mineru处理后的文档生成对应层级的标题
mineru把pdf文档转换为markdown文档,但标题都是一级标题。
对mineru处理后的文档进行再处理,生成对应层级的标题
# md_head.py
import redef is_valid_title(title: str) -> bool:"""检查标题是否符合以下格式之一:1. 一级标题:数字+空格+第(如"1 第")2. 多级标题:a.b.c.d.e.f格式(a-f为0-9的数字)"""# 检查一级标题格式(数字+空格+第)level1_pattern = r'^\s*\d+\s*第\s*.*$'# 检查多级标题格式(a.b.c...)multi_level_pattern = r'^\s*(\d)(\.\d){1,5}\s*.*$'return bool(re.fullmatch(level1_pattern, title)) or \bool(re.fullmatch(multi_level_pattern, title))def convert_to_markdown_header(title: str) -> str:"""将标题转换为Markdown格式:1. 一级标题:数字+空格+第 → # 数字 第...2. 多级标题:a.b.c.d.e.f → #...# a.b.c.d.e.f(根据点号数量确定层级)"""# 处理一级标题level1_match = re.fullmatch(r'^\s*(\d+)\s*第\s*(.*)$', title)if level1_match:num = level1_match.group(1)content = level1_match.group(2).strip()return f"# {num} 第{content if content else ''}"# 处理多级标题(保持原有逻辑)if not re.fullmatch(r'^\s*(\d)(\.\d){1,5}\s*.*$', title):return title # 如果不是有效多级标题,直接返回原内容parts = title.split('.')level = len(parts)# Markdown标题最多6级(######)if level > 6:return title # 超过6级则不处理return '#' * level + ' ' + titledef process_file(input_file: str, output_file: str):"""处理文件:读取input_file,转换标题后写入output_file"""with open(input_file, 'r', encoding='utf-8') as f_in:lines = [line.rstrip('\n') for line in f_in]processed_lines = []for line in lines:if is_valid_title(line):processed_line = convert_to_markdown_header(line)processed_lines.append(processed_line)else:processed_lines.append(line)with open(output_file, 'w', encoding='utf-8') as f_out:f_out.write('\n'.join(processed_lines))if __name__ == "__main__":import sysif len(sys.argv) != 3:print("用法: python md_head.py <输入文件> <输出文件>")sys.exit(1)input_filename = sys.argv[1]output_filename = sys.argv[2]try:process_file(input_filename, output_filename)print(f"处理完成,结果已保存到 {output_filename}")except FileNotFoundError:print(f"错误:文件 {input_filename} 不存在")except Exception as e:print(f"处理过程中发生错误:{e}")