当前位置：首页 > news >正文

mineru处理后的文档生成对应层级的标题

news 2025/10/7 7:30:27

mineru把pdf文档转换为markdown文档，但标题都是一级标题。
对mineru处理后的文档进行再处理，生成对应层级的标题

# md_head.py
import redef is_valid_title(title: str) -> bool:"""检查标题是否符合以下格式之一：1. 一级标题：数字+空格+第（如"1 第"）2. 多级标题：a.b.c.d.e.f格式（a-f为0-9的数字）"""# 检查一级标题格式（数字+空格+第）level1_pattern = r'^\s*\d+\s*第\s*.*$'# 检查多级标题格式（a.b.c...）multi_level_pattern = r'^\s*(\d)(\.\d){1,5}\s*.*$'return bool(re.fullmatch(level1_pattern, title)) or \bool(re.fullmatch(multi_level_pattern, title))def convert_to_markdown_header(title: str) -> str:"""将标题转换为Markdown格式：1. 一级标题：数字+空格+第 → # 数字 第...2. 多级标题：a.b.c.d.e.f → #...# a.b.c.d.e.f（根据点号数量确定层级）"""# 处理一级标题level1_match = re.fullmatch(r'^\s*(\d+)\s*第\s*(.*)$', title)if level1_match:num = level1_match.group(1)content = level1_match.group(2).strip()return f"# {num} 第{content if content else ''}"# 处理多级标题（保持原有逻辑）if not re.fullmatch(r'^\s*(\d)(\.\d){1,5}\s*.*$', title):return title  # 如果不是有效多级标题，直接返回原内容parts = title.split('.')level = len(parts)# Markdown标题最多6级（######）if level > 6:return title  # 超过6级则不处理return '#' * level + ' ' + titledef process_file(input_file: str, output_file: str):"""处理文件：读取input_file，转换标题后写入output_file"""with open(input_file, 'r', encoding='utf-8') as f_in:lines = [line.rstrip('\n') for line in f_in]processed_lines = []for line in lines:if is_valid_title(line):processed_line = convert_to_markdown_header(line)processed_lines.append(processed_line)else:processed_lines.append(line)with open(output_file, 'w', encoding='utf-8') as f_out:f_out.write('\n'.join(processed_lines))if __name__ == "__main__":import sysif len(sys.argv) != 3:print("用法: python md_head.py <输入文件> <输出文件>")sys.exit(1)input_filename = sys.argv[1]output_filename = sys.argv[2]try:process_file(input_filename, output_filename)print(f"处理完成，结果已保存到 {output_filename}")except FileNotFoundError:print(f"错误：文件 {input_filename} 不存在")except Exception as e:print(f"处理过程中发生错误：{e}")