Python快速落地的临床知识问答与检索项目(2025年9月教学实现部分)
项目代码实现与工程化
数据加载模块(loader.py)实现
临床知识问答与检索项目的核心需求之一是高效处理多样化的医疗文档格式,包括 PDF、DOCX 及各类表格文件。以下是支持批量解析并返回带元数据文档列表的 loader.py
实现方案,结合了非结构化文本处理与医学文档元数据提取的专业需求:
import os
import re
from pathlib import Path
from typing import List, Dict, Any
import PyPDF2
from docx import Document
import pandas as pd
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.docx import partition_docx
from unstructured.table import Table
from unstructured.documents.elements import Text, Titleclass ClinicalDocumentLoader:"""医疗文档加载器,支持PDF/DOCX/表格批量解析并保留元数据"""def __init__(self, data_dir: str = "data/clinical_docs"):self.data_dir = Path(data_dir)self.supported_extensions = {".pdf", ".docx", ".csv", ".xlsx", ".xls"}self.metadata_keys = [[15]()][[15]()][[15]()][[15]()][[15]()]def _extract_pdf_metadata(self, file_path: Path) -> List[Dict[str, Any]]:"""解析PDF文档,提取文本内容与元数据"""elements = partition_pdf(filename=str(file_path),strategy="hi_res",extract_images_in_pdf=False,infer_table_structure=True)docs_with_meta = []current_section = "Unknown"page_number = 1for element in elements:# 提取页码信息(从元素元数据中获取)if hasattr(element, "metadata") and "page_number" in element.metadata:page_number = element.metadata[[15]()]# 识别标题元素作为章节划分依据if isinstance(element, Title):current_section = element.text.strip()continue# 处理表格元素if isinstance(element, Table):text = element.text# 转换表格为Markdown格式以便LLM处理markdown_table = self._table_to_markdown(element)docs_with_meta.append({"text": markdown_table,"metadata": {"source": str(file_path),"file_type": "pdf","page_number": page_number,"section": current_section,"document_type": "table"}})continue# 处理文本元素if isinstance(element, Text) and element.text.strip():docs_with_meta.append({"text": element.text.strip(),"metadata": {"source": str(file_path),"file_type": "pdf","page_number": page_number,"section": current_section,"document_type": "text"}})return docs_with_metadef _extract_docx_metadata(self, file_path: Path) -> List[Dict[str, Any]]:"""解析DOCX文档,提取文本内容与元数据"""elements = partition_docx(filename=str(file_path))docs_with_meta = []current_section = "Unknown"for element in elements:# 识别标题元素作为章节划分依据if isinstance(element, Title):current_section = element.text.strip()continue# 处理表格元素if isinstance(element, Table):markdown_table = self._table_to_markdown(element)docs_with_meta.append({"text": markdown_table,"metadata": {"source": str(file_path),"file_type": "docx","section": current_section,"document_type": "table"}})continue# 处理文本元素if isinstance(element, Text) and element.text.strip():docs_with_meta.append({"text": element.text.strip(),"metadata": {"source": str(file_path),"file_type": "docx","section": current_section,"document_type": "text"}})return docs_with_metadef _extract_table_metadata(self, file_path: Path) -> List[Dict[str, Any]]:"""解析表格文件(CSV/XLSX),提取内容与元数据"""docs_with_meta = []file_ext = file_path.suffix.lower()try:if file_ext == ".csv":df = pd.read_csv(file_path)elif file_ext in [[15]()][[15]()]:df = pd.read_excel(file_path)else:return []# 将DataFrame转换为Markdown表格markdown_table = df.to_markdown(index=False)docs_with_meta.append({"text": markdown_table,"metadata": {"source": str(file_path),"file_type": file_ext[1:],"sheet_nam