使用Python一站式提取Word、Excel、PDF 和PPT文档内容v1.0
代码介绍
本文旨在系统介绍如何利用Python主流库从四种常见格式文档中提取原始文本
提供源码和打包好的软件,到时放在文末。
相关软件安装
Python安装
第三方库安装
在cmd界面输入:
pip install python-docx openpyxl pdfminer.six python-pptx
程序打包
可以用这个一键打包文档代码
完整代码
import os
import sys
from docx import Document
import openpyxl
from pdfminer.high_level import extract_text
from pptx import Presentation
from PyQt5.QtWidgets import (QApplication, QMainWindow, QPushButton, QLabel, QFileDialog, QTextEdit, QVBoxLayout, QHBoxLayout, QWidget, QProgressBar, QMessageBox, QGroupBox,QRadioButton, QButtonGroup)
from PyQt5.QtCore import Qt, QThread, pyqtSignal, QObject
from PyQt5.QtGui import QFontclass BatchExtractorThread(QThread):"""批量提取线程(支持单文件/文件夹)"""progress_updated = pyqtSignal(int)batch_finished = pyqtSignal(str, int, int) # 汇总信息, 成功数, 失败数log_message = pyqtSignal(str)def __init__(self, input_path, output_path, is_folder=False, parent=None):super().__init__(parent) # 绑定父对象,避免线程残留self.input_path = input_path # 输入路径(文件/文件夹)self.output_path = output_path # 输出路径(文件/文件夹)self.is_folder = is_folder # 是否为文件夹批量处理self.supported_exts = ['.docx', '.xlsx', '.pdf', '.pptx'] # 支持的格式self.success_count = 0 # 成功提取数self.fail_count = 0 # 提取失败数self.is_running = False # 线程运行状态标记def run(self):self.is_running = Truetry:self.progress_updated.emit(5)if self.is_folder:# 处理文件夹:遍历所有支持的文件self.log_message.emit(f"开始批量处理文件夹: {self.input_path}")file_list = self.get_supported_files(self.input_path)total_files = len(file_list)if total_files == 0:self.log_message.emit("文件夹中未找到支持格式的文档")self.batch_finished.emit("未找到可处理的文档", 0, 0)self.progress_updated.emit(100)returnself.log_message.emit(f"共发现 {total_files} 个支持格式的文档")# 逐个处理文件for idx, file_path in enumerate(file_list):if not self.is_running: # 检查是否需要终止self.log_message.emit("提取任务已被终止")returnself.process_single_file(file_path, self.output_path)# 更新整体进度progress = 5 + int((idx + 1) / total_files * 90)self.progress_updated.emit(min(100, progress))summary = f"批量处理完成!共处理 {total_files} 个文件,成功 {self.success_count} 个,失败 {self.fail_count} 个"self.batch_finished.emit(summary, self.success_count, self.fail_count)else:# 处理单个文件self.log_message.emit(f"开始处理单个文件: {self.input_path}")if self.is_running: # 确保线程未被终止self.process_single_file(self.input_path, self.output_path, is_single=True)summary = f"单文件处理完成!成功: {self.success_count} 个,失败: {self.fail_count} 个"self.batch_finished.emit(summary, self.success_count, self.fail_count)self.progress_updated.emit(100)except Exception as e:self.log_message.emit(f"处理过程总错误: {str(e)}")self.batch_finished.emit(f"处理失败: {str(e)}", 0, 1)finally:self.is_running = False # 标记线程结束def get_supported_files(self, folder_path):"""获取文件夹内所有支持格式的文件(递归遍历子文件夹)"""supported_files = []for root, _, files in os.walk(folder_path):for file in files:file_ext = os.path.splitext(file)[1].lower()if file_ext in self.supported_exts:supported_files.append(os.path.join(root, file))return supported_filesdef process_single_file(self, file_path, output_root, is_single=False):"""处理单个文件的提取和保存"""file_name = os.path.basename(file_path)file_ext = os.path.splitext(file_path)[1].lower()# 确定单个文件的输出路径if is_single:# 单文件模式:直接使用用户指定的输出路径save_path = output_rootelse:# 批量模式:保持原文件夹结构,输出到指定根目录relative_path = os.path.relpath(os.path.dirname(file_path), self.input_path)output_dir = os.path.join(output_root, relative_path)os.makedirs(output_dir, exist_ok=True)# 生成输出文件名(原文件名+_提取结果.txt)base_name = os.path.splitext(file_name)[0]save_path = os.path.join(output_dir, f"{base_name}_提取结果.txt")try:self.log_message.emit(f"正在处理: {file_name}")# 根据文件类型提取内容if file_ext == '.docx':content = self.doc_extract(file_path)file_type = "Word"elif file_ext == '.xlsx':content = self.excel_extract(file_path)file_type = "Excel"elif file_ext == '.pdf':content = self.pdf_extract(file_path)file_type = "PDF"elif file_ext == '.pptx':content = self.ppt_extract(file_path)file_type = "PowerPoint"else:self.log_message.emit(f"跳过不支持的文件: {file_name}")self.fail_count += 1return# 保存内容if content:self.save_content(content, save_path)self.log_message.emit(f"✅ 成功: {file_name} -> 保存至 {os.path.basename(save_path)}")self.success_count += 1else:self.log_message.emit(f"⚠️ 无内容: {file_name}(未生成输出文件)")self.fail_count += 1except Exception as e:self.log_message.emit(f"❌ 失败: {file_name} - {str(e)}")self.fail_count += 1def doc_extract(self, file_path) -> str:"""提取 Word (.docx) 文档内容"""if not os.path.exists(file_path):raise FileNotFoundError(f"文件不存在: {file_path}")doc = Document(file_path)content = [para.text for para in doc.paragraphs]return '\n'.join(content)def excel_extract(self, file_path) -> str:"""提取 Excel (.xlsx) 文档内容"""if not os.path.exists(file_path):raise FileNotFoundError(f"文件不存在: {file_path}")data = []wb = openpyxl.load_workbook(file_path)for sheet in wb.sheetnames:data.append(f"=== 工作表: {sheet} ===")ws = wb[sheet]for i in range(1, ws.max_row + 1):row_data = []for j in range(1, ws.max_column + 1):cell_val = str(ws.cell(i, j).value) if ws.cell(i, j).value is not None else ""row_data.append(cell_val)data.append("\t".join(row_data))return '\n'.join(data)def pdf_extract(self, file_path) -> str:"""提取 PDF (.pdf) 文档内容"""if not os.path.exists(file_path):raise FileNotFoundError(f"文件不存在: {file_path}")return extract_text(file_path)def ppt_extract(self, file_path) -> str:"""提取 PowerPoint (.pptx) 文档内容"""if not os.path.exists(file_path):raise FileNotFoundError(f"文件不存在: {file_path}")content = []ppt = Presentation(file_path)for slide_idx, slide in enumerate(ppt.slides, 1):content.append(f"=== 幻灯片 {slide_idx} ===")for shape in slide.shapes:if hasattr(shape, 'text') and shape.text:content.append(shape.text)return '\n'.join(content)def save_content(self, content, save_path) -> None:"""保存提取的内容到指定路径"""with open(save_path, 'w', encoding='utf-8') as f:f.write(content)def stop(self):"""终止线程(安全退出)"""self.is_running = Falseclass DocumentExtractorApp(QMainWindow):"""文档提取器主窗口(支持单文件/文件夹批量处理)"""def __init__(self):super().__init__()self.init_ui()self.extractor_thread = None # 线程对象初始化def init_ui(self):"""初始化用户界面(优化布局间距)"""self.setWindowTitle("一站式提取Word、Excel、PDF 和PPT文档内容@阿幸")self.setGeometry(100, 100, 1000, 750)# 设置中文字体(优化字体大小和间距)base_font = QFont()base_font.setFamily("SimHei")base_font.setPointSize(10) # 基础字体大小self.setFont(base_font)# 主布局(增加整体内边距)main_layout = QVBoxLayout()main_layout.setContentsMargins(15, 15, 15, 15) # 主布局四周内边距main_layout.setSpacing(12) # 布局内组件间距# 1. 处理模式选择(单文件/文件夹)- 核心优化区域mode_group = QGroupBox("处理模式")mode_group.setFont(QFont("SimHei", 11, QFont.Bold)) # 分组标题加粗mode_layout = QHBoxLayout()# 增加模式布局内边距和间距,解决文字拥挤mode_layout.setContentsMargins(20, 15, 20, 15) # 分组内边距mode_layout.setSpacing(30) # 单选按钮间距self.single_mode_radio = QRadioButton("单文件处理")self.folder_mode_radio = QRadioButton("文件夹批量处理")# 优化单选按钮字体和大小radio_font = QFont("SimHei", 10)self.single_mode_radio.setFont(radio_font)self.folder_mode_radio.setFont(radio_font)self.mode_btn_group = QButtonGroup(self)self.mode_btn_group.addButton(self.single_mode_radio, 0)self.mode_btn_group.addButton(self.folder_mode_radio, 1)self.single_mode_radio.setChecked(True) # 默认单文件模式# 模式切换事件self.mode_btn_group.buttonClicked.connect(self.switch_mode)mode_layout.addWidget(self.single_mode_radio)mode_layout.addWidget(self.folder_mode_radio)mode_group.setLayout(mode_layout)main_layout.addWidget(mode_group)# 2. 路径选择区域path_group = QGroupBox("路径设置")path_group.setFont(QFont("SimHei", 11, QFont.Bold))path_layout = QVBoxLayout()path_layout.setContentsMargins(15, 15, 15, 15)path_layout.setSpacing(10)# 输入路径选择(文件/文件夹)input_layout = QHBoxLayout()input_layout.setSpacing(10) # 标签和按钮间距self.input_label = QLabel("未选择输入(文件/文件夹)")self.input_label.setWordWrap(True)self.input_label.setStyleSheet("border: 1px solid #ccc; padding: 8px; min-height: 35px;")self.input_label.setFont(QFont("SimHei", 9)) # 标签字体稍小,避免拥挤self.select_input_btn = QPushButton("选择输入路径")self.select_input_btn.setFont(QFont("SimHei", 10))self.select_input_btn.setMinimumWidth(120) # 固定按钮宽度,避免变形self.select_input_btn.clicked.connect(self.select_input_path)input_layout.addWidget(self.input_label, 7)input_layout.addWidget(self.select_input_btn, 3)# 输出路径选择(文件/文件夹)output_layout = QHBoxLayout()output_layout.setSpacing(10)self.output_label = QLabel("未选择输出(文件/文件夹)")self.output_label.setWordWrap(True)self.output_label.setStyleSheet("border: 1px solid #ccc; padding: 8px; min-height: 35px;")self.output_label.setFont(QFont("SimHei", 9))self.select_output_btn = QPushButton("选择输出路径")self.select_output_btn.setFont(QFont("SimHei", 10))self.select_output_btn.setMinimumWidth(120)self.select_output_btn.clicked.connect(self.select_output_path)output_layout.addWidget(self.output_label, 7)output_layout.addWidget(self.select_output_btn, 3)# 添加到路径布局path_layout.addLayout(input_layout)path_layout.addLayout(output_layout)path_group.setLayout(path_layout)main_layout.addWidget(path_group)# 3. 操作区域(新增终止按钮)action_layout = QHBoxLayout()action_layout.setSpacing(15) # 按钮间距self.start_btn = QPushButton("开始提取")self.start_btn.setStyleSheet("font-size: 14px; padding: 10px; background-color: #4CAF50; color: white;")self.start_btn.setFont(QFont("SimHei", 10, QFont.Bold))self.start_btn.setMinimumWidth(150)self.start_btn.clicked.connect(self.start_extraction)self.start_btn.setEnabled(False)self.stop_btn = QPushButton("终止提取")self.stop_btn.setStyleSheet("font-size: 14px; padding: 10px; background-color: #f44336; color: white;")self.stop_btn.setFont(QFont("SimHei", 10, QFont.Bold))self.stop_btn.setMinimumWidth(150)self.stop_btn.clicked.connect(self.stop_extraction)self.stop_btn.setEnabled(False) # 默认禁用self.clear_btn = QPushButton("清空日志")self.clear_btn.setStyleSheet("font-size: 14px; padding: 10px;")self.clear_btn.setFont(QFont("SimHei", 10))self.clear_btn.setMinimumWidth(150)self.clear_btn.clicked.connect(self.clear_logs)action_layout.addWidget(self.start_btn)action_layout.addWidget(self.stop_btn)action_layout.addWidget(self.clear_btn)main_layout.addLayout(action_layout)# 4. 进度条self.progress_bar = QProgressBar()self.progress_bar.setVisible(False)self.progress_bar.setStyleSheet("margin-bottom: 10px; height: 25px;")main_layout.addWidget(self.progress_bar)# 5. 结果预览区域result_group = QGroupBox("提取结果预览(仅显示最后一个文件的前2000字符)")result_group.setFont(QFont("SimHei", 11, QFont.Bold))result_layout = QVBoxLayout()result_layout.setContentsMargins(10, 10, 10, 10)self.result_text = QTextEdit()self.result_text.setReadOnly(True)self.result_text.setFont(QFont("SimHei", 9))result_layout.addWidget(self.result_text)result_group.setLayout(result_layout)main_layout.addWidget(result_group, 2)# 6. 日志区域log_group = QGroupBox("操作日志")log_group.setFont(QFont("SimHei", 11, QFont.Bold))log_layout = QVBoxLayout()log_layout.setContentsMargins(10, 10, 10, 10)self.log_text = QTextEdit()self.log_text.setReadOnly(True)self.log_text.setMaximumHeight(120)self.log_text.setFont(QFont("SimHei", 9))log_layout.addWidget(self.log_text)log_group.setLayout(log_layout)main_layout.addWidget(log_group)# 设置中心部件central_widget = QWidget()central_widget.setLayout(main_layout)self.setCentralWidget(central_widget)# 初始化变量self.input_path = "" # 输入路径(文件/文件夹)self.output_path = "" # 输出路径(文件/文件夹)self.is_folder_mode = False # 当前是否为文件夹模式def switch_mode(self):"""切换处理模式(单文件/文件夹)"""# 切换模式前先检查是否有正在运行的线程if self.extractor_thread and self.extractor_thread.isRunning():QMessageBox.warning(self, "警告", "正在进行提取操作,无法切换模式")# 恢复原选择状态if self.is_folder_mode:self.folder_mode_radio.setChecked(True)else:self.single_mode_radio.setChecked(True)returnself.is_folder_mode = (self.mode_btn_group.checkedId() == 1)# 清空现有路径并更新提示self.input_path = ""self.output_path = ""self.input_label.setText("未选择输入文件夹" if self.is_folder_mode else "未选择输入文件")self.output_label.setText("未选择输出文件夹" if self.is_folder_mode else "未选择输出文件")self.start_btn.setEnabled(False)self.log_message(f"已切换至{'文件夹批量处理' if self.is_folder_mode else '单文件处理'}模式")def select_input_path(self):"""选择输入路径(根据模式选择文件或文件夹)"""# 选择路径前检查线程状态if self.extractor_thread and self.extractor_thread.isRunning():QMessageBox.warning(self, "警告", "正在进行提取操作,无法修改路径")returnif self.is_folder_mode:# 文件夹模式:选择文件夹folder_path = QFileDialog.getExistingDirectory(self, "选择输入文件夹", "", QFileDialog.ShowDirsOnly)if folder_path:self.input_path = folder_pathself.input_label.setText(f"输入文件夹: {folder_path}")self.log_message(f"已选择输入文件夹: {folder_path}")# 自动建议输出文件夹(原文件夹名+_提取结果)if not self.output_path:folder_name = os.path.basename(folder_path)self.output_path = os.path.join(os.path.dirname(folder_path), f"{folder_name}_提取结果")self.output_label.setText(f"输出文件夹: {self.output_path}")else:# 单文件模式:选择文件file_path, _ = QFileDialog.getOpenFileName(self, "选择文档文件", "", "支持的文件 (*.docx *.xlsx *.pdf *.pptx);;Word 文件 (*.docx);;Excel 文件 (*.xlsx);;PDF 文件 (*.pdf);;PPT 文件 (*.pptx);;所有文件 (*)")if file_path:self.input_path = file_pathself.input_label.setText(f"输入文件: {os.path.basename(file_path)}")self.log_message(f"已选择输入文件: {file_path}")# 自动建议输出文件if not self.output_path:base_name = os.path.splitext(os.path.basename(file_path))[0]self.output_path = os.path.join(os.path.dirname(file_path), f"{base_name}_提取结果.txt")self.output_label.setText(f"输出文件: {os.path.basename(self.output_path)}")self.check_btn_state()def select_output_path(self):"""选择输出路径(根据模式选择文件或文件夹)"""# 选择路径前检查线程状态if self.extractor_thread and self.extractor_thread.isRunning():QMessageBox.warning(self, "警告", "正在进行提取操作,无法修改路径")returnif not self.input_path:QMessageBox.warning(self, "警告", f"请先选择输入{'文件夹' if self.is_folder_mode else '文件'}")returnif self.is_folder_mode:# 文件夹模式:选择输出文件夹folder_path = QFileDialog.getExistingDirectory(self, "选择输出文件夹", os.path.dirname(self.input_path), QFileDialog.ShowDirsOnly)if folder_path:self.output_path = folder_pathself.output_label.setText(f"输出文件夹: {folder_path}")self.log_message(f"已选择输出文件夹: {folder_path}")else:# 单文件模式:选择输出文件default_name = os.path.splitext(os.path.basename(self.input_path))[0] + "_提取结果.txt"file_path, _ = QFileDialog.getSaveFileName(self, "保存提取结果", os.path.join(os.path.dirname(self.input_path), default_name),"文本文件 (*.txt);;所有文件 (*)")if file_path:self.output_path = file_pathself.output_label.setText(f"输出文件: {os.path.basename(file_path)}")self.log_message(f"已选择输出文件: {file_path}")self.check_btn_state()def check_btn_state(self):"""检查按钮启用状态"""# 只有路径都设置且无运行线程时,才能启用开始按钮can_start = bool(self.input_path) and bool(self.output_path)if self.extractor_thread and self.extractor_thread.isRunning():can_start = Falseself.start_btn.setEnabled(can_start)def log_message(self, message):"""添加日志信息"""self.log_text.append(message)self.log_text.moveCursor(self.log_text.textCursor().End)def clear_logs(self):"""清空日志和预览"""# 提取中也允许清空日志self.log_text.clear()self.result_text.clear()self.log_message("已清空日志和结果预览")def start_extraction(self):"""开始提取(单文件/批量)"""# 再次检查路径有效性if not os.path.exists(self.input_path):QMessageBox.warning(self, "错误", f"输入{'文件夹' if self.is_folder_mode else '文件'}不存在")return# 禁用相关按钮,启用终止按钮self.select_input_btn.setEnabled(False)self.select_output_btn.setEnabled(False)self.start_btn.setEnabled(False)self.stop_btn.setEnabled(True)self.progress_bar.setVisible(True)self.progress_bar.setValue(0)self.result_text.clear()# 创建并启动提取线程(绑定父对象,避免内存泄漏)self.extractor_thread = BatchExtractorThread(input_path=self.input_path,output_path=self.output_path,is_folder=self.is_folder_mode,parent=self # 关键:绑定到主窗口,确保线程随窗口生命周期管理)self.extractor_thread.progress_updated.connect(self.update_progress)self.extractor_thread.batch_finished.connect(self.on_extraction_finished)self.extractor_thread.log_message.connect(self.log_message)# 单文件模式下绑定预览功能if not self.is_folder_mode:self.extractor_thread.process_single_file = self.wrap_single_file_process(self.extractor_thread.process_single_file)self.extractor_thread.start()def stop_extraction(self):"""终止提取操作"""if self.extractor_thread and self.extractor_thread.isRunning():reply = QMessageBox.question(self, "确认终止", "确定要终止当前提取操作吗?已处理的文件会保留,未处理的将停止。",QMessageBox.Yes | QMessageBox.No, QMessageBox.No)if reply == QMessageBox.Yes:self.extractor_thread.stop() # 安全终止线程self.log_message("已终止提取操作")# 恢复按钮状态self.select_input_btn.setEnabled(True)self.select_output_btn.setEnabled(True)self.start_btn.setEnabled(True)self.stop_btn.setEnabled(False)self.progress_bar.setValue(0)self.progress_bar.setVisible(False)def wrap_single_file_process(self, original_func):"""包装单文件处理函数,用于获取预览内容"""def wrapper(file_path, output_root, is_single=False):# 先执行原处理逻辑original_func(file_path, output_root, is_single)# 读取保存的文件内容用于预览if os.path.exists(output_root):with open(output_root, 'r', encoding='utf-8') as f:content = f.read()# 显示前2000字符预览preview = content[:2000]if len(content) > 2000:preview += "\n\n... 内容过长,仅显示前2000字符 ..."self.result_text.setText(preview)return wrapperdef update_progress(self, value):"""更新进度条"""self.progress_bar.setValue(value)def on_extraction_finished(self, summary, success_count, fail_count):"""提取完成回调(核心修复:保持软件运行)"""# 关键修复:恢复所有操作按钮状态,不终止程序self.select_input_btn.setEnabled(True)self.select_output_btn.setEnabled(True)self.start_btn.setEnabled(True)self.stop_btn.setEnabled(False) # 提取完成后禁用终止按钮self.progress_bar.setVisible(False) # 隐藏进度条# 显示汇总信息(使用information而非critical,避免误操作)QMessageBox.information(self, "处理完成", summary)self.log_message(f"\n{summary}")# 清理线程对象(避免残留)self.extractor_thread = Nonedef closeEvent(self, event):"""窗口关闭事件(安全处理线程)"""if self.extractor_thread and self.extractor_thread.isRunning():reply = QMessageBox.question(self, "确认关闭", "正在进行提取操作,强制关闭可能导致文件损坏,确定要关闭吗?",QMessageBox.Yes | QMessageBox.No, QMessageBox.No)if reply == QMessageBox.Yes:self.extractor_thread.stop()self.extractor_thread.wait() # 等待线程安全退出event.accept()else:event.ignore()else:event.accept()if __name__ == '__main__':app = QApplication(sys.argv)# 确保中文显示正常app.setFont(QFont("SimHei", 10))window = DocumentExtractorApp()window.show()# 关键修复:正确的事件循环退出逻辑exit_code = app.exec_()# 程序退出前确保线程已终止if hasattr(window, 'extractor_thread') and window.extractor_thread and window.extractor_thread.isRunning():window.extractor_thread.stop()window.extractor_thread.wait()sys.exit(exit_code)
软件使用
可以选择单文件也可以选择文件夹
点击开始提取
软件下载
夸克