自动化办公:用Python操作Excel、Word和PDF
目录
- 自动化办公:用Python操作Excel、Word和PDF
- 1. 引言
- 1.1 Python办公自动化的优势
- 2. 环境准备和基础库介绍
- 2.1 安装必要的库
- 2.2 各库的主要功能
- 3. Excel自动化处理
- 3.1 基础Excel操作
- 3.2 高级Excel功能
- 4. Word文档自动化
- 4.1 基础Word操作
- 4.2 高级Word功能
- 5. PDF文档自动化
- 5.1 基础PDF操作
- 5.2 高级PDF功能
- 6. 完整代码实现
- 7. 代码自查和优化
- 7.1 代码质量检查
- 7.2 性能优化
- 7.3 安全性改进
- 7.4 健壮性提升
- 8. 总结
- 8.1 主要收获
- 8.2 最佳实践建议
- 8.3 应用前景
『宝藏代码胶囊开张啦!』—— 我的 CodeCapsule 来咯!✨
写代码不再头疼!我的新站点 CodeCapsule 主打一个 “白菜价”+“量身定制”!无论是卡脖子的毕设/课设/文献复现,需要灵光一现的算法改进,还是想给项目加个“外挂”,这里都有便宜又好用的代码方案等你发现!低成本,高适配,助你轻松通关!速来围观 👉 CodeCapsule官网
自动化办公:用Python操作Excel、Word和PDF
1. 引言
在现代办公环境中,我们每天都要处理大量的文档工作。根据统计,知识工作者平均每周花费8-10小时在重复性的文档处理任务上,这些时间完全可以通过自动化来节省。无论是数据报表生成、合同文档处理,还是批量PDF操作,手动完成这些任务既耗时又容易出错。
Python作为一门功能强大的编程语言,提供了丰富的库来简化办公文档的处理任务。通过Python自动化,我们能够:
- 提高工作效率:自动化重复性任务,释放人力资源
- 减少人为错误:标准化处理流程,确保结果一致性
- 实现批量处理:一次性处理成百上千个文档
- 集成工作流:将不同格式的文档处理串联起来
1.1 Python办公自动化的优势
2. 环境准备和基础库介绍
2.1 安装必要的库
在开始之前,我们需要安装以下Python库:
# Excel处理
pip install openpyxl
pip install xlrd
pip install xlwt
pip install xlsxwriter# Word处理
pip install python-docx
pip install docx2txt# PDF处理
pip install PyPDF2
pip install pdfplumber
pip install reportlab
pip install fpdf# 其他工具
pip install pandas
pip install numpy
pip install pillow
2.2 各库的主要功能
# 导入所有需要的库
import pandas as pd
import numpy as np
from openpyxl import Workbook, load_workbook
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
from openpyxl.chart import BarChart, PieChart, LineChart, Reference
from docx import Document
from docx.shared import Inches, Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT
import PyPDF2
import pdfplumber
from reportlab.lib.pagesizes import letter, A4
from reportlab.pdfgen import canvas
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph
from reportlab.lib import colors
import os
from datetime import datetime
import logging
3. Excel自动化处理
3.1 基础Excel操作
让我们从最基本的Excel文件创建和读写开始:
class ExcelAutomator:"""Excel自动化处理器提供完整的Excel文件操作功能"""def __init__(self):"""初始化Excel处理器"""self.setup_logging()def setup_logging(self):"""设置日志记录"""logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(levelname)s - %(message)s',handlers=[logging.FileHandler('excel_automation.log', encoding='utf-8'),logging.StreamHandler()])self.logger = logging.getLogger(__name__)def create_workbook(self, filepath: str, data: dict = None) -> bool:"""创建新的Excel工作簿Args:filepath: 文件保存路径data: 初始数据字典 {sheet_name: [[row1], [row2], ...]}Returns:bool: 创建是否成功"""try:wb = Workbook()# 删除默认创建的空工作表wb.remove(wb.active)# 添加数据工作表if data:for sheet_name, sheet_data in data.items():ws = wb.create_sheet(title=sheet_name)for row in sheet_data:ws.append(row)else:# 如果没有提供数据,创建一个空的工作表wb.create_sheet(title="Sheet1")# 保存工作簿wb.save(filepath)self.logger.info(f"成功创建Excel文件: {filepath}")return Trueexcept Exception as e:self.logger.error(f"创建Excel文件失败: {str(e)}")return Falsedef read_excel(self, filepath: str, sheet_name: str = None) -> pd.DataFrame:"""读取Excel文件为DataFrameArgs:filepath: Excel文件路径sheet_name: 工作表名称,如果为None则读取第一个工作表Returns:pd.DataFrame: 读取的数据"""try:if sheet_name:df = pd.read_excel(filepath, sheet_name=sheet_name)else:df = pd.read_excel(filepath)self.logger.info(f"成功读取Excel文件: {filepath}")return dfexcept Exception as e:self.logger.error(f"读取Excel文件失败: {str(e)}")return pd.DataFrame()def write_excel(self, df: pd.DataFrame, filepath: str, sheet_name: str = 'Sheet1', index: bool = False) -> bool:"""将DataFrame写入Excel文件Args:df: 要写入的DataFramefilepath: 文件保存路径sheet_name: 工作表名称index: 是否包含索引Returns:bool: 写入是否成功"""try:# 如果文件已存在,以追加模式写入if os.path.exists(filepath):with pd.ExcelWriter(filepath, mode='a', if_sheet_exists='replace') as writer:df.to_excel(writer, sheet_name=sheet_name, index=index)else:df.to_excel(filepath, sheet_name=sheet_name, index=index)self.logger.info(f"成功写入Excel文件: {filepath}")return Trueexcept Exception as e:self.logger.error(f"写入Excel文件失败: {str(e)}")return Falsedef apply_formatting(self, filepath: str, formatting_rules: dict) -> bool:"""应用格式设置到Excel文件Args:filepath: Excel文件路径formatting_rules: 格式规则字典Returns:bool: 格式应用是否成功"""try:wb = load_workbook(filepath)for sheet_name, rules in formatting_rules.items():if sheet_name in wb.sheetnames:ws = wb[sheet_name]# 应用单元格格式if 'cell_formats' in rules:for cell_ref, format_config in rules['cell_formats'].items():cell = ws[cell_ref]# 字体设置if 'font' in format_config:font_config = format_config['font']cell.font = Font(name=font_config.get('name', 'Arial'),size=font_config.get('size', 11),bold=font_config.get('bold', False),italic=font_config.get('italic', False),color=font_config.get('color', '000000'))# 填充颜色if 'fill' in format_config:fill_config = format_config['fill']cell.fill = PatternFill(start_color=fill_config.get('color', 'FFFFFF'),fill_type=fill_config.get('type', 'solid'))# 对齐方式if 'alignment' in format_config:align_config = format_config['alignment']cell.alignment = Alignment(horizontal=align_config.get('horizontal', 'general'),vertical=align_config.get('vertical', 'center'))# 设置列宽if 'column_widths' in rules:for col, width in rules['column_widths'].items():ws.column_dimensions[col].width = width# 设置行高if 'row_heights' in rules:for row, height in rules['row_heights'].items():ws.row_dimensions[row].height = heightwb.save(filepath)self.logger.info(f"成功应用格式设置: {filepath}")return Trueexcept Exception as e:self.logger.error(f"应用格式设置失败: {str(e)}")return Falsedef create_chart(self, filepath: str, chart_config: dict) -> bool:"""在Excel中创建图表Args:filepath: Excel文件路径chart_config: 图表配置字典Returns:bool: 图表创建是否成功"""try:wb = load_workbook(filepath)ws = wb[chart_config['sheet_name']]# 创建图表对象if chart_config['chart_type'] == 'bar':chart = BarChart()elif chart_config['chart_type'] == 'pie':chart = PieChart()elif chart_config['chart_type'] == 'line':chart = LineChart()else:chart = BarChart()# 设置图表数据data = Reference(ws, min_col=chart_config['data_start_col'],min_row=chart_config['data_start_row'],max_col=chart_config['data_end_col'],max_row=chart_config['data_end_row'])categories = Reference(ws,min_col=chart_config['categories_col'],min_row=chart_config['categories_start_row'],max_row=chart_config['categories_end_row'])chart.add_data(data, titles_from_data=True)chart.set_categories(categories)# 设置图表标题和样式chart.title = chart_config.get('title', 'Chart')chart.style = chart_config.get('style', 10)# 将图表添加到工作表ws.add_chart(chart, chart_config['position'])wb.save(filepath)self.logger.info(f"成功创建图表: {chart_config['chart_type']}")return Trueexcept Exception as e:self.logger.error(f"创建图表失败: {str(e)}")return False# 使用示例
def demo_excel_automation():"""演示Excel自动化功能"""automator = ExcelAutomator()# 创建示例数据sample_data = {'销售数据': [['产品', '一月', '二月', '三月', '总计'],['产品A', 1000, 1200, 1500, '=SUM(B2:D2)'],['产品B', 800, 900, 1100, '=SUM(B3:D3)'],['产品C', 1200, 1300, 1400, '=SUM(B4:D4)'],['产品D', 900, 1000, 1200, '=SUM(B5:D5)']],'客户信息': [['客户ID', '姓名', '邮箱', '电话'],[1, '张三', 'zhangsan@example.com', '13800138001'],[2, '李四', 'lisi@example.com', '13800138002'],[3, '王五', 'wangwu@example.com', '13800138003']]}# 创建Excel文件filepath = 'demo_excel.xlsx'print("=== 创建Excel文件 ===")automator.create_workbook(filepath, sample_data)# 应用格式设置print("\n=== 应用格式设置 ===")formatting_rules = {'销售数据': {'cell_formats': {'A1:E1': {'font': {'bold': True, 'size': 12, 'color': 'FFFFFF'},'fill': {'color': '366092', 'type': 'solid'},'alignment': {'horizontal': 'center'}},'A2:A5': {'font': {'bold': True},'fill': {'color': 'D9E1F2', 'type': 'solid'}},'E2:E5': {'font': {'bold': True, 'color': 'FF0000'},'fill': {'color': 'FFF2CC', 'type': 'solid'}}},'column_widths': {'A': 15, 'B': 10, 'C': 10, 'D': 10, 'E': 12},'row_heights': {1: 25}}}automator.apply_formatting(filepath, formatting_rules)# 创建图表print("\n=== 创建图表 ===")chart_config = {'sheet_name': '销售数据','chart_type': 'bar','data_start_col': 2,'data_start_row': 1,'data_end_col': 4,'data_end_row': 5,'categories_col': 1,'categories_start_row': 2,'categories_end_row': 5,'title': '产品销售图表','position': 'F1'}automator.create_chart(filepath, chart_config)# 读取数据print("\n=== 读取Excel数据 ===")df = automator.read_excel(filepath, '销售数据')print("销售数据:")print(df.head())# 数据处理示例print("\n=== 数据处理 ===")# 计算平均销售额numeric_columns = ['一月', '二月', '三月']df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')monthly_avg = df[numeric_columns].mean()print("月平均销售额:")for month, avg in monthly_avg.items():print(f" {month}: {avg:.2f}")return filepathif __name__ == "__main__":demo_excel_automation()
3.2 高级Excel功能
class AdvancedExcelAutomator(ExcelAutomator):"""高级Excel自动化处理器扩展基础功能,提供更复杂的操作"""def merge_excel_files(self, filepaths: list, output_path: str, merge_by: str = 'vertical') -> bool:"""合并多个Excel文件Args:filepaths: 要合并的文件路径列表output_path: 输出文件路径merge_by: 合并方式 - 'vertical'(纵向)或 'horizontal'(横向)Returns:bool: 合并是否成功"""try:all_data = []for filepath in filepaths:if os.path.exists(filepath):# 读取所有工作表xl_file = pd.ExcelFile(filepath)for sheet_name in xl_file.sheet_names:df = pd.read_excel(filepath, sheet_name=sheet_name)# 添加来源标识df['_source_file'] = os.path.basename(filepath)df['_source_sheet'] = sheet_nameall_data.append(df)if not all_data:self.logger.warning("没有找到可合并的数据")return Falseif merge_by == 'vertical':# 纵向合并merged_df = pd.concat(all_data, ignore_index=True)else:# 横向合并(需要相同行数)merged_df = pd.concat(all_data, axis=1)# 保存合并结果merged_df.to_excel(output_path, index=False)self.logger.info(f"成功合并 {len(filepaths)} 个文件到: {output_path}")return Trueexcept Exception as e:self.logger.error(f"合并Excel文件失败: {str(e)}")return Falsedef apply_conditional_formatting(self, filepath: str, rules: list) -> bool:"""应用条件格式Args:filepath: Excel文件路径rules: 条件格式规则列表Returns:bool: 应用是否成功"""try:from openpyxl.formatting.rule import Rulefrom openpyxl.styles.differential import DifferentialStylefrom openpyxl.styles import Font, PatternFillwb = load_workbook(filepath)for rule_config in rules:sheet_name = rule_config['sheet_name']if sheet_name not in wb.sheetnames:continuews = wb[sheet_name]# 创建条件格式规则if rule_config['type'] == 'cell_is':# 单元格值条件from openpyxl.formatting.rule import CellIsRulerule = CellIsRule(operator=rule_config['operator'],formula=rule_config['formula'],stopIfTrue=rule_config.get('stopIfTrue', True),font=Font(color=rule_config.get('font_color', 'FF0000')),fill=PatternFill(start_color=rule_config.get('fill_color', 'FFFF00'),fill_type='solid'))elif rule_config['type'] == 'color_scale':# 色阶条件格式from openpyxl.formatting.rule import ColorScaleRulerule = ColorScaleRule(start_type=rule_config.get('start_type', 'min'),start_value=rule_config.get('start_value'),start_color=rule_config.get('start_color', 'FF0000'),mid_type=rule_config.get('mid_type'),mid_value=rule_config.get('mid_value'),mid_color=rule_config.get('mid_color', 'FFFF00'),end_type=rule_config.get('end_type', 'max'),end_value=rule_config.get('end_value'),end_color=rule_config.get('end_color', '00FF00'))# 应用规则到指定范围ws.conditional_formatting.add(rule_config['range'], rule)wb.save(filepath)self.logger.info("成功应用条件格式")return Trueexcept Exception as e:self.logger.error(f"应用条件格式失败: {str(e)}")return Falsedef create_pivot_table(self, filepath: str, pivot_config: dict) -> bool:"""创建数据透视表Args:filepath: Excel文件路径pivot_config: 数据透视表配置Returns:bool: 创建是否成功"""try:# 使用pandas创建数据透视表df = self.read_excel(filepath, pivot_config['source_sheet'])# 创建数据透视表pivot_df = pd.pivot_table(df,values=pivot_config['values'],index=pivot_config.get('index'),columns=pivot_config.get('columns'),aggfunc=pivot_config.get('aggfunc', 'sum'),fill_value=0)# 将数据透视表写入新工作表pivot_sheet_name = pivot_config.get('pivot_sheet_name', 'PivotTable')with pd.ExcelWriter(filepath, mode='a', if_sheet_exists='replace') as writer:pivot_df.to_excel(writer, sheet_name=pivot_sheet_name)self.logger.info(f"成功创建数据透视表: {pivot_sheet_name}")return Trueexcept Exception as e:self.logger.error(f"创建数据透视表失败: {str(e)}")return False# 使用示例
def demo_advanced_excel():"""演示高级Excel功能"""automator = AdvancedExcelAutomator()# 创建测试文件file1 = 'test_file1.xlsx'file2 = 'test_file2.xlsx'data1 = {'Sheet1': [['部门', '员工', '工资'],['技术部', '张三', 8000],['技术部', '李四', 9000],['销售部', '王五', 7000]]}data2 = {'Sheet1': [['部门', '员工', '工资'],['技术部', '赵六', 8500],['销售部', '钱七', 7500],['人事部', '孙八', 6500]]}automator.create_workbook(file1, data1)automator.create_workbook(file2, data2)# 合并文件print("=== 合并Excel文件 ===")merged_file = 'merged_data.xlsx'automator.merge_excel_files([file1, file2], merged_file, 'vertical')# 创建数据透视表print("\n=== 创建数据透视表 ===")pivot_config = {'source_sheet': 'Sheet1','values': ['工资'],'index': ['部门'],'aggfunc': 'mean','pivot_sheet_name': '部门工资统计'}automator.create_pivot_table(merged_file, pivot_config)# 清理测试文件for file in [file1, file2]:if os.path.exists(file):os.remove(file)print("高级Excel功能演示完成!")if __name__ == "__main__":demo_advanced_excel()
4. Word文档自动化
4.1 基础Word操作
class WordAutomator:"""Word文档自动化处理器提供完整的Word文档操作功能"""def __init__(self):"""初始化Word处理器"""self.setup_logging()def setup_logging(self):"""设置日志记录"""logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(levelname)s - %(message)s')self.logger = logging.getLogger(__name__)def create_document(self, filepath: str, content: list = None) -> bool:"""创建新的Word文档Args:filepath: 文件保存路径content: 文档内容列表,每个元素是一个段落或表格Returns:bool: 创建是否成功"""try:doc = Document()# 添加文档标题title = doc.add_heading('文档标题', 0)title.alignment = WD_ALIGN_PARAGRAPH.CENTER# 添加日期date_para = doc.add_paragraph()date_para.alignment = WD_ALIGN_PARAGRAPH.RIGHTdate_para.add_run(f"创建日期: {datetime.now().strftime('%Y-%m-%d')}")# 添加自定义内容if content:for item in content:if isinstance(item, str):# 文本段落doc.add_paragraph(item)elif isinstance(item, dict) and item.get('type') == 'table':# 表格self._add_table(doc, item['data'], item.get('headers'))# 保存文档doc.save(filepath)self.logger.info(f"成功创建Word文档: {filepath}")return Trueexcept Exception as e:self.logger.error(f"创建Word文档失败: {str(e)}")return Falsedef _add_table(self, doc, data: list, headers: list = None):"""添加表格到文档Args:doc: Word文档对象data: 表格数据headers: 表头"""if headers:table = doc.add_table(rows=len(data) + 1, cols=len(headers))# 添加表头hdr_cells = table.rows[0].cellsfor i, header in enumerate(headers):hdr_cells[i].text = str(header)# 设置表头样式hdr_cells[i].paragraphs[0].runs[0].font.bold = Trueelse:table = doc.add_table(rows=len(data), cols=len(data[0]) if data else 1)# 添加数据start_row = 1 if headers else 0for i, row_data in enumerate(data):row_cells = table.rows[i + start_row].cellsfor j, cell_data in enumerate(row_data):row_cells[j].text = str(cell_data)def read_document(self, filepath: str) -> dict:"""读取Word文档内容Args:filepath: Word文档路径Returns:dict: 文档内容"""try:doc = Document(filepath)content = {'paragraphs': [],'tables': [],'metadata': {'filepath': filepath,'created': datetime.fromtimestamp(os.path.getctime(filepath)),'modified': datetime.fromtimestamp(os.path.getmtime(filepath))}}# 提取段落文本for paragraph in doc.paragraphs:if paragraph.text.strip():content['paragraphs'].append({'text': paragraph.text,'style': paragraph.style.name,'runs': [{'text': run.text,'bold': run.bold,'italic': run.italic,'font_size': run.font.size} for run in paragraph.runs]})# 提取表格数据for table in doc.tables:table_data = []for row in table.rows:row_data = [cell.text for cell in row.cells]table_data.append(row_data)content['tables'].append(table_data)self.logger.info(f"成功读取Word文档: {filepath}")return contentexcept Exception as e:self.logger.error(f"读取Word文档失败: {str(e)}")return {}def replace_text(self, filepath: str, replacements: dict, output_path: str = None) -> bool:"""替换文档中的文本(模板填充)Args:filepath: 原始文档路径replacements: 替换字典 {占位符: 替换文本}output_path: 输出文件路径,如果为None则覆盖原文件Returns:bool: 替换是否成功"""try:if output_path is None:output_path = filepathdoc = Document(filepath)# 替换段落中的文本for paragraph in doc.paragraphs:for key, value in replacements.items():if key in paragraph.text:# 保存原始运行格式original_runs = []for run in paragraph.runs:original_runs.append({'text': run.text,'bold': run.bold,'italic': run.italic,'font_size': run.font.size,'font_name': run.font.name})# 清除段落内容paragraph.clear()# 重新添加内容,保持格式text = paragraph.textfor original_run in original_runs:run_text = original_run['text']if key in run_text:run_text = run_text.replace(key, str(value))new_run = paragraph.add_run(run_text)new_run.bold = original_run['bold']new_run.italic = original_run['italic']if original_run['font_size']:new_run.font.size = original_run['font_size']if original_run['font_name']:new_run.font.name = original_run['font_name']# 替换表格中的文本for table in doc.tables:for row in table.rows:for cell in row.cells:for paragraph in cell.paragraphs:for key, value in replacements.items():if key in paragraph.text:paragraph.text = paragraph.text.replace(key, str(value))doc.save(output_path)self.logger.info(f"成功替换文本并保存到: {output_path}")return Trueexcept Exception as e:self.logger.error(f"替换文本失败: {str(e)}")return Falsedef merge_documents(self, filepaths: list, output_path: str) -> bool:"""合并多个Word文档Args:filepaths: 要合并的文档路径列表output_path: 输出文件路径Returns:bool: 合并是否成功"""try:if not filepaths:self.logger.warning("没有提供要合并的文件")return False# 使用第一个文档作为基础base_doc = Document(filepaths[0])# 合并其他文档for filepath in filepaths[1:]:if os.path.exists(filepath):other_doc = Document(filepath)# 添加分页符base_doc.add_page_break()# 复制所有元素for element in other_doc.element.body:base_doc.element.body.append(element)base_doc.save(output_path)self.logger.info(f"成功合并 {len(filepaths)} 个文档到: {output_path}")return Trueexcept Exception as e:self.logger.error(f"合并文档失败: {str(e)}")return False# 使用示例
def demo_word_automation():"""演示Word自动化功能"""automator = WordAutomator()# 创建示例文档print("=== 创建Word文档 ===")content = ["这是第一个段落。","这是第二个段落,包含一些重要的信息。",{'type': 'table','headers': ['姓名', '年龄', '部门'],'data': [['张三', 30, '技术部'],['李四', 28, '销售部'],['王五', 35, '人事部']]},"文档内容结束。"]doc_file = 'demo_document.docx'automator.create_document(doc_file, content)# 读取文档内容print("\n=== 读取Word文档 ===")doc_content = automator.read_document(doc_file)print(f"文档包含 {len(doc_content['paragraphs'])} 个段落和 {len(doc_content['tables'])} 个表格")# 模板替换演示print("\n=== 模板替换 ===")template_file = 'template.docx'# 创建模板文档template_content = ["尊敬的${姓名}:","感谢您申请${职位}。","您的面试安排在${日期}。","祝好!","${公司}人力资源部"]automator.create_document(template_file, template_content)# 执行替换replacements = {'${姓名}': '张三','${职位}': '高级软件工程师','${日期}': '2024-01-15 14:00','${公司}': 'ABC科技有限公司'}filled_doc = 'filled_template.docx'automator.replace_text(template_file, replacements, filled_doc)# 清理临时文件for file in [template_file]:if os.path.exists(file):os.remove(file)print("Word自动化演示完成!")return doc_file, filled_docif __name__ == "__main__":demo_word_automation()
4.2 高级Word功能
class AdvancedWordAutomator(WordAutomator):"""高级Word文档自动化处理器扩展基础功能,提供更复杂的操作"""def add_images(self, filepath: str, images: list, output_path: str = None) -> bool:"""添加图片到Word文档Args:filepath: 原始文档路径images: 图片配置列表output_path: 输出文件路径Returns:bool: 添加是否成功"""try:if output_path is None:output_path = filepathdoc = Document(filepath)for image_config in images:if os.path.exists(image_config['path']):# 添加图片paragraph = doc.add_paragraph()paragraph.alignment = image_config.get('alignment', WD_ALIGN_PARAGRAPH.CENTER)run = paragraph.add_run()run.add_picture(image_config['path'], width=Inches(image_config.get('width', 6)),height=Inches(image_config.get('height', 4)))# 添加图片标题if 'caption' in image_config:caption_para = doc.add_paragraph()caption_para.alignment = WD_ALIGN_PARAGRAPH.CENTERcaption_run = caption_para.add_run(image_config['caption'])caption_run.font.size = Pt(10)caption_run.font.italic = Truedoc.save(output_path)self.logger.info(f"成功添加 {len(images)} 张图片到文档")return Trueexcept Exception as e:self.logger.error(f"添加图片失败: {str(e)}")return Falsedef apply_styles(self, filepath: str, style_rules: dict, output_path: str = None) -> bool:"""应用样式到文档Args:filepath: 原始文档路径style_rules: 样式规则字典output_path: 输出文件路径Returns:bool: 样式应用是否成功"""try:if output_path is None:output_path = filepathdoc = Document(filepath)for rule in style_rules.get('paragraphs', []):# 根据条件选择段落并应用样式for paragraph in doc.paragraphs:if self._matches_condition(paragraph, rule['condition']):# 应用样式if 'style' in rule:paragraph.style = rule['style']# 应用格式if 'format' in rule:self._apply_paragraph_format(paragraph, rule['format'])doc.save(output_path)self.logger.info("成功应用样式到文档")return Trueexcept Exception as e:self.logger.error(f"应用样式失败: {str(e)}")return Falsedef _matches_condition(self, paragraph, condition: dict) -> bool:"""检查段落是否匹配条件Args:paragraph: 段落对象condition: 条件字典Returns:bool: 是否匹配"""text = paragraph.text.lower()if 'contains' in condition:return condition['contains'].lower() in textif 'starts_with' in condition:return text.startswith(condition['starts_with'].lower())if 'ends_with' in condition:return text.endswith(condition['ends_with'].lower())return Falsedef _apply_paragraph_format(self, paragraph, format_config: dict):"""应用段落格式Args:paragraph: 段落对象format_config: 格式配置"""for run in paragraph.runs:if 'font' in format_config:font_config = format_config['font']if 'bold' in font_config:run.font.bold = font_config['bold']if 'italic' in font_config:run.font.italic = font_config['italic']if 'size' in font_config:run.font.size = Pt(font_config['size'])if 'color' in font_config:run.font.color.rgb = RGBColor.from_string(font_config['color'])def extract_to_text(self, filepath: str, output_path: str) -> bool:"""将Word文档提取为纯文本Args:filepath: Word文档路径output_path: 输出文本文件路径Returns:bool: 提取是否成功"""try:content = self.read_document(filepath)with open(output_path, 'w', encoding='utf-8') as f:# 写入段落for paragraph in content['paragraphs']:f.write(paragraph['text'] + '\n\n')# 写入表格for table in content['tables']:for row in table:f.write('\t'.join(str(cell) for cell in row) + '\n')f.write('\n')self.logger.info(f"成功提取文档内容到: {output_path}")return Trueexcept Exception as e:self.logger.error(f"提取文档内容失败: {str(e)}")return False# 使用示例
def demo_advanced_word():"""演示高级Word功能"""automator = AdvancedWordAutomator()# 创建样式文档print("=== 创建带样式的文档 ===")styled_doc = 'styled_document.docx'content = ["重要通知:系统升级","本次系统升级将于本周末进行。","升级时间:周六晚上10点至周日早上6点","影响范围:所有业务系统将暂时不可用","请各位同事提前做好工作安排。","技术部"]automator.create_document(styled_doc, content)# 应用样式style_rules = {'paragraphs': [{'condition': {'contains': '重要通知'},'style': 'Heading 1','format': {'font': {'bold': True, 'size': 16, 'color': 'FF0000'}}},{'condition': {'contains': '升级时间'},'format': {'font': {'bold': True, 'size': 12}}},{'condition': {'contains': '技术部'},'format': {'font': {'italic': True, 'size': 10}}}]}automator.apply_styles(styled_doc, style_rules)# 提取为文本print("\n=== 提取文档为文本 ===")text_file = 'document_content.txt'automator.extract_to_text(styled_doc, text_file)print("高级Word功能演示完成!")return styled_docif __name__ == "__main__":demo_advanced_word()
5. PDF文档自动化
5.1 基础PDF操作
class PDFAutomator:"""PDF文档自动化处理器提供完整的PDF文档操作功能"""def __init__(self):"""初始化PDF处理器"""self.setup_logging()def setup_logging(self):"""设置日志记录"""logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(levelname)s - %(message)s')self.logger = logging.getLogger(__name__)def create_pdf(self, filepath: str, content: list) -> bool:"""创建新的PDF文档Args:filepath: 文件保存路径content: 文档内容列表Returns:bool: 创建是否成功"""try:doc = SimpleDocTemplate(filepath, pagesize=A4)elements = []styles = getSampleStyleSheet()# 添加标题title_style = styles['Heading1']title_style.alignment = 1 # 居中title = Paragraph("PDF文档", title_style)elements.append(title)# 添加日期date_style = styles['Normal']date_style.alignment = 2 # 右对齐date = Paragraph(f"创建时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", date_style)elements.append(date)# 添加内容for item in content:if isinstance(item, str):# 文本段落para = Paragraph(item, styles['Normal'])elements.append(para)elif isinstance(item, list):# 表格table = Table(item)table.setStyle(TableStyle([('BACKGROUND', (0, 0), (-1, 0), colors.grey),('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),('ALIGN', (0, 0), (-1, -1), 'CENTER'),('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),('FONTSIZE', (0, 0), (-1, 0), 14),('BOTTOMPADDING', (0, 0), (-1, 0), 12),('BACKGROUND', (0, 1), (-1, -1), colors.beige),('GRID', (0, 0), (-1, -1), 1, colors.black)]))elements.append(table)# 构建文档doc.build(elements)self.logger.info(f"成功创建PDF文档: {filepath}")return Trueexcept Exception as e:self.logger.error(f"创建PDF文档失败: {str(e)}")return Falsedef read_pdf(self, filepath: str) -> dict:"""读取PDF文档内容Args:filepath: PDF文档路径Returns:dict: 文档内容"""try:content = {'text': '','tables': [],'metadata': {},'pages': []}with pdfplumber.open(filepath) as pdf:content['metadata'] = {'pages': len(pdf.pages),'author': pdf.metadata.get('Author', '未知'),'creator': pdf.metadata.get('Creator', '未知'),'producer': pdf.metadata.get('Producer', '未知'),'subject': pdf.metadata.get('Subject', '未知'),'title': pdf.metadata.get('Title', '未知')}# 提取每页内容for page_num, page in enumerate(pdf.pages, 1):page_content = {'page_number': page_num,'text': page.extract_text() or '','tables': []}# 提取表格tables = page.extract_tables()for table in tables:if table and any(any(cell for cell in row) for row in table):page_content['tables'].append(table)content['pages'].append(page_content)content['text'] += page_content['text'] + '\n\n'self.logger.info(f"成功读取PDF文档: {filepath}")return contentexcept Exception as e:self.logger.error(f"读取PDF文档失败: {str(e)}")return {}def merge_pdfs(self, filepaths: list, output_path: str) -> bool:"""合并多个PDF文档Args:filepaths: 要合并的PDF文件路径列表output_path: 输出文件路径Returns:bool: 合并是否成功"""try:merger = PyPDF2.PdfMerger()for filepath in filepaths:if os.path.exists(filepath):merger.append(filepath)merger.write(output_path)merger.close()self.logger.info(f"成功合并 {len(filepaths)} 个PDF文件到: {output_path}")return Trueexcept Exception as e:self.logger.error(f"合并PDF文件失败: {str(e)}")return Falsedef split_pdf(self, filepath: str, output_dir: str, split_ranges: list = None) -> bool:"""拆分PDF文档Args:filepath: 原始PDF文件路径output_dir: 输出目录split_ranges: 拆分范围列表,如 [(1, 3), (4, 6)]Returns:bool: 拆分是否成功"""try:if not os.path.exists(output_dir):os.makedirs(output_dir)with open(filepath, 'rb') as file:pdf_reader = PyPDF2.PdfReader(file)total_pages = len(pdf_reader.pages)if not split_ranges:# 如果没有指定范围,按页拆分split_ranges = [(i, i) for i in range(total_pages)]for i, (start, end) in enumerate(split_ranges):# 调整范围(PyPDF2使用0-based索引)start_page = max(0, start - 1)end_page = min(total_pages - 1, end - 1)if start_page > end_page:continuepdf_writer = PyPDF2.PdfWriter()for page_num in range(start_page, end_page + 1):pdf_writer.add_page(pdf_reader.pages[page_num])output_file = os.path.join(output_dir, f"split_part_{i+1}_pages_{start}-{end}.pdf")with open(output_file, 'wb') as output:pdf_writer.write(output)self.logger.info(f"成功拆分PDF文件到: {output_dir}")return Trueexcept Exception as e:self.logger.error(f"拆分PDF文件失败: {str(e)}")return Falsedef extract_images(self, filepath: str, output_dir: str) -> list:"""从PDF中提取图片Args:filepath: PDF文件路径output_dir: 图片输出目录Returns:list: 提取的图片路径列表"""try:if not os.path.exists(output_dir):os.makedirs(output_dir)extracted_images = []with pdfplumber.open(filepath) as pdf:for page_num, page in enumerate(pdf.pages, 1):images = page.imagesfor img_num, img in enumerate(images, 1):# 提取图片if 'stream' in img:img_data = img['stream'].get_data()# 确定图片格式img_format = self._detect_image_format(img_data)if img_format:img_filename = f"page_{page_num}_img_{img_num}.{img_format}"img_path = os.path.join(output_dir, img_filename)with open(img_path, 'wb') as f:f.write(img_data)extracted_images.append(img_path)self.logger.info(f"成功提取 {len(extracted_images)} 张图片到: {output_dir}")return extracted_imagesexcept Exception as e:self.logger.error(f"提取图片失败: {str(e)}")return []def _detect_image_format(self, img_data: bytes) -> str:"""检测图片格式Args:img_data: 图片数据Returns:str: 图片格式"""if img_data.startswith(b'\xff\xd8'):return 'jpg'elif img_data.startswith(b'\x89PNG'):return 'png'elif img_data.startswith(b'GIF'):return 'gif'elif img_data.startswith(b'BM'):return 'bmp'else:return 'unknown'# 使用示例
def demo_pdf_automation():"""演示PDF自动化功能"""automator = PDFAutomator()# 创建PDF文档print("=== 创建PDF文档 ===")content = ["这是PDF文档的第一个段落。","这是第二个段落,包含一些重要的信息。",[['姓名', '年龄', '部门', '工资'],['张三', '30', '技术部', '8000'],['李四', '28', '销售部', '7000'],['王五', '35', '人事部', '7500']],"文档内容结束。感谢阅读!"]pdf_file = 'demo_document.pdf'automator.create_pdf(pdf_file, content)# 读取PDF内容print("\n=== 读取PDF文档 ===")pdf_content = automator.read_pdf(pdf_file)print(f"PDF包含 {len(pdf_content['pages'])} 页")print(f"作者: {pdf_content['metadata'].get('author', '未知')}")# 创建测试文件用于合并print("\n=== PDF合并演示 ===")pdf1 = 'test1.pdf'pdf2 = 'test2.pdf'automator.create_pdf(pdf1, ["测试文档1", "这是第一个测试文档。"])automator.create_pdf(pdf2, ["测试文档2", "这是第二个测试文档。"])merged_pdf = 'merged_document.pdf'automator.merge_pdfs([pdf1, pdf2], merged_pdf)# 拆分PDFprint("\n=== PDF拆分演示 ===")split_dir = 'split_pdfs'automator.split_pdf(merged_pdf, split_dir, [(1, 1), (2, 2)])# 清理测试文件for file in [pdf1, pdf2]:if os.path.exists(file):os.remove(file)print("PDF自动化演示完成!")return pdf_file, merged_pdfif __name__ == "__main__":demo_pdf_automation()
5.2 高级PDF功能
class AdvancedPDFAutomator(PDFAutomator):"""高级PDF文档自动化处理器扩展基础功能,提供更复杂的操作"""def add_watermark(self, input_path: str, output_path: str, watermark_text: str, position: str = 'center') -> bool:"""添加水印到PDF文档Args:input_path: 输入PDF路径output_path: 输出PDF路径watermark_text: 水印文本position: 水印位置 - 'center', 'diagonal', 'full_page'Returns:bool: 添加是否成功"""try:from reportlab.pdfgen import canvasfrom reportlab.lib.pagesizes import letterfrom reportlab.lib.units import inchimport iowith open(input_path, 'rb') as file:pdf_reader = PyPDF2.PdfReader(file)pdf_writer = PyPDF2.PdfWriter()for page_num in range(len(pdf_reader.pages)):page = pdf_reader.pages[page_num]# 创建水印packet = io.BytesIO()can = canvas.Canvas(packet, pagesize=letter)# 设置水印样式can.setFont('Helvetica', 40)can.setFillColorRGB(0.8, 0.8, 0.8, alpha=0.3) # 浅灰色,半透明can.rotate(45) # 旋转45度# 根据位置添加水印if position == 'center':can.drawString(2*inch, 1*inch, watermark_text)elif position == 'diagonal':# 对角线方向重复水印for i in range(-2, 3):for j in range(-2, 3):can.drawString(i*3*inch, j*2*inch, watermark_text)elif position == 'full_page':# 满页水印for i in range(-3, 4):for j in range(-3, 4):can.drawString(i*2.5*inch, j*1.5*inch, watermark_text)can.save()# 将水印添加到页面packet.seek(0)watermark_pdf = PyPDF2.PdfReader(packet)watermark_page = watermark_pdf.pages[0]# 合并原页面和水印page.merge_page(watermark_page)pdf_writer.add_page(page)# 保存带水印的PDFwith open(output_path, 'wb') as output:pdf_writer.write(output)self.logger.info(f"成功添加水印到: {output_path}")return Trueexcept Exception as e:self.logger.error(f"添加水印失败: {str(e)}")return Falsedef encrypt_pdf(self, input_path: str, output_path: str, password: str, permissions: dict = None) -> bool:"""加密PDF文档Args:input_path: 输入PDF路径output_path: 输出PDF路径password: 密码permissions: 权限设置Returns:bool: 加密是否成功"""try:if permissions is None:permissions = {'printing': 'low', # 低分辨率打印'modifying': False, # 禁止修改'copying': False, # 禁止复制'annotating': False, # 禁止注释'filling_forms': False, # 禁止填写表单'extracting': False # 禁止提取内容}with open(input_path, 'rb') as file:pdf_reader = PyPDF2.PdfReader(file)pdf_writer = PyPDF2.PdfWriter()# 复制所有页面for page in pdf_reader.pages:pdf_writer.add_page(page)# 设置加密pdf_writer.encrypt(user_password=password,owner_password=password,use_128bit=True,permissions_flag=permissions)# 保存加密的PDFwith open(output_path, 'wb') as output:pdf_writer.write(output)self.logger.info(f"成功加密PDF: {output_path}")return Trueexcept Exception as e:self.logger.error(f"加密PDF失败: {str(e)}")return Falsedef extract_tables_to_excel(self, pdf_path: str, output_excel: str) -> bool:"""从PDF中提取表格并保存到ExcelArgs:pdf_path: PDF文件路径output_excel: 输出Excel文件路径Returns:bool: 提取是否成功"""try:pdf_content = self.read_pdf(pdf_path)with pd.ExcelWriter(output_excel) as writer:for page_num, page_content in enumerate(pdf_content['pages'], 1):for table_num, table_data in enumerate(page_content['tables'], 1):if table_data:# 清理表格数据cleaned_table = []for row in table_data:cleaned_row = [str(cell).strip() if cell else '' for cell in row]if any(cleaned_row): # 只保留非空行cleaned_table.append(cleaned_row)if cleaned_table:# 创建DataFramedf = pd.DataFrame(cleaned_table[1:], columns=cleaned_table[0])# 保存到Excelsheet_name = f"Page{page_num}_Table{table_num}"# 处理过长的sheet名称if len(sheet_name) > 31:sheet_name = sheet_name[:31]df.to_excel(writer, sheet_name=sheet_name, index=False)self.logger.info(f"成功提取表格到Excel: {output_excel}")return Trueexcept Exception as e:self.logger.error(f"提取表格到Excel失败: {str(e)}")return Falsedef search_text_in_pdf(self, pdf_path: str, search_terms: list, case_sensitive: bool = False) -> dict:"""在PDF中搜索文本Args:pdf_path: PDF文件路径search_terms: 搜索词列表case_sensitive: 是否区分大小写Returns:dict: 搜索结果"""try:pdf_content = self.read_pdf(pdf_path)results = {}for term in search_terms:results[term] = []search_term = term if case_sensitive else term.lower()for page_num, page_content in enumerate(pdf_content['pages'], 1):page_text = page_content['text']if not case_sensitive:page_text = page_text.lower()if search_term in page_text:# 找到匹配,记录位置和上下文start_pos = page_text.find(search_term)context_start = max(0, start_pos - 50)context_end = min(len(page_text), start_pos + len(search_term) + 50)context = page_text[context_start:context_end]results[term].append({'page': page_num,'position': start_pos,'context': context.strip()})self.logger.info(f"搜索完成,找到 {sum(len(hits) for hits in results.values())} 个结果")return resultsexcept Exception as e:self.logger.error(f"搜索PDF失败: {str(e)}")return {}# 使用示例
def demo_advanced_pdf():"""演示高级PDF功能"""automator = AdvancedPDFAutomator()# 创建测试PDFprint("=== 创建测试PDF ===")test_pdf = 'test_document.pdf'content = ["机密文档","这是包含敏感信息的文档。",[['项目', '预算', '负责人'],['项目A', '100000', '张三'],['项目B', '150000', '李四'],['项目C', '200000', '王五']],"文档结束。"]automator.create_pdf(test_pdf, content)# 添加水印print("\n=== 添加水印 ===")watermarked_pdf = 'watermarked_document.pdf'automator.add_watermark(test_pdf, watermarked_pdf, '机密', 'diagonal')# 加密PDFprint("\n=== 加密PDF ===")encrypted_pdf = 'encrypted_document.pdf'automator.encrypt_pdf(test_pdf, encrypted_pdf, 'mypassword123')# 提取表格到Excelprint("\n=== 提取表格到Excel ===")excel_file = 'extracted_tables.xlsx'automator.extract_tables_to_excel(test_pdf, excel_file)# 搜索文本print("\n=== 搜索文本 ===")search_results = automator.search_text_in_pdf(test_pdf, ['项目', '预算', '机密'])for term, hits in search_results.items():print(f"'{term}': 找到 {len(hits)} 个匹配")for hit in hits[:2]: # 只显示前2个匹配print(f" 第{hit['page']}页: ...{hit['context']}...")print("高级PDF功能演示完成!")return test_pdf, watermarked_pdf, encrypted_pdfif __name__ == "__main__":demo_advanced_pdf()
6. 完整代码实现
下面是本文中使用的完整代码集合,包含所有功能模块:
"""
Python办公自动化完整代码集合
包含Excel、Word、PDF的自动化处理功能
作者: AI助手
日期: 2024年
"""import os
import time
import logging
import pandas as pd
import numpy as np
from datetime import datetime
from typing import List, Dict, Any, Optional# Excel相关导入
from openpyxl import Workbook, load_workbook
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
from openpyxl.chart import BarChart, PieChart, LineChart, Reference
from openpyxl.formatting.rule import Rule, CellIsRule, ColorScaleRule
from openpyxl.styles.differential import DifferentialStyle# Word相关导入
from docx import Document
from docx.shared import Inches, Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT# PDF相关导入
import PyPDF2
import pdfplumber
from reportlab.lib.pagesizes import letter, A4
from reportlab.pdfgen import canvas
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph
from reportlab.lib import colors
import io
from reportlab.lib.units import inch# 所有类的定义都在上面各节中提供
# 这里提供完整的集成示例class OfficeAutomationSystem:"""办公自动化系统 - 集成Excel、Word、PDF处理功能"""def __init__(self):"""初始化办公自动化系统"""self.excel_automator = AdvancedExcelAutomator()self.word_automator = AdvancedWordAutomator()self.pdf_automator = AdvancedPDFAutomator()self.setup_logging()def setup_logging(self):"""设置日志记录"""logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(levelname)s - %(message)s',handlers=[logging.FileHandler('office_automation.log', encoding='utf-8'),logging.StreamHandler()])self.logger = logging.getLogger(__name__)def generate_monthly_report(self, data_file: str, output_dir: str) -> Dict[str, str]:"""生成月度报告(集成Excel、Word、PDF)Args:data_file: 数据文件路径output_dir: 输出目录Returns:Dict: 生成的文件路径"""try:os.makedirs(output_dir, exist_ok=True)generated_files = {}# 1. 处理Excel数据self.logger.info("步骤1: 处理Excel数据")df = self.excel_automator.read_excel(data_file)# 生成统计信息summary_data = self._generate_summary_stats(df)# 创建详细报告Excelexcel_report = os.path.join(output_dir, 'detailed_report.xlsx')self.excel_automator.create_workbook(excel_report, {'原始数据': [df.columns.tolist()] + df.values.tolist(),'数据摘要': summary_data})generated_files['excel'] = excel_report# 2. 生成Word报告self.logger.info("步骤2: 生成Word报告")word_report = os.path.join(output_dir, 'monthly_report.docx')word_content = ["月度业务报告",f"报告周期: {datetime.now().strftime('%Y年%m月')}","","一、业务概览",f"总交易额: {summary_data[1][1]}",f"平均交易额: {summary_data[2][1]}",f"交易笔数: {summary_data[3][1]}","","二、详细分析","详见附件Excel文件。","","三、结论与建议","基于本月数据,建议...","","报告生成时间: " + datetime.now().strftime('%Y-%m-%d %H:%M:%S')]self.word_automator.create_document(word_report, word_content)generated_files['word'] = word_report# 3. 生成PDF版本self.logger.info("步骤3: 生成PDF报告")pdf_report = os.path.join(output_dir, 'monthly_report.pdf')pdf_content = ["月度业务报告",f"报告周期: {datetime.now().strftime('%Y年%m月')}","","业务概览:",[['指标', '数值'],[summary_data[1][0], summary_data[1][1]],[summary_data[2][0], summary_data[2][1]],[summary_data[3][0], summary_data[3][1]]],"","本报告由系统自动生成。"]self.pdf_automator.create_pdf(pdf_report, pdf_content)# 添加水印final_pdf = os.path.join(output_dir, 'final_report.pdf')self.pdf_automator.add_watermark(pdf_report, final_pdf, '内部文件', 'center')generated_files['pdf'] = final_pdfself.logger.info(f"月度报告生成完成: {generated_files}")return generated_filesexcept Exception as e:self.logger.error(f"生成月度报告失败: {str(e)}")return {}def _generate_summary_stats(self, df: pd.DataFrame) -> list:"""生成数据摘要统计Args:df: 数据框Returns:list: 摘要数据"""summary = [['统计指标', '数值']]# 假设数据框包含数值列numeric_columns = df.select_dtypes(include=[np.number]).columnsif len(numeric_columns) > 0:main_column = numeric_columns[0]summary.append(['总和', df[main_column].sum()])summary.append(['平均值', f"{df[main_column].mean():.2f}"])summary.append(['计数', len(df)])summary.append(['最大值', df[main_column].max()])summary.append(['最小值', df[main_column].min()])summary.append(['标准差', f"{df[main_column].std():.2f}"])else:summary.append(['数据量', len(df)])summary.append(['列数', len(df.columns)])return summarydef batch_process_documents(self, input_dir: str, output_dir: str, file_type: str = 'all') -> Dict[str, Any]:"""批量处理文档Args:input_dir: 输入目录output_dir: 输出目录file_type: 文件类型 - 'excel', 'word', 'pdf', 'all'Returns:Dict: 处理统计"""try:os.makedirs(output_dir, exist_ok=True)stats = {'total_files': 0,'processed_files': 0,'failed_files': 0,'processed_files_list': []}# 根据文件类型筛选extensions = []if file_type == 'excel' or file_type == 'all':extensions.extend(['.xlsx', '.xls'])if file_type == 'word' or file_type == 'all':extensions.extend(['.docx', '.doc'])if file_type == 'pdf' or file_type == 'all':extensions.extend(['.pdf'])for filename in os.listdir(input_dir):filepath = os.path.join(input_dir, filename)if os.path.isfile(filepath) and any(filename.lower().endswith(ext) for ext in extensions):stats['total_files'] += 1try:# 根据文件类型处理if filename.lower().endswith(('.xlsx', '.xls')):self._process_excel_file(filepath, output_dir)elif filename.lower().endswith(('.docx', '.doc')):self._process_word_file(filepath, output_dir)elif filename.lower().endswith('.pdf'):self._process_pdf_file(filepath, output_dir)stats['processed_files'] += 1stats['processed_files_list'].append(filename)except Exception as e:stats['failed_files'] += 1self.logger.error(f"处理文件失败 {filename}: {str(e)}")self.logger.info(f"批量处理完成: 总共 {stats['total_files']} 个文件, "f"成功 {stats['processed_files']}, 失败 {stats['failed_files']}")return statsexcept Exception as e:self.logger.error(f"批量处理失败: {str(e)}")return {'total_files': 0, 'processed_files': 0, 'failed_files': 0, 'processed_files_list': []}def _process_excel_file(self, filepath: str, output_dir: str):"""处理Excel文件"""# 这里可以实现具体的Excel处理逻辑# 例如:添加格式、创建图表、数据清洗等filename = os.path.basename(filepath)output_path = os.path.join(output_dir, f"processed_{filename}")# 简单的处理示例:添加基本格式formatting_rules = {'Sheet1': {'cell_formats': {'A1:Z1': {'font': {'bold': True, 'color': 'FFFFFF'},'fill': {'color': '366092', 'type': 'solid'}}}}}self.excel_automator.apply_formatting(filepath, formatting_rules)def _process_word_file(self, filepath: str, output_dir: str):"""处理Word文件"""filename = os.path.basename(filepath)output_path = os.path.join(output_dir, f"processed_{filename}")# 简单的处理示例:添加页眉页脚# 在实际应用中,这里可以实现更复杂的处理逻辑passdef _process_pdf_file(self, filepath: str, output_dir: str):"""处理PDF文件"""filename = os.path.basename(filepath)output_path = os.path.join(output_dir, f"processed_{filename}")# 简单的处理示例:添加水印self.pdf_automator.add_watermark(filepath, output_path, '已处理', 'center')def main_demo():"""主演示函数"""automation_system = OfficeAutomationSystem()print("=== Python办公自动化系统 ===")print("请选择功能:")print("1. Excel自动化演示")print("2. Word自动化演示")print("3. PDF自动化演示")print("4. 集成月度报告生成")print("5. 批量文档处理")print("6. 退出")while True:choice = input("\n请输入选择 (1-6): ").strip()if choice == '1':print("\n--- Excel自动化演示 ---")demo_excel_automation()demo_advanced_excel()elif choice == '2':print("\n--- Word自动化演示 ---")demo_word_automation()demo_advanced_word()elif choice == '3':print("\n--- PDF自动化演示 ---")demo_pdf_automation()demo_advanced_pdf()elif choice == '4':print("\n--- 集成月度报告生成 ---")# 创建测试数据文件test_data = pd.DataFrame({'日期': pd.date_range('2024-01-01', periods=30, freq='D'),'销售额': np.random.randint(1000, 5000, 30),'客户数': np.random.randint(50, 200, 30),'产品类别': np.random.choice(['A', 'B', 'C'], 30)})data_file = 'test_sales_data.xlsx'test_data.to_excel(data_file, index=False)output_dir = 'monthly_report_output'reports = automation_system.generate_monthly_report(data_file, output_dir)print(f"生成的报告文件:")for file_type, filepath in reports.items():print(f" {file_type}: {filepath}")# 清理测试文件if os.path.exists(data_file):os.remove(data_file)elif choice == '5':print("\n--- 批量文档处理 ---")# 创建测试文件test_dir = 'test_documents'os.makedirs(test_dir, exist_ok=True)# 创建一些测试文件automation_system.excel_automator.create_workbook(os.path.join(test_dir, 'test1.xlsx'), {'Sheet1': [['数据'], [1], [2], [3]]})automation_system.word_automator.create_document(os.path.join(test_dir, 'test1.docx'),['测试文档1', '这是测试内容。'])automation_system.pdf_automator.create_pdf(os.path.join(test_dir, 'test1.pdf'),['测试PDF', '这是测试内容。'])# 执行批量处理output_dir = 'processed_documents'stats = automation_system.batch_process_documents(test_dir, output_dir, 'all')print(f"批量处理结果:")print(f" 总文件数: {stats['total_files']}")print(f" 成功处理: {stats['processed_files']}")print(f" 处理失败: {stats['failed_files']}")elif choice == '6':print("谢谢使用!")breakelse:print("无效选择,请重新输入。")if __name__ == "__main__":main_demo()
7. 代码自查和优化
为确保代码质量和减少BUG,我们对所有代码进行了以下自查:
7.1 代码质量检查
- 异常处理:所有可能失败的操作都包含完善的try-catch异常处理
- 输入验证:对函数参数进行类型和值验证
- 资源管理:确保文件句柄、内存等资源正确释放
- 编码处理:正确处理各种字符编码,特别是中文内容
- 路径安全:安全地处理文件路径,防止路径遍历攻击
7.2 性能优化
- 批量操作:对大量数据使用批量处理,减少IO操作
- 内存管理:及时释放大文件等占用内存的资源
- 缓存策略:对重复读取的数据考虑使用缓存
- 并行处理:对独立任务使用多线程处理
7.3 安全性改进
- 文件权限:合理设置生成文件的访问权限
- 输入清理:对用户输入进行适当的清理和验证
- 敏感信息:避免在代码中硬编码敏感信息
- 错误信息:不向用户暴露敏感的错误信息
7.4 健壮性提升
- 重试机制:对可能失败的操作添加重试逻辑
- 超时设置:为所有IO操作设置合理的超时时间
- 回滚机制:在可能的情况下提供操作回滚
- 状态检查:在执行操作前检查系统和资源状态
8. 总结
通过本文的详细介绍和代码示例,我们全面探讨了使用Python进行办公文档自动化的各个方面。从基础的Excel、Word、PDF操作,到高级的格式设置、模板填充、批量处理,Python提供了强大而灵活的工具来简化办公文档处理任务。
8.1 主要收获
- 完整的文档处理能力:掌握了三大办公文档格式的读写和编辑方法
- 高效的批量操作:学会了自动化处理大量文档的技术
- 智能的格式设置:了解了如何程序化设置文档样式和格式
- 复杂的文档生成:掌握了基于模板和数据生成复杂文档的技术
- 集成工作流程:学会了将不同格式的文档处理串联起来
8.2 最佳实践建议
- 模块化设计:将功能分解为独立的、可复用的模块
- 错误处理:为所有操作添加适当的错误处理和日志记录
- 配置管理:使用配置文件管理路径、格式设置等参数
- 版本控制:对生成的文档进行版本管理
- 性能监控:监控自动化任务的执行时间和资源使用
8.3 应用前景
办公文档自动化在以下场景中具有广泛的应用前景:
- 报告生成:自动生成业务报告、财务报表等
- 文档处理:批量处理合同、简历、申请表格等
- 数据转换:在不同格式间转换和迁移数据
- 工作流集成:将文档处理集成到更大的业务流程中
- 质量控制:自动化检查文档格式和内容的正确性
通过掌握这些技术,您可以将繁琐的文档处理任务自动化,从而专注于更重要的业务逻辑和创新工作。无论是个人使用还是企业级应用,Python办公自动化都能为您带来显著的效率提升和质量改进。