DeepSeek辅助利用搬移底层xml实现快速编辑xlsx文件的python程序
我们知道xlsx文件实质上是包含众多底层xml文件的zip压缩文件,那么容易想到可以利用搬移底层xml文件实现一个快速编辑xlsx文件的python程序,实现(1)从多个xlsx文件中读取sheet,合并到一个新xlsx文件中,合并后的sheet名采用原始文件名_sheet名的形式(2)可以从某些xlsx文件中指定要保存到新文件的sheet。思路是:将xlsx解压缩到临时目录,将xml文件移动到同一个xl\worksheets 目录,修改workbook.xml中的sheets的引用。假定所有xlsx都未使用共享字符串。把这些需求发给DeepSeek,他给出了如下程序。
import os
import shutil
import tempfile
import zipfile
from pathlib import Path
import xml.etree.ElementTree as ET
from typing import List, Dict, Unionclass FastXlsxEditor:def __init__(self):self.namespace = {'ns': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'}def merge_sheets(self, input_files: List[str], output_file: str,selected_sheets: Dict[str, List[str]] = None) -> None:"""合并多个xlsx文件的sheet到一个新文件中Args:input_files: 输入文件路径列表output_file: 输出文件路径selected_sheets: 可选,指定要从哪些文件保存哪些sheet{文件名: [sheet名列表]}"""# 创建临时工作目录with tempfile.TemporaryDirectory() as temp_dir:temp_path = Path(temp_dir)merged_dir = temp_path / "merged"merged_dir.mkdir()# 解压第一个文件作为基础模板base_file = input_files[0]self._extract_zip(base_file, merged_dir)# 处理workbook.xml和sheetsself._process_workbook_and_sheets(merged_dir, input_files, selected_sheets)# 重新打包为xlsx文件self._create_zip(merged_dir, output_file)def _extract_zip(self, zip_path: str, extract_dir: Path) -> None:"""解压xlsx文件"""with zipfile.ZipFile(zip_path, 'r') as zip_ref:zip_ref.extractall(extract_dir)def _create_zip(self, source_dir: Path, output_file: str) -> None:"""创建xlsx文件"""# 确保输出目录存在os.makedirs(os.path.dirname(os.path.abspath(output_file)), exist_ok=True)with zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED) as zipf:for root, dirs, files in os.walk(source_dir):for file in files:file_path = os.path.join(root, file)# 在zip文件中保持相对路径arcname = os.path.relpath(file_path, source_dir)zipf.write(file_path, arcname)def _process_workbook_and_sheets(self, merged_dir: Path, input_files: List[str],selected_sheets: Dict[str, List[str]] = None) -> None:"""处理workbook.xml和所有sheet文件"""workbook_path = merged_dir / "xl" / "workbook.xml"worksheets_dir = merged_dir / "xl" / "worksheets"shared_strings_path = merged_dir / "xl" / "sharedStrings.xml"# 如果存在共享字符串文件,删除它(根据假设不使用共享字符串)if shared_strings_path.exists():shared_strings_path.unlink()# 解析基础workbook.xmltree = ET.parse(workbook_path)root = tree.getroot()# 找到sheets元素sheets_elem = root.find('.//ns:sheets', self.namespace)if sheets_elem is None:raise ValueError("在workbook.xml中找不到sheets元素")# 清空原有的sheetssheets_elem.clear()sheet_id = 1relationships = []# 处理每个输入文件for file_path in input_files:file_name = Path(file_path).stem# 创建临时目录处理当前文件with tempfile.TemporaryDirectory() as temp_dir:temp_path = Path(temp_dir)self._extract_zip(file_path, temp_path)# 获取该文件的所有sheet或选定的sheetfile_sheets = self._get_sheets_from_file(temp_path, file_name, selected_sheets)# 复制sheet文件并更新workbookfor sheet_name, sheet_file in file_sheets:new_sheet_name = f"{file_name}_{sheet_name}"new_sheet_file = f"sheet{sheet_id}.xml"# 复制sheet文件source_sheet_path = temp_path / "xl" / "worksheets" / sheet_filetarget_sheet_path = worksheets_dir / new_sheet_fileshutil.copy2(source_sheet_path, target_sheet_path)# 在workbook.xml中添加sheet引用sheet_elem = ET.SubElement(sheets_elem, 'sheet')sheet_elem.set('name', new_sheet_name)sheet_elem.set('sheetId', str(sheet_id))sheet_elem.set('r:id', f"rId{sheet_id}")# 记录关系relationships.append({'Id': f"rId{sheet_id}",'Type': "http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet",'Target': f"worksheets/{new_sheet_file}"})sheet_id += 1# 保存修改后的workbook.xmltree.write(workbook_path, encoding='UTF-8', xml_declaration=True)# 更新relationships文件self._update_relationships(merged_dir, relationships)def _get_sheets_from_file(self, extracted_dir: Path, file_name: str,selected_sheets: Dict[str, List[str]] = None) -> List[tuple]:"""从解压的文件中获取sheet信息"""workbook_path = extracted_dir / "xl" / "workbook.xml"tree = ET.parse(workbook_path)root = tree.getroot()sheets = []sheets_elem = root.find('.//ns:sheets', self.namespace)if sheets_elem is not None:for sheet_elem in sheets_elem.findall('ns:sheet', self.namespace):sheet_name = sheet_elem.get('name')sheet_r_id = sheet_elem.get('r:id')# 获取实际的sheet文件名sheet_file = self._get_sheet_filename(extracted_dir, sheet_r_id)# 检查是否在选定的sheet中if selected_sheets:if file_name in selected_sheets:if sheet_name in selected_sheets[file_name]:sheets.append((sheet_name, sheet_file))# 如果文件不在selected_sheets中,跳过所有sheetelse:continueelse:# 如果没有指定selected_sheets,包含所有sheetsheets.append((sheet_name, sheet_file))return sheetsdef _get_sheet_filename(self, extracted_dir: Path, r_id: str) -> str:"""通过关系ID获取sheet文件名"""rels_path = extracted_dir / "xl" / "_rels" / "workbook.xml.rels"tree = ET.parse(rels_path)root = tree.getroot()for rel in root.findall('.//{http://schemas.openxmlformats.org/package/2006/relationships}Relationship'):if rel.get('Id') == r_id:target = rel.get('Target')return os.path.basename(target)raise ValueError(f"找不到关系ID: {r_id}")def _update_relationships(self, merged_dir: Path, relationships: List[Dict]) -> None:"""更新workbook的关系文件"""rels_dir = merged_dir / "xl" / "_rels"rels_dir.mkdir(exist_ok=True)rels_path = rels_dir / "workbook.xml.rels"# 创建新的relationships根元素root = ET.Element('Relationships')root.set('xmlns', 'http://schemas.openxmlformats.org/package/2006/relationships')for rel in relationships:rel_elem = ET.SubElement(root, 'Relationship')rel_elem.set('Id', rel['Id'])rel_elem.set('Type', rel['Type'])rel_elem.set('Target', rel['Target'])# 写入relationships文件tree = ET.ElementTree(root)tree.write(rels_path, encoding='UTF-8', xml_declaration=True)# 测试代码
def create_sample_xlsx():"""创建测试用的xlsx文件(使用openpyxl创建简单示例)"""try:from openpyxl import Workbook# 创建测试文件1wb1 = Workbook()ws1 = wb1.activews1.title = "Data"ws1['A1'] = "File1_Data_A1"ws1['B1'] = "File1_Data_B1"ws2 = wb1.create_sheet("Summary")ws2['A1'] = "File1_Summary_A1"wb1.save("file1.xlsx")# 创建测试文件2wb2 = Workbook()ws1 = wb2.activews1.title = "Data"ws1['A1'] = "File2_Data_A1"ws2 = wb2.create_sheet("Report")ws2['A1'] = "File2_Report_A1"wb2.save("file2.xlsx")print("测试文件创建完成")except ImportError:print("请安装openpyxl来创建测试文件: pip install openpyxl")
# 使用示例
def main():create_sample_xlsx()editor = FastXlsxEditor()# 示例1: 合并所有文件的所有sheetinput_files = ["file1.xlsx", "file2.xlsx"]editor.merge_sheets(input_files, "merged_all.xlsx")# 示例2: 只合并特定文件的特定sheetselected_sheets = {"file1": ["Data"], # 从file1.xlsx只取Data"file2": ["Report"] # 从file2.xlsx只取Report# file3.xlsx不会被包含,因为没有在selected_sheets中指定}editor.merge_sheets(input_files, "merged_selected.xlsx", selected_sheets)if __name__ == "__main__":main()
结果执行报错
file_sheets = self._get_sheets_from_file(temp_path, file_name, selected_sheets)File "C:\d\mergexlsx1.py", line 152, in _get_sheets_from_filesheet_file = self._get_sheet_filename(extracted_dir, sheet_r_id)File "C:\d\mergexlsx1.py", line 179, in _get_sheet_filenameraise ValueError(f"找不到关系ID: {r_id}")
经过调试,发现r_id的值是None, 让他修改后,还是不行,于是让他用手工读取xml文件的方法获取r:id,他给出了如下代码,
def _get_sheets_from_file(self, extracted_dir: Path, file_name: str,selected_sheets: Dict[str, List[str]] = None) -> List[tuple]:"""从解压的文件中获取sheet信息"""workbook_path = extracted_dir / "xl" / "workbook.xml"# 手工读取XML文件内容with open(workbook_path, 'r', encoding='utf-8') as f:content = f.read()sheets = []# 手工解析sheet元素import re# 查找所有的sheet元素sheet_pattern = r'<sheet[^>]*name="([^"]*)"[^>]*r:id="([^"]*)"[^>]*>'matches = re.findall(sheet_pattern, content)for sheet_name, sheet_r_id in matches:print(f"找到sheet: name='{sheet_name}', r:id='{sheet_r_id}'")# 获取实际的sheet文件名sheet_file = self._get_sheet_filename(extracted_dir, sheet_r_id)# 检查是否在选定的sheet中if selected_sheets:if file_name in selected_sheets:if sheet_name in selected_sheets[file_name]:sheets.append((sheet_name, sheet_file))# 如果文件不在selected_sheets中,跳过所有sheetelse:continueelse:# 如果没有指定selected_sheets,包含所有sheetsheets.append((sheet_name, sheet_file))return sheets
替换原有的函数后,能够合并成功了,用rusty_sheet能正确读出。
python ../mergexlsx3.py
找到sheet: name='Data', r:id='rId1'
找到sheet: name='Summary', r:id='rId2'
找到sheet: name='Data', r:id='rId1'
找到sheet: name='Report', r:id='rId2'
找到sheet: name='Data', r:id='rId1'
找到sheet: name='Summary', r:id='rId2'
找到sheet: name='Data', r:id='rId1'
找到sheet: name='Report', r:id='rId2'\d\duckdb141 -unsigned -cmd "load '/d/2sh/rusty_sheet.duckdb_extension';"
D from read_sheets(['/d/mgxlsx/file*.xlsx'],header=0,sheet_name_column='sn',file_name_column='fn');
┌──────────────────┬───────────────┬─────────┬──────────────────────┐
│ A │ B │ sn │ fn │
│ varchar │ varchar │ varchar │ varchar │
├──────────────────┼───────────────┼─────────┼──────────────────────┤
│ File1_Data_A1 │ File1_Data_B1 │ Data │ /d\mgxlsx\file1.xlsx │
│ File1_Summary_A1 │ NULL │ Summary │ /d\mgxlsx\file1.xlsx │
│ File2_Data_A1 │ NULL │ Data │ /d\mgxlsx\file2.xlsx │
│ File2_Report_A1 │ NULL │ Report │ /d\mgxlsx\file2.xlsx │
└──────────────────┴───────────────┴─────────┴──────────────────────┘
D from read_sheets(['/d/mgxlsx/merged*.xlsx'],header=0,sheet_name_column='sn',file_name_column='fn');
┌──────────────────┬───────────────┬───────────────┬────────────────────────────────┐
│ A │ B │ sn │ fn │
│ varchar │ varchar │ varchar │ varchar │
├──────────────────┼───────────────┼───────────────┼────────────────────────────────┤
│ File1_Data_A1 │ File1_Data_B1 │ file1_Data │ /d\mgxlsx\merged_all.xlsx │
│ File1_Summary_A1 │ NULL │ file1_Summary │ /d\mgxlsx\merged_all.xlsx │
│ File2_Data_A1 │ NULL │ file2_Data │ /d\mgxlsx\merged_all.xlsx │
│ File2_Report_A1 │ NULL │ file2_Report │ /d\mgxlsx\merged_all.xlsx │
│ File1_Data_A1 │ File1_Data_B1 │ file1_Data │ /d\mgxlsx\merged_selected.xlsx │
│ File2_Report_A1 │ NULL │ file2_Report │ /d\mgxlsx\merged_selected.xlsx │
└──────────────────┴───────────────┴───────────────┴────────────────────────────────┘
但有个问题,用wps打开合并后的文件,看不到有数据的sheet, 而只能看到一个空白的sheet1,还需要进一步研究。