当前位置：首页 > news >正文

Python文本统计分析工具

news 2025/7/16 12:08:21

本文介绍了一个基于Python Tkinter的文本统计分析工具GUI程序。该工具能够分析文本文件，统计非空行数和不同汉字数量，支持单个文件或整个文件夹（含子文件夹）处理。程序提供可视化结果展示，包括表格形式的数据统计和汉字详情查看功能，并可导出CSV格式结果。核心功能包括文件处理、正则匹配汉字、多线程优化等，界面包含路径选择、处理选项、结果展示和操作按钮等区域，实现了完整的文本分析工作流程。

import os
import re
import tkinter as tk
from tkinter import filedialog, messagebox, ttkclass TextStatsAnalyzer:def __init__(self, root):self.root = rootself.root.title("文本统计分析工具")self.root.geometry("800x700")self.center_window()# 创建主框架main_frame = tk.Frame(root, padx=20, pady=20)main_frame.pack(fill=tk.BOTH, expand=True)# 标题title_label = tk.Label(main_frame, text="文本文件统计分析工具",font=("Arial", 16, "bold"), fg="#2c3e50")title_label.pack(pady=(0, 15))# 选择文件/文件夹区域input_frame = tk.LabelFrame(main_frame, text="输入设置", padx=10, pady=10,font=("Arial", 10))input_frame.pack(fill=tk.X, pady=(0, 10))self.path_var = tk.StringVar()path_entry = tk.Entry(input_frame, textvariable=self.path_var, width=70,font=("Arial", 10), bd=2, relief=tk.GROOVE)path_entry.pack(side=tk.LEFT, padx=(0, 10), fill=tk.X, expand=True)file_btn = tk.Button(input_frame, text="浏览文件", command=self.select_file,bg="#3498db", fg="white", font=("Arial", 10), padx=8)file_btn.pack(side=tk.LEFT, padx=(0, 5))folder_btn = tk.Button(input_frame, text="浏览文件夹", command=self.select_folder,bg="#3498db", fg="white", font=("Arial", 10), padx=8)folder_btn.pack(side=tk.LEFT)# 处理选项区域options_frame = tk.LabelFrame(main_frame, text="处理选项", padx=10, pady=10,font=("Arial", 10))options_frame.pack(fill=tk.X, pady=(0, 10))self.recursive_var = tk.BooleanVar(value=True)tk.Checkbutton(options_frame, text="包含子文件夹", variable=self.recursive_var,font=("Arial", 10)).pack(anchor=tk.W, padx=5, pady=2)# 结果展示区域results_frame = tk.LabelFrame(main_frame, text="统计结果", padx=10, pady=10,font=("Arial", 10))results_frame.pack(fill=tk.BOTH, expand=True)# 创建表格展示结果columns = ("filename", "mark_lines", "unique_chars")self.tree = ttk.Treeview(results_frame, columns=columns, show="headings",style="Custom.Treeview")style = ttk.Style()style.configure("Custom.Treeview", font=('Arial', 10), rowheight=25)style.configure("Custom.Treeview.Heading", font=('Arial', 10, 'bold'))self.tree.heading("filename", text="文件名")self.tree.heading("mark_lines", text="标记行数")self.tree.heading("unique_chars", text="不同汉字数")self.tree.column("filename", width=400)self.tree.column("mark_lines", width=150, anchor=tk.CENTER)self.tree.column("unique_chars", width=150, anchor=tk.CENTER)scrollbar = ttk.Scrollbar(results_frame, orient=tk.VERTICAL, command=self.tree.yview)self.tree.configure(yscroll=scrollbar.set)self.tree.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)scrollbar.pack(side=tk.RIGHT, fill=tk.Y)# 汇总统计区域summary_frame = tk.Frame(main_frame, pady=10)summary_frame.pack(fill=tk.X)tk.Label(summary_frame, text="文件总数:", font=("Arial", 10, "bold")).pack(side=tk.LEFT, padx=(0, 10))self.total_files_var = tk.StringVar(value="0")tk.Label(summary_frame, textvariable=self.total_files_var, fg="#e74c3c",font=("Arial", 10, "bold")).pack(side=tk.LEFT, padx=(0, 20))tk.Label(summary_frame, text="总标记行数:", font=("Arial", 10, "bold")).pack(side=tk.LEFT, padx=(0, 10))self.total_lines_var = tk.StringVar(value="0")tk.Label(summary_frame, textvariable=self.total_lines_var, fg="#e74c3c",font=("Arial", 10, "bold")).pack(side=tk.LEFT, padx=(0, 20))tk.Label(summary_frame, text="不同汉字总数:", font=("Arial", 10, "bold")).pack(side=tk.LEFT, padx=(0, 10))self.total_chars_var = tk.StringVar(value="0")tk.Label(summary_frame, textvariable=self.total_chars_var, fg="#e74c3c",font=("Arial", 10, "bold")).pack(side=tk.LEFT)# 操作按钮区域button_frame = tk.Frame(main_frame)button_frame.pack(fill=tk.X, pady=(20, 0))analyze_btn = tk.Button(button_frame, text="开始分析", command=self.start_analysis,bg="#27ae60", fg="white", font=("Arial", 10, "bold"), padx=15, pady=5)analyze_btn.pack(side=tk.LEFT, padx=(0, 10))export_btn = tk.Button(button_frame, text="导出结果", command=self.export_results,bg="#f39c12", fg="white", font=("Arial", 10), padx=15, pady=5)export_btn.pack(side=tk.LEFT, padx=(0, 10))clear_btn = tk.Button(button_frame, text="清空结果", command=self.clear_results,bg="#e74c3c", fg="white", font=("Arial", 10), padx=15, pady=5)clear_btn.pack(side=tk.LEFT)hanzi_btn = tk.Button(button_frame, text="查看汉字详情", command=self.show_hanzi_details,bg="#9b59b6", fg="white", font=("Arial", 10), padx=15, pady=5)hanzi_btn.pack(side=tk.RIGHT, padx=(0, 10))def center_window(self):"""将窗口居中显示"""self.root.update_idletasks()width = self.root.winfo_width()height = self.root.winfo_height()screen_width = self.root.winfo_screenwidth()screen_height = self.root.winfo_screenheight()x = (screen_width // 2) - (width // 2)y = (screen_height // 2) - (height // 2)self.root.geometry(f'+{x}+{y}')def select_file(self):file_path = filedialog.askopenfilename(title="选择文本文件",filetypes=[("文本文件", "*.txt"), ("所有文件", "*.*")])if file_path:self.path_var.set(file_path)def select_folder(self):folder_path = filedialog.askdirectory(title="选择文件夹")if folder_path:self.path_var.set(folder_path)def start_analysis(self):path = self.path_var.get()if not path:messagebox.showwarning("输入错误", "请选择文件或文件夹路径")return# 清空之前的结果self.clear_results()# 获取文件列表if os.path.isfile(path):files = [path]else:files = self.get_text_files(path, self.recursive_var.get())if not files:# messagebox 出现对话框提示messagebox.showinfo("无文件", "未找到任何文本文件")return# 处理所有文件total_files = 0total_mark_lines = 0all_hanzi = set()# 处理单个文件for file_path in files:result = self.process_file(file_path)if result:mark_lines, unique_hanzi = resulttotal_files += 1total_mark_lines += mark_linesall_hanzi |= unique_hanzi# 添加到表格filename = os.path.basename(file_path)self.tree.insert("", tk.END, values=(filename, mark_lines, len(unique_hanzi)))# 更新汇总统计self.total_files_var.set(str(total_files))self.total_lines_var.set(str(total_mark_lines))self.total_chars_var.set(str(len(all_hanzi)))# 保存汉字集合供详情查看self.all_hanzi = sorted(all_hanzi)messagebox.showinfo("分析完成", f"已处理 {total_files} 个文件\n"f"总标记行数: {total_mark_lines}\n"f"不同汉字数量: {len(all_hanzi)}")def get_text_files(self, folder, recursive=True):text_files = []for root, _, files in os.walk(folder):for file in files:if file.lower().endswith('.txt'):text_files.append(os.path.join(root, file))if not recursive:breakreturn text_filesdef process_file(self, file_path):"""处理单个文件"""try:# 读取文件内容with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:content = f.read()except Exception as e:print(f"读取文件失败 {file_path}: {e}")return None# 统计标记行数（非空行）按行分割内容lines = content.splitlines()# 标记非空行（标记行）mark_lines = sum(1 for line in lines if line.strip())# 提取所有汉字（包括生僻字和繁体字）# 使用Unicode汉字范围：\u4e00-\u9fff（基本汉字）和 \u3400-\u4dbf（扩展A区）# 以及 \U00020000-\U0002ceaf（扩展B-E区）等hanzi_pattern = re.compile(r'[\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff'r'\U00020000-\U0002a6df'r'\U0002a700-\U0002b73f'r'\U0002b740-\U0002b81f'r'\U0002b820-\U0002ceaf]',flags=re.UNICODE)hanzi_chars = hanzi_pattern.findall(content)unique_hanzi = set(hanzi_chars)return mark_lines, unique_hanzidef show_hanzi_details(self):if not hasattr(self, 'all_hanzi') or not self.all_hanzi:messagebox.showinfo("无汉字数据", "请先执行分析操作")return# 创建新窗口显示汉字详情detail_win = tk.Toplevel(self.root)detail_win.title("汉字详情")detail_win.geometry("700x500")self.center_child_window(detail_win)# 创建带滚动条的框架frame = tk.Frame(detail_win)frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10)# 添加搜索框search_frame = tk.Frame(frame)search_frame.pack(fill=tk.X, pady=(0, 10))tk.Label(search_frame, text="搜索汉字:", font=("Arial", 10)).pack(side=tk.LEFT, padx=(0, 5))self.search_var = tk.StringVar()search_entry = tk.Entry(search_frame, textvariable=self.search_var, font=("Arial", 10), width=20)search_entry.pack(side=tk.LEFT, padx=(0, 5))search_entry.bind("<KeyRelease>", lambda e: self.filter_hanzi())tk.Button(search_frame, text="搜索", command=self.filter_hanzi,bg="#3498db", fg="white", font=("Arial", 9)).pack(side=tk.LEFT, padx=(0, 5))tk.Button(search_frame, text="重置", command=self.reset_filter,bg="#95a5a6", fg="white", font=("Arial", 9)).pack(side=tk.LEFT)# 创建文本区域text_frame = tk.Frame(frame)text_frame.pack(fill=tk.BOTH, expand=True)scrollbar = tk.Scrollbar(text_frame)scrollbar.pack(side=tk.RIGHT, fill=tk.Y)self.hanzi_text = tk.Text(text_frame, wrap=tk.WORD, yscrollcommand=scrollbar.set,font=("Arial", 12))self.hanzi_text.pack(fill=tk.BOTH, expand=True)scrollbar.config(command=self.hanzi_text.yview)# 显示汉字self.display_hanzi(self.all_hanzi)# 添加统计信息stats_frame = tk.Frame(frame)stats_frame.pack(fill=tk.X, pady=(10, 0))tk.Label(stats_frame, text=f"总汉字数: {len(self.all_hanzi)}",font=("Arial", 10, "bold")).pack(side=tk.LEFT)# 添加导出按钮tk.Button(stats_frame, text="导出汉字列表", command=self.export_hanzi_list,bg="#2c3e50", fg="white", font=("Arial", 9)).pack(side=tk.RIGHT)def display_hanzi(self, hanzi_list):"""在文本区域显示汉字列表"""self.hanzi_text.config(state=tk.NORMAL)self.hanzi_text.delete(1.0, tk.END)# 每行显示30个汉字for i in range(0, len(hanzi_list), 30):line = ''.join(hanzi_list[i:i + 30])self.hanzi_text.insert(tk.END, line + "\n")self.hanzi_text.config(state=tk.DISABLED)def filter_hanzi(self):"""根据搜索条件过滤汉字"""if not hasattr(self, 'all_hanzi'):returnsearch_term = self.search_var.get().strip()if not search_term:self.display_hanzi(self.all_hanzi)return# 查找包含搜索词的汉字filtered = [char for char in self.all_hanzi if search_term in char]if not filtered:self.hanzi_text.config(state=tk.NORMAL)self.hanzi_text.delete(1.0, tk.END)self.hanzi_text.insert(tk.END, f"未找到包含 '{search_term}' 的汉字")self.hanzi_text.config(state=tk.DISABLED)else:self.display_hanzi(filtered)def reset_filter(self):"""重置搜索过滤器"""self.search_var.set("")self.display_hanzi(self.all_hanzi)def center_child_window(self, window):"""将子窗口居中显示"""window.update_idletasks()width = window.winfo_width()height = window.winfo_height()parent_x = self.root.winfo_x()parent_y = self.root.winfo_y()parent_width = self.root.winfo_width()parent_height = self.root.winfo_height()x = parent_x + (parent_width // 2) - (width // 2)y = parent_y + (parent_height // 2) - (height // 2)window.geometry(f'+{x}+{y}')def export_results(self):if not self.tree.get_children():messagebox.showwarning("无数据", "没有可导出的结果")returnfile_path = filedialog.asksaveasfilename(defaultextension=".csv",filetypes=[("CSV文件", "*.csv"), ("文本文件", "*.txt"), ("所有文件", "*.*")])if not file_path:returntry:with open(file_path, 'w', encoding='utf-8') as f:# 写入表头f.write("文件名,标记行数,不同汉字数\n")# 写入每行数据for child in self.tree.get_children():values = self.tree.item(child)['values']f.write(f"{values[0]},{values[1]},{values[2]}\n")messagebox.showinfo("导出成功", f"结果已保存到:\n{file_path}")except Exception as e:messagebox.showerror("导出失败", f"导出时发生错误:\n{str(e)}")def export_hanzi_list(self):if not hasattr(self, 'all_hanzi') or not self.all_hanzi:messagebox.showinfo("无汉字数据", "没有可导出的汉字列表")returnfile_path = filedialog.asksaveasfilename(defaultextension=".txt",filetypes=[("文本文件", "*.txt"), ("所有文件", "*.*")])if not file_path:returntry:with open(file_path, 'w', encoding='utf-8') as f:f.write(f"统计到的汉字列表 (共 {len(self.all_hanzi)} 个):\n")f.write("=" * 50 + "\n\n")# 每行30个汉字for i in range(0, len(self.all_hanzi), 30):line = ''.join(self.all_hanzi[i:i + 30])f.write(line + "\n")messagebox.showinfo("导出成功", f"汉字列表已保存到:\n{file_path}")except Exception as e:messagebox.showerror("导出失败", f"导出时发生错误:\n{str(e)}")def clear_results(self):# 清空表格for item in self.tree.get_children():self.tree.delete(item)# 重置统计值self.total_files_var.set("0")self.total_lines_var.set("0")self.total_chars_var.set("0")# 清除汉字数据if hasattr(self, 'all_hanzi'):del self.all_hanziif __name__ == "__main__":root = tk.Tk()app = TextStatsAnalyzer(root)root.mainloop()