3、日常脚本:文件去重(删除重复文件)
快速对比文件夹内内容是否重复(即使文件名不同:
- 计算每个文件的哈希值(如 MD5、SHA1 等)
- 将相同哈希值的文件归为一组,即内容重复
使用 hashlib 计算文件的 MD5 值,并找出重复文件:
import os
import hashlib
from collections import defaultdictdef get_file_hash(filepath, chunk_size=8192):"""计算文件的 MD5 哈希值"""hash_md5 = hashlib.md5()try:with open(filepath, "rb") as f:for chunk in iter(lambda: f.read(chunk_size), b""):hash_md5.update(chunk)except (OSError, IOError) as e:raise ereturn hash_md5.hexdigest()def count_total_files(folder_path):"""统计文件夹中所有文件的数量"""total = 0for _, _, filenames in os.walk(folder_path):total += len(filenames)return totaldef find_duplicate_files_with_progress(folder_path):"""查找重复文件,并显示处理进度"""total_files = count_total_files(folder_path)if total_files == 0:print("=====》目标文件夹中没有可处理的文件。")return {}hash_map = defaultdict(list)processed = 0for dirpath, _, filenames in os.walk(folder_path):for filename in filenames:filepath = os.path.join(dirpath, filename)processed += 1basename = os.path.basename(filepath)display_name = (basename[:42] + "...") if len(basename) > 45 else basenameprint(f"\r[{processed}/{total_files}] 正在处理: {display_name:<48}", end='', flush=True)try:file_hash = get_file_hash(filepath)hash_map[file_hash].append(filepath)except (OSError, IOError) as e:print(f"\n 跳过文件(无法读取): {filepath} | 错误: {e}")print()duplicates = {h: paths for h, paths in hash_map.items() if len(paths) > 1}return duplicatesdef choose_keep_file(file_list, strategy="first"):"""从重复文件列表中选择一个保留strategy: 'first'(按路径顺序保留第一个)'newest'(保留最新修改的)'oldest'(保留最早修改的)"""if strategy == "first":return file_list[0]elif strategy == "newest":return max(file_list, key=os.path.getmtime)elif strategy == "oldest":return min(file_list, key=os.path.getmtime)else:return file_list[0]def delete_duplicates(duplicates, strategy="first"):"""删除重复文件,每组只保留一个"""deleted_count = 0total_to_delete = sum(len(files) - 1 for files in duplicates.values())if total_to_delete == 0:print(" 没有需要删除的重复文件。")return 0print(f"\n 共有 {len(duplicates)} 组重复文件,总计 {total_to_delete} 个冗余文件可删除。")confirm = input("是否删除冗余副本?(y/N): ").strip().lower()if confirm != 'y':print("X 已取消删除操作。")return 0print("\n正在删除重复文件...")for i, (file_hash, file_list) in enumerate(duplicates.items(), 1):keep = choose_keep_file(file_list, strategy=strategy)to_delete = [f for f in file_list if f != keep]print(f"\n【组 {i}】保留: {keep}")for f in to_delete:try:os.remove(f)print(f" √ 已删除: {f}")deleted_count += 1except OSError as e:print(f" X 删除失败: {f} | 错误: {e}")print(f"\n 删除完成!共成功删除 {deleted_count} 个重复文件。")return deleted_count# 主程序
if __name__ == "__main__":print(" 重复文件检测与清理工具(基于内容)")folder = input("请输入要检查的文件夹路径: ").strip().strip('"')if not os.path.isdir(folder):print("X 错误:输入的路径不是有效文件夹!")exit(1)print(f"文件 正在扫描: {os.path.abspath(folder)}")duplicates = find_duplicate_files_with_progress(folder)if not duplicates:print("\n 恭喜!未发现重复文件。")else:print("\n 发现以下重复文件组(内容完全相同):")group_num = 1for file_hash, file_list in duplicates.items():print(f"\n【重复组 #{group_num}】(共 {len(file_list)} 个文件)")for f in file_list:print(f" - {f}")group_num += 1# 询问是否删除print("\n 你可以选择删除每组中的冗余副本,仅保留一个。")print("保留策略:")print(" [1] 保留第一个(按路径顺序)")print(" [2] 保留最新修改的文件")print(" [3] 保留最早修改的文件")choice = input("请选择保留策略 (1/2/3,默认为1): ").strip()strategy_map = {"1": "first", "2": "newest", "3": "oldest"}strategy = strategy_map.get(choice, "first")delete_duplicates(duplicates, strategy=strategy)
