import os
from typing import Set, List, Tupleclass DatasetChecker:"""数据集与标注文件比对工具,用于检查图片和标注文件的匹配情况"""def __init__(self, img_dir: str, label_dir: str):"""初始化检查器:param img_dir: 图片文件夹路径:param label_dir: 标注文件夹路径"""self.img_dir = img_dirself.label_dir = label_dirself.img_files = self._get_files(img_dir)self.label_files = self._get_files(label_dir)self.img_basenames = self._get_basenames(self.img_files)self.label_basenames = self._get_basenames(self.label_files)def _get_files(self, dir_path: str) -> List[str]:"""获取文件夹中的所有文件名(包含扩展名)"""if not os.path.exists(dir_path):raise FileNotFoundError(f"文件夹不存在: {dir_path}")return os.listdir(dir_path)def _get_basename(self, filename: str) -> str:"""提取文件名(去除扩展名)"""return os.path.splitext(filename)[0]def _get_basenames(self, files: List[str]) -> Set[str]:"""将文件名列表转换为去除扩展名的集合"""return {self._get_basename(f) for f in files}def get_extra_files(self) -> Tuple[Set[str], Set[str]]:"""计算多余的文件:return: 元组 (图片文件夹中多余的文件, 标注文件夹中多余的文件)"""extra_imgs = self.img_basenames - self.label_basenamesextra_labels = self.label_basenames - self.img_basenamesreturn extra_imgs, extra_labelsdef find_original_filenames(self, basenames: Set[str], source_files: List[str]) -> List[str]:"""根据去除扩展名的文件名,查找原始带扩展名的文件名:param basenames: 去除扩展名的文件名集合:param source_files: 原始文件列表(带扩展名):return: 原始文件名列表"""original_files = []for basename in basenames:matches = [f for f in source_files if self._get_basename(f) == basename]if matches:original_files.append(matches[0]) return original_filesdef print_results(self):"""打印比对结果"""extra_imgs, extra_labels = self.get_extra_files()img_original = self.find_original_filenames(extra_imgs, self.img_files)print(f"图片文件夹中多余的文件(无对应标注):{len(img_original)} 个")for file in img_original:print(f" - {file}")label_original = self.find_original_filenames(extra_labels, self.label_files)print(f"\n标注文件夹中多余的文件(无对应图片):{len(label_original)} 个")for file in label_original:print(f" - {file}")
if __name__ == "__main__":img_directory = r'C:\Users\123\Desktop\fsdownload\images'label_directory = r'C:\Users\123\Desktop\fsdownload\labels'try:checker = DatasetChecker(img_directory, label_directory)checker.print_results()except Exception as e:print(f"错误: {e}")
带界面
import os
import tkinter as tk
from tkinter import ttk, filedialog, messagebox
from typing import Set, List, Tuple
from datetime import datetimeclass DatasetCheckerGUI:"""带 TK 界面的数据集与标注文件比对工具"""def __init__(self, root):self.root = rootself.root.title("数据集与标注文件比对工具")self.root.geometry("600x280")self.root.resizable(False, False)self.img_dir = ""self.label_dir = ""self._init_ui()def _init_ui(self):"""构建界面"""style = ttk.Style()style.configure("TButton", font=("微软雅黑", 10))style.configure("TLabel", font=("微软雅黑", 10))style.configure("TEntry", font=("微软雅黑", 10))img_frame = ttk.Frame(self.root, padding="10")img_frame.pack(fill=tk.X, padx=20, pady=15)ttk.Label(img_frame, text="图片文件夹:").pack(side=tk.LEFT, padx=5)self.img_entry = ttk.Entry(img_frame, width=40)self.img_entry.pack(side=tk.LEFT, padx=5, fill=tk.X, expand=True)ttk.Button(img_frame, text="选择", command=self._select_img_dir).pack(side=tk.LEFT, padx=5)label_frame = ttk.Frame(self.root, padding="10")label_frame.pack(fill=tk.X, padx=20, pady=5)ttk.Label(label_frame, text="标注文件夹:").pack(side=tk.LEFT, padx=5)self.label_entry = ttk.Entry(label_frame, width=40)self.label_entry.pack(side=tk.LEFT, padx=5, fill=tk.X, expand=True)ttk.Button(label_frame, text="选择", command=self._select_label_dir).pack(side=tk.LEFT, padx=5)btn_frame = ttk.Frame(self.root, padding="10")btn_frame.pack(pady=20)self.run_btn = ttk.Button(btn_frame,text="开始比对并保存结果",command=self._run_check,state=tk.DISABLED,style="Accent.TButton")self.run_btn.pack(padx=5, pady=5, ipady=3)self.status_var = tk.StringVar(value="请选择图片和标注文件夹")status_label = ttk.Label(self.root,textvariable=self.status_var,foreground="#666",font=("微软雅黑", 10))status_label.pack(pady=10)def _select_img_dir(self):"""选择图片文件夹"""dir_path = filedialog.askdirectory(title="选择图片文件夹")if dir_path:self.img_dir = dir_pathself.img_entry.delete(0, tk.END)self.img_entry.insert(0, dir_path)self._check_btn_status()def _select_label_dir(self):"""选择标注文件夹"""dir_path = filedialog.askdirectory(title="选择标注文件夹")if dir_path:self.label_dir = dir_pathself.label_entry.delete(0, tk.END)self.label_entry.insert(0, dir_path)self._check_btn_status()def _check_btn_status(self):"""检查按钮是否可启用"""if self.img_dir and self.label_dir:self.run_btn.config(state=tk.NORMAL)self.status_var.set("已选择文件夹,点击开始比对")else:self.run_btn.config(state=tk.DISABLED)self.status_var.set("请选择图片和标注文件夹")def _get_files(self, dir_path: str) -> List[str]:"""获取文件夹中的所有文件名(包含扩展名)"""if not os.path.exists(dir_path):raise FileNotFoundError(f"文件夹不存在: {dir_path}")return [f for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f))]def _get_basename(self, filename: str) -> str:"""提取文件名(去除扩展名)"""return os.path.splitext(filename)[0]def _get_basenames(self, files: List[str]) -> Set[str]:"""将文件名列表转换为去除扩展名的集合"""return {self._get_basename(f) for f in files}def _get_extra_files(self) -> Tuple[List[str], List[str]]:"""计算多余的文件"""img_files = self._get_files(self.img_dir)label_files = self._get_files(self.label_dir)img_basenames = self._get_basenames(img_files)label_basenames = self._get_basenames(label_files)extra_img_basenames = img_basenames - label_basenamesextra_label_basenames = label_basenames - img_basenamesextra_imgs = [f for f in img_files if self._get_basename(f) in extra_img_basenames]extra_labels = [f for f in label_files if self._get_basename(f) in extra_label_basenames]return extra_imgs, extra_labelsdef _save_results_to_txt(self, extra_imgs: List[str], extra_labels: List[str]) -> bool:"""将结果保存到 TXT 文件"""timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")save_path = os.path.join(os.getcwd(), f"数据集比对结果_{timestamp}.txt")try:with open(save_path, "w", encoding="utf-8") as f:f.write("=" * 50 + "\n")f.write(f"数据集比对结果 - 生成时间:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")f.write(f"图片文件夹路径:{self.img_dir}\n")f.write(f"标注文件夹路径:{self.label_dir}\n")f.write("=" * 50 + "\n\n")f.write(f"一、图片文件夹中多余的文件(无对应标注):共 {len(extra_imgs)} 个\n")if extra_imgs:for i, file in enumerate(extra_imgs, 1):f.write(f"{i:2d}. {file}\n")else:f.write("无\n")f.write("\n" + "-" * 30 + "\n\n")f.write(f"二、标注文件夹中多余的文件(无对应图片):共 {len(extra_labels)} 个\n")if extra_labels:for i, file in enumerate(extra_labels, 1):f.write(f"{i:2d}. {file}\n")else:f.write("无\n")f.write("\n" + "=" * 50 + "\n")return save_pathexcept Exception as e:messagebox.showerror("保存失败", f"结果保存出错:{str(e)}")return Falsedef _run_check(self):"""执行比对并保存结果"""self.status_var.set("正在比对...")self.root.update() try:extra_imgs, extra_labels = self._get_extra_files()save_path = self._save_results_to_txt(extra_imgs, extra_labels)if save_path:self.status_var.set(f"比对完成!结果已保存到当前目录")messagebox.showinfo("成功",f"比对完成!\n\n统计结果:\n图片文件夹多余文件:{len(extra_imgs)} 个\n标注文件夹多余文件:{len(extra_labels)} 个\n\n结果文件路径:\n{save_path}")else:self.status_var.set("比对完成,但保存失败")except Exception as e:error_msg = f"比对出错:{str(e)}"self.status_var.set(error_msg)messagebox.showerror("错误", error_msg)if __name__ == "__main__":root = tk.Tk()app = DatasetCheckerGUI(root)root.mainloop()
比对同一个文件夹内jpg/png 和 xml的不同
import os
import tkinter as tk
from tkinter import ttk, filedialog, messagebox
from typing import Set, List, Tuple
from datetime import datetimeclass DatasetCheckerGUI:"""同一文件夹下 图片与XML标注文件名比对工具"""def __init__(self, root):self.root = rootself.root.title("图片与XML标注文件名比对工具")self.root.geometry("600x250")self.root.resizable(False, False)self.target_dir = ""self._init_ui()def _init_ui(self):"""构建界面"""style = ttk.Style()style.configure("TButton", font=("微软雅黑", 10))style.configure("TLabel", font=("微软雅黑", 10))style.configure("TEntry", font=("微软雅黑", 10))dir_frame = ttk.Frame(self.root, padding="10")dir_frame.pack(fill=tk.X, padx=20, pady=30)ttk.Label(dir_frame, text="目标文件夹:").pack(side=tk.LEFT, padx=5)self.dir_entry = ttk.Entry(dir_frame, width=45)self.dir_entry.pack(side=tk.LEFT, padx=5, fill=tk.X, expand=True)ttk.Button(dir_frame, text="选择", command=self._select_dir).pack(side=tk.LEFT, padx=5)btn_frame = ttk.Frame(self.root, padding="10")btn_frame.pack(pady=10)self.run_btn = ttk.Button(btn_frame,text="开始比对并保存结果",command=self._run_check,state=tk.DISABLED,style="Accent.TButton")self.run_btn.pack(padx=5, pady=5, ipady=3)self.status_var = tk.StringVar(value="请选择包含图片和XML的文件夹")status_label = ttk.Label(self.root,textvariable=self.status_var,foreground="#666",font=("微软雅黑", 10))status_label.pack(pady=10)def _select_dir(self):"""选择目标文件夹(图片和XML在同一目录)"""dir_path = filedialog.askdirectory(title="选择包含图片和XML标注的文件夹")if dir_path:self.target_dir = dir_pathself.dir_entry.delete(0, tk.END)self.dir_entry.insert(0, dir_path)self.run_btn.config(state=tk.NORMAL)self.status_var.set("已选择文件夹,点击开始比对(仅检查JPG/PNG与XML)")else:self.run_btn.config(state=tk.DISABLED)self.status_var.set("请选择包含图片和XML的文件夹")def _get_files_by_ext(self, dir_path: str) -> Tuple[List[str], List[str]]:"""获取文件夹中所有图片文件(JPG/PNG)和XML标注文件:return: (图片文件列表, XML文件列表)"""if not os.path.exists(dir_path):raise FileNotFoundError(f"文件夹不存在: {dir_path}")img_files = []xml_files = []for filename in os.listdir(dir_path):file_path = os.path.join(dir_path, filename)if not os.path.isfile(file_path):continue ext = os.path.splitext(filename)[1].lower()if ext in [".jpg", ".jpeg", ".png"]:img_files.append(filename)elif ext == ".xml":xml_files.append(filename)return img_files, xml_filesdef _check_matching(self, img_files: List[str], xml_files: List[str]) -> Tuple[List[str], List[str]]:"""检查文件名一致性(忽略扩展名):return: (无对应XML的图片文件, 无对应图片的XML文件)"""img_basenames = {os.path.splitext(f)[0] for f in img_files}xml_basenames = {os.path.splitext(f)[0] for f in xml_files}img_without_xml = [f for f in img_files if os.path.splitext(f)[0] not in xml_basenames]xml_without_img = [f for f in xml_files if os.path.splitext(f)[0] not in img_basenames]return img_without_xml, xml_without_imgdef _save_results_to_txt(self, img_without_xml: List[str], xml_without_img: List[str]) -> bool:"""将比对结果保存到TXT文件"""timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")save_path = os.path.join(self.target_dir, f"文件名比对结果_{timestamp}.txt")try:with open(save_path, "w", encoding="utf-8") as f:f.write("=" * 60 + "\n")f.write(f"图片与XML标注文件名比对结果\n")f.write(f"生成时间:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")f.write(f"目标文件夹:{self.target_dir}\n")f.write(f"图片格式:JPG/JPEG/PNG | 标注格式:XML\n")f.write("=" * 60 + "\n\n")f.write(f"一、无对应XML标注的图片文件:共 {len(img_without_xml)} 个\n")if img_without_xml:for i, file in enumerate(img_without_xml, 1):f.write(f"{i:2d}. {file}\n")else:f.write("无\n")f.write("\n" + "-" * 40 + "\n\n")f.write(f"二、无对应图片的XML标注文件:共 {len(xml_without_img)} 个\n")if xml_without_img:for i, file in enumerate(xml_without_img, 1):f.write(f"{i:2d}. {file}\n")else:f.write("无\n")f.write("\n" + "=" * 60 + "\n")return save_pathexcept Exception as e:messagebox.showerror("保存失败", f"结果保存出错:{str(e)}")return Falsedef _run_check(self):"""执行比对逻辑"""self.status_var.set("正在比对...")self.root.update() try:img_files, xml_files = self._get_files_by_ext(self.target_dir)if not img_files and not xml_files:messagebox.showwarning("无文件", "文件夹中未找到JPG/PNG图片或XML标注文件")self.status_var.set("比对完成:未找到目标文件")returnimg_without_xml, xml_without_img = self._check_matching(img_files, xml_files)save_path = self._save_results_to_txt(img_without_xml, xml_without_img)if save_path:self.status_var.set("比对完成!结果已保存到目标文件夹")msg = (f"比对完成!\n\n"f"统计结果:\n"f"文件夹中图片总数(JPG/PNG):{len(img_files)} 个\n"f"文件夹中XML标注总数:{len(xml_files)} 个\n"f"无对应XML的图片:{len(img_without_xml)} 个\n"f"无对应图片的XML:{len(xml_without_img)} 个\n\n"f"结果文件已保存到:\n{save_path}")messagebox.showinfo("成功", msg)except Exception as e:error_msg = f"比对出错:{str(e)}"self.status_var.set(error_msg)messagebox.showerror("错误", error_msg)if __name__ == "__main__":root = tk.Tk()try:root.tk.call("source", "azure.tcl")root.tk.call("set_theme", "light")except:pass app = DatasetCheckerGUI(root)root.mainloop()