当前位置：首页 > news >正文

scene graph generation 用到的vg150数据集groundtruth数据预处理，展示和保存

news 2025/10/23 13:32:46

scene graph generation 用到的vg150数据集groundtruth数据预处理，展示和保存

文件下载：
预处理代码1（展示其前三条数据）：
预处理代码2 展示其全部数据：
本文档涉及到的所有文件的百度网盘分享：

文件下载：

我首先下载了KaihuaTang/Scene-Graph-Benchmark.pytorch/Publicdatasets/vg网址下的这两个文件：

VG-SGG-dicts-with-attri.json
image_data.json

然后参考Scene-Graph-Benchmark.pytorch/DATASET.md，下载了数据文件VG-SGG-with-attri.h5 。

之后我把这些文件放到了同一个文件夹下。

预处理代码1（展示其前三条数据）：

展示其前三条图片数据的代码：

#!/usr/bin/env python3
# -*- coding: utf-8 -*-import argparse
import json
import sys
from typing import Any, Dict, Listtry:import h5py
except ImportError:print("缺少依赖：h5py。请先安装：pip install h5py")sys.exit(1)def load_json(path: str) -> Any:with open(path, "r", encoding="utf-8") as f:return json.load(f)def coerce_int_key_map(m: Dict[Any, Any]) -> Dict[int, str]:res: Dict[int, str] = {}for k, v in m.items():try:kk = int(k)except Exception:continueres[kk] = str(v)return resdef preview_image_full(f: h5py.File,image_data: List[Dict[str, Any]],idx2label_maps: Dict[str, Dict[int, str]],img_index: int,limit: int,
) -> Dict[str, Any]:"""构造一张图片的完整 groundtruth 信息字典"""# 基本信息img_info = image_data[img_index] if img_index < len(image_data) else Noneimg_id = img_info.get("image_id") if img_info else None# box 范围fb = int(f["img_to_first_box"][img_index])lb = int(f["img_to_last_box"][img_index])boxes = f["boxes_512"][fb : lb + 1].tolist() if "boxes_512" in f else []labels = f["labels"][fb : lb + 1].flatten().tolist()idx2obj = idx2label_maps.get("idx_to_label", {})decoded_labels = [idx2obj.get(int(x), f"<unk:{x}>") for x in labels]# 属性attributes_rows = f["attributes"][fb : lb + 1].tolist() if "attributes" in f else []idx2attr = idx2label_maps.get("idx_to_attribute", {})decoded_attrs_rows = []for row in attributes_rows:decoded = [idx2attr.get(int(x), f"<unk:{x}>") for x in row if isinstance(x, int) and x > 0]decoded_attrs_rows.append(decoded)# all_boxes 输出all_boxes = []num_with_attr = 0for i in range(len(labels)):attrs = decoded_attrs_rows[i] if i < len(decoded_attrs_rows) else []if attrs:num_with_attr += 1all_boxes.append({"box": boxes[i] if i < len(boxes) else None,"label_idx": labels[i],"label": decoded_labels[i],"attributes": attrs})# 关系fr = int(f["img_to_first_rel"][img_index])lr = int(f["img_to_last_rel"][img_index])rel_pairs = f["relationships"][fr : lr + 1].tolist() if "relationships" in f else []rel_labels = f["predicates"][fr : lr + 1].flatten().tolist() if "predicates" in f else []idx2pred = idx2label_maps.get("idx_to_predicate", {})decoded_rels = [idx2pred.get(int(x), f"<unk:{x}>") for x in rel_labels]# 关系三元组解码triplets = []for i in range(len(rel_pairs)):subj_global, obj_global = rel_pairs[i]subj_local = subj_global - fbobj_local = obj_global - fbsubj_name = decoded_labels[subj_local] if 0 <= subj_local < len(decoded_labels) else f"<unk:{subj_global}>"obj_name = decoded_labels[obj_local] if 0 <= obj_local < len(decoded_labels) else f"<unk:{obj_global}>"pred_name = decoded_rels[i]triplets.append([subj_name, pred_name, obj_name])# 去重triplets_unique = []seen = set()for t in triplets:key = tuple(t)if key not in seen:seen.add(key)triplets_unique.append(t)out: Dict[str, Any] = {"image_index": img_index,"image_id": img_id,"image_info": {"width": img_info.get("width") if img_info else None,"height": img_info.get("height") if img_info else None,"url": img_info.get("url") if img_info else None,},"num_boxes": len(labels),"num_boxes_with_attr": num_with_attr,"all_boxes": all_boxes[:limit],  # 只输出前 limit 个 box"num_relations": len(rel_labels),"triplets_raw": triplets[:limit],  # 原始 triplets（可能有重复）"triplets_unique": triplets_unique[:limit],  # 去重后的 triplets}return outdef main():parser = argparse.ArgumentParser(description="VG150 groundtruth 结构化输出（增强版）")parser.add_argument("--image_data", type=str, default="image_data.json")parser.add_argument("--dicts", type=str, default="VG-SGG-dicts-with-attri.json")parser.add_argument("--h5", type=str, default="VG-SGG-with-attri.h5")parser.add_argument("--num_images", type=int, default=3)parser.add_argument("--limit", type=int, default=5)args = parser.parse_args()image_data = load_json(args.image_data)dicts_data = load_json(args.dicts)idx2label_maps: Dict[str, Dict[int, str]] = {}for key in ["idx_to_label", "idx_to_predicate", "idx_to_attribute"]:if key in dicts_data:idx2label_maps[key] = coerce_int_key_map(dicts_data[key])with h5py.File(args.h5, "r") as f:from pprint import pprintfor i in range(args.num_images):out = preview_image_full(f, image_data, idx2label_maps, img_index=i, limit=args.limit)print("\n==============================")print(f"## 图像索引 {i} 的完整 groundtruth")print("==============================")pprint(out, width=120)if __name__ == "__main__":main()

首先运行了一下展示其前三条数据的代码，得到的结果如下：

==============================
## 图像索引 0 的完整 groundtruth
==============================
{'all_boxes': [{'attributes': [], 'box': [256, 178, 512, 357], 'label': 'tree', 'label_idx': 136},{'attributes': ['brick'], 'box': [280, 289, 463, 186], 'label': 'sidewalk', 'label_idx': 114},{'attributes': ['brick', 'tall'], 'box': [71, 172, 143, 345], 'label': 'building', 'label_idx': 22},{'attributes': ['clean'], 'box': [395, 263, 230, 166], 'label': 'street', 'label_idx': 124},{'attributes': ['green', 'tall'], 'box': [294, 155, 50, 233], 'label': 'clock', 'label_idx': 30}],'image_id': 1,'image_index': 0,'image_info': {'height': 600, 'url': 'https://cs.stanford.edu/people/rak248/VG_100K_2/1.jpg', 'width': 800},'num_boxes': 23,'num_boxes_with_attr': 15,'num_relations': 29,'triplets_raw': [['man', 'wears', 'sneaker'],['sign', 'on', 'building'],['man', 'has', 'shirt'],['sidewalk', 'near', 'street'],['man', 'has', 'glass']],'triplets_unique': [['man', 'wears', 'sneaker'],['sign', 'on', 'building'],['man', 'has', 'shirt'],['sidewalk', 'near', 'street'],['man', 'has', 'glass']]}==============================
## 图像索引 1 的完整 groundtruth
==============================
{'all_boxes': [{'attributes': ['brick', 'white'], 'box': [357, 302, 306, 162], 'label': 'sidewalk', 'label_idx': 114},{'attributes': ['orange', 'brown', 'tall'],'box': [436, 132, 146, 265],'label': 'building','label_idx': 22},{'attributes': ['red', 'brown'], 'box': [191, 102, 166, 205], 'label': 'building', 'label_idx': 22},{'attributes': ['white', 'walking'], 'box': [249, 286, 91, 158], 'label': 'man', 'label_idx': 78},{'attributes': [], 'box': [286, 171, 32, 342], 'label': 'pole', 'label_idx': 99}],'image_id': 2,'image_index': 1,'image_info': {'height': 600, 'url': 'https://cs.stanford.edu/people/rak248/VG_100K/2.jpg', 'width': 800},'num_boxes': 16,'num_boxes_with_attr': 13,'num_relations': 6,'triplets_raw': [['building', 'has', 'window'],['building', 'has', 'window'],['building', 'has', 'window'],['building', 'has', 'window'],['building', 'has', 'window']],'triplets_unique': [['building', 'has', 'window'], ['bike', 'near', 'car']]}==============================
## 图像索引 2 的完整 groundtruth
==============================
{'all_boxes': [{'attributes': ['white', 'curved'], 'box': [306, 282, 406, 196], 'label': 'table', 'label_idx': 126},{'attributes': ['sitting'], 'box': [464, 130, 91, 111], 'label': 'girl', 'label_idx': 53},{'attributes': ['black', 'leather'], 'box': [484, 270, 53, 104], 'label': 'bag', 'label_idx': 4},{'attributes': ['long'], 'box': [488, 121, 45, 88], 'label': 'hair', 'label_idx': 57},{'attributes': [], 'box': [53, 355, 104, 35], 'label': 'drawer', 'label_idx': 39}],'image_id': 3,'image_index': 2,'image_info': {'height': 480, 'url': 'https://cs.stanford.edu/people/rak248/VG_100K/3.jpg', 'width': 640},'num_boxes': 8,'num_boxes_with_attr': 4,'num_relations': 6,'triplets_raw': [['girl', 'has', 'hair'],['bag', 'on', 'table'],['girl', 'has', 'hair'],['girl', 'with', 'hair'],['girl', 'with', 'hair']],'triplets_unique': [['girl', 'has', 'hair'],['bag', 'on', 'table'],['girl', 'with', 'hair'],['drawer', 'has', 'handle']]}

这三条数据让我们可以初步了解这个数据的情况。

预处理代码2 展示其全部数据：


#!/usr/bin/env python3
# -*- coding: utf-8 -*-"""
VG150 groundtruth 结构化导出工具 (JSON Lines 版本 v2)
- 遍历数据集中所有图片
- 将每张图片的信息保存为单行 JSON 对象，输出为 .jsonl 文件
- 优化数据结构，使其更紧凑、易于审阅
- 新增：统计并报告缺少 URL 的图像信息
"""import argparse
import json
import sys
from typing import Any, Dict, List
from urllib.parse import urlparse
import ostry:import h5py
except ImportError:print("错误：缺少依赖 h5py。请先安装：pip install h5py")sys.exit(1)try:from tqdm import tqdm
except ImportError:print("提示：缺少可选依赖 tqdm。建议安装以显示进度条：pip install tqdm")def tqdm(iterable, **kwargs):return iterabledef load_json(path: str) -> Any:"""从指定路径加载 JSON 文件"""with open(path, "r", encoding="utf-8") as f:return json.load(f)def coerce_int_key_map(m: Dict[Any, Any]) -> Dict[int, str]:"""将字典的键强制转换为整数类型"""res: Dict[int, str] = {}for k, v in m.items():try:kk = int(k)except (ValueError, TypeError):continueres[kk] = str(v)return resdef process_image_groundtruth(f: h5py.File,image_data: List[Dict[str, Any]],idx2label_maps: Dict[str, Dict[int, str]],img_index: int,
) -> Dict[str, Any]:"""构造一张图片的完整 groundtruth 信息字典（此函数与上一版相同）"""img_info = image_data[img_index] if img_index < len(image_data) else {}fb = int(f["img_to_first_box"][img_index])lb = int(f["img_to_last_box"][img_index])boxes = f["boxes_512"][fb : lb + 1].tolist()labels = f["labels"][fb : lb + 1].flatten().tolist()idx2obj = idx2label_maps.get("idx_to_label", {})decoded_labels = [idx2obj.get(x, f"<unk:{x}>") for x in labels]attributes_rows = f["attributes"][fb : lb + 1].tolist()idx2attr = idx2label_maps.get("idx_to_attribute", {})decoded_attrs_rows = [[idx2attr.get(x, f"<unk:{x}>") for x in row if x > 0]for row in attributes_rows]all_boxes = []for i in range(len(labels)):all_boxes.append({"label": decoded_labels[i],"attributes": decoded_attrs_rows[i],"box": boxes[i],})fr = int(f["img_to_first_rel"][img_index])lr = int(f["img_to_last_rel"][img_index])rel_pairs = f["relationships"][fr : lr + 1].tolist()rel_labels = f["predicates"][fr : lr + 1].flatten().tolist()idx2pred = idx2label_maps.get("idx_to_predicate", {})decoded_rels = [idx2pred.get(x, f"<unk:{x}>") for x in rel_labels]triplets = []for i in range(len(rel_pairs)):subj_global, obj_global = rel_pairs[i]subj_local = subj_global - fbobj_local = obj_global - fbsubj_name = decoded_labels[subj_local] if 0 <= subj_local < len(decoded_labels) else f"<out_of_bounds>"obj_name = decoded_labels[obj_local] if 0 <= obj_local < len(decoded_labels) else f"<out_of_bounds>"pred_name = decoded_rels[i]triplets.append([subj_name, pred_name, obj_name])triplets_unique_set = set(tuple(t) for t in triplets)triplets_unique = [list(t) for t in triplets_unique_set]return {"image_info": img_info,"objects": all_boxes,"groundtruth_triplets": triplets_unique,}def main():parser = argparse.ArgumentParser(description="VG150 groundtruth 导出为 JSON Lines 格式 (带统计功能)")parser.add_argument("--image_data", type=str, default="image_data.json", help="图像元数据 JSON 文件路径")parser.add_argument("--dicts", type=str, default="VG-SGG-dicts-with-attri.json", help="字典 JSON 文件路径")parser.add_argument("--h5", type=str, default="VG-SGG-with-attri.h5", help="数据集 HDF5 文件路径")parser.add_argument("--output_file", type=str, default="vg150_groundtruth.jsonl", help="输出的 JSON Lines 文件路径")args = parser.parse_args()print("正在加载元数据...")image_data = load_json(args.image_data)dicts_data = load_json(args.dicts)idx2label_maps: Dict[str, Dict[int, str]] = {key: coerce_int_key_map(dicts_data[key])for key in ["idx_to_label", "idx_to_predicate", "idx_to_attribute"]if key in dicts_data}# === 新增：初始化统计变量 ===missing_url_count = 0images_with_missing_urls = []print(f"正在打开 HDF5 文件: {args.h5}")with h5py.File(args.h5, "r") as f, open(args.output_file, "w", encoding="utf-8") as out_f:num_images = f["img_to_first_box"].shape[0]print(f"发现 {num_images} 张图像，开始处理并写入到 {args.output_file}...")for i in tqdm(range(num_images), desc="处理图像中"):gt_data = process_image_groundtruth(f, image_data, idx2label_maps, img_index=i)url = gt_data["image_info"].get("url") # 使用 .get() 更安全# === 新增：统计逻辑 ===if not url:missing_url_count += 1images_with_missing_urls.append({"index": i,"image_id": gt_data["image_info"].get("image_id")})# (这部分逻辑保持不变)image_filename = os.path.basename(urlparse(url).path) if url else f"{gt_data['image_info'].get('image_id', i)}.jpg"final_line_object = {"image_id": image_filename,"groundtruth_triplets": gt_data["groundtruth_triplets"],"objects": gt_data["objects"],"image_info": {"width": gt_data["image_info"].get("width"),"height": gt_data["image_info"].get("height"),"url": url,}}out_f.write(json.dumps(final_line_object, ensure_ascii=False) + "\n")print(f"\n处理完成！数据已成功保存到: {args.output_file}")# === 新增：打印最终的统计报告 ===print("\n--- 数据完整性统计报告 ---")if missing_url_count == 0:print("✅ 所有图像都包含有效的URL，数据完整性良好！")else:print(f"⚠️ 发现 {missing_url_count} 张图像缺少URL信息。")print("   对于这些图像，已使用其数字 image_id 生成了备用文件名 (例如 '123.jpg')。")print(f"   以下是前 {min(10, missing_url_count)} 个缺少URL的图像详情：")for item in images_with_missing_urls[:10]:print(f"     - 图像索引: {item['index']}, 原始数字ID: {item['image_id']}")print("--------------------------")if __name__ == "__main__":main()

代码2预处理了全部数据，并且把这些数据保存到了vg150_groundtruth.jsonl里

正在加载元数据...
正在打开 HDF5 文件: VG-SGG-with-attri.h5
发现 108073 张图像，开始处理并写入到 vg150_groundtruth.jsonl...
处理图像中: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 108073/108073 [02:35<00:00, 695.98it/s] 处理完成！数据已成功保存到: vg150_groundtruth.jsonl--- 数据完整性统计报告 ---
✅ 所有图像都包含有效的URL，数据完整性良好！

生成的vg150_groundtruth.jsonl文件内容的前两条如下：

{"image_id": "1.jpg", "groundtruth_triplets": [["sign", "on", "building"], ["man", "has", "shirt"], ["man", "wearing", "glass"], ["bike", "behind", "man"], ["bike", "on", "sidewalk"], ["man", "has", "shoe"], ["man", "wears", "sneaker"], ["man", "in", "shirt"], ["sidewalk", "near", "street"], ["man", "wears", "pant"], ["tree", "near", "street"], ["tree", "near", "sidewalk"], ["man", "wearing", "pant"], ["man", "has", "glass"], ["man", "wearing", "shirt"], ["bike", "near", "tree"], ["man", "wearing", "shoe"], ["building", "with", "window"], ["shirt", "on", "man"], ["bike", "parked on", "sidewalk"], ["car", "parked on", "street"], ["man", "has", "pant"]], "objects": [{"label": "tree", "attributes": [], "box": [256, 178, 512, 357]}, {"label": "sidewalk", "attributes": ["brick"], "box": [280, 289, 463, 186]}, {"label": "building", "attributes": ["brick", "tall"], "box": [71, 172, 143, 345]}, {"label": "street", "attributes": ["clean"], "box": [395, 263, 230, 166]}, {"label": "clock", "attributes": ["green", "tall"], "box": [294, 155, 50, 233]}, {"label": "window", "attributes": [], "box": [447, 47, 127, 95]}, {"label": "man", "attributes": [], "box": [260, 248, 49, 160]}, {"label": "man", "attributes": [], "box": [170, 242, 38, 168]}, {"label": "sign", "attributes": ["black"], "box": [103, 64, 50, 115]}, {"label": "car", "attributes": ["white", "parked"], "box": [485, 270, 52, 105]}, {"label": "shirt", "attributes": ["grey"], "box": [260, 220, 53, 64]}, {"label": "car", "attributes": [], "box": [330, 233, 50, 61]}, {"label": "pant", "attributes": ["grey"], "box": [262, 276, 31, 82]}, {"label": "shirt", "attributes": ["orange", "red"], "box": [170, 216, 35, 66]}, {"label": "pant", "attributes": ["black"], "box": [170, 283, 29, 76]}, {"label": "shoe", "attributes": ["brown"], "box": [262, 318, 31, 18]}, {"label": "arm", "attributes": ["raised"], "box": [246, 194, 20, 27]}, {"label": "bike", "attributes": ["parked"], "box": [224, 215, 18, 24]}, {"label": "bike", "attributes": [], "box": [213, 211, 18, 27]}, {"label": "glass", "attributes": [], "box": [300, 209, 28, 15]}, {"label": "street", "attributes": ["brick"], "box": [276, 293, 457, 168]}, {"label": "sneaker", "attributes": ["grey"], "box": [171, 320, 34, 17]}, {"label": "bike", "attributes": [], "box": [217, 214, 26, 23]}], "image_info": {"width": 800, "height": 600, "url": "https://cs.stanford.edu/people/rak248/VG_100K_2/1.jpg"}}
{"image_id": "2.jpg", "groundtruth_triplets": [["building", "has", "window"], ["bike", "near", "car"]], "objects": [{"label": "sidewalk", "attributes": ["brick", "white"], "box": [357, 302, 306, 162]}, {"label": "building", "attributes": ["orange", "brown", "tall"], "box": [436, 132, 146, 265]}, {"label": "building", "attributes": ["red", "brown"], "box": [191, 102, 166, 205]}, {"label": "man", "attributes": ["white", "walking"], "box": [249, 286, 91, 158]}, {"label": "pole", "attributes": [], "box": [286, 171, 32, 342]}, {"label": "window", "attributes": ["glass"], "box": [444, 95, 61, 106]}, {"label": "car", "attributes": ["parked", "red"], "box": [193, 260, 82, 70]}, {"label": "tree", "attributes": ["green"], "box": [28, 162, 57, 141]}, {"label": "tree", "attributes": ["green"], "box": [98, 176, 68, 104]}, {"label": "tree", "attributes": ["green"], "box": [56, 164, 40, 132]}, {"label": "window", "attributes": ["glass"], "box": [442, 202, 63, 71]}, {"label": "window", "attributes": ["glass"], "box": [493, 96, 34, 104]}, {"label": "car", "attributes": ["white"], "box": [249, 235, 52, 40]}, {"label": "window", "attributes": ["glass"], "box": [495, 202, 32, 69]}, {"label": "window", "attributes": [], "box": [409, 100, 12, 109]}, {"label": "bike", "attributes": [], "box": [281, 278, 31, 41]}], "image_info": {"width": 800, "height": 600, "url": "https://cs.stanford.edu/people/rak248/VG_100K/2.jpg"}}