import os import shutil # Master Configuration: Maps {Original_ID: Master_ID} CONFIG = { "suharjito_dataset": {"0": "0", "1": "1", "2": "2", "3": "3", "4": "4", "5": "5"}, "darren_dataset": {"0": "4", "1": "5", "2": "3"}, "fy_dataset": {"0": "1", "1": "3", "2": "5"}, "nazwa_dataset": {"0": "2", "1": "0", "2": "5", "3": "3", "4": "1", "5": "4"} } BASE_DIR = "datasets" OUTPUT_DIR = "unified_dataset" # Prepare Directories for split in ['train', 'val']: os.makedirs(f"{OUTPUT_DIR}/images/{split}", exist_ok=True) os.makedirs(f"{OUTPUT_DIR}/labels/{split}", exist_ok=True) def process_and_merge(): for ds_name, mapping in CONFIG.items(): print(f"Processing {ds_name}...") # Roboflow uses 'valid', Suharjito might use 'valid' or 'test' # Adjusting splits to unify them into train/val splits = { 'train': 'train', 'valid': 'val', 'test': 'val' # Adding test images to validation for a more robust PoC } for src_split, target_split in splits.items(): img_path = os.path.join(BASE_DIR, ds_name, src_split, "images") lbl_path = os.path.join(BASE_DIR, ds_name, src_split, "labels") if not os.path.exists(lbl_path): continue for label_file in os.listdir(lbl_path): if not label_file.endswith(".txt"): continue # 1. Re-index Labels new_lines = [] with open(os.path.join(lbl_path, label_file), 'r') as f: for line in f: parts = line.split() if parts[0] in mapping: parts[0] = mapping[parts[0]] new_lines.append(" ".join(parts)) if new_lines: # 2. Save new label with dataset prefix to avoid filename collisions new_name = f"{ds_name}_{label_file}" with open(os.path.join(OUTPUT_DIR, "labels", target_split, new_name), 'w') as f: f.write("\n".join(new_lines)) # 3. Copy Image img_exts = ['.jpg', '.jpeg', '.png', '.JPG'] for ext in img_exts: img_name = label_file.replace(".txt", ext) src_img = os.path.join(img_path, img_name) if os.path.exists(src_img): shutil.copy(src_img, os.path.join(OUTPUT_DIR, "images", target_split, f"{ds_name}_{img_name}")) break process_and_merge() print("Successfully created unified_dataset!")