unify.py 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
  1. import os
  2. import shutil
  3. # Master Configuration: Maps {Original_ID: Master_ID}
  4. CONFIG = {
  5. "suharjito_dataset": {"0": "0", "1": "1", "2": "2", "3": "3", "4": "4", "5": "5"},
  6. "darren_dataset": {"0": "4", "1": "5", "2": "3"},
  7. "fy_dataset": {"0": "1", "1": "3", "2": "5"},
  8. "nazwa_dataset": {"0": "2", "1": "0", "2": "5", "3": "3", "4": "1", "5": "4"}
  9. }
  10. BASE_DIR = "datasets"
  11. OUTPUT_DIR = "unified_dataset"
  12. # Prepare Directories
  13. for split in ['train', 'val']:
  14. os.makedirs(f"{OUTPUT_DIR}/images/{split}", exist_ok=True)
  15. os.makedirs(f"{OUTPUT_DIR}/labels/{split}", exist_ok=True)
  16. def process_and_merge():
  17. for ds_name, mapping in CONFIG.items():
  18. print(f"Processing {ds_name}...")
  19. # Roboflow uses 'valid', Suharjito might use 'valid' or 'test'
  20. # Adjusting splits to unify them into train/val
  21. splits = {
  22. 'train': 'train',
  23. 'valid': 'val',
  24. 'test': 'val' # Adding test images to validation for a more robust PoC
  25. }
  26. for src_split, target_split in splits.items():
  27. img_path = os.path.join(BASE_DIR, ds_name, src_split, "images")
  28. lbl_path = os.path.join(BASE_DIR, ds_name, src_split, "labels")
  29. if not os.path.exists(lbl_path): continue
  30. for label_file in os.listdir(lbl_path):
  31. if not label_file.endswith(".txt"): continue
  32. # 1. Re-index Labels
  33. new_lines = []
  34. with open(os.path.join(lbl_path, label_file), 'r') as f:
  35. for line in f:
  36. parts = line.split()
  37. if parts[0] in mapping:
  38. parts[0] = mapping[parts[0]]
  39. new_lines.append(" ".join(parts))
  40. if new_lines:
  41. # 2. Save new label with dataset prefix to avoid filename collisions
  42. new_name = f"{ds_name}_{label_file}"
  43. with open(os.path.join(OUTPUT_DIR, "labels", target_split, new_name), 'w') as f:
  44. f.write("\n".join(new_lines))
  45. # 3. Copy Image
  46. img_exts = ['.jpg', '.jpeg', '.png', '.JPG']
  47. for ext in img_exts:
  48. img_name = label_file.replace(".txt", ext)
  49. src_img = os.path.join(img_path, img_name)
  50. if os.path.exists(src_img):
  51. shutil.copy(src_img, os.path.join(OUTPUT_DIR, "images", target_split, f"{ds_name}_{img_name}"))
  52. break
  53. process_and_merge()
  54. print("Successfully created unified_dataset!")