enzo
/
PalmOilRipenessDetection-Python


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
							import os
import shutil

# Master Configuration: Maps {Original_ID: Master_ID}
CONFIG = {
    "suharjito_dataset": {"0": "0", "1": "1", "2": "2", "3": "3", "4": "4", "5": "5"},
    "darren_dataset": {"0": "4", "1": "5", "2": "3"},
    "fy_dataset": {"0": "1", "1": "3", "2": "5"},
    "nazwa_dataset": {"0": "2", "1": "0", "2": "5", "3": "3", "4": "1", "5": "4"}
}

BASE_DIR = "datasets"
OUTPUT_DIR = "unified_dataset"

# Prepare Directories
for split in ['train', 'val']:
    os.makedirs(f"{OUTPUT_DIR}/images/{split}", exist_ok=True)
    os.makedirs(f"{OUTPUT_DIR}/labels/{split}", exist_ok=True)

def process_and_merge():
    for ds_name, mapping in CONFIG.items():
        print(f"Processing {ds_name}...")
        
        # Roboflow uses 'valid', Suharjito might use 'valid' or 'test'
        # Adjusting splits to unify them into train/val
        splits = {
            'train': 'train',
            'valid': 'val',
            'test': 'val' # Adding test images to validation for a more robust PoC
        }

        for src_split, target_split in splits.items():
            img_path = os.path.join(BASE_DIR, ds_name, src_split, "images")
            lbl_path = os.path.join(BASE_DIR, ds_name, src_split, "labels")

            if not os.path.exists(lbl_path): continue

            for label_file in os.listdir(lbl_path):
                if not label_file.endswith(".txt"): continue
                
                # 1. Re-index Labels
                new_lines = []
                with open(os.path.join(lbl_path, label_file), 'r') as f:
                    for line in f:
                        parts = line.split()
                        if parts[0] in mapping:
                            parts[0] = mapping[parts[0]]
                            new_lines.append(" ".join(parts))
                
                if new_lines:
                    # 2. Save new label with dataset prefix to avoid filename collisions
                    new_name = f"{ds_name}_{label_file}"
                    with open(os.path.join(OUTPUT_DIR, "labels", target_split, new_name), 'w') as f:
                        f.write("\n".join(new_lines))
                    
                    # 3. Copy Image
                    img_exts = ['.jpg', '.jpeg', '.png', '.JPG']
                    for ext in img_exts:
                        img_name = label_file.replace(".txt", ext)
                        src_img = os.path.join(img_path, img_name)
                        if os.path.exists(src_img):
                            shutil.copy(src_img, os.path.join(OUTPUT_DIR, "images", target_split, f"{ds_name}_{img_name}"))
                            break

process_and_merge()
print("Successfully created unified_dataset!")