import * as dotenv from 'dotenv'; import * as path from 'path'; import * as fs from 'fs'; import { MongoClient, Db } from 'mongodb'; import { OllamaEmbeddings } from '@langchain/ollama'; // Load .env from backend root (one level above scripts/) dotenv.config({ path: path.resolve(__dirname, '../.env') }); // ─── Config ─────────────────────────────────────────────────────────────────── const MONGO_URI = process.env.MONGO_URI; const MONGO_DB_NAME = process.env.MONGO_DB_NAME; const EMBEDDING_MODEL = process.env.EMBEDDING_MODEL || 'bge-large'; const OLLAMA_BASE_URL = 'http://localhost:11434'; const BATCH_SIZE = 50; // Two levels up from scripts/ → workspace root → mongo stuff/ const MONGO_STUFF = path.resolve(__dirname, '../../mongo stuff'); // ─── Source JSON interfaces ──────────────────────────────────────────────────── interface RawPhase { phaseID: number; phaseCode: string; phaseName: string; phaseDesc: string; } interface RawBlock { blockID: number; blockCode: string; blockDesc: string; loc_type: string; entry_no: number; entry_year: number; quater_planted: number; month_planted: string; numOfTreesPlanted: string | number | null; totalTreeMatured: string | number | null; totalTreeImmatured: string | number | null; totalTreeDead: string | number | null; totalPlantedArea: string | number | null; initalPlantedArea: string | number | null; plantedLocUOM: string; loc_soil_condition: string; } interface RawFFB { activityId: number; productionDate: string; siteId: string; phaseId: number; blockId: number; net_weight: string; act_uom: string; no_of_bunches: number; qty_uom: string; } // ─── Target document interfaces (match NestJS schemas exactly) ──────────────── interface PhaseDoc { locId: number; phaseCode: string; description: string; locType: string; vector?: number[]; } interface BlockDoc { locId: number; blockCode: string; blockDesc: string; locType: string; entryNo: number; entryYear: number; quarterPlanted: number; monthPlanted: string; totalTrees: number; totalMaturedTrees: number; totalImmaturedTrees: number; totalDeadTrees: number; plantedArea: number; initialPlantedArea: number; plantedLocUOM: string; soilCondition: string; vector?: number[]; } // ─── Type-cast helpers ──────────────────────────────────────────────────────── const toNum = (v: string | number | null | undefined): number => v == null ? 0 : parseFloat(String(v)) || 0; // Round to integer (avoids NaN from null/empty strings) const toInt = (v: string | number | null | undefined): number => v == null ? 0 : Math.round(parseFloat(String(v))) || 0; function loadJson(filename: string): T { const filePath = path.join(MONGO_STUFF, filename); return JSON.parse(fs.readFileSync(filePath, 'utf-8')) as T; } // ─── Transform: phaseData.json → Phase collection schema ───────────────────── // phaseID → locId, phaseDesc → description, add locType function transformPhase(raw: RawPhase): PhaseDoc { return { locId: raw.phaseID, phaseCode: raw.phaseCode, description: raw.phaseDesc || raw.phaseName, locType: 'PHASE', }; } // ─── Transform: blockData.json → Block collection schema ───────────────────── // All snake_case / mixed-case source fields → camelCase schema fields. // Numeric strings (e.g. "13932", "88.7300") cast to native Number on write. function transformBlock(raw: RawBlock): BlockDoc { return { locId: raw.blockID, blockCode: raw.blockCode, blockDesc: raw.blockDesc || '', locType: raw.loc_type || 'BLOCK', entryNo: toInt(raw.entry_no), entryYear: toInt(raw.entry_year), quarterPlanted: toInt(raw.quater_planted), monthPlanted: raw.month_planted || '', totalTrees: toInt(raw.numOfTreesPlanted), // "13932" → 13932 totalMaturedTrees: toInt(raw.totalTreeMatured), totalImmaturedTrees: toInt(raw.totalTreeImmatured), totalDeadTrees: toInt(raw.totalTreeDead), plantedArea: toNum(raw.totalPlantedArea), // "88.7300" → 88.73 initialPlantedArea: toNum(raw.initalPlantedArea), plantedLocUOM: raw.plantedLocUOM || '', soilCondition: raw.loc_soil_condition || '', // "PEAT" → soilCondition }; } // ─── Phase embedding text ───────────────────────────────────────────────────── function buildPhaseEmbeddingText(doc: PhaseDoc): string { return `Phase Reference Entry: Phase Code: ${doc.phaseCode} Phase Name: ${doc.description} Location Type: ${doc.locType}`.trim(); } // ─── Block embedding text ───────────────────────────────────────────────────── function buildBlockEmbeddingText(doc: BlockDoc): string { let text = `Block Reference Entry: Block Code: ${doc.blockCode} Description: ${doc.blockDesc || 'No description.'} Location Type: ${doc.locType} Total Trees: ${doc.totalTrees} (${doc.totalMaturedTrees} matured, ${doc.totalImmaturedTrees} immature, ${doc.totalDeadTrees} dead) Planted Area: ${doc.plantedArea} ${doc.plantedLocUOM} Entry Year: ${doc.entryYear}, Quarter Planted: ${doc.quarterPlanted}, Month: ${doc.monthPlanted}`; if (doc.soilCondition) { text += `\n Soil Condition: ${doc.soilCondition}`; } return text.trim(); } // ─── FFB embedding text (mirrors recordToTextEnriched in ffb-vector.service.ts) ─ // Environmental Context line conditionally appended only when soilCondition exists, // matching the production vector service behaviour exactly. function buildFFBEmbeddingText( raw: RawFFB, phaseCode: string, phaseName: string, block: BlockDoc, ): string { let text = `FFB Production Log Entry: Project Code: ${raw.siteId} | Activity: FFB Harvesting Organization: (${raw.siteId}) Location Details: Phase ${phaseName} (${phaseCode}), Block Code ${block.blockCode} Harvest Output: ${raw.no_of_bunches} Bunches, Net Weight: ${toNum(raw.net_weight)} ${raw.act_uom} Logistics: Transported via Truck to Mill Field Observations: No supervisor remarks recorded. Operational Issues: No production anomalies reported.`; if (block.soilCondition) { text += `\nEnvironmental Context: Cultivated on ${block.soilCondition} soil conditions with a total tree count of ${block.totalTrees || 0} trees.`; } return text.trim(); } // ─── FFB Production document builder ───────────────────────────────────────── function buildFFBDoc( raw: RawFFB, phaseCode: string, phaseName: string, block: BlockDoc, vector: number[], ) { return { productionDate: new Date(raw.productionDate), prjCode: raw.siteId, actCode: 'FFB', actName: 'FFB Harvesting', entityCode: '', orgnId: 0, orgnCode: raw.siteId, orgnFullName: '', orgnAddress: '', orgnCompRegNo: '', phaseCode, phaseName, phaseDesc: phaseName, blockCode: block.blockCode, blockName: block.blockDesc || null, blockDesc: block.blockDesc || null, truckNo: '', millNo: '', actEntryNo: raw.activityId, actRound: 0, weightChitNo: '', ownNetWeight: null, netWeight: toNum(raw.net_weight), // "2.2700" → 2.27 actUom: raw.act_uom, noOfBunches: raw.no_of_bunches, qtyUom: raw.qty_uom, docActQty: 0, locArea: block.plantedArea, locUom: block.plantedLocUOM, budgetedFfb: null, remarks: '', issues: null, vector, }; } // ─── Batch embedder helper ──────────────────────────────────────────────────── async function embedInBatches( embedder: OllamaEmbeddings, texts: string[], label: string, ): Promise { const results: number[][] = []; const total = Math.ceil(texts.length / BATCH_SIZE); for (let i = 0; i < texts.length; i += BATCH_SIZE) { const batchNum = Math.floor(i / BATCH_SIZE) + 1; process.stdout.write(` [${batchNum}/${total}] embedding ${label}...`); const slice = texts.slice(i, i + BATCH_SIZE); const vecs = await embedder.embedDocuments(slice); results.push(...vecs); console.log(` ✅ ${slice.length} done.`); } return results; } // ─── Main ───────────────────────────────────────────────────────────────────── async function seed() { if (!MONGO_URI) throw new Error('MONGO_URI not set in .env'); if (!MONGO_DB_NAME) throw new Error('MONGO_DB_NAME not set in .env'); console.log(''); console.log('═══════════════════════════════════════════════════════'); console.log(' RAG Warehouse Seed Script'); console.log(` DB: ${MONGO_DB_NAME}`); console.log(` Model: ${EMBEDDING_MODEL} @ ${OLLAMA_BASE_URL}`); console.log('═══════════════════════════════════════════════════════'); console.log(''); // 1. Load source JSON console.log('📂 Loading source JSON files...'); const rawPhases = loadJson('phaseData.json'); const rawBlocks = loadJson('blockData.json'); const rawFFBs = loadJson('FFBProductionData.json'); console.log(` Phases: ${rawPhases.length}`); console.log(` Blocks: ${rawBlocks.length}`); console.log(` FFB Activities: ${rawFFBs.length}`); console.log(''); // 2. Transform master data console.log('🔄 Transforming master data (field rename + type cast)...'); const phases = rawPhases.map(transformPhase); const blocks = rawBlocks.map(transformBlock); // 3. Build in-memory lookup maps — avoids per-record Atlas round trips const phaseIdToCode = new Map(rawPhases.map(p => [p.phaseID, p.phaseCode])); const phaseIdToName = new Map(rawPhases.map(p => [p.phaseID, p.phaseName])); const blockIdToDoc = new Map(rawBlocks.map(b => [b.blockID, transformBlock(b)])); console.log(` Phase lookup map: ${phaseIdToCode.size} entries`); console.log(` Block lookup map: ${blockIdToDoc.size} entries`); console.log(''); // 4. Connect to MongoDB Atlas console.log('🔗 Connecting to MongoDB Atlas...'); const client = new MongoClient(MONGO_URI); await client.connect(); const db: Db = client.db(MONGO_DB_NAME); console.log(' Connected.\n'); // 5. Initialize Ollama embedder (shared across all three collections) console.log('🤖 Initializing Ollama embedder...'); const embedder = new OllamaEmbeddings({ model: EMBEDDING_MODEL, baseUrl: OLLAMA_BASE_URL, }); console.log(' Embedder ready.\n'); try { // ── Phase collection ──────────────────────────────────────────────────── console.log(`📦 Vectorizing ${phases.length} phases...`); const phaseTexts = phases.map(buildPhaseEmbeddingText); const phaseVectors = await embedInBatches(embedder, phaseTexts, 'phase'); console.log('🗑️ Phase: clearing...'); await db.collection('Phase').deleteMany({}); const phaseDocs = phases.map((p, i) => ({ ...p, vector: phaseVectors[i] })); await db.collection('Phase').insertMany(phaseDocs); console.log(` ✅ ${phaseDocs.length} phases inserted with vectors.\n`); // ── Block collection ──────────────────────────────────────────────────── console.log(`📦 Vectorizing ${blocks.length} blocks...`); const blockTexts = blocks.map(buildBlockEmbeddingText); const blockVectors = await embedInBatches(embedder, blockTexts, 'block'); console.log('🗑️ Block: clearing...'); await db.collection('Block').deleteMany({}); const blockDocs = blocks.map((b, i) => ({ ...b, vector: blockVectors[i] })); await db.collection('Block').insertMany(blockDocs); console.log(` ✅ ${blockDocs.length} blocks inserted with vectors.\n`); // ── FFB Production collection ─────────────────────────────────────────── console.log('🗑️ FFB Production: clearing...'); await db.collection('FFB Production').deleteMany({}); console.log(' Collection cleared.\n'); const totalBatches = Math.ceil(rawFFBs.length / BATCH_SIZE); let insertedCount = 0; let skippedCount = 0; console.log(`📦 Processing ${rawFFBs.length} FFB records in ${totalBatches} batches of ${BATCH_SIZE}...`); console.log(''); for (let i = 0; i < rawFFBs.length; i += BATCH_SIZE) { const batch = rawFFBs.slice(i, i + BATCH_SIZE); const batchNum = Math.floor(i / BATCH_SIZE) + 1; process.stdout.write(` [${batchNum}/${totalBatches}] resolving...`); // Resolve phaseCode/block for each record using in-memory maps const resolved: Array<{ raw: RawFFB; phaseCode: string; phaseName: string; block: BlockDoc }> = []; let batchSkip = 0; for (const raw of batch) { const phaseCode = phaseIdToCode.get(raw.phaseId); const phaseName = phaseIdToName.get(raw.phaseId) || ''; const block = blockIdToDoc.get(raw.blockId); if (!phaseCode || !block) { batchSkip++; } else { resolved.push({ raw, phaseCode, phaseName, block }); } } skippedCount += batchSkip; if (resolved.length === 0) { console.log(` all ${batchSkip} records have unresolvable IDs — skipped.`); continue; } // Generate embeddings for the entire resolved batch in one Ollama call process.stdout.write(` embedding ${resolved.length}...`); const texts = resolved.map(r => buildFFBEmbeddingText(r.raw, r.phaseCode, r.phaseName, r.block) ); const vectors = await embedder.embedDocuments(texts); // Build final documents and insert const docs = resolved.map((r, idx) => buildFFBDoc(r.raw, r.phaseCode, r.phaseName, r.block, vectors[idx]) ); await db.collection('FFB Production').insertMany(docs); insertedCount += docs.length; const skipNote = batchSkip > 0 ? ` (${batchSkip} skipped)` : ''; console.log(` ✅ ${docs.length} inserted.${skipNote}`); } // ── Final summary ─────────────────────────────────────────────────────── console.log(''); console.log('═══════════════════════════════════════════════════════'); console.log(' SEED COMPLETE'); console.log(` Phases inserted (with vectors): ${phaseDocs.length}`); console.log(` Blocks inserted (with vectors): ${blockDocs.length}`); console.log(` FFB records inserted: ${insertedCount}`); if (skippedCount > 0) { console.log(` FFB records skipped: ${skippedCount} (phaseId/blockId not in master)`); } console.log('═══════════════════════════════════════════════════════'); console.log(''); } finally { await client.close(); } } seed().catch(err => { console.error('\n❌ Seed failed:', err.message || err); process.exit(1); });