|
|
@@ -0,0 +1,414 @@
|
|
|
+import * as dotenv from 'dotenv';
|
|
|
+import * as path from 'path';
|
|
|
+import * as fs from 'fs';
|
|
|
+import { MongoClient, Db } from 'mongodb';
|
|
|
+import { OllamaEmbeddings } from '@langchain/ollama';
|
|
|
+
|
|
|
+// Load .env from backend root (one level above scripts/)
|
|
|
+dotenv.config({ path: path.resolve(__dirname, '../.env') });
|
|
|
+
|
|
|
+// ─── Config ───────────────────────────────────────────────────────────────────
|
|
|
+
|
|
|
+const MONGO_URI = process.env.MONGO_URI;
|
|
|
+const MONGO_DB_NAME = process.env.MONGO_DB_NAME;
|
|
|
+const EMBEDDING_MODEL = process.env.EMBEDDING_MODEL || 'bge-large';
|
|
|
+const OLLAMA_BASE_URL = 'http://localhost:11434';
|
|
|
+const BATCH_SIZE = 50;
|
|
|
+
|
|
|
+// Two levels up from scripts/ → workspace root → mongo stuff/
|
|
|
+const MONGO_STUFF = path.resolve(__dirname, '../../mongo stuff');
|
|
|
+
|
|
|
+// ─── Source JSON interfaces ────────────────────────────────────────────────────
|
|
|
+
|
|
|
+interface RawPhase {
|
|
|
+ phaseID: number;
|
|
|
+ phaseCode: string;
|
|
|
+ phaseName: string;
|
|
|
+ phaseDesc: string;
|
|
|
+}
|
|
|
+
|
|
|
+interface RawBlock {
|
|
|
+ blockID: number;
|
|
|
+ blockCode: string;
|
|
|
+ blockDesc: string;
|
|
|
+ loc_type: string;
|
|
|
+ entry_no: number;
|
|
|
+ entry_year: number;
|
|
|
+ quater_planted: number;
|
|
|
+ month_planted: string;
|
|
|
+ numOfTreesPlanted: string | number | null;
|
|
|
+ totalTreeMatured: string | number | null;
|
|
|
+ totalTreeImmatured: string | number | null;
|
|
|
+ totalTreeDead: string | number | null;
|
|
|
+ totalPlantedArea: string | number | null;
|
|
|
+ initalPlantedArea: string | number | null;
|
|
|
+ plantedLocUOM: string;
|
|
|
+ loc_soil_condition: string;
|
|
|
+}
|
|
|
+
|
|
|
+interface RawFFB {
|
|
|
+ activityId: number;
|
|
|
+ productionDate: string;
|
|
|
+ siteId: string;
|
|
|
+ phaseId: number;
|
|
|
+ blockId: number;
|
|
|
+ net_weight: string;
|
|
|
+ act_uom: string;
|
|
|
+ no_of_bunches: number;
|
|
|
+ qty_uom: string;
|
|
|
+}
|
|
|
+
|
|
|
+// ─── Target document interfaces (match NestJS schemas exactly) ────────────────
|
|
|
+
|
|
|
+interface PhaseDoc {
|
|
|
+ locId: number;
|
|
|
+ phaseCode: string;
|
|
|
+ description: string;
|
|
|
+ locType: string;
|
|
|
+ vector?: number[];
|
|
|
+}
|
|
|
+
|
|
|
+interface BlockDoc {
|
|
|
+ locId: number;
|
|
|
+ blockCode: string;
|
|
|
+ blockDesc: string;
|
|
|
+ locType: string;
|
|
|
+ entryNo: number;
|
|
|
+ entryYear: number;
|
|
|
+ quarterPlanted: number;
|
|
|
+ monthPlanted: string;
|
|
|
+ totalTrees: number;
|
|
|
+ totalMaturedTrees: number;
|
|
|
+ totalImmaturedTrees: number;
|
|
|
+ totalDeadTrees: number;
|
|
|
+ plantedArea: number;
|
|
|
+ initialPlantedArea: number;
|
|
|
+ plantedLocUOM: string;
|
|
|
+ soilCondition: string;
|
|
|
+ vector?: number[];
|
|
|
+}
|
|
|
+
|
|
|
+// ─── Type-cast helpers ────────────────────────────────────────────────────────
|
|
|
+
|
|
|
+const toNum = (v: string | number | null | undefined): number =>
|
|
|
+ v == null ? 0 : parseFloat(String(v)) || 0;
|
|
|
+
|
|
|
+// Round to integer (avoids NaN from null/empty strings)
|
|
|
+const toInt = (v: string | number | null | undefined): number =>
|
|
|
+ v == null ? 0 : Math.round(parseFloat(String(v))) || 0;
|
|
|
+
|
|
|
+function loadJson<T>(filename: string): T {
|
|
|
+ const filePath = path.join(MONGO_STUFF, filename);
|
|
|
+ return JSON.parse(fs.readFileSync(filePath, 'utf-8')) as T;
|
|
|
+}
|
|
|
+
|
|
|
+// ─── Transform: phaseData.json → Phase collection schema ─────────────────────
|
|
|
+// phaseID → locId, phaseDesc → description, add locType
|
|
|
+
|
|
|
+function transformPhase(raw: RawPhase): PhaseDoc {
|
|
|
+ return {
|
|
|
+ locId: raw.phaseID,
|
|
|
+ phaseCode: raw.phaseCode,
|
|
|
+ description: raw.phaseDesc || raw.phaseName,
|
|
|
+ locType: 'PHASE',
|
|
|
+ };
|
|
|
+}
|
|
|
+
|
|
|
+// ─── Transform: blockData.json → Block collection schema ─────────────────────
|
|
|
+// All snake_case / mixed-case source fields → camelCase schema fields.
|
|
|
+// Numeric strings (e.g. "13932", "88.7300") cast to native Number on write.
|
|
|
+
|
|
|
+function transformBlock(raw: RawBlock): BlockDoc {
|
|
|
+ return {
|
|
|
+ locId: raw.blockID,
|
|
|
+ blockCode: raw.blockCode,
|
|
|
+ blockDesc: raw.blockDesc || '',
|
|
|
+ locType: raw.loc_type || 'BLOCK',
|
|
|
+ entryNo: toInt(raw.entry_no),
|
|
|
+ entryYear: toInt(raw.entry_year),
|
|
|
+ quarterPlanted: toInt(raw.quater_planted),
|
|
|
+ monthPlanted: raw.month_planted || '',
|
|
|
+ totalTrees: toInt(raw.numOfTreesPlanted), // "13932" → 13932
|
|
|
+ totalMaturedTrees: toInt(raw.totalTreeMatured),
|
|
|
+ totalImmaturedTrees: toInt(raw.totalTreeImmatured),
|
|
|
+ totalDeadTrees: toInt(raw.totalTreeDead),
|
|
|
+ plantedArea: toNum(raw.totalPlantedArea), // "88.7300" → 88.73
|
|
|
+ initialPlantedArea: toNum(raw.initalPlantedArea),
|
|
|
+ plantedLocUOM: raw.plantedLocUOM || '',
|
|
|
+ soilCondition: raw.loc_soil_condition || '', // "PEAT" → soilCondition
|
|
|
+ };
|
|
|
+}
|
|
|
+
|
|
|
+// ─── Phase embedding text ─────────────────────────────────────────────────────
|
|
|
+
|
|
|
+function buildPhaseEmbeddingText(doc: PhaseDoc): string {
|
|
|
+ return `Phase Reference Entry:
|
|
|
+ Phase Code: ${doc.phaseCode}
|
|
|
+ Phase Name: ${doc.description}
|
|
|
+ Location Type: ${doc.locType}`.trim();
|
|
|
+}
|
|
|
+
|
|
|
+// ─── Block embedding text ─────────────────────────────────────────────────────
|
|
|
+
|
|
|
+function buildBlockEmbeddingText(doc: BlockDoc): string {
|
|
|
+ let text = `Block Reference Entry:
|
|
|
+ Block Code: ${doc.blockCode}
|
|
|
+ Description: ${doc.blockDesc || 'No description.'}
|
|
|
+ Location Type: ${doc.locType}
|
|
|
+ Total Trees: ${doc.totalTrees} (${doc.totalMaturedTrees} matured, ${doc.totalImmaturedTrees} immature, ${doc.totalDeadTrees} dead)
|
|
|
+ Planted Area: ${doc.plantedArea} ${doc.plantedLocUOM}
|
|
|
+ Entry Year: ${doc.entryYear}, Quarter Planted: ${doc.quarterPlanted}, Month: ${doc.monthPlanted}`;
|
|
|
+
|
|
|
+ if (doc.soilCondition) {
|
|
|
+ text += `\n Soil Condition: ${doc.soilCondition}`;
|
|
|
+ }
|
|
|
+
|
|
|
+ return text.trim();
|
|
|
+}
|
|
|
+
|
|
|
+// ─── FFB embedding text (mirrors recordToTextEnriched in ffb-vector.service.ts) ─
|
|
|
+// Environmental Context line conditionally appended only when soilCondition exists,
|
|
|
+// matching the production vector service behaviour exactly.
|
|
|
+
|
|
|
+function buildFFBEmbeddingText(
|
|
|
+ raw: RawFFB,
|
|
|
+ phaseCode: string,
|
|
|
+ phaseName: string,
|
|
|
+ block: BlockDoc,
|
|
|
+): string {
|
|
|
+ let text = `FFB Production Log Entry:
|
|
|
+ Project Code: ${raw.siteId} | Activity: FFB Harvesting
|
|
|
+ Organization: (${raw.siteId})
|
|
|
+ Location Details: Phase ${phaseName} (${phaseCode}), Block Code ${block.blockCode}
|
|
|
+ Harvest Output: ${raw.no_of_bunches} Bunches, Net Weight: ${toNum(raw.net_weight)} ${raw.act_uom}
|
|
|
+ Logistics: Transported via Truck to Mill
|
|
|
+ Field Observations: No supervisor remarks recorded.
|
|
|
+ Operational Issues: No production anomalies reported.`;
|
|
|
+
|
|
|
+ if (block.soilCondition) {
|
|
|
+ text += `\nEnvironmental Context: Cultivated on ${block.soilCondition} soil conditions with a total tree count of ${block.totalTrees || 0} trees.`;
|
|
|
+ }
|
|
|
+
|
|
|
+ return text.trim();
|
|
|
+}
|
|
|
+
|
|
|
+// ─── FFB Production document builder ─────────────────────────────────────────
|
|
|
+
|
|
|
+function buildFFBDoc(
|
|
|
+ raw: RawFFB,
|
|
|
+ phaseCode: string,
|
|
|
+ phaseName: string,
|
|
|
+ block: BlockDoc,
|
|
|
+ vector: number[],
|
|
|
+) {
|
|
|
+ return {
|
|
|
+ productionDate: new Date(raw.productionDate),
|
|
|
+ prjCode: raw.siteId,
|
|
|
+ actCode: 'FFB',
|
|
|
+ actName: 'FFB Harvesting',
|
|
|
+ entityCode: '',
|
|
|
+ orgnId: 0,
|
|
|
+ orgnCode: raw.siteId,
|
|
|
+ orgnFullName: '',
|
|
|
+ orgnAddress: '',
|
|
|
+ orgnCompRegNo: '',
|
|
|
+ phaseCode,
|
|
|
+ phaseName,
|
|
|
+ phaseDesc: phaseName,
|
|
|
+ blockCode: block.blockCode,
|
|
|
+ blockName: block.blockDesc || null,
|
|
|
+ blockDesc: block.blockDesc || null,
|
|
|
+ truckNo: '',
|
|
|
+ millNo: '',
|
|
|
+ actEntryNo: raw.activityId,
|
|
|
+ actRound: 0,
|
|
|
+ weightChitNo: '',
|
|
|
+ ownNetWeight: null,
|
|
|
+ netWeight: toNum(raw.net_weight), // "2.2700" → 2.27
|
|
|
+ actUom: raw.act_uom,
|
|
|
+ noOfBunches: raw.no_of_bunches,
|
|
|
+ qtyUom: raw.qty_uom,
|
|
|
+ docActQty: 0,
|
|
|
+ locArea: block.plantedArea,
|
|
|
+ locUom: block.plantedLocUOM,
|
|
|
+ budgetedFfb: null,
|
|
|
+ remarks: '',
|
|
|
+ issues: null,
|
|
|
+ vector,
|
|
|
+ };
|
|
|
+}
|
|
|
+
|
|
|
+// ─── Batch embedder helper ────────────────────────────────────────────────────
|
|
|
+
|
|
|
+async function embedInBatches(
|
|
|
+ embedder: OllamaEmbeddings,
|
|
|
+ texts: string[],
|
|
|
+ label: string,
|
|
|
+): Promise<number[][]> {
|
|
|
+ const results: number[][] = [];
|
|
|
+ const total = Math.ceil(texts.length / BATCH_SIZE);
|
|
|
+ for (let i = 0; i < texts.length; i += BATCH_SIZE) {
|
|
|
+ const batchNum = Math.floor(i / BATCH_SIZE) + 1;
|
|
|
+ process.stdout.write(` [${batchNum}/${total}] embedding ${label}...`);
|
|
|
+ const slice = texts.slice(i, i + BATCH_SIZE);
|
|
|
+ const vecs = await embedder.embedDocuments(slice);
|
|
|
+ results.push(...vecs);
|
|
|
+ console.log(` ✅ ${slice.length} done.`);
|
|
|
+ }
|
|
|
+ return results;
|
|
|
+}
|
|
|
+
|
|
|
+// ─── Main ─────────────────────────────────────────────────────────────────────
|
|
|
+
|
|
|
+async function seed() {
|
|
|
+ if (!MONGO_URI) throw new Error('MONGO_URI not set in .env');
|
|
|
+ if (!MONGO_DB_NAME) throw new Error('MONGO_DB_NAME not set in .env');
|
|
|
+
|
|
|
+ console.log('');
|
|
|
+ console.log('═══════════════════════════════════════════════════════');
|
|
|
+ console.log(' RAG Warehouse Seed Script');
|
|
|
+ console.log(` DB: ${MONGO_DB_NAME}`);
|
|
|
+ console.log(` Model: ${EMBEDDING_MODEL} @ ${OLLAMA_BASE_URL}`);
|
|
|
+ console.log('═══════════════════════════════════════════════════════');
|
|
|
+ console.log('');
|
|
|
+
|
|
|
+ // 1. Load source JSON
|
|
|
+ console.log('📂 Loading source JSON files...');
|
|
|
+ const rawPhases = loadJson<RawPhase[]>('phaseData.json');
|
|
|
+ const rawBlocks = loadJson<RawBlock[]>('blockData.json');
|
|
|
+ const rawFFBs = loadJson<RawFFB[]>('FFBProductionData.json');
|
|
|
+ console.log(` Phases: ${rawPhases.length}`);
|
|
|
+ console.log(` Blocks: ${rawBlocks.length}`);
|
|
|
+ console.log(` FFB Activities: ${rawFFBs.length}`);
|
|
|
+ console.log('');
|
|
|
+
|
|
|
+ // 2. Transform master data
|
|
|
+ console.log('🔄 Transforming master data (field rename + type cast)...');
|
|
|
+ const phases = rawPhases.map(transformPhase);
|
|
|
+ const blocks = rawBlocks.map(transformBlock);
|
|
|
+
|
|
|
+ // 3. Build in-memory lookup maps — avoids per-record Atlas round trips
|
|
|
+ const phaseIdToCode = new Map<number, string>(rawPhases.map(p => [p.phaseID, p.phaseCode]));
|
|
|
+ const phaseIdToName = new Map<number, string>(rawPhases.map(p => [p.phaseID, p.phaseName]));
|
|
|
+ const blockIdToDoc = new Map<number, BlockDoc>(rawBlocks.map(b => [b.blockID, transformBlock(b)]));
|
|
|
+ console.log(` Phase lookup map: ${phaseIdToCode.size} entries`);
|
|
|
+ console.log(` Block lookup map: ${blockIdToDoc.size} entries`);
|
|
|
+ console.log('');
|
|
|
+
|
|
|
+ // 4. Connect to MongoDB Atlas
|
|
|
+ console.log('🔗 Connecting to MongoDB Atlas...');
|
|
|
+ const client = new MongoClient(MONGO_URI);
|
|
|
+ await client.connect();
|
|
|
+ const db: Db = client.db(MONGO_DB_NAME);
|
|
|
+ console.log(' Connected.\n');
|
|
|
+
|
|
|
+ // 5. Initialize Ollama embedder (shared across all three collections)
|
|
|
+ console.log('🤖 Initializing Ollama embedder...');
|
|
|
+ const embedder = new OllamaEmbeddings({
|
|
|
+ model: EMBEDDING_MODEL,
|
|
|
+ baseUrl: OLLAMA_BASE_URL,
|
|
|
+ });
|
|
|
+ console.log(' Embedder ready.\n');
|
|
|
+
|
|
|
+ try {
|
|
|
+ // ── Phase collection ────────────────────────────────────────────────────
|
|
|
+ console.log(`📦 Vectorizing ${phases.length} phases...`);
|
|
|
+ const phaseTexts = phases.map(buildPhaseEmbeddingText);
|
|
|
+ const phaseVectors = await embedInBatches(embedder, phaseTexts, 'phase');
|
|
|
+
|
|
|
+ console.log('🗑️ Phase: clearing...');
|
|
|
+ await db.collection('Phase').deleteMany({});
|
|
|
+ const phaseDocs = phases.map((p, i) => ({ ...p, vector: phaseVectors[i] }));
|
|
|
+ await db.collection('Phase').insertMany(phaseDocs);
|
|
|
+ console.log(` ✅ ${phaseDocs.length} phases inserted with vectors.\n`);
|
|
|
+
|
|
|
+ // ── Block collection ────────────────────────────────────────────────────
|
|
|
+ console.log(`📦 Vectorizing ${blocks.length} blocks...`);
|
|
|
+ const blockTexts = blocks.map(buildBlockEmbeddingText);
|
|
|
+ const blockVectors = await embedInBatches(embedder, blockTexts, 'block');
|
|
|
+
|
|
|
+ console.log('🗑️ Block: clearing...');
|
|
|
+ await db.collection('Block').deleteMany({});
|
|
|
+ const blockDocs = blocks.map((b, i) => ({ ...b, vector: blockVectors[i] }));
|
|
|
+ await db.collection('Block').insertMany(blockDocs);
|
|
|
+ console.log(` ✅ ${blockDocs.length} blocks inserted with vectors.\n`);
|
|
|
+
|
|
|
+ // ── FFB Production collection ───────────────────────────────────────────
|
|
|
+ console.log('🗑️ FFB Production: clearing...');
|
|
|
+ await db.collection('FFB Production').deleteMany({});
|
|
|
+ console.log(' Collection cleared.\n');
|
|
|
+
|
|
|
+ const totalBatches = Math.ceil(rawFFBs.length / BATCH_SIZE);
|
|
|
+ let insertedCount = 0;
|
|
|
+ let skippedCount = 0;
|
|
|
+
|
|
|
+ console.log(`📦 Processing ${rawFFBs.length} FFB records in ${totalBatches} batches of ${BATCH_SIZE}...`);
|
|
|
+ console.log('');
|
|
|
+
|
|
|
+ for (let i = 0; i < rawFFBs.length; i += BATCH_SIZE) {
|
|
|
+ const batch = rawFFBs.slice(i, i + BATCH_SIZE);
|
|
|
+ const batchNum = Math.floor(i / BATCH_SIZE) + 1;
|
|
|
+ process.stdout.write(` [${batchNum}/${totalBatches}] resolving...`);
|
|
|
+
|
|
|
+ // Resolve phaseCode/block for each record using in-memory maps
|
|
|
+ const resolved: Array<{ raw: RawFFB; phaseCode: string; phaseName: string; block: BlockDoc }> = [];
|
|
|
+ let batchSkip = 0;
|
|
|
+
|
|
|
+ for (const raw of batch) {
|
|
|
+ const phaseCode = phaseIdToCode.get(raw.phaseId);
|
|
|
+ const phaseName = phaseIdToName.get(raw.phaseId) || '';
|
|
|
+ const block = blockIdToDoc.get(raw.blockId);
|
|
|
+ if (!phaseCode || !block) {
|
|
|
+ batchSkip++;
|
|
|
+ } else {
|
|
|
+ resolved.push({ raw, phaseCode, phaseName, block });
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ skippedCount += batchSkip;
|
|
|
+
|
|
|
+ if (resolved.length === 0) {
|
|
|
+ console.log(` all ${batchSkip} records have unresolvable IDs — skipped.`);
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ // Generate embeddings for the entire resolved batch in one Ollama call
|
|
|
+ process.stdout.write(` embedding ${resolved.length}...`);
|
|
|
+ const texts = resolved.map(r =>
|
|
|
+ buildFFBEmbeddingText(r.raw, r.phaseCode, r.phaseName, r.block)
|
|
|
+ );
|
|
|
+ const vectors = await embedder.embedDocuments(texts);
|
|
|
+
|
|
|
+ // Build final documents and insert
|
|
|
+ const docs = resolved.map((r, idx) =>
|
|
|
+ buildFFBDoc(r.raw, r.phaseCode, r.phaseName, r.block, vectors[idx])
|
|
|
+ );
|
|
|
+ await db.collection('FFB Production').insertMany(docs);
|
|
|
+ insertedCount += docs.length;
|
|
|
+
|
|
|
+ const skipNote = batchSkip > 0 ? ` (${batchSkip} skipped)` : '';
|
|
|
+ console.log(` ✅ ${docs.length} inserted.${skipNote}`);
|
|
|
+ }
|
|
|
+
|
|
|
+ // ── Final summary ───────────────────────────────────────────────────────
|
|
|
+ console.log('');
|
|
|
+ console.log('═══════════════════════════════════════════════════════');
|
|
|
+ console.log(' SEED COMPLETE');
|
|
|
+ console.log(` Phases inserted (with vectors): ${phaseDocs.length}`);
|
|
|
+ console.log(` Blocks inserted (with vectors): ${blockDocs.length}`);
|
|
|
+ console.log(` FFB records inserted: ${insertedCount}`);
|
|
|
+ if (skippedCount > 0) {
|
|
|
+ console.log(` FFB records skipped: ${skippedCount} (phaseId/blockId not in master)`);
|
|
|
+ }
|
|
|
+ console.log('═══════════════════════════════════════════════════════');
|
|
|
+ console.log('');
|
|
|
+
|
|
|
+ } finally {
|
|
|
+ await client.close();
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+seed().catch(err => {
|
|
|
+ console.error('\n❌ Seed failed:', err.message || err);
|
|
|
+ process.exit(1);
|
|
|
+});
|