| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414 |
- import * as dotenv from 'dotenv';
- import * as path from 'path';
- import * as fs from 'fs';
- import { MongoClient, Db } from 'mongodb';
- import { OllamaEmbeddings } from '@langchain/ollama';
- // Load .env from backend root (one level above scripts/)
- dotenv.config({ path: path.resolve(__dirname, '../.env') });
- // ─── Config ───────────────────────────────────────────────────────────────────
- const MONGO_URI = process.env.MONGO_URI;
- const MONGO_DB_NAME = process.env.MONGO_DB_NAME;
- const EMBEDDING_MODEL = process.env.EMBEDDING_MODEL || 'bge-large';
- const OLLAMA_BASE_URL = 'http://localhost:11434';
- const BATCH_SIZE = 50;
- // Two levels up from scripts/ → workspace root → mongo stuff/
- const MONGO_STUFF = path.resolve(__dirname, '../../mongo stuff');
- // ─── Source JSON interfaces ────────────────────────────────────────────────────
- interface RawPhase {
- phaseID: number;
- phaseCode: string;
- phaseName: string;
- phaseDesc: string;
- }
- interface RawBlock {
- blockID: number;
- blockCode: string;
- blockDesc: string;
- loc_type: string;
- entry_no: number;
- entry_year: number;
- quater_planted: number;
- month_planted: string;
- numOfTreesPlanted: string | number | null;
- totalTreeMatured: string | number | null;
- totalTreeImmatured: string | number | null;
- totalTreeDead: string | number | null;
- totalPlantedArea: string | number | null;
- initalPlantedArea: string | number | null;
- plantedLocUOM: string;
- loc_soil_condition: string;
- }
- interface RawFFB {
- activityId: number;
- productionDate: string;
- siteId: string;
- phaseId: number;
- blockId: number;
- net_weight: string;
- act_uom: string;
- no_of_bunches: number;
- qty_uom: string;
- }
- // ─── Target document interfaces (match NestJS schemas exactly) ────────────────
- interface PhaseDoc {
- locId: number;
- phaseCode: string;
- description: string;
- locType: string;
- vector?: number[];
- }
- interface BlockDoc {
- locId: number;
- blockCode: string;
- blockDesc: string;
- locType: string;
- entryNo: number;
- entryYear: number;
- quarterPlanted: number;
- monthPlanted: string;
- totalTrees: number;
- totalMaturedTrees: number;
- totalImmaturedTrees: number;
- totalDeadTrees: number;
- plantedArea: number;
- initialPlantedArea: number;
- plantedLocUOM: string;
- soilCondition: string;
- vector?: number[];
- }
- // ─── Type-cast helpers ────────────────────────────────────────────────────────
- const toNum = (v: string | number | null | undefined): number =>
- v == null ? 0 : parseFloat(String(v)) || 0;
- // Round to integer (avoids NaN from null/empty strings)
- const toInt = (v: string | number | null | undefined): number =>
- v == null ? 0 : Math.round(parseFloat(String(v))) || 0;
- function loadJson<T>(filename: string): T {
- const filePath = path.join(MONGO_STUFF, filename);
- return JSON.parse(fs.readFileSync(filePath, 'utf-8')) as T;
- }
- // ─── Transform: phaseData.json → Phase collection schema ─────────────────────
- // phaseID → locId, phaseDesc → description, add locType
- function transformPhase(raw: RawPhase): PhaseDoc {
- return {
- locId: raw.phaseID,
- phaseCode: raw.phaseCode,
- description: raw.phaseDesc || raw.phaseName,
- locType: 'PHASE',
- };
- }
- // ─── Transform: blockData.json → Block collection schema ─────────────────────
- // All snake_case / mixed-case source fields → camelCase schema fields.
- // Numeric strings (e.g. "13932", "88.7300") cast to native Number on write.
- function transformBlock(raw: RawBlock): BlockDoc {
- return {
- locId: raw.blockID,
- blockCode: raw.blockCode,
- blockDesc: raw.blockDesc || '',
- locType: raw.loc_type || 'BLOCK',
- entryNo: toInt(raw.entry_no),
- entryYear: toInt(raw.entry_year),
- quarterPlanted: toInt(raw.quater_planted),
- monthPlanted: raw.month_planted || '',
- totalTrees: toInt(raw.numOfTreesPlanted), // "13932" → 13932
- totalMaturedTrees: toInt(raw.totalTreeMatured),
- totalImmaturedTrees: toInt(raw.totalTreeImmatured),
- totalDeadTrees: toInt(raw.totalTreeDead),
- plantedArea: toNum(raw.totalPlantedArea), // "88.7300" → 88.73
- initialPlantedArea: toNum(raw.initalPlantedArea),
- plantedLocUOM: raw.plantedLocUOM || '',
- soilCondition: raw.loc_soil_condition || '', // "PEAT" → soilCondition
- };
- }
- // ─── Phase embedding text ─────────────────────────────────────────────────────
- function buildPhaseEmbeddingText(doc: PhaseDoc): string {
- return `Phase Reference Entry:
- Phase Code: ${doc.phaseCode}
- Phase Name: ${doc.description}
- Location Type: ${doc.locType}`.trim();
- }
- // ─── Block embedding text ─────────────────────────────────────────────────────
- function buildBlockEmbeddingText(doc: BlockDoc): string {
- let text = `Block Reference Entry:
- Block Code: ${doc.blockCode}
- Description: ${doc.blockDesc || 'No description.'}
- Location Type: ${doc.locType}
- Total Trees: ${doc.totalTrees} (${doc.totalMaturedTrees} matured, ${doc.totalImmaturedTrees} immature, ${doc.totalDeadTrees} dead)
- Planted Area: ${doc.plantedArea} ${doc.plantedLocUOM}
- Entry Year: ${doc.entryYear}, Quarter Planted: ${doc.quarterPlanted}, Month: ${doc.monthPlanted}`;
- if (doc.soilCondition) {
- text += `\n Soil Condition: ${doc.soilCondition}`;
- }
- return text.trim();
- }
- // ─── FFB embedding text (mirrors recordToTextEnriched in ffb-vector.service.ts) ─
- // Environmental Context line conditionally appended only when soilCondition exists,
- // matching the production vector service behaviour exactly.
- function buildFFBEmbeddingText(
- raw: RawFFB,
- phaseCode: string,
- phaseName: string,
- block: BlockDoc,
- ): string {
- let text = `FFB Production Log Entry:
- Project Code: ${raw.siteId} | Activity: FFB Harvesting
- Organization: (${raw.siteId})
- Location Details: Phase ${phaseName} (${phaseCode}), Block Code ${block.blockCode}
- Harvest Output: ${raw.no_of_bunches} Bunches, Net Weight: ${toNum(raw.net_weight)} ${raw.act_uom}
- Logistics: Transported via Truck to Mill
- Field Observations: No supervisor remarks recorded.
- Operational Issues: No production anomalies reported.`;
- if (block.soilCondition) {
- text += `\nEnvironmental Context: Cultivated on ${block.soilCondition} soil conditions with a total tree count of ${block.totalTrees || 0} trees.`;
- }
- return text.trim();
- }
- // ─── FFB Production document builder ─────────────────────────────────────────
- function buildFFBDoc(
- raw: RawFFB,
- phaseCode: string,
- phaseName: string,
- block: BlockDoc,
- vector: number[],
- ) {
- return {
- productionDate: new Date(raw.productionDate),
- prjCode: raw.siteId,
- actCode: 'FFB',
- actName: 'FFB Harvesting',
- entityCode: '',
- orgnId: 0,
- orgnCode: raw.siteId,
- orgnFullName: '',
- orgnAddress: '',
- orgnCompRegNo: '',
- phaseCode,
- phaseName,
- phaseDesc: phaseName,
- blockCode: block.blockCode,
- blockName: block.blockDesc || null,
- blockDesc: block.blockDesc || null,
- truckNo: '',
- millNo: '',
- actEntryNo: raw.activityId,
- actRound: 0,
- weightChitNo: '',
- ownNetWeight: null,
- netWeight: toNum(raw.net_weight), // "2.2700" → 2.27
- actUom: raw.act_uom,
- noOfBunches: raw.no_of_bunches,
- qtyUom: raw.qty_uom,
- docActQty: 0,
- locArea: block.plantedArea,
- locUom: block.plantedLocUOM,
- budgetedFfb: null,
- remarks: '',
- issues: null,
- vector,
- };
- }
- // ─── Batch embedder helper ────────────────────────────────────────────────────
- async function embedInBatches(
- embedder: OllamaEmbeddings,
- texts: string[],
- label: string,
- ): Promise<number[][]> {
- const results: number[][] = [];
- const total = Math.ceil(texts.length / BATCH_SIZE);
- for (let i = 0; i < texts.length; i += BATCH_SIZE) {
- const batchNum = Math.floor(i / BATCH_SIZE) + 1;
- process.stdout.write(` [${batchNum}/${total}] embedding ${label}...`);
- const slice = texts.slice(i, i + BATCH_SIZE);
- const vecs = await embedder.embedDocuments(slice);
- results.push(...vecs);
- console.log(` ✅ ${slice.length} done.`);
- }
- return results;
- }
- // ─── Main ─────────────────────────────────────────────────────────────────────
- async function seed() {
- if (!MONGO_URI) throw new Error('MONGO_URI not set in .env');
- if (!MONGO_DB_NAME) throw new Error('MONGO_DB_NAME not set in .env');
- console.log('');
- console.log('═══════════════════════════════════════════════════════');
- console.log(' RAG Warehouse Seed Script');
- console.log(` DB: ${MONGO_DB_NAME}`);
- console.log(` Model: ${EMBEDDING_MODEL} @ ${OLLAMA_BASE_URL}`);
- console.log('═══════════════════════════════════════════════════════');
- console.log('');
- // 1. Load source JSON
- console.log('📂 Loading source JSON files...');
- const rawPhases = loadJson<RawPhase[]>('phaseData.json');
- const rawBlocks = loadJson<RawBlock[]>('blockData.json');
- const rawFFBs = loadJson<RawFFB[]>('FFBProductionData.json');
- console.log(` Phases: ${rawPhases.length}`);
- console.log(` Blocks: ${rawBlocks.length}`);
- console.log(` FFB Activities: ${rawFFBs.length}`);
- console.log('');
- // 2. Transform master data
- console.log('🔄 Transforming master data (field rename + type cast)...');
- const phases = rawPhases.map(transformPhase);
- const blocks = rawBlocks.map(transformBlock);
- // 3. Build in-memory lookup maps — avoids per-record Atlas round trips
- const phaseIdToCode = new Map<number, string>(rawPhases.map(p => [p.phaseID, p.phaseCode]));
- const phaseIdToName = new Map<number, string>(rawPhases.map(p => [p.phaseID, p.phaseName]));
- const blockIdToDoc = new Map<number, BlockDoc>(rawBlocks.map(b => [b.blockID, transformBlock(b)]));
- console.log(` Phase lookup map: ${phaseIdToCode.size} entries`);
- console.log(` Block lookup map: ${blockIdToDoc.size} entries`);
- console.log('');
- // 4. Connect to MongoDB Atlas
- console.log('🔗 Connecting to MongoDB Atlas...');
- const client = new MongoClient(MONGO_URI);
- await client.connect();
- const db: Db = client.db(MONGO_DB_NAME);
- console.log(' Connected.\n');
- // 5. Initialize Ollama embedder (shared across all three collections)
- console.log('🤖 Initializing Ollama embedder...');
- const embedder = new OllamaEmbeddings({
- model: EMBEDDING_MODEL,
- baseUrl: OLLAMA_BASE_URL,
- });
- console.log(' Embedder ready.\n');
- try {
- // ── Phase collection ────────────────────────────────────────────────────
- console.log(`📦 Vectorizing ${phases.length} phases...`);
- const phaseTexts = phases.map(buildPhaseEmbeddingText);
- const phaseVectors = await embedInBatches(embedder, phaseTexts, 'phase');
- console.log('🗑️ Phase: clearing...');
- await db.collection('Phase').deleteMany({});
- const phaseDocs = phases.map((p, i) => ({ ...p, vector: phaseVectors[i] }));
- await db.collection('Phase').insertMany(phaseDocs);
- console.log(` ✅ ${phaseDocs.length} phases inserted with vectors.\n`);
- // ── Block collection ────────────────────────────────────────────────────
- console.log(`📦 Vectorizing ${blocks.length} blocks...`);
- const blockTexts = blocks.map(buildBlockEmbeddingText);
- const blockVectors = await embedInBatches(embedder, blockTexts, 'block');
- console.log('🗑️ Block: clearing...');
- await db.collection('Block').deleteMany({});
- const blockDocs = blocks.map((b, i) => ({ ...b, vector: blockVectors[i] }));
- await db.collection('Block').insertMany(blockDocs);
- console.log(` ✅ ${blockDocs.length} blocks inserted with vectors.\n`);
- // ── FFB Production collection ───────────────────────────────────────────
- console.log('🗑️ FFB Production: clearing...');
- await db.collection('FFB Production').deleteMany({});
- console.log(' Collection cleared.\n');
- const totalBatches = Math.ceil(rawFFBs.length / BATCH_SIZE);
- let insertedCount = 0;
- let skippedCount = 0;
- console.log(`📦 Processing ${rawFFBs.length} FFB records in ${totalBatches} batches of ${BATCH_SIZE}...`);
- console.log('');
- for (let i = 0; i < rawFFBs.length; i += BATCH_SIZE) {
- const batch = rawFFBs.slice(i, i + BATCH_SIZE);
- const batchNum = Math.floor(i / BATCH_SIZE) + 1;
- process.stdout.write(` [${batchNum}/${totalBatches}] resolving...`);
- // Resolve phaseCode/block for each record using in-memory maps
- const resolved: Array<{ raw: RawFFB; phaseCode: string; phaseName: string; block: BlockDoc }> = [];
- let batchSkip = 0;
- for (const raw of batch) {
- const phaseCode = phaseIdToCode.get(raw.phaseId);
- const phaseName = phaseIdToName.get(raw.phaseId) || '';
- const block = blockIdToDoc.get(raw.blockId);
- if (!phaseCode || !block) {
- batchSkip++;
- } else {
- resolved.push({ raw, phaseCode, phaseName, block });
- }
- }
- skippedCount += batchSkip;
- if (resolved.length === 0) {
- console.log(` all ${batchSkip} records have unresolvable IDs — skipped.`);
- continue;
- }
- // Generate embeddings for the entire resolved batch in one Ollama call
- process.stdout.write(` embedding ${resolved.length}...`);
- const texts = resolved.map(r =>
- buildFFBEmbeddingText(r.raw, r.phaseCode, r.phaseName, r.block)
- );
- const vectors = await embedder.embedDocuments(texts);
- // Build final documents and insert
- const docs = resolved.map((r, idx) =>
- buildFFBDoc(r.raw, r.phaseCode, r.phaseName, r.block, vectors[idx])
- );
- await db.collection('FFB Production').insertMany(docs);
- insertedCount += docs.length;
- const skipNote = batchSkip > 0 ? ` (${batchSkip} skipped)` : '';
- console.log(` ✅ ${docs.length} inserted.${skipNote}`);
- }
- // ── Final summary ───────────────────────────────────────────────────────
- console.log('');
- console.log('═══════════════════════════════════════════════════════');
- console.log(' SEED COMPLETE');
- console.log(` Phases inserted (with vectors): ${phaseDocs.length}`);
- console.log(` Blocks inserted (with vectors): ${blockDocs.length}`);
- console.log(` FFB records inserted: ${insertedCount}`);
- if (skippedCount > 0) {
- console.log(` FFB records skipped: ${skippedCount} (phaseId/blockId not in master)`);
- }
- console.log('═══════════════════════════════════════════════════════');
- console.log('');
- } finally {
- await client.close();
- }
- }
- seed().catch(err => {
- console.error('\n❌ Seed failed:', err.message || err);
- process.exit(1);
- });
|