seed-rag-warehouse.ts 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414
  1. import * as dotenv from 'dotenv';
  2. import * as path from 'path';
  3. import * as fs from 'fs';
  4. import { MongoClient, Db } from 'mongodb';
  5. import { OllamaEmbeddings } from '@langchain/ollama';
  6. // Load .env from backend root (one level above scripts/)
  7. dotenv.config({ path: path.resolve(__dirname, '../.env') });
  8. // ─── Config ───────────────────────────────────────────────────────────────────
  9. const MONGO_URI = process.env.MONGO_URI;
  10. const MONGO_DB_NAME = process.env.MONGO_DB_NAME;
  11. const EMBEDDING_MODEL = process.env.EMBEDDING_MODEL || 'bge-large';
  12. const OLLAMA_BASE_URL = 'http://localhost:11434';
  13. const BATCH_SIZE = 50;
  14. // Two levels up from scripts/ → workspace root → mongo stuff/
  15. const MONGO_STUFF = path.resolve(__dirname, '../../mongo stuff');
  16. // ─── Source JSON interfaces ────────────────────────────────────────────────────
  17. interface RawPhase {
  18. phaseID: number;
  19. phaseCode: string;
  20. phaseName: string;
  21. phaseDesc: string;
  22. }
  23. interface RawBlock {
  24. blockID: number;
  25. blockCode: string;
  26. blockDesc: string;
  27. loc_type: string;
  28. entry_no: number;
  29. entry_year: number;
  30. quater_planted: number;
  31. month_planted: string;
  32. numOfTreesPlanted: string | number | null;
  33. totalTreeMatured: string | number | null;
  34. totalTreeImmatured: string | number | null;
  35. totalTreeDead: string | number | null;
  36. totalPlantedArea: string | number | null;
  37. initalPlantedArea: string | number | null;
  38. plantedLocUOM: string;
  39. loc_soil_condition: string;
  40. }
  41. interface RawFFB {
  42. activityId: number;
  43. productionDate: string;
  44. siteId: string;
  45. phaseId: number;
  46. blockId: number;
  47. net_weight: string;
  48. act_uom: string;
  49. no_of_bunches: number;
  50. qty_uom: string;
  51. }
  52. // ─── Target document interfaces (match NestJS schemas exactly) ────────────────
  53. interface PhaseDoc {
  54. locId: number;
  55. phaseCode: string;
  56. description: string;
  57. locType: string;
  58. vector?: number[];
  59. }
  60. interface BlockDoc {
  61. locId: number;
  62. blockCode: string;
  63. blockDesc: string;
  64. locType: string;
  65. entryNo: number;
  66. entryYear: number;
  67. quarterPlanted: number;
  68. monthPlanted: string;
  69. totalTrees: number;
  70. totalMaturedTrees: number;
  71. totalImmaturedTrees: number;
  72. totalDeadTrees: number;
  73. plantedArea: number;
  74. initialPlantedArea: number;
  75. plantedLocUOM: string;
  76. soilCondition: string;
  77. vector?: number[];
  78. }
  79. // ─── Type-cast helpers ────────────────────────────────────────────────────────
  80. const toNum = (v: string | number | null | undefined): number =>
  81. v == null ? 0 : parseFloat(String(v)) || 0;
  82. // Round to integer (avoids NaN from null/empty strings)
  83. const toInt = (v: string | number | null | undefined): number =>
  84. v == null ? 0 : Math.round(parseFloat(String(v))) || 0;
  85. function loadJson<T>(filename: string): T {
  86. const filePath = path.join(MONGO_STUFF, filename);
  87. return JSON.parse(fs.readFileSync(filePath, 'utf-8')) as T;
  88. }
  89. // ─── Transform: phaseData.json → Phase collection schema ─────────────────────
  90. // phaseID → locId, phaseDesc → description, add locType
  91. function transformPhase(raw: RawPhase): PhaseDoc {
  92. return {
  93. locId: raw.phaseID,
  94. phaseCode: raw.phaseCode,
  95. description: raw.phaseDesc || raw.phaseName,
  96. locType: 'PHASE',
  97. };
  98. }
  99. // ─── Transform: blockData.json → Block collection schema ─────────────────────
  100. // All snake_case / mixed-case source fields → camelCase schema fields.
  101. // Numeric strings (e.g. "13932", "88.7300") cast to native Number on write.
  102. function transformBlock(raw: RawBlock): BlockDoc {
  103. return {
  104. locId: raw.blockID,
  105. blockCode: raw.blockCode,
  106. blockDesc: raw.blockDesc || '',
  107. locType: raw.loc_type || 'BLOCK',
  108. entryNo: toInt(raw.entry_no),
  109. entryYear: toInt(raw.entry_year),
  110. quarterPlanted: toInt(raw.quater_planted),
  111. monthPlanted: raw.month_planted || '',
  112. totalTrees: toInt(raw.numOfTreesPlanted), // "13932" → 13932
  113. totalMaturedTrees: toInt(raw.totalTreeMatured),
  114. totalImmaturedTrees: toInt(raw.totalTreeImmatured),
  115. totalDeadTrees: toInt(raw.totalTreeDead),
  116. plantedArea: toNum(raw.totalPlantedArea), // "88.7300" → 88.73
  117. initialPlantedArea: toNum(raw.initalPlantedArea),
  118. plantedLocUOM: raw.plantedLocUOM || '',
  119. soilCondition: raw.loc_soil_condition || '', // "PEAT" → soilCondition
  120. };
  121. }
  122. // ─── Phase embedding text ─────────────────────────────────────────────────────
  123. function buildPhaseEmbeddingText(doc: PhaseDoc): string {
  124. return `Phase Reference Entry:
  125. Phase Code: ${doc.phaseCode}
  126. Phase Name: ${doc.description}
  127. Location Type: ${doc.locType}`.trim();
  128. }
  129. // ─── Block embedding text ─────────────────────────────────────────────────────
  130. function buildBlockEmbeddingText(doc: BlockDoc): string {
  131. let text = `Block Reference Entry:
  132. Block Code: ${doc.blockCode}
  133. Description: ${doc.blockDesc || 'No description.'}
  134. Location Type: ${doc.locType}
  135. Total Trees: ${doc.totalTrees} (${doc.totalMaturedTrees} matured, ${doc.totalImmaturedTrees} immature, ${doc.totalDeadTrees} dead)
  136. Planted Area: ${doc.plantedArea} ${doc.plantedLocUOM}
  137. Entry Year: ${doc.entryYear}, Quarter Planted: ${doc.quarterPlanted}, Month: ${doc.monthPlanted}`;
  138. if (doc.soilCondition) {
  139. text += `\n Soil Condition: ${doc.soilCondition}`;
  140. }
  141. return text.trim();
  142. }
  143. // ─── FFB embedding text (mirrors recordToTextEnriched in ffb-vector.service.ts) ─
  144. // Environmental Context line conditionally appended only when soilCondition exists,
  145. // matching the production vector service behaviour exactly.
  146. function buildFFBEmbeddingText(
  147. raw: RawFFB,
  148. phaseCode: string,
  149. phaseName: string,
  150. block: BlockDoc,
  151. ): string {
  152. let text = `FFB Production Log Entry:
  153. Project Code: ${raw.siteId} | Activity: FFB Harvesting
  154. Organization: (${raw.siteId})
  155. Location Details: Phase ${phaseName} (${phaseCode}), Block Code ${block.blockCode}
  156. Harvest Output: ${raw.no_of_bunches} Bunches, Net Weight: ${toNum(raw.net_weight)} ${raw.act_uom}
  157. Logistics: Transported via Truck to Mill
  158. Field Observations: No supervisor remarks recorded.
  159. Operational Issues: No production anomalies reported.`;
  160. if (block.soilCondition) {
  161. text += `\nEnvironmental Context: Cultivated on ${block.soilCondition} soil conditions with a total tree count of ${block.totalTrees || 0} trees.`;
  162. }
  163. return text.trim();
  164. }
  165. // ─── FFB Production document builder ─────────────────────────────────────────
  166. function buildFFBDoc(
  167. raw: RawFFB,
  168. phaseCode: string,
  169. phaseName: string,
  170. block: BlockDoc,
  171. vector: number[],
  172. ) {
  173. return {
  174. productionDate: new Date(raw.productionDate),
  175. prjCode: raw.siteId,
  176. actCode: 'FFB',
  177. actName: 'FFB Harvesting',
  178. entityCode: '',
  179. orgnId: 0,
  180. orgnCode: raw.siteId,
  181. orgnFullName: '',
  182. orgnAddress: '',
  183. orgnCompRegNo: '',
  184. phaseCode,
  185. phaseName,
  186. phaseDesc: phaseName,
  187. blockCode: block.blockCode,
  188. blockName: block.blockDesc || null,
  189. blockDesc: block.blockDesc || null,
  190. truckNo: '',
  191. millNo: '',
  192. actEntryNo: raw.activityId,
  193. actRound: 0,
  194. weightChitNo: '',
  195. ownNetWeight: null,
  196. netWeight: toNum(raw.net_weight), // "2.2700" → 2.27
  197. actUom: raw.act_uom,
  198. noOfBunches: raw.no_of_bunches,
  199. qtyUom: raw.qty_uom,
  200. docActQty: 0,
  201. locArea: block.plantedArea,
  202. locUom: block.plantedLocUOM,
  203. budgetedFfb: null,
  204. remarks: '',
  205. issues: null,
  206. vector,
  207. };
  208. }
  209. // ─── Batch embedder helper ────────────────────────────────────────────────────
  210. async function embedInBatches(
  211. embedder: OllamaEmbeddings,
  212. texts: string[],
  213. label: string,
  214. ): Promise<number[][]> {
  215. const results: number[][] = [];
  216. const total = Math.ceil(texts.length / BATCH_SIZE);
  217. for (let i = 0; i < texts.length; i += BATCH_SIZE) {
  218. const batchNum = Math.floor(i / BATCH_SIZE) + 1;
  219. process.stdout.write(` [${batchNum}/${total}] embedding ${label}...`);
  220. const slice = texts.slice(i, i + BATCH_SIZE);
  221. const vecs = await embedder.embedDocuments(slice);
  222. results.push(...vecs);
  223. console.log(` ✅ ${slice.length} done.`);
  224. }
  225. return results;
  226. }
  227. // ─── Main ─────────────────────────────────────────────────────────────────────
  228. async function seed() {
  229. if (!MONGO_URI) throw new Error('MONGO_URI not set in .env');
  230. if (!MONGO_DB_NAME) throw new Error('MONGO_DB_NAME not set in .env');
  231. console.log('');
  232. console.log('═══════════════════════════════════════════════════════');
  233. console.log(' RAG Warehouse Seed Script');
  234. console.log(` DB: ${MONGO_DB_NAME}`);
  235. console.log(` Model: ${EMBEDDING_MODEL} @ ${OLLAMA_BASE_URL}`);
  236. console.log('═══════════════════════════════════════════════════════');
  237. console.log('');
  238. // 1. Load source JSON
  239. console.log('📂 Loading source JSON files...');
  240. const rawPhases = loadJson<RawPhase[]>('phaseData.json');
  241. const rawBlocks = loadJson<RawBlock[]>('blockData.json');
  242. const rawFFBs = loadJson<RawFFB[]>('FFBProductionData.json');
  243. console.log(` Phases: ${rawPhases.length}`);
  244. console.log(` Blocks: ${rawBlocks.length}`);
  245. console.log(` FFB Activities: ${rawFFBs.length}`);
  246. console.log('');
  247. // 2. Transform master data
  248. console.log('🔄 Transforming master data (field rename + type cast)...');
  249. const phases = rawPhases.map(transformPhase);
  250. const blocks = rawBlocks.map(transformBlock);
  251. // 3. Build in-memory lookup maps — avoids per-record Atlas round trips
  252. const phaseIdToCode = new Map<number, string>(rawPhases.map(p => [p.phaseID, p.phaseCode]));
  253. const phaseIdToName = new Map<number, string>(rawPhases.map(p => [p.phaseID, p.phaseName]));
  254. const blockIdToDoc = new Map<number, BlockDoc>(rawBlocks.map(b => [b.blockID, transformBlock(b)]));
  255. console.log(` Phase lookup map: ${phaseIdToCode.size} entries`);
  256. console.log(` Block lookup map: ${blockIdToDoc.size} entries`);
  257. console.log('');
  258. // 4. Connect to MongoDB Atlas
  259. console.log('🔗 Connecting to MongoDB Atlas...');
  260. const client = new MongoClient(MONGO_URI);
  261. await client.connect();
  262. const db: Db = client.db(MONGO_DB_NAME);
  263. console.log(' Connected.\n');
  264. // 5. Initialize Ollama embedder (shared across all three collections)
  265. console.log('🤖 Initializing Ollama embedder...');
  266. const embedder = new OllamaEmbeddings({
  267. model: EMBEDDING_MODEL,
  268. baseUrl: OLLAMA_BASE_URL,
  269. });
  270. console.log(' Embedder ready.\n');
  271. try {
  272. // ── Phase collection ────────────────────────────────────────────────────
  273. console.log(`📦 Vectorizing ${phases.length} phases...`);
  274. const phaseTexts = phases.map(buildPhaseEmbeddingText);
  275. const phaseVectors = await embedInBatches(embedder, phaseTexts, 'phase');
  276. console.log('🗑️ Phase: clearing...');
  277. await db.collection('Phase').deleteMany({});
  278. const phaseDocs = phases.map((p, i) => ({ ...p, vector: phaseVectors[i] }));
  279. await db.collection('Phase').insertMany(phaseDocs);
  280. console.log(` ✅ ${phaseDocs.length} phases inserted with vectors.\n`);
  281. // ── Block collection ────────────────────────────────────────────────────
  282. console.log(`📦 Vectorizing ${blocks.length} blocks...`);
  283. const blockTexts = blocks.map(buildBlockEmbeddingText);
  284. const blockVectors = await embedInBatches(embedder, blockTexts, 'block');
  285. console.log('🗑️ Block: clearing...');
  286. await db.collection('Block').deleteMany({});
  287. const blockDocs = blocks.map((b, i) => ({ ...b, vector: blockVectors[i] }));
  288. await db.collection('Block').insertMany(blockDocs);
  289. console.log(` ✅ ${blockDocs.length} blocks inserted with vectors.\n`);
  290. // ── FFB Production collection ───────────────────────────────────────────
  291. console.log('🗑️ FFB Production: clearing...');
  292. await db.collection('FFB Production').deleteMany({});
  293. console.log(' Collection cleared.\n');
  294. const totalBatches = Math.ceil(rawFFBs.length / BATCH_SIZE);
  295. let insertedCount = 0;
  296. let skippedCount = 0;
  297. console.log(`📦 Processing ${rawFFBs.length} FFB records in ${totalBatches} batches of ${BATCH_SIZE}...`);
  298. console.log('');
  299. for (let i = 0; i < rawFFBs.length; i += BATCH_SIZE) {
  300. const batch = rawFFBs.slice(i, i + BATCH_SIZE);
  301. const batchNum = Math.floor(i / BATCH_SIZE) + 1;
  302. process.stdout.write(` [${batchNum}/${totalBatches}] resolving...`);
  303. // Resolve phaseCode/block for each record using in-memory maps
  304. const resolved: Array<{ raw: RawFFB; phaseCode: string; phaseName: string; block: BlockDoc }> = [];
  305. let batchSkip = 0;
  306. for (const raw of batch) {
  307. const phaseCode = phaseIdToCode.get(raw.phaseId);
  308. const phaseName = phaseIdToName.get(raw.phaseId) || '';
  309. const block = blockIdToDoc.get(raw.blockId);
  310. if (!phaseCode || !block) {
  311. batchSkip++;
  312. } else {
  313. resolved.push({ raw, phaseCode, phaseName, block });
  314. }
  315. }
  316. skippedCount += batchSkip;
  317. if (resolved.length === 0) {
  318. console.log(` all ${batchSkip} records have unresolvable IDs — skipped.`);
  319. continue;
  320. }
  321. // Generate embeddings for the entire resolved batch in one Ollama call
  322. process.stdout.write(` embedding ${resolved.length}...`);
  323. const texts = resolved.map(r =>
  324. buildFFBEmbeddingText(r.raw, r.phaseCode, r.phaseName, r.block)
  325. );
  326. const vectors = await embedder.embedDocuments(texts);
  327. // Build final documents and insert
  328. const docs = resolved.map((r, idx) =>
  329. buildFFBDoc(r.raw, r.phaseCode, r.phaseName, r.block, vectors[idx])
  330. );
  331. await db.collection('FFB Production').insertMany(docs);
  332. insertedCount += docs.length;
  333. const skipNote = batchSkip > 0 ? ` (${batchSkip} skipped)` : '';
  334. console.log(` ✅ ${docs.length} inserted.${skipNote}`);
  335. }
  336. // ── Final summary ───────────────────────────────────────────────────────
  337. console.log('');
  338. console.log('═══════════════════════════════════════════════════════');
  339. console.log(' SEED COMPLETE');
  340. console.log(` Phases inserted (with vectors): ${phaseDocs.length}`);
  341. console.log(` Blocks inserted (with vectors): ${blockDocs.length}`);
  342. console.log(` FFB records inserted: ${insertedCount}`);
  343. if (skippedCount > 0) {
  344. console.log(` FFB records skipped: ${skippedCount} (phaseId/blockId not in master)`);
  345. }
  346. console.log('═══════════════════════════════════════════════════════');
  347. console.log('');
  348. } finally {
  349. await client.close();
  350. }
  351. }
  352. seed().catch(err => {
  353. console.error('\n❌ Seed failed:', err.message || err);
  354. process.exit(1);
  355. });