seed-ffb-production.ts 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212
  1. /**
  2. * Seeds FFBProductionData_processed.json into the 'FFB Production' collection.
  3. *
  4. * For each record:
  5. * - Verifies Phase (by locId) and Block (by locId) exist in Atlas
  6. * - Skips the record if either is missing
  7. * - Resolves their MongoDB _id and embeds into phase.id / block.id
  8. * - Generates a 1024-dim vector via Ollama bge-large
  9. * - Inserts into 'FFB Production'
  10. *
  11. * NOTE: Site collection not yet seeded — site.siteId kept as plain string for now.
  12. *
  13. * Run: npx ts-node --transpile-only scripts/seed-ffb-production.ts
  14. */
  15. import * as dotenv from 'dotenv';
  16. import * as path from 'path';
  17. import * as fs from 'fs';
  18. import { MongoClient, ObjectId, Db } from 'mongodb';
  19. import { OllamaEmbeddings } from '@langchain/ollama';
  20. dotenv.config({ path: path.resolve(__dirname, '../.env') });
  21. const MONGO_URI = process.env.MONGO_URI!;
  22. const MONGO_DB_NAME = process.env.MONGO_DB_NAME!;
  23. const EMBEDDING_MODEL = process.env.EMBEDDING_MODEL || 'bge-large';
  24. const OLLAMA_BASE_URL = 'http://localhost:11434';
  25. const BATCH_SIZE = 20;
  26. const MONGO_STUFF = path.resolve(__dirname, '../../mongo stuff');
  27. // ─── Processed record shape (matches FFBProductionData_processed.json) ────────
  28. interface ProcessedFFB {
  29. activityId: number;
  30. productionDate: string;
  31. site: { siteId: string };
  32. phase: { phaseId: number };
  33. block: { blockId: number };
  34. weight: number;
  35. weightUom: string;
  36. quantity: number;
  37. quantityUom: string;
  38. remarks: string;
  39. vector: number[];
  40. }
  41. // ─── Retry wrapper for Ollama (handles transient connection drops) ────────────
  42. async function embedWithRetry(
  43. embedder: OllamaEmbeddings,
  44. texts: string[],
  45. retries = 3,
  46. delayMs = 3000,
  47. ): Promise<number[][]> {
  48. for (let attempt = 1; attempt <= retries; attempt++) {
  49. try {
  50. return await embedder.embedDocuments(texts);
  51. } catch (err: any) {
  52. if (attempt === retries) throw err;
  53. console.log(`\n ⚠️ Ollama error (attempt ${attempt}/${retries}): ${err.message} — retrying in ${delayMs / 1000}s...`);
  54. await new Promise(r => setTimeout(r, delayMs));
  55. }
  56. }
  57. throw new Error('embedWithRetry exhausted');
  58. }
  59. // ─── Embedding text builder ───────────────────────────────────────────────────
  60. function buildEmbeddingText(r: ProcessedFFB): string {
  61. const date = new Date(r.productionDate).toISOString().split('T')[0];
  62. let text = `FFB Production Entry
  63. Date: ${date}
  64. Site: ${r.site.siteId} | Phase ID: ${r.phase.phaseId} | Block ID: ${r.block.blockId}
  65. Harvest: ${r.quantity} ${r.quantityUom}, Weight: ${r.weight} ${r.weightUom}`;
  66. if (r.remarks) text += `\n Remarks: ${r.remarks}`;
  67. return text.trim();
  68. }
  69. // ─── Main ─────────────────────────────────────────────────────────────────────
  70. async function seed() {
  71. if (!MONGO_URI) throw new Error('MONGO_URI not set in .env');
  72. if (!MONGO_DB_NAME) throw new Error('MONGO_DB_NAME not set in .env');
  73. console.log('\n═══════════════════════════════════════════════════════');
  74. console.log(' Seed: FFB Production');
  75. console.log(` DB: ${MONGO_DB_NAME}`);
  76. console.log(` Model: ${EMBEDDING_MODEL} @ ${OLLAMA_BASE_URL}`);
  77. console.log('═══════════════════════════════════════════════════════\n');
  78. // Load processed JSON
  79. console.log('📂 Loading FFBProductionData_processed.json...');
  80. const records: ProcessedFFB[] = JSON.parse(
  81. fs.readFileSync(path.join(MONGO_STUFF, 'FFBProductionData_processed.json'), 'utf-8')
  82. );
  83. console.log(` ${records.length} records loaded.\n`);
  84. // Connect to Atlas
  85. console.log('🔗 Connecting to MongoDB Atlas...');
  86. const client = new MongoClient(MONGO_URI);
  87. await client.connect();
  88. const db: Db = client.db(MONGO_DB_NAME);
  89. console.log(' Connected.\n');
  90. // Load Phase and Block reference maps from Atlas (locId → _id)
  91. console.log('📋 Loading Phase and Block reference maps from Atlas...');
  92. const phaseDocs = await db.collection('Phase').find({}, { projection: { _id: 1, locId: 1 } }).toArray();
  93. const blockDocs = await db.collection('Block').find({}, { projection: { _id: 1, locId: 1 } }).toArray();
  94. const phaseLocIdToId = new Map<number, ObjectId>(
  95. phaseDocs.map(d => [d.locId as number, d._id as ObjectId])
  96. );
  97. const blockLocIdToId = new Map<number, ObjectId>(
  98. blockDocs.map(d => [d.locId as number, d._id as ObjectId])
  99. );
  100. console.log(` Phases in Atlas : ${phaseLocIdToId.size}`);
  101. console.log(` Blocks in Atlas : ${blockLocIdToId.size}\n`);
  102. // Initialize embedder
  103. console.log('🤖 Initializing Ollama embedder...');
  104. const embedder = new OllamaEmbeddings({ model: EMBEDDING_MODEL, baseUrl: OLLAMA_BASE_URL });
  105. console.log(' Ready.\n');
  106. try {
  107. // Wipe existing FFB Production collection
  108. console.log('🗑️ Clearing FFB Production collection...');
  109. await db.collection('FFB Production').deleteMany({});
  110. console.log(' Cleared.\n');
  111. const totalBatches = Math.ceil(records.length / BATCH_SIZE);
  112. let inserted = 0;
  113. let skipped = 0;
  114. console.log(`📦 Processing ${records.length} records in ${totalBatches} batches of ${BATCH_SIZE}...\n`);
  115. for (let i = 0; i < records.length; i += BATCH_SIZE) {
  116. const batch = records.slice(i, i + BATCH_SIZE);
  117. const batchNum = Math.floor(i / BATCH_SIZE) + 1;
  118. process.stdout.write(` [${batchNum}/${totalBatches}] validating...`);
  119. // Validate Phase and Block existence; resolve ObjectIds
  120. const valid: Array<{ record: ProcessedFFB; phaseId: ObjectId; blockId: ObjectId }> = [];
  121. for (const r of batch) {
  122. const phaseObjectId = phaseLocIdToId.get(r.phase.phaseId);
  123. const blockObjectId = blockLocIdToId.get(r.block.blockId);
  124. if (!phaseObjectId || !blockObjectId) {
  125. skipped++;
  126. continue;
  127. }
  128. valid.push({ record: r, phaseId: phaseObjectId, blockId: blockObjectId });
  129. }
  130. if (valid.length === 0) {
  131. console.log(` all ${batch.length - valid.length} skipped (missing Phase/Block).`);
  132. continue;
  133. }
  134. // Generate embeddings for the valid batch
  135. process.stdout.write(` embedding ${valid.length}...`);
  136. const texts = valid.map(v => buildEmbeddingText(v.record));
  137. const vectors = await embedWithRetry(embedder, texts);
  138. // Build final documents with resolved ObjectIds
  139. const docs = valid.map((v, idx) => ({
  140. activityId: v.record.activityId,
  141. productionDate: new Date(v.record.productionDate),
  142. site: {
  143. siteId: v.record.site.siteId,
  144. },
  145. phase: {
  146. id: v.phaseId, // ObjectId ref to Phase._id
  147. phaseId: v.record.phase.phaseId,
  148. },
  149. block: {
  150. id: v.blockId, // ObjectId ref to Block._id
  151. blockId: v.record.block.blockId,
  152. },
  153. weight: v.record.weight,
  154. weightUom: v.record.weightUom,
  155. quantity: v.record.quantity,
  156. quantityUom: v.record.quantityUom,
  157. remarks: v.record.remarks,
  158. vector: vectors[idx],
  159. }));
  160. await db.collection('FFB Production').insertMany(docs);
  161. inserted += docs.length;
  162. skipped += batch.length - valid.length;
  163. const skipNote = batch.length - valid.length > 0 ? ` (${batch.length - valid.length} skipped)` : '';
  164. console.log(` ✅ ${docs.length} inserted.${skipNote}`);
  165. }
  166. console.log('\n═══════════════════════════════════════════════════════');
  167. console.log(' SEED COMPLETE');
  168. console.log(` Inserted : ${inserted}`);
  169. console.log(` Skipped : ${skipped} (Phase or Block not found in Atlas)`);
  170. console.log('═══════════════════════════════════════════════════════\n');
  171. } finally {
  172. await client.close();
  173. }
  174. }
  175. seed().catch(err => {
  176. console.error('\n❌ Seed failed:', err.message || err);
  177. process.exit(1);
  178. });