| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212 |
- /**
- * Seeds FFBProductionData_processed.json into the 'FFB Production' collection.
- *
- * For each record:
- * - Verifies Phase (by locId) and Block (by locId) exist in Atlas
- * - Skips the record if either is missing
- * - Resolves their MongoDB _id and embeds into phase.id / block.id
- * - Generates a 1024-dim vector via Ollama bge-large
- * - Inserts into 'FFB Production'
- *
- * NOTE: Site collection not yet seeded — site.siteId kept as plain string for now.
- *
- * Run: npx ts-node --transpile-only scripts/seed-ffb-production.ts
- */
- import * as dotenv from 'dotenv';
- import * as path from 'path';
- import * as fs from 'fs';
- import { MongoClient, ObjectId, Db } from 'mongodb';
- import { OllamaEmbeddings } from '@langchain/ollama';
- dotenv.config({ path: path.resolve(__dirname, '../.env') });
- const MONGO_URI = process.env.MONGO_URI!;
- const MONGO_DB_NAME = process.env.MONGO_DB_NAME!;
- const EMBEDDING_MODEL = process.env.EMBEDDING_MODEL || 'bge-large';
- const OLLAMA_BASE_URL = 'http://localhost:11434';
- const BATCH_SIZE = 20;
- const MONGO_STUFF = path.resolve(__dirname, '../../mongo stuff');
- // ─── Processed record shape (matches FFBProductionData_processed.json) ────────
- interface ProcessedFFB {
- activityId: number;
- productionDate: string;
- site: { siteId: string };
- phase: { phaseId: number };
- block: { blockId: number };
- weight: number;
- weightUom: string;
- quantity: number;
- quantityUom: string;
- remarks: string;
- vector: number[];
- }
- // ─── Retry wrapper for Ollama (handles transient connection drops) ────────────
- async function embedWithRetry(
- embedder: OllamaEmbeddings,
- texts: string[],
- retries = 3,
- delayMs = 3000,
- ): Promise<number[][]> {
- for (let attempt = 1; attempt <= retries; attempt++) {
- try {
- return await embedder.embedDocuments(texts);
- } catch (err: any) {
- if (attempt === retries) throw err;
- console.log(`\n ⚠️ Ollama error (attempt ${attempt}/${retries}): ${err.message} — retrying in ${delayMs / 1000}s...`);
- await new Promise(r => setTimeout(r, delayMs));
- }
- }
- throw new Error('embedWithRetry exhausted');
- }
- // ─── Embedding text builder ───────────────────────────────────────────────────
- function buildEmbeddingText(r: ProcessedFFB): string {
- const date = new Date(r.productionDate).toISOString().split('T')[0];
- let text = `FFB Production Entry
- Date: ${date}
- Site: ${r.site.siteId} | Phase ID: ${r.phase.phaseId} | Block ID: ${r.block.blockId}
- Harvest: ${r.quantity} ${r.quantityUom}, Weight: ${r.weight} ${r.weightUom}`;
- if (r.remarks) text += `\n Remarks: ${r.remarks}`;
- return text.trim();
- }
- // ─── Main ─────────────────────────────────────────────────────────────────────
- async function seed() {
- if (!MONGO_URI) throw new Error('MONGO_URI not set in .env');
- if (!MONGO_DB_NAME) throw new Error('MONGO_DB_NAME not set in .env');
- console.log('\n═══════════════════════════════════════════════════════');
- console.log(' Seed: FFB Production');
- console.log(` DB: ${MONGO_DB_NAME}`);
- console.log(` Model: ${EMBEDDING_MODEL} @ ${OLLAMA_BASE_URL}`);
- console.log('═══════════════════════════════════════════════════════\n');
- // Load processed JSON
- console.log('📂 Loading FFBProductionData_processed.json...');
- const records: ProcessedFFB[] = JSON.parse(
- fs.readFileSync(path.join(MONGO_STUFF, 'FFBProductionData_processed.json'), 'utf-8')
- );
- console.log(` ${records.length} records loaded.\n`);
- // Connect to Atlas
- console.log('🔗 Connecting to MongoDB Atlas...');
- const client = new MongoClient(MONGO_URI);
- await client.connect();
- const db: Db = client.db(MONGO_DB_NAME);
- console.log(' Connected.\n');
- // Load Phase and Block reference maps from Atlas (locId → _id)
- console.log('📋 Loading Phase and Block reference maps from Atlas...');
- const phaseDocs = await db.collection('Phase').find({}, { projection: { _id: 1, locId: 1 } }).toArray();
- const blockDocs = await db.collection('Block').find({}, { projection: { _id: 1, locId: 1 } }).toArray();
- const phaseLocIdToId = new Map<number, ObjectId>(
- phaseDocs.map(d => [d.locId as number, d._id as ObjectId])
- );
- const blockLocIdToId = new Map<number, ObjectId>(
- blockDocs.map(d => [d.locId as number, d._id as ObjectId])
- );
- console.log(` Phases in Atlas : ${phaseLocIdToId.size}`);
- console.log(` Blocks in Atlas : ${blockLocIdToId.size}\n`);
- // Initialize embedder
- console.log('🤖 Initializing Ollama embedder...');
- const embedder = new OllamaEmbeddings({ model: EMBEDDING_MODEL, baseUrl: OLLAMA_BASE_URL });
- console.log(' Ready.\n');
- try {
- // Wipe existing FFB Production collection
- console.log('🗑️ Clearing FFB Production collection...');
- await db.collection('FFB Production').deleteMany({});
- console.log(' Cleared.\n');
- const totalBatches = Math.ceil(records.length / BATCH_SIZE);
- let inserted = 0;
- let skipped = 0;
- console.log(`📦 Processing ${records.length} records in ${totalBatches} batches of ${BATCH_SIZE}...\n`);
- for (let i = 0; i < records.length; i += BATCH_SIZE) {
- const batch = records.slice(i, i + BATCH_SIZE);
- const batchNum = Math.floor(i / BATCH_SIZE) + 1;
- process.stdout.write(` [${batchNum}/${totalBatches}] validating...`);
- // Validate Phase and Block existence; resolve ObjectIds
- const valid: Array<{ record: ProcessedFFB; phaseId: ObjectId; blockId: ObjectId }> = [];
- for (const r of batch) {
- const phaseObjectId = phaseLocIdToId.get(r.phase.phaseId);
- const blockObjectId = blockLocIdToId.get(r.block.blockId);
- if (!phaseObjectId || !blockObjectId) {
- skipped++;
- continue;
- }
- valid.push({ record: r, phaseId: phaseObjectId, blockId: blockObjectId });
- }
- if (valid.length === 0) {
- console.log(` all ${batch.length - valid.length} skipped (missing Phase/Block).`);
- continue;
- }
- // Generate embeddings for the valid batch
- process.stdout.write(` embedding ${valid.length}...`);
- const texts = valid.map(v => buildEmbeddingText(v.record));
- const vectors = await embedWithRetry(embedder, texts);
- // Build final documents with resolved ObjectIds
- const docs = valid.map((v, idx) => ({
- activityId: v.record.activityId,
- productionDate: new Date(v.record.productionDate),
- site: {
- siteId: v.record.site.siteId,
- },
- phase: {
- id: v.phaseId, // ObjectId ref to Phase._id
- phaseId: v.record.phase.phaseId,
- },
- block: {
- id: v.blockId, // ObjectId ref to Block._id
- blockId: v.record.block.blockId,
- },
- weight: v.record.weight,
- weightUom: v.record.weightUom,
- quantity: v.record.quantity,
- quantityUom: v.record.quantityUom,
- remarks: v.record.remarks,
- vector: vectors[idx],
- }));
- await db.collection('FFB Production').insertMany(docs);
- inserted += docs.length;
- skipped += batch.length - valid.length;
- const skipNote = batch.length - valid.length > 0 ? ` (${batch.length - valid.length} skipped)` : '';
- console.log(` ✅ ${docs.length} inserted.${skipNote}`);
- }
- console.log('\n═══════════════════════════════════════════════════════');
- console.log(' SEED COMPLETE');
- console.log(` Inserted : ${inserted}`);
- console.log(` Skipped : ${skipped} (Phase or Block not found in Atlas)`);
- console.log('═══════════════════════════════════════════════════════\n');
- } finally {
- await client.close();
- }
- }
- seed().catch(err => {
- console.error('\n❌ Seed failed:', err.message || err);
- process.exit(1);
- });
|