Browse Source

some test scripts to populate mongodb

Dr-Swopt 2 weeks ago
parent
commit
4dd5976782

+ 2 - 1
package.json

@@ -17,7 +17,8 @@
     "test:watch": "jest --watch",
     "test:cov": "jest --coverage",
     "test:debug": "node --inspect-brk -r tsconfig-paths/register -r ts-node/register node_modules/.bin/jest --runInBand",
-    "test:e2e": "jest --config ./test/jest-e2e.json"
+    "test:e2e": "jest --config ./test/jest-e2e.json",
+    "seed": "ts-node --transpile-only scripts/seed-rag-warehouse.ts"
   },
   "dependencies": {
     "@grpc/grpc-js": "^1.14.2",

+ 163 - 0
scripts/build-ffb-processed-json.ts

@@ -0,0 +1,163 @@
+/**
+ * Reads FFBProductionData.json and writes FFBProductionData_processed.json
+ * conforming to the updated Data Manual schema.
+ *
+ * - No MongoDB connection (no _id refs in output)
+ * - Generates remarks via Ollama qwen3:0.6b (one per unique phaseId+blockId pair)
+ * - vector field left as [] — to be filled at seed time
+ *
+ * Run: npx ts-node --transpile-only scripts/build-ffb-processed-json.ts
+ */
+
+import * as path from 'path';
+import * as fs from 'fs';
+import axios from 'axios';
+
+const OLLAMA_BASE_URL = 'http://localhost:11434';
+const REMARK_MODEL = 'qwen3:0.6b';
+const MONGO_STUFF = path.resolve(__dirname, '../../mongo stuff');
+
+// ─── Interfaces ───────────────────────────────────────────────────────────────
+
+interface RawPhase { phaseID: number; phaseCode: string; phaseName: string; phaseDesc: string; }
+interface RawBlock {
+  blockID: number; blockCode: string; blockDesc: string;
+  loc_soil_condition: string; numOfTreesPlanted: string | number | null;
+}
+interface RawFFB {
+  activityId: number; productionDate: string; siteId: string;
+  phaseId: number; blockId: number;
+  net_weight: string; act_uom: string; no_of_bunches: number; qty_uom: string;
+}
+
+// ─── Remark topics — cycle through for variety across blocks ─────────────────
+
+const TOPICS = [
+  'soil or ground conditions',
+  'weather during harvesting',
+  'worker performance or fatigue',
+  'harvesting equipment condition',
+  'pest or disease observation on fronds',
+];
+
+async function generateRemark(
+  blockCode: string,
+  soilCondition: string,
+  phaseName: string,
+  topicIndex: number,
+): Promise<string> {
+  const topic = TOPICS[topicIndex % TOPICS.length];
+  const prompt = `You are an oil palm plantation field supervisor writing a brief harvest log entry.
+Write ONE sentence (max 25 words) about ${topic} observed during FFB harvesting.
+Context: Block ${blockCode}, Phase: ${phaseName}, Soil: ${soilCondition || 'mineral'}.
+Reply with ONLY the sentence. No quotes, no labels. /no_think`;
+
+  const res = await axios.post(`${OLLAMA_BASE_URL}/api/generate`, {
+    model: REMARK_MODEL,
+    prompt,
+    stream: false,
+  });
+
+  // Strip any residual <think>...</think> tags qwen3 may emit despite /no_think
+  const raw: string = res.data.response ?? '';
+  return raw.replace(/<think>[\s\S]*?<\/think>/gi, '').trim();
+}
+
+// ─── Main ─────────────────────────────────────────────────────────────────────
+
+async function main() {
+  console.log('\n═══════════════════════════════════════════════════════');
+  console.log('  Build FFBProductionData_processed.json');
+  console.log('═══════════════════════════════════════════════════════\n');
+
+  // Load source files
+  const rawPhases: RawPhase[] = JSON.parse(fs.readFileSync(path.join(MONGO_STUFF, 'phaseData.json'), 'utf-8'));
+  const rawBlocks: RawBlock[] = JSON.parse(fs.readFileSync(path.join(MONGO_STUFF, 'blockData.json'), 'utf-8'));
+  const rawFFBs: RawFFB[] = JSON.parse(fs.readFileSync(path.join(MONGO_STUFF, 'FFBProductionData.json'), 'utf-8'));
+
+  console.log(`📂 Loaded: ${rawPhases.length} phases, ${rawBlocks.length} blocks, ${rawFFBs.length} FFB records\n`);
+
+  // Lookup maps
+  const phaseById = new Map<number, RawPhase>(rawPhases.map(p => [p.phaseID, p]));
+  const blockById = new Map<number, RawBlock>(rawBlocks.map(b => [b.blockID, b]));
+
+  // Collect unique (phaseId, blockId) pairs
+  const uniquePairs = new Map<string, { phaseId: number; blockId: number }>();
+  for (const r of rawFFBs) {
+    const key = `${r.phaseId}:${r.blockId}`;
+    if (!uniquePairs.has(key)) uniquePairs.set(key, { phaseId: r.phaseId, blockId: r.blockId });
+  }
+  console.log(`🔑 Unique (phaseId, blockId) pairs: ${uniquePairs.size}`);
+  console.log(`🤖 Generating one remark per pair via ${REMARK_MODEL}...\n`);
+
+  // Generate one remark per unique pair
+  const remarkCache = new Map<string, string>();
+  let pairIndex = 0;
+  for (const [key, { phaseId, blockId }] of uniquePairs) {
+    const phase = phaseById.get(phaseId);
+    const block = blockById.get(blockId);
+    const blockCode = block?.blockCode ?? `BLK-${blockId}`;
+    const soilCondition = block?.loc_soil_condition ?? '';
+    const phaseName = phase?.phaseName ?? `Phase ${phaseId}`;
+
+    process.stdout.write(`   [${pairIndex + 1}/${uniquePairs.size}] ${blockCode} (${phaseName})...`);
+    try {
+      const remark = await generateRemark(blockCode, soilCondition, phaseName, pairIndex);
+      remarkCache.set(key, remark);
+      console.log(` ✅`);
+    } catch (err: any) {
+      console.log(` ⚠️  fallback`);
+      remarkCache.set(key, 'Field conditions were normal during harvesting operations.');
+    }
+    pairIndex++;
+  }
+
+  console.log('\n🔄 Transforming all FFB records...');
+
+  // Transform all records
+  let skipped = 0;
+  const output: object[] = [];
+
+  for (const raw of rawFFBs) {
+    const key = `${raw.phaseId}:${raw.blockId}`;
+    const remark = remarkCache.get(key) ?? 'No field observations recorded.';
+
+    output.push({
+      activityId: raw.activityId,
+      productionDate: new Date(raw.productionDate).toISOString(),
+      site: {
+        siteId: raw.siteId,
+      },
+      phase: {
+        phaseId: raw.phaseId,
+      },
+      block: {
+        blockId: raw.blockId,
+      },
+      weight: parseFloat(raw.net_weight) || 0,
+      weightUom: raw.act_uom,
+      quantity: raw.no_of_bunches,
+      quantityUom: raw.qty_uom,
+      remarks: remark,
+      vector: [],
+    });
+  }
+
+  const outPath = path.join(MONGO_STUFF, 'FFBProductionData_processed.json');
+  fs.writeFileSync(outPath, JSON.stringify(output, null, 2), 'utf-8');
+
+  console.log('\n═══════════════════════════════════════════════════════');
+  console.log('  DONE');
+  console.log(`  Records written : ${output.length}`);
+  console.log(`  Output file     : mongo stuff/FFBProductionData_processed.json`);
+  console.log('═══════════════════════════════════════════════════════\n');
+
+  // Print first 2 records as a preview
+  console.log('── Sample (first 2 records) ──────────────────────────\n');
+  console.log(JSON.stringify(output.slice(0, 2), null, 2));
+}
+
+main().catch(err => {
+  console.error('\n❌ Failed:', err.message || err);
+  process.exit(1);
+});

+ 156 - 0
scripts/sample-ffb-processed.ts

@@ -0,0 +1,156 @@
+/**
+ * Generates a sample of 3 processed FFB Production records to validate:
+ *   - New nested schema shape (site/phase/block objects)
+ *   - Ollama qwen3:0.6b remark generation quality
+ *
+ * Run: npx ts-node --transpile-only scripts/sample-ffb-processed.ts
+ */
+
+import * as dotenv from 'dotenv';
+import * as path from 'path';
+import * as fs from 'fs';
+import { MongoClient, ObjectId } from 'mongodb';
+
+dotenv.config({ path: path.resolve(__dirname, '../.env') });
+
+const MONGO_URI = process.env.MONGO_URI!;
+const MONGO_DB_NAME = process.env.MONGO_DB_NAME!;
+const OLLAMA_BASE_URL = 'http://localhost:11434';
+const REMARK_MODEL = 'qwen3:0.6b';
+const MONGO_STUFF = path.resolve(__dirname, '../../mongo stuff');
+const SAMPLE_SIZE = 3;
+
+// ─── Source interfaces ────────────────────────────────────────────────────────
+
+interface RawPhase { phaseID: number; phaseCode: string; phaseName: string; phaseDesc: string; }
+interface RawBlock {
+  blockID: number; blockCode: string; blockDesc: string; loc_type: string;
+  numOfTreesPlanted: string | number | null; totalPlantedArea: string | number | null;
+  loc_soil_condition: string; plantedLocUOM: string;
+}
+interface RawFFB {
+  activityId: number; productionDate: string; siteId: string;
+  phaseId: number; blockId: number;
+  net_weight: string; act_uom: string; no_of_bunches: number; qty_uom: string;
+}
+
+// ─── Ollama generate (non-streaming) ─────────────────────────────────────────
+
+async function generateRemark(blockCode: string, soilCondition: string, phaseName: string): Promise<string> {
+  const prompt = `You are an oil palm plantation field supervisor writing a brief harvest observation note.
+Write ONE short sentence (max 25 words) about field conditions observed during FFB harvesting today.
+Context: Block ${blockCode}, Phase: ${phaseName}, Soil type: ${soilCondition || 'mineral'}.
+Your sentence must mention one of: soil/ground conditions, weather, worker performance, equipment, or pest/disease observation.
+Reply with ONLY the observation sentence. No quotes, no labels, no preamble. /no_think`;
+
+  const res = await fetch(`${OLLAMA_BASE_URL}/api/generate`, {
+    method: 'POST',
+    headers: { 'Content-Type': 'application/json' },
+    body: JSON.stringify({ model: REMARK_MODEL, prompt, stream: false }),
+  });
+
+  if (!res.ok) throw new Error(`Ollama generate failed: ${res.status} ${res.statusText}`);
+  const json = (await res.json()) as { response: string };
+  return json.response.trim();
+}
+
+// ─── Main ─────────────────────────────────────────────────────────────────────
+
+async function main() {
+  console.log('\n═══════════════════════════════════════════════');
+  console.log('  FFB Processed JSON — Sample Preview (3 recs)');
+  console.log('═══════════════════════════════════════════════\n');
+
+  // Load source files
+  const rawPhases: RawPhase[] = JSON.parse(fs.readFileSync(path.join(MONGO_STUFF, 'phaseData.json'), 'utf-8'));
+  const rawBlocks: RawBlock[] = JSON.parse(fs.readFileSync(path.join(MONGO_STUFF, 'blockData.json'), 'utf-8'));
+  const rawFFBs: RawFFB[] = JSON.parse(fs.readFileSync(path.join(MONGO_STUFF, 'FFBProductionData.json'), 'utf-8'));
+
+  // In-memory lookup maps (integer ID → raw data)
+  const phaseById = new Map<number, RawPhase>(rawPhases.map(p => [p.phaseID, p]));
+  const blockById = new Map<number, RawBlock>(rawBlocks.map(b => [b.blockID, b]));
+
+  // Connect to Atlas to get actual _id ObjectIds from Phase and Block collections
+  console.log('🔗 Connecting to Atlas to resolve ObjectIds...');
+  const client = new MongoClient(MONGO_URI);
+  await client.connect();
+  const db = client.db(MONGO_DB_NAME);
+
+  // Fetch all Phase and Block docs (small collections — 13 phases, 598 blocks)
+  const phaseDocs = await db.collection('Phase').find({}, { projection: { _id: 1, locId: 1, phaseCode: 1 } }).toArray();
+  const blockDocs = await db.collection('Block').find({}, { projection: { _id: 1, locId: 1, blockCode: 1 } }).toArray();
+  await client.close();
+
+  // Map locId (== original phaseID / blockID) → MongoDB _id
+  const phaseLocIdToMongoId = new Map<number, ObjectId>(phaseDocs.map(d => [d.locId as number, d._id as ObjectId]));
+  const blockLocIdToMongoId = new Map<number, ObjectId>(blockDocs.map(d => [d.locId as number, d._id as ObjectId]));
+
+  console.log(`   Phase ObjectIds resolved: ${phaseLocIdToMongoId.size}`);
+  console.log(`   Block ObjectIds resolved: ${blockLocIdToMongoId.size}\n`);
+
+  // Take first SAMPLE_SIZE records that are fully resolvable
+  const sample: RawFFB[] = [];
+  for (const raw of rawFFBs) {
+    if (sample.length >= SAMPLE_SIZE) break;
+    if (phaseById.has(raw.phaseId) && blockById.has(raw.blockId) &&
+        phaseLocIdToMongoId.has(raw.phaseId) && blockLocIdToMongoId.has(raw.blockId)) {
+      sample.push(raw);
+    }
+  }
+
+  console.log(`📋 Generating remarks for ${sample.length} sample records via ${REMARK_MODEL}...\n`);
+
+  const output: object[] = [];
+
+  for (let i = 0; i < sample.length; i++) {
+    const raw = sample[i];
+    const rawPhase = phaseById.get(raw.phaseId)!;
+    const rawBlock = blockById.get(raw.blockId)!;
+    const phaseMongoId = phaseLocIdToMongoId.get(raw.phaseId)!;
+    const blockMongoId = blockLocIdToMongoId.get(raw.blockId)!;
+
+    process.stdout.write(`   [${i + 1}/${sample.length}] activityId=${raw.activityId} → generating remark...`);
+    const remark = await generateRemark(rawBlock.blockCode, rawBlock.loc_soil_condition, rawPhase.phaseName);
+    console.log(` ✅`);
+    console.log(`           "${remark}"\n`);
+
+    output.push({
+      activityId: raw.activityId,
+      productionDate: new Date(raw.productionDate).toISOString(),
+      site: {
+        _id: null,          // Site collection not yet seeded; placeholder
+        siteId: raw.siteId,
+      },
+      phase: {
+        id: phaseMongoId.toHexString(),   // actual ObjectId from Phase collection
+        phaseId: raw.phaseId,
+      },
+      block: {
+        id: blockMongoId.toHexString(),   // actual ObjectId from Block collection
+        blockId: raw.blockId,
+      },
+      weight: parseFloat(raw.net_weight) || 0,
+      weightUom: raw.act_uom,
+      quantity: raw.no_of_bunches,
+      quantityUom: raw.qty_uom,
+      remarks: remark,
+      vector: [],   // to be filled during full seed run
+    });
+  }
+
+  // Pretty-print to console and write sample file
+  const outPath = path.join(MONGO_STUFF, 'FFBProductionData_sample.json');
+  const pretty = JSON.stringify(output, null, 2);
+  fs.writeFileSync(outPath, pretty, 'utf-8');
+
+  console.log('═══════════════════════════════════════════════');
+  console.log('  SAMPLE OUTPUT');
+  console.log('═══════════════════════════════════════════════\n');
+  console.log(pretty);
+  console.log(`\n✅ Written to: mongo stuff/FFBProductionData_sample.json`);
+}
+
+main().catch(err => {
+  console.error('\n❌ Sample failed:', err.message || err);
+  process.exit(1);
+});

+ 212 - 0
scripts/seed-ffb-production.ts

@@ -0,0 +1,212 @@
+/**
+ * Seeds FFBProductionData_processed.json into the 'FFB Production' collection.
+ *
+ * For each record:
+ *   - Verifies Phase (by locId) and Block (by locId) exist in Atlas
+ *   - Skips the record if either is missing
+ *   - Resolves their MongoDB _id and embeds into phase.id / block.id
+ *   - Generates a 1024-dim vector via Ollama bge-large
+ *   - Inserts into 'FFB Production'
+ *
+ * NOTE: Site collection not yet seeded — site.siteId kept as plain string for now.
+ *
+ * Run: npx ts-node --transpile-only scripts/seed-ffb-production.ts
+ */
+
+import * as dotenv from 'dotenv';
+import * as path from 'path';
+import * as fs from 'fs';
+import { MongoClient, ObjectId, Db } from 'mongodb';
+import { OllamaEmbeddings } from '@langchain/ollama';
+
+dotenv.config({ path: path.resolve(__dirname, '../.env') });
+
+const MONGO_URI = process.env.MONGO_URI!;
+const MONGO_DB_NAME = process.env.MONGO_DB_NAME!;
+const EMBEDDING_MODEL = process.env.EMBEDDING_MODEL || 'bge-large';
+const OLLAMA_BASE_URL = 'http://localhost:11434';
+const BATCH_SIZE = 20;
+const MONGO_STUFF = path.resolve(__dirname, '../../mongo stuff');
+
+// ─── Processed record shape (matches FFBProductionData_processed.json) ────────
+
+interface ProcessedFFB {
+  activityId: number;
+  productionDate: string;
+  site: { siteId: string };
+  phase: { phaseId: number };
+  block: { blockId: number };
+  weight: number;
+  weightUom: string;
+  quantity: number;
+  quantityUom: string;
+  remarks: string;
+  vector: number[];
+}
+
+// ─── Retry wrapper for Ollama (handles transient connection drops) ────────────
+
+async function embedWithRetry(
+  embedder: OllamaEmbeddings,
+  texts: string[],
+  retries = 3,
+  delayMs = 3000,
+): Promise<number[][]> {
+  for (let attempt = 1; attempt <= retries; attempt++) {
+    try {
+      return await embedder.embedDocuments(texts);
+    } catch (err: any) {
+      if (attempt === retries) throw err;
+      console.log(`\n   ⚠️  Ollama error (attempt ${attempt}/${retries}): ${err.message} — retrying in ${delayMs / 1000}s...`);
+      await new Promise(r => setTimeout(r, delayMs));
+    }
+  }
+  throw new Error('embedWithRetry exhausted');
+}
+
+// ─── Embedding text builder ───────────────────────────────────────────────────
+
+function buildEmbeddingText(r: ProcessedFFB): string {
+  const date = new Date(r.productionDate).toISOString().split('T')[0];
+  let text = `FFB Production Entry
+  Date: ${date}
+  Site: ${r.site.siteId} | Phase ID: ${r.phase.phaseId} | Block ID: ${r.block.blockId}
+  Harvest: ${r.quantity} ${r.quantityUom}, Weight: ${r.weight} ${r.weightUom}`;
+  if (r.remarks) text += `\n  Remarks: ${r.remarks}`;
+  return text.trim();
+}
+
+// ─── Main ─────────────────────────────────────────────────────────────────────
+
+async function seed() {
+  if (!MONGO_URI) throw new Error('MONGO_URI not set in .env');
+  if (!MONGO_DB_NAME) throw new Error('MONGO_DB_NAME not set in .env');
+
+  console.log('\n═══════════════════════════════════════════════════════');
+  console.log('  Seed: FFB Production');
+  console.log(`  DB:    ${MONGO_DB_NAME}`);
+  console.log(`  Model: ${EMBEDDING_MODEL} @ ${OLLAMA_BASE_URL}`);
+  console.log('═══════════════════════════════════════════════════════\n');
+
+  // Load processed JSON
+  console.log('📂 Loading FFBProductionData_processed.json...');
+  const records: ProcessedFFB[] = JSON.parse(
+    fs.readFileSync(path.join(MONGO_STUFF, 'FFBProductionData_processed.json'), 'utf-8')
+  );
+  console.log(`   ${records.length} records loaded.\n`);
+
+  // Connect to Atlas
+  console.log('🔗 Connecting to MongoDB Atlas...');
+  const client = new MongoClient(MONGO_URI);
+  await client.connect();
+  const db: Db = client.db(MONGO_DB_NAME);
+  console.log('   Connected.\n');
+
+  // Load Phase and Block reference maps from Atlas (locId → _id)
+  console.log('📋 Loading Phase and Block reference maps from Atlas...');
+  const phaseDocs = await db.collection('Phase').find({}, { projection: { _id: 1, locId: 1 } }).toArray();
+  const blockDocs = await db.collection('Block').find({}, { projection: { _id: 1, locId: 1 } }).toArray();
+
+  const phaseLocIdToId = new Map<number, ObjectId>(
+    phaseDocs.map(d => [d.locId as number, d._id as ObjectId])
+  );
+  const blockLocIdToId = new Map<number, ObjectId>(
+    blockDocs.map(d => [d.locId as number, d._id as ObjectId])
+  );
+
+  console.log(`   Phases in Atlas : ${phaseLocIdToId.size}`);
+  console.log(`   Blocks in Atlas : ${blockLocIdToId.size}\n`);
+
+  // Initialize embedder
+  console.log('🤖 Initializing Ollama embedder...');
+  const embedder = new OllamaEmbeddings({ model: EMBEDDING_MODEL, baseUrl: OLLAMA_BASE_URL });
+  console.log('   Ready.\n');
+
+  try {
+    // Wipe existing FFB Production collection
+    console.log('🗑️  Clearing FFB Production collection...');
+    await db.collection('FFB Production').deleteMany({});
+    console.log('   Cleared.\n');
+
+    const totalBatches = Math.ceil(records.length / BATCH_SIZE);
+    let inserted = 0;
+    let skipped = 0;
+
+    console.log(`📦 Processing ${records.length} records in ${totalBatches} batches of ${BATCH_SIZE}...\n`);
+
+    for (let i = 0; i < records.length; i += BATCH_SIZE) {
+      const batch = records.slice(i, i + BATCH_SIZE);
+      const batchNum = Math.floor(i / BATCH_SIZE) + 1;
+      process.stdout.write(`   [${batchNum}/${totalBatches}] validating...`);
+
+      // Validate Phase and Block existence; resolve ObjectIds
+      const valid: Array<{ record: ProcessedFFB; phaseId: ObjectId; blockId: ObjectId }> = [];
+
+      for (const r of batch) {
+        const phaseObjectId = phaseLocIdToId.get(r.phase.phaseId);
+        const blockObjectId = blockLocIdToId.get(r.block.blockId);
+
+        if (!phaseObjectId || !blockObjectId) {
+          skipped++;
+          continue;
+        }
+
+        valid.push({ record: r, phaseId: phaseObjectId, blockId: blockObjectId });
+      }
+
+      if (valid.length === 0) {
+        console.log(` all ${batch.length - valid.length} skipped (missing Phase/Block).`);
+        continue;
+      }
+
+      // Generate embeddings for the valid batch
+      process.stdout.write(` embedding ${valid.length}...`);
+      const texts = valid.map(v => buildEmbeddingText(v.record));
+      const vectors = await embedWithRetry(embedder, texts);
+
+      // Build final documents with resolved ObjectIds
+      const docs = valid.map((v, idx) => ({
+        activityId: v.record.activityId,
+        productionDate: new Date(v.record.productionDate),
+        site: {
+          siteId: v.record.site.siteId,
+        },
+        phase: {
+          id: v.phaseId,          // ObjectId ref to Phase._id
+          phaseId: v.record.phase.phaseId,
+        },
+        block: {
+          id: v.blockId,          // ObjectId ref to Block._id
+          blockId: v.record.block.blockId,
+        },
+        weight: v.record.weight,
+        weightUom: v.record.weightUom,
+        quantity: v.record.quantity,
+        quantityUom: v.record.quantityUom,
+        remarks: v.record.remarks,
+        vector: vectors[idx],
+      }));
+
+      await db.collection('FFB Production').insertMany(docs);
+      inserted += docs.length;
+      skipped += batch.length - valid.length;
+
+      const skipNote = batch.length - valid.length > 0 ? ` (${batch.length - valid.length} skipped)` : '';
+      console.log(` ✅ ${docs.length} inserted.${skipNote}`);
+    }
+
+    console.log('\n═══════════════════════════════════════════════════════');
+    console.log('  SEED COMPLETE');
+    console.log(`  Inserted : ${inserted}`);
+    console.log(`  Skipped  : ${skipped}  (Phase or Block not found in Atlas)`);
+    console.log('═══════════════════════════════════════════════════════\n');
+
+  } finally {
+    await client.close();
+  }
+}
+
+seed().catch(err => {
+  console.error('\n❌ Seed failed:', err.message || err);
+  process.exit(1);
+});

+ 414 - 0
scripts/seed-rag-warehouse.ts

@@ -0,0 +1,414 @@
+import * as dotenv from 'dotenv';
+import * as path from 'path';
+import * as fs from 'fs';
+import { MongoClient, Db } from 'mongodb';
+import { OllamaEmbeddings } from '@langchain/ollama';
+
+// Load .env from backend root (one level above scripts/)
+dotenv.config({ path: path.resolve(__dirname, '../.env') });
+
+// ─── Config ───────────────────────────────────────────────────────────────────
+
+const MONGO_URI = process.env.MONGO_URI;
+const MONGO_DB_NAME = process.env.MONGO_DB_NAME;
+const EMBEDDING_MODEL = process.env.EMBEDDING_MODEL || 'bge-large';
+const OLLAMA_BASE_URL = 'http://localhost:11434';
+const BATCH_SIZE = 50;
+
+// Two levels up from scripts/ → workspace root → mongo stuff/
+const MONGO_STUFF = path.resolve(__dirname, '../../mongo stuff');
+
+// ─── Source JSON interfaces ────────────────────────────────────────────────────
+
+interface RawPhase {
+  phaseID: number;
+  phaseCode: string;
+  phaseName: string;
+  phaseDesc: string;
+}
+
+interface RawBlock {
+  blockID: number;
+  blockCode: string;
+  blockDesc: string;
+  loc_type: string;
+  entry_no: number;
+  entry_year: number;
+  quater_planted: number;
+  month_planted: string;
+  numOfTreesPlanted: string | number | null;
+  totalTreeMatured: string | number | null;
+  totalTreeImmatured: string | number | null;
+  totalTreeDead: string | number | null;
+  totalPlantedArea: string | number | null;
+  initalPlantedArea: string | number | null;
+  plantedLocUOM: string;
+  loc_soil_condition: string;
+}
+
+interface RawFFB {
+  activityId: number;
+  productionDate: string;
+  siteId: string;
+  phaseId: number;
+  blockId: number;
+  net_weight: string;
+  act_uom: string;
+  no_of_bunches: number;
+  qty_uom: string;
+}
+
+// ─── Target document interfaces (match NestJS schemas exactly) ────────────────
+
+interface PhaseDoc {
+  locId: number;
+  phaseCode: string;
+  description: string;
+  locType: string;
+  vector?: number[];
+}
+
+interface BlockDoc {
+  locId: number;
+  blockCode: string;
+  blockDesc: string;
+  locType: string;
+  entryNo: number;
+  entryYear: number;
+  quarterPlanted: number;
+  monthPlanted: string;
+  totalTrees: number;
+  totalMaturedTrees: number;
+  totalImmaturedTrees: number;
+  totalDeadTrees: number;
+  plantedArea: number;
+  initialPlantedArea: number;
+  plantedLocUOM: string;
+  soilCondition: string;
+  vector?: number[];
+}
+
+// ─── Type-cast helpers ────────────────────────────────────────────────────────
+
+const toNum = (v: string | number | null | undefined): number =>
+  v == null ? 0 : parseFloat(String(v)) || 0;
+
+// Round to integer (avoids NaN from null/empty strings)
+const toInt = (v: string | number | null | undefined): number =>
+  v == null ? 0 : Math.round(parseFloat(String(v))) || 0;
+
+function loadJson<T>(filename: string): T {
+  const filePath = path.join(MONGO_STUFF, filename);
+  return JSON.parse(fs.readFileSync(filePath, 'utf-8')) as T;
+}
+
+// ─── Transform: phaseData.json → Phase collection schema ─────────────────────
+// phaseID → locId, phaseDesc → description, add locType
+
+function transformPhase(raw: RawPhase): PhaseDoc {
+  return {
+    locId: raw.phaseID,
+    phaseCode: raw.phaseCode,
+    description: raw.phaseDesc || raw.phaseName,
+    locType: 'PHASE',
+  };
+}
+
+// ─── Transform: blockData.json → Block collection schema ─────────────────────
+// All snake_case / mixed-case source fields → camelCase schema fields.
+// Numeric strings (e.g. "13932", "88.7300") cast to native Number on write.
+
+function transformBlock(raw: RawBlock): BlockDoc {
+  return {
+    locId: raw.blockID,
+    blockCode: raw.blockCode,
+    blockDesc: raw.blockDesc || '',
+    locType: raw.loc_type || 'BLOCK',
+    entryNo: toInt(raw.entry_no),
+    entryYear: toInt(raw.entry_year),
+    quarterPlanted: toInt(raw.quater_planted),
+    monthPlanted: raw.month_planted || '',
+    totalTrees: toInt(raw.numOfTreesPlanted),         // "13932" → 13932
+    totalMaturedTrees: toInt(raw.totalTreeMatured),
+    totalImmaturedTrees: toInt(raw.totalTreeImmatured),
+    totalDeadTrees: toInt(raw.totalTreeDead),
+    plantedArea: toNum(raw.totalPlantedArea),          // "88.7300" → 88.73
+    initialPlantedArea: toNum(raw.initalPlantedArea),
+    plantedLocUOM: raw.plantedLocUOM || '',
+    soilCondition: raw.loc_soil_condition || '',       // "PEAT" → soilCondition
+  };
+}
+
+// ─── Phase embedding text ─────────────────────────────────────────────────────
+
+function buildPhaseEmbeddingText(doc: PhaseDoc): string {
+  return `Phase Reference Entry:
+  Phase Code: ${doc.phaseCode}
+  Phase Name: ${doc.description}
+  Location Type: ${doc.locType}`.trim();
+}
+
+// ─── Block embedding text ─────────────────────────────────────────────────────
+
+function buildBlockEmbeddingText(doc: BlockDoc): string {
+  let text = `Block Reference Entry:
+  Block Code: ${doc.blockCode}
+  Description: ${doc.blockDesc || 'No description.'}
+  Location Type: ${doc.locType}
+  Total Trees: ${doc.totalTrees} (${doc.totalMaturedTrees} matured, ${doc.totalImmaturedTrees} immature, ${doc.totalDeadTrees} dead)
+  Planted Area: ${doc.plantedArea} ${doc.plantedLocUOM}
+  Entry Year: ${doc.entryYear}, Quarter Planted: ${doc.quarterPlanted}, Month: ${doc.monthPlanted}`;
+
+  if (doc.soilCondition) {
+    text += `\n  Soil Condition: ${doc.soilCondition}`;
+  }
+
+  return text.trim();
+}
+
+// ─── FFB embedding text (mirrors recordToTextEnriched in ffb-vector.service.ts) ─
+// Environmental Context line conditionally appended only when soilCondition exists,
+// matching the production vector service behaviour exactly.
+
+function buildFFBEmbeddingText(
+  raw: RawFFB,
+  phaseCode: string,
+  phaseName: string,
+  block: BlockDoc,
+): string {
+  let text = `FFB Production Log Entry:
+  Project Code: ${raw.siteId} | Activity: FFB Harvesting
+  Organization:  (${raw.siteId})
+  Location Details: Phase ${phaseName} (${phaseCode}), Block Code ${block.blockCode}
+  Harvest Output: ${raw.no_of_bunches} Bunches, Net Weight: ${toNum(raw.net_weight)} ${raw.act_uom}
+  Logistics: Transported via Truck  to Mill
+  Field Observations: No supervisor remarks recorded.
+  Operational Issues: No production anomalies reported.`;
+
+  if (block.soilCondition) {
+    text += `\nEnvironmental Context: Cultivated on ${block.soilCondition} soil conditions with a total tree count of ${block.totalTrees || 0} trees.`;
+  }
+
+  return text.trim();
+}
+
+// ─── FFB Production document builder ─────────────────────────────────────────
+
+function buildFFBDoc(
+  raw: RawFFB,
+  phaseCode: string,
+  phaseName: string,
+  block: BlockDoc,
+  vector: number[],
+) {
+  return {
+    productionDate: new Date(raw.productionDate),
+    prjCode: raw.siteId,
+    actCode: 'FFB',
+    actName: 'FFB Harvesting',
+    entityCode: '',
+    orgnId: 0,
+    orgnCode: raw.siteId,
+    orgnFullName: '',
+    orgnAddress: '',
+    orgnCompRegNo: '',
+    phaseCode,
+    phaseName,
+    phaseDesc: phaseName,
+    blockCode: block.blockCode,
+    blockName: block.blockDesc || null,
+    blockDesc: block.blockDesc || null,
+    truckNo: '',
+    millNo: '',
+    actEntryNo: raw.activityId,
+    actRound: 0,
+    weightChitNo: '',
+    ownNetWeight: null,
+    netWeight: toNum(raw.net_weight),          // "2.2700" → 2.27
+    actUom: raw.act_uom,
+    noOfBunches: raw.no_of_bunches,
+    qtyUom: raw.qty_uom,
+    docActQty: 0,
+    locArea: block.plantedArea,
+    locUom: block.plantedLocUOM,
+    budgetedFfb: null,
+    remarks: '',
+    issues: null,
+    vector,
+  };
+}
+
+// ─── Batch embedder helper ────────────────────────────────────────────────────
+
+async function embedInBatches(
+  embedder: OllamaEmbeddings,
+  texts: string[],
+  label: string,
+): Promise<number[][]> {
+  const results: number[][] = [];
+  const total = Math.ceil(texts.length / BATCH_SIZE);
+  for (let i = 0; i < texts.length; i += BATCH_SIZE) {
+    const batchNum = Math.floor(i / BATCH_SIZE) + 1;
+    process.stdout.write(`   [${batchNum}/${total}] embedding ${label}...`);
+    const slice = texts.slice(i, i + BATCH_SIZE);
+    const vecs = await embedder.embedDocuments(slice);
+    results.push(...vecs);
+    console.log(` ✅ ${slice.length} done.`);
+  }
+  return results;
+}
+
+// ─── Main ─────────────────────────────────────────────────────────────────────
+
+async function seed() {
+  if (!MONGO_URI) throw new Error('MONGO_URI not set in .env');
+  if (!MONGO_DB_NAME) throw new Error('MONGO_DB_NAME not set in .env');
+
+  console.log('');
+  console.log('═══════════════════════════════════════════════════════');
+  console.log('  RAG Warehouse Seed Script');
+  console.log(`  DB:    ${MONGO_DB_NAME}`);
+  console.log(`  Model: ${EMBEDDING_MODEL} @ ${OLLAMA_BASE_URL}`);
+  console.log('═══════════════════════════════════════════════════════');
+  console.log('');
+
+  // 1. Load source JSON
+  console.log('📂 Loading source JSON files...');
+  const rawPhases = loadJson<RawPhase[]>('phaseData.json');
+  const rawBlocks = loadJson<RawBlock[]>('blockData.json');
+  const rawFFBs = loadJson<RawFFB[]>('FFBProductionData.json');
+  console.log(`   Phases:         ${rawPhases.length}`);
+  console.log(`   Blocks:         ${rawBlocks.length}`);
+  console.log(`   FFB Activities: ${rawFFBs.length}`);
+  console.log('');
+
+  // 2. Transform master data
+  console.log('🔄 Transforming master data (field rename + type cast)...');
+  const phases = rawPhases.map(transformPhase);
+  const blocks = rawBlocks.map(transformBlock);
+
+  // 3. Build in-memory lookup maps — avoids per-record Atlas round trips
+  const phaseIdToCode = new Map<number, string>(rawPhases.map(p => [p.phaseID, p.phaseCode]));
+  const phaseIdToName = new Map<number, string>(rawPhases.map(p => [p.phaseID, p.phaseName]));
+  const blockIdToDoc = new Map<number, BlockDoc>(rawBlocks.map(b => [b.blockID, transformBlock(b)]));
+  console.log(`   Phase lookup map: ${phaseIdToCode.size} entries`);
+  console.log(`   Block lookup map: ${blockIdToDoc.size} entries`);
+  console.log('');
+
+  // 4. Connect to MongoDB Atlas
+  console.log('🔗 Connecting to MongoDB Atlas...');
+  const client = new MongoClient(MONGO_URI);
+  await client.connect();
+  const db: Db = client.db(MONGO_DB_NAME);
+  console.log('   Connected.\n');
+
+  // 5. Initialize Ollama embedder (shared across all three collections)
+  console.log('🤖 Initializing Ollama embedder...');
+  const embedder = new OllamaEmbeddings({
+    model: EMBEDDING_MODEL,
+    baseUrl: OLLAMA_BASE_URL,
+  });
+  console.log('   Embedder ready.\n');
+
+  try {
+    // ── Phase collection ────────────────────────────────────────────────────
+    console.log(`📦 Vectorizing ${phases.length} phases...`);
+    const phaseTexts = phases.map(buildPhaseEmbeddingText);
+    const phaseVectors = await embedInBatches(embedder, phaseTexts, 'phase');
+
+    console.log('🗑️  Phase: clearing...');
+    await db.collection('Phase').deleteMany({});
+    const phaseDocs = phases.map((p, i) => ({ ...p, vector: phaseVectors[i] }));
+    await db.collection('Phase').insertMany(phaseDocs);
+    console.log(`   ✅ ${phaseDocs.length} phases inserted with vectors.\n`);
+
+    // ── Block collection ────────────────────────────────────────────────────
+    console.log(`📦 Vectorizing ${blocks.length} blocks...`);
+    const blockTexts = blocks.map(buildBlockEmbeddingText);
+    const blockVectors = await embedInBatches(embedder, blockTexts, 'block');
+
+    console.log('🗑️  Block: clearing...');
+    await db.collection('Block').deleteMany({});
+    const blockDocs = blocks.map((b, i) => ({ ...b, vector: blockVectors[i] }));
+    await db.collection('Block').insertMany(blockDocs);
+    console.log(`   ✅ ${blockDocs.length} blocks inserted with vectors.\n`);
+
+    // ── FFB Production collection ───────────────────────────────────────────
+    console.log('🗑️  FFB Production: clearing...');
+    await db.collection('FFB Production').deleteMany({});
+    console.log('   Collection cleared.\n');
+
+    const totalBatches = Math.ceil(rawFFBs.length / BATCH_SIZE);
+    let insertedCount = 0;
+    let skippedCount = 0;
+
+    console.log(`📦 Processing ${rawFFBs.length} FFB records in ${totalBatches} batches of ${BATCH_SIZE}...`);
+    console.log('');
+
+    for (let i = 0; i < rawFFBs.length; i += BATCH_SIZE) {
+      const batch = rawFFBs.slice(i, i + BATCH_SIZE);
+      const batchNum = Math.floor(i / BATCH_SIZE) + 1;
+      process.stdout.write(`   [${batchNum}/${totalBatches}] resolving...`);
+
+      // Resolve phaseCode/block for each record using in-memory maps
+      const resolved: Array<{ raw: RawFFB; phaseCode: string; phaseName: string; block: BlockDoc }> = [];
+      let batchSkip = 0;
+
+      for (const raw of batch) {
+        const phaseCode = phaseIdToCode.get(raw.phaseId);
+        const phaseName = phaseIdToName.get(raw.phaseId) || '';
+        const block = blockIdToDoc.get(raw.blockId);
+        if (!phaseCode || !block) {
+          batchSkip++;
+        } else {
+          resolved.push({ raw, phaseCode, phaseName, block });
+        }
+      }
+
+      skippedCount += batchSkip;
+
+      if (resolved.length === 0) {
+        console.log(` all ${batchSkip} records have unresolvable IDs — skipped.`);
+        continue;
+      }
+
+      // Generate embeddings for the entire resolved batch in one Ollama call
+      process.stdout.write(` embedding ${resolved.length}...`);
+      const texts = resolved.map(r =>
+        buildFFBEmbeddingText(r.raw, r.phaseCode, r.phaseName, r.block)
+      );
+      const vectors = await embedder.embedDocuments(texts);
+
+      // Build final documents and insert
+      const docs = resolved.map((r, idx) =>
+        buildFFBDoc(r.raw, r.phaseCode, r.phaseName, r.block, vectors[idx])
+      );
+      await db.collection('FFB Production').insertMany(docs);
+      insertedCount += docs.length;
+
+      const skipNote = batchSkip > 0 ? ` (${batchSkip} skipped)` : '';
+      console.log(` ✅ ${docs.length} inserted.${skipNote}`);
+    }
+
+    // ── Final summary ───────────────────────────────────────────────────────
+    console.log('');
+    console.log('═══════════════════════════════════════════════════════');
+    console.log('  SEED COMPLETE');
+    console.log(`  Phases inserted (with vectors):  ${phaseDocs.length}`);
+    console.log(`  Blocks inserted (with vectors):  ${blockDocs.length}`);
+    console.log(`  FFB records inserted:            ${insertedCount}`);
+    if (skippedCount > 0) {
+      console.log(`  FFB records skipped:             ${skippedCount}  (phaseId/blockId not in master)`);
+    }
+    console.log('═══════════════════════════════════════════════════════');
+    console.log('');
+
+  } finally {
+    await client.close();
+  }
+}
+
+seed().catch(err => {
+  console.error('\n❌ Seed failed:', err.message || err);
+  process.exit(1);
+});