{
  "name": "xlsx-ingest",
  "settings": {
    "executionOrder": "v1",
    "saveExecutionProgress": true,
    "saveDataErrorExecution": "all",
    "saveDataSuccessExecution": "all"
  },
  "nodes": [
    {
      "id": "manual",
      "name": "Manual run",
      "type": "n8n-nodes-base.manualTrigger",
      "typeVersion": 1,
      "position": [
        240,
        220
      ],
      "parameters": {}
    },
    {
      "id": "schedule",
      "name": "Every minute",
      "type": "n8n-nodes-base.scheduleTrigger",
      "typeVersion": 1.2,
      "position": [
        240,
        420
      ],
      "parameters": {
        "rule": {
          "interval": [
            {
              "field": "minutes",
              "minutesInterval": 1
            }
          ]
        }
      }
    },
    {
      "id": "list",
      "name": "List incoming spreadsheets",
      "type": "n8n-nodes-base.code",
      "typeVersion": 2,
      "position": [
        480,
        320
      ],
      "parameters": {
        "language": "javaScript",
        "jsCode": "// Walk /data/spreadsheets/incoming/<assignment>/*.xlsx and atomically claim each\n// file by renaming it into a .working/ subfolder. fs.renameSync is atomic on\n// the same filesystem, so two concurrent scheduler ticks can never claim the\n// same file -> no duplicate processing, no duplicate points.\nconst fs = require('fs');\nconst path = require('path');\nconst root = '/data/spreadsheets/incoming';\nconst out = [];\nlet entries = [];\ntry { entries = fs.readdirSync(root, { withFileTypes: true }); } catch (e) { return out; }\nfor (const e of entries) {\n  if (!e.isDirectory()) continue;\n  const assignment = e.name;\n  const sub = path.join(root, assignment);\n  const working = path.join(sub, '.working');\n  let files = [];\n  try { files = fs.readdirSync(sub); } catch (_) { continue; }\n  for (const f of files) {\n    if (f.startsWith('.')) continue; // skip dotfiles incl. .working\n    const lo = f.toLowerCase();\n    if (!lo.endsWith('.xlsx') && !lo.endsWith('.xlsm')) continue;\n    const src = path.join(sub, f);\n    const dst = path.join(working, f);\n    try { fs.mkdirSync(working, { recursive: true }); } catch (_) {}\n    try {\n      fs.renameSync(src, dst); // atomic claim\n    } catch (_) {\n      continue; // someone else (or a previous tick) already grabbed it\n    }\n    out.push({ json: { assignment, filename: f, path: dst } });\n  }\n}\nreturn out;\n"
      }
    },
    {
      "id": "process",
      "name": "Ingest spreadsheet",
      "type": "n8n-nodes-base.code",
      "typeVersion": 2,
      "position": [
        720,
        320
      ],
      "parameters": {
        "mode": "runOnceForEachItem",
        "language": "javaScript",
        "jsCode": "/**\n * Per-file pipeline:\n *   1. POST raw bytes to spreadsheet-mcp parser  -> {sheets, profile, sha256}\n *   2. Ask Qwen to summarize the file (used for retrieval and SQL hinting)\n *   3. UPSERT into postgres.spreadsheets\n *   4. Embed the summary with bge-m3 and upsert into qdrant 'spreadsheets-meta'\n *   5. Move source file to processed/<assignment>/  (or failed/<assignment>/)\n */\nconst fs   = require('fs');\nconst path = require('path');\nconst http = require('http');\nconst https = require('https');\nconst { URL } = require('url');\n\nconst PARSER_URL  = ($env.PARSER_URL  || 'http://spreadsheet-mcp:8090').replace(/\\/$/, '');\nconst LLM_BASE    = ($env.LLM_BASE_URL || 'https://chat.example.com/v1').replace(/\\/$/, '');\nconst LLM_MODEL   = $env.LLM_MODEL || 'Qwen3.6-35B-A3B-Q5_K_M';\nconst EMBED_BASE  = ($env.EMBEDDINGS_BASE_URL || 'https://embed.example.com/v1').replace(/\\/$/, '');\nconst EMBED_MODEL = $env.EMBEDDINGS_MODEL || 'bge-m3';\nconst QDRANT_URL  = ($env.QDRANT_URL || 'http://qdrant:6333').replace(/\\/$/, '');\nconst QDRANT_COL  = 'spreadsheets-meta';\nconst PG_HOST     = $env.POSTGRES_HOST || 'postgres';\nconst PG_PORT     = Number($env.POSTGRES_PORT || 5432);\nconst PG_DB       = $env.POSTGRES_DB   || 'boxai';\nconst PG_USER     = $env.POSTGRES_USER || 'boxai';\nconst PG_PASS     = $env.POSTGRES_PASSWORD || '';\n\n// --- minimal http(s) JSON helper -------------------------------------------\nfunction httpJson(method, urlStr, body, headers) {\n  return new Promise((resolve, reject) => {\n    const u = new URL(urlStr);\n    const lib = u.protocol === 'https:' ? https : http;\n    const h = Object.assign({ 'Accept': 'application/json' }, headers || {});\n    let payload = null;\n    if (body != null) {\n      payload = Buffer.isBuffer(body) ? body : Buffer.from(JSON.stringify(body));\n      if (!h['Content-Type']) h['Content-Type'] = 'application/json';\n      h['Content-Length'] = payload.length;\n    }\n    const req = lib.request({\n      method,\n      hostname: u.hostname,\n      port: u.port || (u.protocol === 'https:' ? 443 : 80),\n      path: u.pathname + u.search,\n      headers: h,\n    }, (res) => {\n      const chunks = [];\n      res.on('data', (c) => chunks.push(c));\n      res.on('end', () => {\n        const text = Buffer.concat(chunks).toString('utf8');\n        if (res.statusCode >= 200 && res.statusCode < 300) {\n          try { resolve(JSON.parse(text)); } catch (_) { resolve(text); }\n        } else {\n          reject(new Error(`${method} ${urlStr} -> ${res.statusCode}: ${text.slice(0, 400)}`));\n        }\n      });\n    });\n    req.on('error', reject);\n    if (payload) req.write(payload);\n    req.end();\n  });\n}\n\nfunction safeMove(src, destDir) {\n  fs.mkdirSync(destDir, { recursive: true });\n  const dest = path.join(destDir, path.basename(src));\n  try { fs.renameSync(src, dest); }\n  catch (_) { fs.copyFileSync(src, dest); fs.unlinkSync(src); }\n  return dest;\n}\n\n// --- LLM summarizer --------------------------------------------------------\nasync function summarize(parsed, assignment) {\n  // Send the model a compact view: per-sheet column names, types, samples, and\n  // first 5 rows. Never the full data.\n  const compact = parsed.profile.sheets.map((s) => ({\n    sheet: s.name,\n    row_count: s.row_count,\n    columns: s.columns.map((c) => ({\n      name: c.name, type: c.type,\n      samples: (c.samples || []).slice(0, 5),\n      distinct_estimate: c.distinct_estimate,\n      null_count: c.null_count,\n    })),\n    head: (parsed.sheets[s.name] || []).slice(0, 5),\n  }));\n\n  const prompt = `You are profiling an uploaded spreadsheet so another LLM can later answer questions about it via SQL.\n\nFile: ${parsed.filename}\nAssignment: ${assignment}\nSheets and schemas (samples included):\n${JSON.stringify(compact, null, 2)}\n\nRespond ONLY with a single JSON object, no prose, no code fences. Schema:\n{\n  \"summary\": string,                 // 2-4 sentences: what this file contains, time range if any, units, language\n  \"topics\": string[],                // 3-8 short tags, lowercase\n  \"suggested_questions\": string[],   // 3-6 questions a user might ask\n  \"column_descriptions\": { \"<sheet>.<column>\": string }   // one short line per non-obvious column\n}\nWrite the summary in the same language as the data when possible.`;\n\n  const j = await httpJson('POST', `${LLM_BASE}/chat/completions`, {\n    model: LLM_MODEL,\n    temperature: 0.2,\n    messages: [\n      { role: 'system', content: 'You output strictly valid JSON.' },\n      { role: 'user',   content: prompt },\n    ],\n  });\n  const txt = (j.choices && j.choices[0] && j.choices[0].message && j.choices[0].message.content) || '';\n  // Tolerate ```json fences\n  const cleaned = txt.replace(/^```(?:json)?\\s*/i, '').replace(/```\\s*$/i, '').trim();\n  try { return JSON.parse(cleaned); }\n  catch (_) { return { summary: txt.slice(0, 800), topics: [], suggested_questions: [], column_descriptions: {} }; }\n}\n\n// --- Postgres via tiny TCP wire (use n8n Postgres node? we'd rather avoid an\n// extra node + credential setup, so use the pg module via a side-car HTTP).\n// Simpler: do the upsert through a generic SQL endpoint... we don't have one.\n// We DO have spreadsheet-mcp which has pg in it, but it's read-only. The\n// cleanest, smallest-surface approach is to add an internal HTTP route on the\n// parser sidecar to perform the UPSERT. But to keep this single-node-flow we\n// instead require n8n's Postgres node. That node lives outside the Code node.\n//\n// To stay in pure-JS Code, we go through Qdrant + a postgres-write-via-MCP\n// internal endpoint. To avoid adding more endpoints, we use the n8n Postgres\n// node downstream. This Code node just emits the prepared row; the next node\n// does the UPSERT.\n\nconst pdfPath  = $json.path;\nconst assignment = $json.assignment;\nconst filename = $json.filename;\n\ntry {\n  // 1. parse\n  const buf = fs.readFileSync(pdfPath);\n  const parsed = await httpJson('POST', `${PARSER_URL}/parse`,\n    { filename, base64: buf.toString('base64') });\n\n  // 2. summarize via Qwen\n  const meta = await summarize(parsed, assignment);\n\n  // 3. emit a row for the next (Postgres) node to UPSERT\n  const downloadPath = `processed/${assignment}/${filename}`;\n\n  return {\n    status: 'parsed',\n    assignment,\n    filename,\n    sha256: parsed.sha256,\n    download_path: downloadPath,\n    sheets:   parsed.sheets,\n    profile:  parsed.profile,\n    metadata: meta,\n    src_path: pdfPath,\n    // text used for the qdrant point\n    embed_text:\n      `Filename: ${filename}\\nAssignment: ${assignment}\\n` +\n      `Sheets: ${parsed.profile.sheets.map(s => s.name + ' (' + s.row_count + ' rows, cols: ' + s.columns.map(c => c.name).join(', ') + ')').join(' | ')}\\n` +\n      `Summary: ${meta.summary || ''}\\n` +\n      `Topics: ${(meta.topics || []).join(', ')}\\n` +\n      `Suggested questions: ${(meta.suggested_questions || []).join(' | ')}`\n  };\n} catch (err) {\n  let dest = null;\n  try { dest = safeMove(pdfPath, `/data/spreadsheets/failed/${assignment}`); } catch (_) {}\n  return {\n    status: 'error',\n    assignment, filename,\n    error: String(err && err.message || err),\n    moved_to: dest,\n  };\n}\n"
      }
    },
    {
      "id": "skip-errors",
      "name": "Continue if parsed",
      "type": "n8n-nodes-base.if",
      "typeVersion": 2.2,
      "position": [
        960,
        320
      ],
      "parameters": {
        "conditions": {
          "options": {
            "caseSensitive": true,
            "leftValue": "",
            "typeValidation": "strict"
          },
          "combinator": "and",
          "conditions": [
            {
              "leftValue": "={{ $json.status }}",
              "rightValue": "parsed",
              "operator": {
                "type": "string",
                "operation": "equals"
              }
            }
          ]
        }
      }
    },
    {
      "id": "upsert-pg",
      "name": "UPSERT spreadsheet",
      "type": "n8n-nodes-base.postgres",
      "typeVersion": 2.5,
      "position": [
        1200,
        220
      ],
      "parameters": {
        "operation": "executeQuery",
        "query": "INSERT INTO spreadsheets (assignment, filename, sha256, download_path, sheets, profile, metadata)\nVALUES ($1, $2, $3, $4, $5::jsonb, $6::jsonb, $7::jsonb)\nON CONFLICT (assignment, filename) DO UPDATE\n  SET sha256        = EXCLUDED.sha256,\n      download_path = EXCLUDED.download_path,\n      sheets        = EXCLUDED.sheets,\n      profile       = EXCLUDED.profile,\n      metadata      = EXCLUDED.metadata,\n      uploaded_at   = now()\nRETURNING id::text AS id;",
        "options": {
          "queryReplacement": "={{ $json.assignment }},{{ $json.filename }},{{ $json.sha256 }},{{ $json.download_path }},{{ JSON.stringify($json.sheets) }},{{ JSON.stringify($json.profile) }},{{ JSON.stringify($json.metadata) }}"
        }
      },
      "credentials": {
        "postgres": {
          "name": "<your credential>"
        }
      }
    },
    {
      "id": "qdrant-upsert",
      "name": "Embed + upsert qdrant + move",
      "type": "n8n-nodes-base.code",
      "typeVersion": 2,
      "position": [
        1440,
        220
      ],
      "parameters": {
        "mode": "runOnceForEachItem",
        "language": "javaScript",
        "jsCode": "const fs   = require('fs');\nconst path = require('path');\nconst http = require('http');\nconst https = require('https');\nconst crypto = require('crypto');\nconst { URL } = require('url');\n\nconst EMBED_BASE  = ($env.EMBEDDINGS_BASE_URL || 'https://embed.example.com/v1').replace(/\\/$/, '');\nconst EMBED_MODEL = $env.EMBEDDINGS_MODEL || 'bge-m3';\nconst QDRANT_URL  = ($env.QDRANT_URL || 'http://qdrant:6333').replace(/\\/$/, '');\nconst QDRANT_COL  = 'spreadsheets-meta';\n\n// Deterministic UUIDv5 from a string. Using a fixed namespace UUID so that\n// (assignment, filename) -> always the same qdrant point id. Re-ingesting the\n// same file overwrites in place; no duplicates can ever exist in qdrant.\nconst NS = '6ba7b810-9dad-11d1-80b4-00c04fd430c8'; // RFC 4122 DNS namespace\nfunction uuidv5(name, nsHex) {\n  const ns = Buffer.from(nsHex.replace(/-/g, ''), 'hex');\n  const h = crypto.createHash('sha1').update(ns).update(Buffer.from(name, 'utf8')).digest();\n  const b = Buffer.from(h.subarray(0, 16));\n  b[6] = (b[6] & 0x0f) | 0x50; // version 5\n  b[8] = (b[8] & 0x3f) | 0x80; // RFC 4122 variant\n  const hex = b.toString('hex');\n  return `${hex.slice(0,8)}-${hex.slice(8,12)}-${hex.slice(12,16)}-${hex.slice(16,20)}-${hex.slice(20,32)}`;\n}\n\nfunction httpJson(method, urlStr, body) {\n  return new Promise((resolve, reject) => {\n    const u = new URL(urlStr);\n    const lib = u.protocol === 'https:' ? https : http;\n    const h = { 'Content-Type': 'application/json', 'Accept': 'application/json' };\n    const payload = body == null ? null : Buffer.from(JSON.stringify(body));\n    if (payload) h['Content-Length'] = payload.length;\n    const req = lib.request({\n      method, hostname: u.hostname,\n      port: u.port || (u.protocol === 'https:' ? 443 : 80),\n      path: u.pathname + u.search, headers: h,\n    }, (res) => {\n      const chunks = [];\n      res.on('data', (c) => chunks.push(c));\n      res.on('end', () => {\n        const text = Buffer.concat(chunks).toString('utf8');\n        if (res.statusCode >= 200 && res.statusCode < 300) {\n          try { resolve(JSON.parse(text)); } catch (_) { resolve(text); }\n        } else reject(new Error(`${method} ${urlStr} -> ${res.statusCode}: ${text.slice(0, 400)}`));\n      });\n    });\n    req.on('error', reject);\n    if (payload) req.write(payload);\n    req.end();\n  });\n}\n\nfunction safeMove(src, destDir) {\n  fs.mkdirSync(destDir, { recursive: true });\n  const dest = path.join(destDir, path.basename(src));\n  try { fs.renameSync(src, dest); } catch (_) { fs.copyFileSync(src, dest); fs.unlinkSync(src); }\n  return dest;\n}\n\n// Pull the upserted id from the previous (Postgres) node, and the metadata\n// from the Code node before it.\nconst pgItem = $input.item.json;\nconst id = pgItem.id;\nconst meta = $('Ingest spreadsheet').item.json;\nconst assignment = meta.assignment;\nconst filename   = meta.filename;\nconst srcPath    = meta.src_path;\n\n// Stable point id keyed on (assignment, filename) -- guarantees uniqueness.\nconst pointId = uuidv5(`${assignment}/${filename}`, NS);\n\ntry {\n  // ensure collection exists (idempotent)\n  try { await httpJson('GET', `${QDRANT_URL}/collections/${QDRANT_COL}`); }\n  catch (_) {\n    await httpJson('PUT', `${QDRANT_URL}/collections/${QDRANT_COL}`,\n      { vectors: { size: 1024, distance: 'Cosine' } });\n    try { await httpJson('PUT', `${QDRANT_URL}/collections/${QDRANT_COL}/index`,\n      { field_name: 'assignment', field_schema: 'keyword' }); } catch (_) {}\n  }\n\n  // embed the summary text\n  const e = await httpJson('POST', `${EMBED_BASE}/embeddings`,\n    { model: EMBED_MODEL, input: meta.embed_text });\n  const vector = e.data[0].embedding;\n\n  // upsert by deterministic id -- if the file is re-ingested the same point\n  // is overwritten in place. No delete step needed; no duplicates possible.\n  await httpJson('PUT', `${QDRANT_URL}/collections/${QDRANT_COL}/points?wait=true`, {\n    points: [{\n      id: pointId,\n      vector,\n      payload: {\n        spreadsheet_id: id,\n        assignment,\n        filename,\n        download_path: meta.download_path,\n        summary: (meta.metadata && meta.metadata.summary) || '',\n      },\n    }],\n  });\n\n  const dest = safeMove(srcPath, `/data/spreadsheets/processed/${assignment}`);\n  return {\n    status: 'ok', assignment, filename, spreadsheet_id: id, point_id: pointId, moved_to: dest,\n  };\n} catch (err) {\n  let dest = null;\n  try { dest = safeMove(srcPath, `/data/spreadsheets/failed/${assignment}`); } catch (_) {}\n  return {\n    status: 'error', assignment, filename, spreadsheet_id: id, point_id: pointId,\n    error: String(err && err.message || err), moved_to: dest,\n  };\n}\n"
      }
    }
  ],
  "connections": {
    "Manual run": {
      "main": [
        [
          {
            "node": "List incoming spreadsheets",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Every minute": {
      "main": [
        [
          {
            "node": "List incoming spreadsheets",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "List incoming spreadsheets": {
      "main": [
        [
          {
            "node": "Ingest spreadsheet",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Ingest spreadsheet": {
      "main": [
        [
          {
            "node": "Continue if parsed",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Continue if parsed": {
      "main": [
        [
          {
            "node": "UPSERT spreadsheet",
            "type": "main",
            "index": 0
          }
        ],
        []
      ]
    },
    "UPSERT spreadsheet": {
      "main": [
        [
          {
            "node": "Embed + upsert qdrant + move",
            "type": "main",
            "index": 0
          }
        ]
      ]
    }
  }
}