{
  "name": "Document Ingestion Pipeline",
  "nodes": [
    {
      "parameters": {
        "content": "## Document Ingestion Pipeline\n\nThis workflow processes PDF documents and stores them as vector embeddings in PostgreSQL.\n\n**Steps:**\n1. Trigger manually or detect new files\n2. Read and extract text from PDFs\n3. Split text into chunks\n4. Generate embeddings via Ollama\n5. Store in pgvector database",
        "height": 280,
        "width": 320
      },
      "id": "sticky-note",
      "name": "Instructions",
      "type": "n8n-nodes-base.stickyNote",
      "typeVersion": 1,
      "position": [
        -200,
        -100
      ]
    },
    {
      "parameters": {},
      "id": "manual-trigger",
      "name": "Manual Trigger",
      "type": "n8n-nodes-base.manualTrigger",
      "typeVersion": 1,
      "position": [
        0,
        200
      ]
    },
    {
      "parameters": {
        "command": "ls -1 /home/node/.n8n-files/*.pdf /home/node/.n8n-files/*.txt /home/node/.n8n-files/*.md 2>/dev/null || echo ''"
      },
      "id": "list-files",
      "name": "List PDF Files",
      "type": "n8n-nodes-base.executeCommand",
      "typeVersion": 1,
      "position": [
        220,
        200
      ]
    },
    {
      "parameters": {
        "jsCode": "const stdout = $input.first().json.stdout || '';\nconst supportedExtensions = ['.pdf', '.txt', '.md'];\nconst files = stdout.trim().split('\\n').filter(f => {\n  if (!f) return false;\n  return supportedExtensions.some(ext => f.toLowerCase().endsWith(ext));\n});\n\nif (files.length === 0) {\n  throw new Error('No supported files found in /home/node/.n8n-files/ (supported: PDF, TXT, MD)');\n}\n\nreturn files.map(filePath => {\n  const fileName = filePath.split('/').pop();\n  const ext = fileName.split('.').pop().toLowerCase();\n  return {\n    json: {\n      filePath,\n      fileName,\n      fileType: ext\n    }\n  };\n});"
      },
      "id": "parse-file-list",
      "name": "Parse File List",
      "type": "n8n-nodes-base.code",
      "typeVersion": 2,
      "position": [
        440,
        200
      ]
    },
    {
      "parameters": {
        "filePath": "={{ $json.filePath }}"
      },
      "id": "read-binary",
      "name": "Read Binary File",
      "type": "n8n-nodes-base.readBinaryFile",
      "typeVersion": 1,
      "position": [
        660,
        200
      ]
    },
    {
      "parameters": {},
      "id": "read-pdf",
      "name": "Extract PDF Text",
      "type": "n8n-nodes-base.readPDF",
      "typeVersion": 1,
      "position": [
        880,
        200
      ]
    },
    {
      "parameters": {
        "jsCode": "const input = $input.first();\nconst text = input.json.text || '';\nconst fileName = $('Parse File List').first().json.fileName;\n\nif (!text.trim()) {\n  throw new Error(`No text extracted from ${fileName}`);\n}\n\n// Recursive character text splitter\nconst chunkSize = 1000;\nconst chunkOverlap = 200;\nconst chunks = [];\n\nlet start = 0;\nlet chunkIndex = 0;\n\nwhile (start < text.length) {\n  const end = Math.min(start + chunkSize, text.length);\n  let chunk = text.slice(start, end);\n  \n  // Try to break at sentence boundary\n  if (end < text.length) {\n    const lastPeriod = chunk.lastIndexOf('.');\n    const lastNewline = chunk.lastIndexOf('\\n');\n    const breakPoint = Math.max(lastPeriod, lastNewline);\n    if (breakPoint > chunkSize * 0.5) {\n      chunk = chunk.slice(0, breakPoint + 1);\n    }\n  }\n  \n  chunks.push({\n    json: {\n      documentName: fileName,\n      chunkIndex: chunkIndex,\n      content: chunk.trim(),\n      metadata: {\n        fileName,\n        chunkIndex,\n        charStart: start,\n        charEnd: start + chunk.length\n      }\n    }\n  });\n  \n  start += chunk.length - chunkOverlap;\n  if (start <= chunks[chunks.length - 1]?.json?.metadata?.charStart) {\n    start = chunks[chunks.length - 1].json.metadata.charEnd;\n  }\n  chunkIndex++;\n}\n\nreturn chunks;"
      },
      "id": "text-splitter",
      "name": "Split into Chunks",
      "type": "n8n-nodes-base.code",
      "typeVersion": 2,
      "position": [
        1100,
        200
      ]
    },
    {
      "parameters": {
        "method": "POST",
        "url": "http://192.168.50.49:11434/api/embeddings",
        "sendBody": true,
        "specifyBody": "json",
        "jsonBody": "={{ JSON.stringify({ model: 'mxbai-embed-large:latest', prompt: $json.content }) }}",
        "options": {
          "timeout": 60000
        }
      },
      "id": "generate-embedding",
      "name": "Generate Embedding",
      "type": "n8n-nodes-base.httpRequest",
      "typeVersion": 4.2,
      "position": [
        1320,
        200
      ]
    },
    {
      "parameters": {
        "jsCode": "const input = $input.first();\nconst chunk = $('Split into Chunks').item;\nconst embedding = input.json.embedding;\n\nif (!embedding || !Array.isArray(embedding)) {\n  throw new Error('Invalid embedding response from Ollama');\n}\n\n// Format embedding as PostgreSQL vector string\nconst vectorString = '[' + embedding.join(',') + ']';\n\nreturn [{\n  json: {\n    documentName: chunk.json.documentName,\n    chunkIndex: chunk.json.chunkIndex,\n    content: chunk.json.content,\n    embedding: vectorString,\n    metadata: JSON.stringify(chunk.json.metadata)\n  }\n}];"
      },
      "id": "format-for-db",
      "name": "Format for Database",
      "type": "n8n-nodes-base.code",
      "typeVersion": 2,
      "position": [
        1540,
        200
      ]
    },
    {
      "parameters": {
        "operation": "executeQuery",
        "query": "SELECT ingest_document_chunk(\n  '{{ $json.documentName }}',\n  {{ $json.chunkIndex }},\n  '{{ $json.content.replace(/'/g, \"''\") }}',\n  '{{ $json.embedding }}'::vector,\n  '{{ $json.metadata }}'::jsonb\n) as id;",
        "options": {}
      },
      "id": "store-in-pgvector",
      "name": "Store in PGVector",
      "type": "n8n-nodes-base.postgres",
      "typeVersion": 2.5,
      "position": [
        1760,
        200
      ],
      "credentials": {
        "postgres": {
          "name": "<your credential>"
        }
      }
    },
    {
      "parameters": {
        "aggregate": "aggregateAllItemData",
        "destinationFieldName": "results",
        "options": {}
      },
      "id": "aggregate-results",
      "name": "Aggregate Results",
      "type": "n8n-nodes-base.aggregate",
      "typeVersion": 1,
      "position": [
        1980,
        200
      ]
    },
    {
      "parameters": {
        "jsCode": "const results = $input.first().json.results || [];\nconst totalChunks = results.length;\nconst uniqueDocs = [...new Set(results.map(r => r.documentName))];\n\nreturn [{\n  json: {\n    success: true,\n    message: `Successfully processed ${uniqueDocs.length} document(s) with ${totalChunks} chunks`,\n    documents: uniqueDocs,\n    totalChunks\n  }\n}];"
      },
      "id": "summary",
      "name": "Generate Summary",
      "type": "n8n-nodes-base.code",
      "typeVersion": 2,
      "position": [
        2200,
        200
      ]
    }
  ],
  "connections": {
    "Manual Trigger": {
      "main": [
        [
          {
            "node": "List PDF Files",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "List PDF Files": {
      "main": [
        [
          {
            "node": "Parse File List",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Parse File List": {
      "main": [
        [
          {
            "node": "Read Binary File",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Read Binary File": {
      "main": [
        [
          {
            "node": "Extract PDF Text",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Extract PDF Text": {
      "main": [
        [
          {
            "node": "Split into Chunks",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Split into Chunks": {
      "main": [
        [
          {
            "node": "Generate Embedding",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Generate Embedding": {
      "main": [
        [
          {
            "node": "Format for Database",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Format for Database": {
      "main": [
        [
          {
            "node": "Store in PGVector",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Store in PGVector": {
      "main": [
        [
          {
            "node": "Aggregate Results",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Aggregate Results": {
      "main": [
        [
          {
            "node": "Generate Summary",
            "type": "main",
            "index": 0
          }
        ]
      ]
    }
  },
  "settings": {
    "executionOrder": "v1"
  },
  "staticData": null,
  "tags": [
    {
      "name": "RAG"
    },
    {
      "name": "Document Processing"
    }
  ],
  "triggerCount": 1
}