{
  "name": "Web Scraper with AI Extraction",
  "nodes": [
    {
      "parameters": {
        "httpMethod": "POST",
        "path": "scrape-url",
        "responseMode": "responseNode",
        "options": {}
      },
      "id": "b2c3d4e5-2222-4000-8000-000000000001",
      "name": "Webhook - Receive URL",
      "type": "n8n-nodes-base.webhook",
      "typeVersion": 2,
      "position": [
        240,
        300
      ]
    },
    {
      "parameters": {
        "url": "={{ $json.body.url }}",
        "options": {
          "response": {
            "response": {
              "fullResponse": false
            }
          },
          "timeout": 15000
        }
      },
      "id": "b2c3d4e5-2222-4000-8000-000000000002",
      "name": "HTTP Request - Fetch Page",
      "type": "n8n-nodes-base.httpRequest",
      "typeVersion": 4.2,
      "position": [
        480,
        300
      ]
    },
    {
      "parameters": {
        "jsCode": "const html = $input.first().json.data || $input.first().json.body || '';\nconst textContent = html\n  .replace(/<script[^>]*>[\\s\\S]*?<\\/script>/gi, '')\n  .replace(/<style[^>]*>[\\s\\S]*?<\\/style>/gi, '')\n  .replace(/<[^>]+>/g, ' ')\n  .replace(/\\s+/g, ' ')\n  .trim()\n  .substring(0, 8000);\nreturn [{ json: { textContent, url: $('Webhook - Receive URL').first().json.body.url, schema: $('Webhook - Receive URL').first().json.body.schema || 'auto' } }];"
      },
      "id": "b2c3d4e5-2222-4000-8000-000000000003",
      "name": "Code - Strip HTML",
      "type": "n8n-nodes-base.code",
      "typeVersion": 2,
      "position": [
        720,
        300
      ]
    },
    {
      "parameters": {
        "resource": "chat",
        "model": "gpt-4o",
        "messages": {
          "values": [
            {
              "content": "=You are a web data extraction specialist. Extract structured data from the following webpage content.\n\nURL: {{ $json.url }}\nRequested schema: {{ $json.schema }}\n\nExtract ALL relevant entities and data points. Return a JSON object with these fields:\n- title: page title\n- description: brief description\n- entities: array of extracted entities with type, name, and attributes\n- metadata: any relevant metadata (dates, authors, categories)\n- structured_data: key-value pairs of all important data found\n\nReturn ONLY valid JSON, no markdown formatting.\n\nPage content:\n{{ $json.textContent }}"
            }
          ]
        },
        "options": {
          "temperature": 0.2,
          "maxTokens": 4096,
          "responseFormat": "json_object"
        }
      },
      "id": "b2c3d4e5-2222-4000-8000-000000000004",
      "name": "OpenAI - Extract Structured Data",
      "type": "@n8n/n8n-nodes-langchain.openAi",
      "typeVersion": 1.4,
      "position": [
        960,
        300
      ],
      "credentials": {
        "openAiApi": {
          "name": "<your credential>"
        }
      }
    },
    {
      "parameters": {
        "jsCode": "const aiResponse = $input.first().json.message.content;\nlet parsed;\ntry {\n  parsed = JSON.parse(aiResponse);\n} catch (e) {\n  parsed = { raw: aiResponse, parseError: true };\n}\nconst url = $('Code - Strip HTML').first().json.url;\nreturn [{ json: { url, extractedAt: new Date().toISOString(), ...parsed } }];"
      },
      "id": "b2c3d4e5-2222-4000-8000-000000000005",
      "name": "Code - Parse AI Response",
      "type": "n8n-nodes-base.code",
      "typeVersion": 2,
      "position": [
        1200,
        300
      ]
    },
    {
      "parameters": {
        "operation": "executeQuery",
        "query": "INSERT INTO scraped_data (url, title, description, entities, metadata, structured_data, extracted_at) VALUES ($1, $2, $3, $4::jsonb, $5::jsonb, $6::jsonb, $7) RETURNING id",
        "options": {
          "queryParams": "={{ $json.url }},={{ $json.title }},={{ $json.description }},={{ JSON.stringify($json.entities || []) }},={{ JSON.stringify($json.metadata || {}) }},={{ JSON.stringify($json.structured_data || {}) }},={{ $json.extractedAt }}"
        }
      },
      "id": "b2c3d4e5-2222-4000-8000-000000000006",
      "name": "PostgreSQL - Store Results",
      "type": "n8n-nodes-base.postgres",
      "typeVersion": 2.5,
      "position": [
        1440,
        300
      ],
      "credentials": {
        "postgres": {
          "name": "<your credential>"
        }
      }
    },
    {
      "parameters": {
        "respondWith": "json",
        "responseBody": "={{ JSON.stringify({ success: true, url: $('Code - Parse AI Response').first().json.url, title: $('Code - Parse AI Response').first().json.title, entities: $('Code - Parse AI Response').first().json.entities, structured_data: $('Code - Parse AI Response').first().json.structured_data }) }}"
      },
      "id": "b2c3d4e5-2222-4000-8000-000000000007",
      "name": "Respond with Results",
      "type": "n8n-nodes-base.respondToWebhook",
      "typeVersion": 1.1,
      "position": [
        1680,
        300
      ]
    }
  ],
  "connections": {
    "Webhook - Receive URL": {
      "main": [
        [
          {
            "node": "HTTP Request - Fetch Page",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "HTTP Request - Fetch Page": {
      "main": [
        [
          {
            "node": "Code - Strip HTML",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Code - Strip HTML": {
      "main": [
        [
          {
            "node": "OpenAI - Extract Structured Data",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "OpenAI - Extract Structured Data": {
      "main": [
        [
          {
            "node": "Code - Parse AI Response",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Code - Parse AI Response": {
      "main": [
        [
          {
            "node": "PostgreSQL - Store Results",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "PostgreSQL - Store Results": {
      "main": [
        [
          {
            "node": "Respond with Results",
            "type": "main",
            "index": 0
          }
        ]
      ]
    }
  },
  "settings": {
    "executionOrder": "v1"
  },
  "staticData": null,
  "tags": [
    {
      "name": "ai-data"
    }
  ]
}