AutomationFlowsGeneral › Scrape Multipage Websites with Jina.ai

Scrape Multipage Websites with Jina.ai

Original n8n title: 💡🌐 Essential Multipage Website Scraper with Jina.ai

💡🌐 Essential Multipage Website Scraper with Jina.ai. Uses stickyNote, manualTrigger, splitInBatches, limit. Event-driven trigger; 16 nodes.

Event trigger★★★★☆ complexity16 nodesHTTP RequestXMLGoogle Drive
General Trigger: Event Nodes: 16 Complexity: ★★★★☆ Added:

This workflow follows the Google Drive → HTTP Request recipe pattern — see all workflows that pair these two integrations.

The workflow JSON

Copy or download the full n8n JSON below. Paste it into a new n8n workflow, add your credentials, activate. Full import guide →

Download .json
{
  "id": "xEij0kj2I1DHbL3I",
  "meta": {
    "templateCredsSetupCompleted": true
  },
  "name": "\ud83d\udca1\ud83c\udf10 Essential Multipage Website Scraper with Jina.ai",
  "tags": [],
  "nodes": [
    {
      "id": "3a503859-ef0a-492d-81c6-37e4f0c4c25e",
      "name": "Sticky Note",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -840,
        0
      ],
      "parameters": {
        "color": 3,
        "width": 340,
        "height": 320,
        "content": "## Jina.ai Web Scraper\n### No API Key Required\n"
      },
      "typeVersion": 1
    },
    {
      "id": "c5217a1a-f074-409b-8340-72afdc5fc8b5",
      "name": "When clicking \u2018Test workflow\u2019",
      "type": "n8n-nodes-base.manualTrigger",
      "position": [
        -1500,
        -300
      ],
      "parameters": {},
      "typeVersion": 1
    },
    {
      "id": "72af3b00-2632-4877-a0b6-7477e2f468f7",
      "name": "Loop Over Items",
      "type": "n8n-nodes-base.splitInBatches",
      "position": [
        -1080,
        20
      ],
      "parameters": {
        "options": {}
      },
      "typeVersion": 3
    },
    {
      "id": "11f0fa02-51f8-41cc-b789-5c452b6899aa",
      "name": "Wait",
      "type": "n8n-nodes-base.wait",
      "position": [
        80,
        220
      ],
      "parameters": {},
      "typeVersion": 1.1
    },
    {
      "id": "cf3b5887-8ff2-46e0-ab33-384ab0987cbb",
      "name": "Limit",
      "type": "n8n-nodes-base.limit",
      "position": [
        80,
        -300
      ],
      "parameters": {
        "maxItems": 20
      },
      "typeVersion": 1
    },
    {
      "id": "c4f04d82-aa33-46cf-a8e2-0b4e717e754a",
      "name": "Get List of Website URLs",
      "type": "n8n-nodes-base.httpRequest",
      "position": [
        -780,
        -300
      ],
      "parameters": {
        "url": "={{ $json.sitemap_url }}",
        "options": {}
      },
      "typeVersion": 4.2
    },
    {
      "id": "7f507c38-1e9e-4c46-8dea-bd6daf65dc55",
      "name": "Convert to JSON",
      "type": "n8n-nodes-base.xml",
      "position": [
        -560,
        -300
      ],
      "parameters": {
        "options": {}
      },
      "typeVersion": 1
    },
    {
      "id": "e21b55c2-8b0d-4c7c-ba91-a2d563a4c966",
      "name": "Create List of Website URLs",
      "type": "n8n-nodes-base.splitOut",
      "position": [
        -340,
        -300
      ],
      "parameters": {
        "options": {},
        "fieldToSplitOut": "urlset.url"
      },
      "typeVersion": 1
    },
    {
      "id": "61555239-8a16-424e-8a60-700f6ebaa270",
      "name": "Filter By Topics or Pages",
      "type": "n8n-nodes-base.filter",
      "position": [
        -120,
        -300
      ],
      "parameters": {
        "options": {},
        "conditions": {
          "options": {
            "version": 2,
            "leftValue": "",
            "caseSensitive": true,
            "typeValidation": "strict"
          },
          "combinator": "or",
          "conditions": [
            {
              "id": "d66c304d-879a-4dc4-908f-ab0665093672",
              "operator": {
                "name": "filter.operator.equals",
                "type": "string",
                "operation": "equals"
              },
              "leftValue": "={{ $json.loc }}",
              "rightValue": "=https://ai.pydantic.dev/"
            },
            {
              "id": "3c930950-bee4-442b-82e6-4437fd39a933",
              "operator": {
                "type": "string",
                "operation": "contains"
              },
              "leftValue": "={{ $json.loc.toLowerCase() }}",
              "rightValue": "agent"
            },
            {
              "id": "aaeaf34e-ad5a-4673-b3bd-8bddf3500988",
              "operator": {
                "type": "string",
                "operation": "contains"
              },
              "leftValue": "={{ $json.loc.toLowerCase() }}",
              "rightValue": "tool"
            }
          ]
        }
      },
      "typeVersion": 2.2
    },
    {
      "id": "dd25fb57-64a3-4c47-be04-6eb66d16520a",
      "name": "Set Website URL",
      "type": "n8n-nodes-base.set",
      "position": [
        -1080,
        -300
      ],
      "parameters": {
        "options": {},
        "assignments": {
          "assignments": [
            {
              "id": "1601dc3e-8024-4e19-b592-93a4e4f77641",
              "name": "sitemap_url",
              "type": "string",
              "value": "https://ai.pydantic.dev/sitemap.xml"
            }
          ]
        }
      },
      "typeVersion": 3.4
    },
    {
      "id": "14ac1c87-29fe-44c8-9c1e-f247a292dde5",
      "name": "Jina.ai Web Scraper",
      "type": "n8n-nodes-base.httpRequest",
      "position": [
        -720,
        120
      ],
      "parameters": {
        "url": "=https://r.jina.ai/{{ $json.loc }}",
        "options": {}
      },
      "typeVersion": 4.2
    },
    {
      "id": "be253ec2-f088-4895-8ef2-61a3720cf68b",
      "name": "Save Webpage Contents to Google Drive",
      "type": "n8n-nodes-base.googleDrive",
      "position": [
        -120,
        120
      ],
      "parameters": {
        "name": "={{ $('Loop Over Items').item.json.loc }} - {{ $json.title }}",
        "content": "={{ $json.markdown }}",
        "driveId": {
          "__rl": true,
          "mode": "list",
          "value": "My Drive"
        },
        "options": {},
        "folderId": {
          "__rl": true,
          "mode": "list",
          "value": "root",
          "cachedResultName": "/ (Root folder)"
        },
        "operation": "createFromText"
      },
      "credentials": {
        "googleDriveOAuth2Api": {
          "name": "<your credential>"
        }
      },
      "typeVersion": 3
    },
    {
      "id": "95d808c7-a3ca-4f59-a385-cc77bdff322e",
      "name": "Extract Title & Markdown Content",
      "type": "n8n-nodes-base.code",
      "position": [
        -380,
        120
      ],
      "parameters": {
        "jsCode": "// Get the text output from the previous node\nconst data = $input.first().json.data;\n\n// Regular expression to capture the title line\nconst titleRegex = /^Title:\\s*(.+)$/m;\n// Regular expression to capture everything after \"Markdown Content:\"\nconst markdownRegex = /Markdown Content:\\n([\\s\\S]+)/;\n\n// Extract the title using the first capture group\nconst titleMatch = data.match(titleRegex);\nconst title = titleMatch ? titleMatch[1].trim() : '';\n\n// Extract the markdown content using the first capture group\nconst markdownMatch = data.match(markdownRegex);\nconst markdown = markdownMatch ? markdownMatch[1].trim() : '';\n\n// Return a single object with title and markdown as unique values\nreturn { title, markdown };"
      },
      "typeVersion": 2
    },
    {
      "id": "2fb86c81-c144-4450-908c-559855deadef",
      "name": "Sticky Note1",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -1240,
        -580
      ],
      "parameters": {
        "color": 7,
        "width": 1540,
        "height": 1080,
        "content": "# \ud83d\udca1\ud83c\udf10 Essential Multipage Website Scraper with Jina.ai\n## Scrape entire websites with this workflow\n**Use responsibly and follow local rules and regulations**"
      },
      "typeVersion": 1
    },
    {
      "id": "b470b294-95d0-4e51-a9cc-2fe17316a771",
      "name": "Sticky Note2",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -1580,
        -400
      ],
      "parameters": {
        "color": 4,
        "width": 280,
        "height": 300,
        "content": "## \ud83d\udc4dTry Me!"
      },
      "typeVersion": 1
    },
    {
      "id": "fafd0623-a423-4e73-9609-cee8e81f5c13",
      "name": "Sticky Note3",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -1180,
        -400
      ],
      "parameters": {
        "width": 300,
        "height": 300,
        "content": "## \ud83d\udc47Add Website Sitemap URL"
      },
      "typeVersion": 1
    }
  ],
  "active": false,
  "settings": {
    "executionOrder": "v1"
  },
  "versionId": "2e815787-d83b-4ab7-a959-2f33006a37a5",
  "connections": {
    "Wait": {
      "main": [
        [
          {
            "node": "Loop Over Items",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Limit": {
      "main": [
        [
          {
            "node": "Loop Over Items",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Convert to JSON": {
      "main": [
        [
          {
            "node": "Create List of Website URLs",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Loop Over Items": {
      "main": [
        [],
        [
          {
            "node": "Jina.ai Web Scraper",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Set Website URL": {
      "main": [
        [
          {
            "node": "Get List of Website URLs",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Jina.ai Web Scraper": {
      "main": [
        [
          {
            "node": "Extract Title & Markdown Content",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Get List of Website URLs": {
      "main": [
        [
          {
            "node": "Convert to JSON",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Filter By Topics or Pages": {
      "main": [
        [
          {
            "node": "Limit",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Create List of Website URLs": {
      "main": [
        [
          {
            "node": "Filter By Topics or Pages",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Extract Title & Markdown Content": {
      "main": [
        [
          {
            "node": "Save Webpage Contents to Google Drive",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "When clicking \u2018Test workflow\u2019": {
      "main": [
        [
          {
            "node": "Set Website URL",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Save Webpage Contents to Google Drive": {
      "main": [
        [
          {
            "node": "Wait",
            "type": "main",
            "index": 0
          }
        ]
      ]
    }
  }
}

Credentials you'll need

Each integration node will prompt for credentials when you import. We strip credential IDs before publishing — you'll add your own.

Pro

For the full experience including quality scoring and batch install features for each workflow upgrade to Pro

How this works

Efficiently extract and organise content from entire websites without manual browsing, saving hours of tedious data collection for researchers, marketers, or analysts needing comprehensive insights. This workflow targets multipage sites, starting with a list of URLs to systematically fetch and process pages using Jina.ai for intelligent scraping, then converting XML responses to structured JSON for easy handling. The key step involves looping through batches of pages with controlled waits to respect site limits, culminating in a clean list of URLs ready for further analysis or storage in Google Drive.

Use this when scraping structured data from news sites, blogs, or directories with clear URL patterns, especially for ongoing monitoring tasks triggered by events. Avoid it for single-page sites or those with heavy JavaScript rendering, where simpler tools like direct HTTP requests suffice; opt for browser automation alternatives in those cases. Common variations include adding filters for specific content types or integrating with email notifications for scraped updates.

About this workflow

💡🌐 Essential Multipage Website Scraper with Jina.ai. Uses stickyNote, manualTrigger, splitInBatches, limit. Event-driven trigger; 16 nodes.

Source: https://github.com/Zie619/n8n-workflows — original creator credit. Request a take-down →

More General workflows → · Browse all categories →

Related workflows

Workflows that share integrations, category, or trigger type with this one. All free to copy and import.

General

AutoQoutesV2_template. Uses manualTrigger, httpRequest, stickyNote, googleSheets. Event-driven trigger; 28 nodes.

HTTP Request, Google Sheets, Google Drive +2
General

AutoClip – Automatically Generate Video Clips and Upload to YouTube. Uses manualTrigger, googleSheets, googleDrive, stickyNote. Event-driven trigger; 23 nodes.

Google Sheets, Google Drive, Read Write File +2
General

n8n workflow deployer. Uses httpRequest, stickyNote, manualTrigger, extractFromFile. Event-driven trigger; 21 nodes.

HTTP Request, Google Drive, Google Drive Trigger
General

Http Stickynote. Uses httpRequest, manualTrigger, googleDrive, stickyNote. Event-driven trigger; 21 nodes.

HTTP Request, Google Drive
General

Image-to-3D. Uses manualTrigger, httpRequest, scheduleTrigger, stickyNote. Event-driven trigger; 17 nodes.

HTTP Request, Google Sheets, Google Drive