{
  "id": "QZHRRSELf6rrYIYc",
  "meta": {
    "templateCredsSetupCompleted": true
  },
  "name": "Sitemap Link Extractor",
  "tags": [],
  "nodes": [
    {
      "id": "cc29176e-71a0-435d-b880-4c972642b6dd",
      "name": "Load the xml file as JSON",
      "type": "n8n-nodes-base.extractFromFile",
      "position": [
        1792,
        48
      ],
      "parameters": {
        "options": {},
        "operation": "xml",
        "binaryPropertyName": "=data"
      },
      "typeVersion": 1
    },
    {
      "id": "2671b101-8196-4c2a-881b-2326af6f05d7",
      "name": "If it's a binary file",
      "type": "n8n-nodes-base.if",
      "position": [
        896,
        144
      ],
      "parameters": {
        "options": {},
        "conditions": {
          "options": {
            "version": 2,
            "leftValue": "",
            "caseSensitive": true,
            "typeValidation": "strict"
          },
          "combinator": "and",
          "conditions": [
            {
              "id": "2b526d63-52be-48a1-b354-3454d6878c6c",
              "operator": {
                "type": "object",
                "operation": "notEmpty",
                "singleValue": true
              },
              "leftValue": "={{ $binary }}",
              "rightValue": ""
            }
          ]
        }
      },
      "typeVersion": 2.2
    },
    {
      "id": "f0a656d3-31e3-4a83-a5b9-ea7e6647f9eb",
      "name": "Domain to scrape",
      "type": "n8n-nodes-base.webhook",
      "position": [
        0,
        264
      ],
      "parameters": {
        "path": "1da30868-fbca-4e8e-8580-485afb3fd956",
        "options": {}
      },
      "typeVersion": 2.1
    },
    {
      "id": "66346ed3-4ce8-4a50-a379-61280b7dcd9d",
      "name": "Scrape robots.txt file",
      "type": "n8n-nodes-scrapingbee.ScrapingBee",
      "onError": "continueErrorOutput",
      "position": [
        224,
        264
      ],
      "parameters": {
        "url": "=https://{{ $('Domain to scrape').item.json.query.domain }}/robots.txt",
        "additionalFields": {
          "renderJs": false
        }
      },
      "credentials": {
        "ScrapingBeeApi": {
          "name": "<your credential>"
        }
      },
      "notesInFlow": false,
      "retryOnFail": true,
      "typeVersion": 1,
      "alwaysOutputData": false
    },
    {
      "id": "2fe3fd6d-1105-49e2-8c13-aae809b9a745",
      "name": "Scrape sitemap.xml file",
      "type": "n8n-nodes-scrapingbee.ScrapingBee",
      "position": [
        672,
        144
      ],
      "parameters": {
        "url": "=https://{{ $('Domain to scrape').item.json.query.domain }}/sitemap.xml",
        "additionalFields": {
          "renderJs": false
        }
      },
      "credentials": {
        "ScrapingBeeApi": {
          "name": "<your credential>"
        }
      },
      "notesInFlow": false,
      "retryOnFail": true,
      "typeVersion": 1,
      "alwaysOutputData": false
    },
    {
      "id": "51f47d4e-f7f3-4578-8516-8a8bb03f2123",
      "name": "If sitemap links are found",
      "type": "n8n-nodes-base.if",
      "position": [
        448,
        480
      ],
      "parameters": {
        "options": {},
        "conditions": {
          "options": {
            "version": 2,
            "leftValue": "",
            "caseSensitive": true,
            "typeValidation": "strict"
          },
          "combinator": "and",
          "conditions": [
            {
              "id": "04bb5cbc-fe28-48fe-9b2f-0e390dc278a2",
              "operator": {
                "type": "string",
                "operation": "contains"
              },
              "leftValue": "={{ $json.data }}",
              "rightValue": "Sitemap:"
            }
          ]
        }
      },
      "typeVersion": 2.2
    },
    {
      "id": "bbd9751f-3f8f-4873-bd5b-54d519b1738a",
      "name": "If it's a .gz file",
      "type": "n8n-nodes-base.if",
      "position": [
        1120,
        48
      ],
      "parameters": {
        "options": {},
        "conditions": {
          "options": {
            "version": 2,
            "leftValue": "",
            "caseSensitive": true,
            "typeValidation": "strict"
          },
          "combinator": "and",
          "conditions": [
            {
              "id": "caba8b1e-efc9-4234-88d5-c45621989717",
              "operator": {
                "name": "filter.operator.equals",
                "type": "string",
                "operation": "equals"
              },
              "leftValue": "={{ $binary.data.fileExtension }}",
              "rightValue": "gz"
            }
          ]
        }
      },
      "typeVersion": 2.2
    },
    {
      "id": "bfa892a6-7215-4503-99ae-6da5ed012029",
      "name": "Decompress .gz file",
      "type": "n8n-nodes-base.compression",
      "onError": "continueRegularOutput",
      "position": [
        1344,
        144
      ],
      "parameters": {},
      "typeVersion": 1.1
    },
    {
      "id": "85e6ae16-6b30-4241-a762-5fdc3381b6f8",
      "name": "Store the file to data key for easy handling",
      "type": "n8n-nodes-base.code",
      "position": [
        1568,
        144
      ],
      "parameters": {
        "mode": "runOnceForEachItem",
        "jsCode": "const item = $input.item;\n\nitem.binary.data = item.binary.file_0;\ndelete item.binary.file_0;\n\nreturn item;"
      },
      "typeVersion": 2
    },
    {
      "id": "fc8d55c1-ad13-4384-bd7a-06c05c83bb42",
      "name": "Extract non-xml links",
      "type": "n8n-nodes-base.code",
      "position": [
        2016,
        -96
      ],
      "parameters": {
        "jsCode": "// Function node: return only non-XML links as separate items { \"link\": \"...\" }\n// Excludes *.xml, *.xml.gz, any link to www.sitemaps.org, and www.w3.org\n// Robust against escaped \\n, mixed HTML/XML/JSON, and trailing punctuation\n\nconst items = $input.all();\nconst seen = new Set();\nconst results = [];\n\n// Add only non-XML URLs, excluding sitemaps.org and w3.org\nfunction add(url) {\n  if (!url) return;\n\n  // Decode common HTML entity for &\n  url = url.replace(/&amp;/gi, \"&\");\n\n  // Trim trailing punctuation/brackets that often stick to URLs\n  url = url.replace(/[)\\]\\},.;!?'\"\\u00BB\\u203A>]+$/g, \"\");\n\n  // Ignore query/hash when checking extension\n  const noQuery = url.replace(/[#?].*$/, \"\");\n\n  // Skip *.xml and *.xml.gz\n  if (/\\.xml(\\.gz)?$/i.test(noQuery)) return;\n\n  // Skip www.sitemaps.org links\n  if (/^https?:\\/\\/www\\.sitemaps\\.org\\//i.test(noQuery)) return;\n\n  // Skip www.w3.org links\n  if (/^https?:\\/\\/www\\.w3\\.org\\//i.test(noQuery)) return;\n\n  if (!seen.has(url)) {\n    seen.add(url);\n    results.push({ json: { link: url } });\n  }\n}\n\n// Recursively collect all string values from any object/array shape\nfunction collectStrings(value, out) {\n  if (value == null) return;\n  if (typeof value === 'string') { out.push(value); return; }\n  if (Array.isArray(value)) { for (const v of value) collectStrings(v, out); return; }\n  if (typeof value === 'object') {\n    for (const k of Object.keys(value)) collectStrings(value[k], out);\n  }\n}\n\n// Normalize both real control chars and escaped sequences like \"\\n\"\nfunction normalizeText(text) {\n  return String(text)\n    .replace(/\\\\[nrvtf]/g, ' ')   // turn escaped \\n \\r \\t \\v \\f into spaces\n    .replace(/[\\n\\r\\t\\f\\v]+/g, ' '); // collapse real control whitespace\n}\n\n// URL extractor that stops at whitespace AND backslash (handles leftover \"\\n\")\nfunction extractFromText(text) {\n  if (!text) return;\n  const t = normalizeText(text);\n  const urlRe = /https?:\\/\\/[^\\s\\\\<>\"')]+/gi;\n  let m;\n  while ((m = urlRe.exec(t)) !== null) add(m[0]);\n}\n\n// Process all input items\nfor (const item of items) {\n  // Merge any pre-existing links if present\n  if (Array.isArray(item.json?.links)) {\n    for (const l of item.json.links) add(String(l));\n  }\n\n  // Extract from every string field (any key/shape)\n  const strings = [];\n  collectStrings(item.json ?? {}, strings);\n  for (const s of strings) extractFromText(s);\n}\n\nreturn results;\n"
      },
      "typeVersion": 2
    },
    {
      "id": "813a432d-1ad5-4e52-9319-9f76352f03d4",
      "name": "Extract xml links",
      "type": "n8n-nodes-base.code",
      "position": [
        2016,
        240
      ],
      "parameters": {
        "jsCode": "// Function node: extract .xml and .xml.gz from ANY input shape\n// Output: [{ json: { xml: \"https://...\" } }, ... ]\n\nconst items = $input.all();\nconst seen = new Set();\nconst results = [];\n\n// Add a URL if it ends with .xml or .xml.gz (ignoring ?query/#hash)\nfunction add(url) {\n  if (!url) return;\n\n  // Decode common HTML entity for &\n  url = url.replace(/&amp;/gi, \"&\");\n\n  // Trim trailing punctuation/brackets that often stick to URLs\n  url = url.replace(/[)\\]\\},.;!?'\"\\u00BB\\u203A>]+$/g, \"\");\n\n  // Check extension on the path only\n  const noQuery = url.replace(/[#?].*$/, \"\");\n  if (!/\\.xml(\\.gz)?$/i.test(noQuery)) return;\n\n  if (!seen.has(url)) {\n    seen.add(url);\n    results.push({ json: { xml: url } });\n  }\n}\n\n// Recursively collect all string values from any object/array shape\nfunction collectStrings(value, out) {\n  if (value == null) return;\n  if (typeof value === 'string') { out.push(value); return; }\n  if (Array.isArray(value)) { for (const v of value) collectStrings(v, out); return; }\n  if (typeof value === 'object') {\n    for (const k of Object.keys(value)) collectStrings(value[k], out);\n  }\n}\n\n// Normalize both real control chars and escaped sequences like \"\\n\"\nfunction normalizeText(text) {\n  return String(text)\n    .replace(/\\\\[nrvtf]/g, ' ')      // turn escaped \\n \\r \\t \\v \\f into spaces\n    .replace(/[\\n\\r\\t\\f\\v]+/g, ' '); // collapse real control whitespace\n}\n\n// URL extractor that stops at whitespace AND backslash (handles leftover \"\\n\")\nfunction extractFromText(text) {\n  if (!text) return;\n  const t = normalizeText(text);\n  const urlRe = /https?:\\/\\/[^\\s\\\\<>\"')]+/gi; // note the extra '\\\\' stop char\n  let m;\n  while ((m = urlRe.exec(t)) !== null) add(m[0]);\n}\n\n// Process all input items (schema-agnostic)\nfor (const item of items) {\n  // Merge any pre-existing links arrays if present\n  if (Array.isArray(item.json?.links)) {\n    for (const l of item.json.links) extractFromText(String(l));\n  }\n\n  // Extract from every string field (any key/shape, nested included)\n  const strings = [];\n  collectStrings(item.json ?? {}, strings);\n  for (const s of strings) extractFromText(s);\n}\n\nreturn results;"
      },
      "typeVersion": 2
    },
    {
      "id": "d0e312fd-d25e-4af9-998e-819e80d6689a",
      "name": "Scrape xml file",
      "type": "n8n-nodes-scrapingbee.ScrapingBee",
      "onError": "continueErrorOutput",
      "position": [
        2240,
        336
      ],
      "parameters": {
        "url": "={{ $json.xml }}",
        "additionalFields": {
          "renderJs": false
        }
      },
      "credentials": {
        "ScrapingBeeApi": {
          "name": "<your credential>"
        }
      },
      "retryOnFail": true,
      "typeVersion": 1
    },
    {
      "id": "e2a98270-8996-4321-94ff-9b9dfef8cb7b",
      "name": "Append links to sheet",
      "type": "n8n-nodes-base.googleSheets",
      "position": [
        2240,
        -96
      ],
      "parameters": {
        "columns": {
          "value": {
            "links": "={{ $json.link }}"
          },
          "schema": [
            {
              "id": "links",
              "type": "string",
              "display": true,
              "removed": false,
              "required": false,
              "displayName": "links",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            }
          ],
          "mappingMode": "defineBelow",
          "matchingColumns": [
            "links"
          ],
          "attemptToConvertTypes": false,
          "convertFieldsToString": false
        },
        "options": {
          "useAppend": true
        },
        "operation": "append",
        "sheetName": {
          "__rl": true,
          "mode": "list",
          "value": "gid=0",
          "cachedResultUrl": "https://docs.google.com/spreadsheets/d/1xRUx3hsaAEJoh3MNUS5iEZLJsIr0Q3_J-h0dNiLZBDo/edit#gid=0",
          "cachedResultName": "Sheet1"
        },
        "documentId": {
          "__rl": true,
          "mode": "url",
          "value": "https://docs.google.com/spreadsheets/d/1xRUx3hsaAEJoh3MNUS5iEZLJsIr0Q3_J-h0dNiLZBDo/edit?usp=sharing"
        }
      },
      "credentials": {
        "googleSheetsOAuth2Api": {
          "name": "<your credential>"
        }
      },
      "typeVersion": 4.7
    },
    {
      "id": "a8d2c3f8-a711-43bf-96d5-6bbbb01ac9c5",
      "name": "Sticky Note",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -336,
        208
      ],
      "parameters": {
        "width": 272,
        "height": 208,
        "content": "## Input\n\nYou need to send a webhook request with domain as query parameter.\n\nFor example:\n`https://<webhook_link>?domain=n8n.io`"
      },
      "typeVersion": 1
    },
    {
      "id": "5851dd9e-24a6-4842-9699-cbbd322d6f2f",
      "name": "Sticky Note1",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        144,
        432
      ],
      "parameters": {
        "height": 144,
        "content": "## Scrape Robots.txt\n\nMost websites provide sitemap links in robots.txt file so we will scrape it first"
      },
      "typeVersion": 1
    },
    {
      "id": "09c2c465-ff2a-44a4-9536-8dc59290c07d",
      "name": "Sticky Note2",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        608,
        -80
      ],
      "parameters": {
        "height": 176,
        "content": "## Scrape Sitemap.xml\n\nIn case sitemap links are missing in robots.txt file, we will try to scrape sitemap.xml file"
      },
      "typeVersion": 1
    },
    {
      "id": "8939e3bc-458c-414b-94c1-032c229ce369",
      "name": "Sticky Note3",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        384,
        640
      ],
      "parameters": {
        "height": 80,
        "content": "If sitemap links are are available, we will directly extract the xml links"
      },
      "typeVersion": 1
    },
    {
      "id": "82e794a5-3e7d-4d33-951f-06930423b647",
      "name": "Sticky Note4",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        880,
        288
      ],
      "parameters": {
        "height": 96,
        "content": "Sometimes links are received as text content and sometimes they are received as binary, so we need to check for that."
      },
      "typeVersion": 1
    },
    {
      "id": "a16d3039-8c33-4a2f-908c-1d0c8b2504d1",
      "name": "Sticky Note5",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        1408,
        -192
      ],
      "parameters": {
        "height": 176,
        "content": "If it's a .xml.gz file, we need to decompress it. We are also renaming the key because by default they are named `file_0` and we need it to be named as `data` so that we can use a single extraction logic for both `.xml.gz` and `.xml` files"
      },
      "typeVersion": 1
    },
    {
      "id": "6917c1a1-5b0a-4e68-a9d2-f3f7e4bf0de2",
      "name": "Sticky Note6",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        2192,
        64
      ],
      "parameters": {
        "height": 256,
        "content": "## Add Links to Sheet and Scrape XML Links\n\nIf the xml file contains normal links they are extracted and added to sheet. And if it contains other `.xml` links, we will scrape them. Basically, this is a recursive workflow."
      },
      "typeVersion": 1
    },
    {
      "id": "c5c6585a-efe5-44ef-9a78-43651308e5ce",
      "name": "Sticky Note7",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        2192,
        -208
      ],
      "parameters": {
        "height": 80,
        "content": "Connect to a Google Sheet and add `links` as column name"
      },
      "typeVersion": 1
    },
    {
      "id": "ffba7362-954c-4274-ac21-4ecf67e9a536",
      "name": "Sticky Note8",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -304,
        -80
      ],
      "parameters": {
        "color": 3,
        "width": 800,
        "height": 112,
        "content": "## NOTE\nSome heavy sitemaps could result in a crash if the workflow consumes more memory than what is available in your n8n plan or self-hosted system. If this happens, we would recommend you to either upgrade your plan or use a self-hosted solution with a higher memory."
      },
      "typeVersion": 1
    }
  ],
  "active": false,
  "settings": {
    "executionOrder": "v1"
  },
  "versionId": "a0d2792b-22a4-4cf3-9982-e819553258ab",
  "connections": {
    "Scrape xml file": {
      "main": [
        [
          {
            "node": "If it's a binary file",
            "type": "main",
            "index": 0
          }
        ],
        []
      ]
    },
    "Domain to scrape": {
      "main": [
        [
          {
            "node": "Scrape robots.txt file",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Extract xml links": {
      "main": [
        [
          {
            "node": "Scrape xml file",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "If it's a .gz file": {
      "main": [
        [
          {
            "node": "Decompress .gz file",
            "type": "main",
            "index": 0
          }
        ],
        [
          {
            "node": "Load the xml file as JSON",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Decompress .gz file": {
      "main": [
        [
          {
            "node": "Store the file to data key for easy handling",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Append links to sheet": {
      "main": [
        []
      ]
    },
    "Extract non-xml links": {
      "main": [
        [
          {
            "node": "Append links to sheet",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "If it's a binary file": {
      "main": [
        [
          {
            "node": "If it's a .gz file",
            "type": "main",
            "index": 0
          }
        ],
        [
          {
            "node": "Extract non-xml links",
            "type": "main",
            "index": 0
          },
          {
            "node": "Extract xml links",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Scrape robots.txt file": {
      "main": [
        [
          {
            "node": "If sitemap links are found",
            "type": "main",
            "index": 0
          }
        ],
        [
          {
            "node": "Scrape sitemap.xml file",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Scrape sitemap.xml file": {
      "main": [
        [
          {
            "node": "If it's a binary file",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Load the xml file as JSON": {
      "main": [
        [
          {
            "node": "Extract non-xml links",
            "type": "main",
            "index": 0
          },
          {
            "node": "Extract xml links",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "If sitemap links are found": {
      "main": [
        [
          {
            "node": "Extract xml links",
            "type": "main",
            "index": 0
          }
        ],
        [
          {
            "node": "Scrape sitemap.xml file",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Store the file to data key for easy handling": {
      "main": [
        [
          {
            "node": "Load the xml file as JSON",
            "type": "main",
            "index": 0
          }
        ]
      ]
    }
  }
}