This workflow corresponds to n8n.io template #7186 — we link there as the canonical source.
This workflow follows the Google Drive → HTTP Request recipe pattern — see all workflows that pair these two integrations.
The workflow JSON
Copy or download the full n8n JSON below. Paste it into a new n8n workflow, add your credentials, activate. Full import guide →
{
"id": "YOUR_WORKFLOW_ID_HERE",
"name": "Simple Working Scraper",
"tags": [],
"nodes": [
{
"id": "86320ce4-511d-4c6d-84c9-5e98de444130",
"name": "Working Scraper Note",
"type": "n8n-nodes-base.stickyNote",
"position": [
-304,
-32
],
"parameters": {
"color": 2,
"width": 400,
"height": 300,
"content": "## \u2705 Simple Working Scraper\n### Now with Full Sitemap Support\n**Uses direct HTTP requests that we know work**\n\n**Features:**\n- Fetches complete sitemap\n- Processes all pages sequentially\n- Respectful 3-second delays\n- Robust error handling"
},
"typeVersion": 1
},
{
"id": "0d6bb0ea-4090-43ce-828f-f05b5571d2e8",
"name": "Start Working Scraper",
"type": "n8n-nodes-base.manualTrigger",
"position": [
-384,
128
],
"parameters": {},
"typeVersion": 1
},
{
"id": "3bee3b45-8b82-4017-bae3-fb1026fd9792",
"name": "Set Sitemap URL",
"type": "n8n-nodes-base.set",
"position": [
-192,
128
],
"parameters": {
"options": {},
"assignments": {
"assignments": [
{
"id": "sitemap-url-id",
"name": "sitemap_url",
"type": "string",
"value": "https://yourwebsitehere.com/page-sitemap.xml"
}
]
}
},
"typeVersion": 3.4
},
{
"id": "92a93278-e61b-4860-b0a6-73b2821dacbb",
"name": "Get Sitemap",
"type": "n8n-nodes-base.httpRequest",
"position": [
16,
128
],
"parameters": {
"url": "={{ $json.sitemap_url }}",
"options": {
"timeout": 15000,
"response": {
"response": {
"neverError": true
}
}
}
},
"typeVersion": 4.2
},
{
"id": "3aa79508-fe19-428b-91d8-77c1e7a7e012",
"name": "Parse Sitemap XML",
"type": "n8n-nodes-base.xml",
"position": [
224,
128
],
"parameters": {
"options": {}
},
"typeVersion": 1
},
{
"id": "f797b739-60eb-478e-b4d8-958dfe51b37a",
"name": "Split Sitemap URLs",
"type": "n8n-nodes-base.splitOut",
"position": [
416,
128
],
"parameters": {
"options": {},
"fieldToSplitOut": "urlset.url"
},
"typeVersion": 1
},
{
"id": "a311732e-68fa-4e96-989e-207565a499d5",
"name": "Limit URLs (Optional)",
"type": "n8n-nodes-base.limit",
"disabled": true,
"position": [
752,
80
],
"parameters": {
"maxItems": 20
},
"typeVersion": 1
},
{
"id": "cfb6df8a-a753-41c2-9983-22fea4bb0b2e",
"name": "Loop URLs",
"type": "n8n-nodes-base.splitInBatches",
"position": [
-192,
272
],
"parameters": {
"options": {}
},
"typeVersion": 3
},
{
"id": "7ae29bfa-f197-447b-a5de-283db54aedf9",
"name": "Scrape Page",
"type": "n8n-nodes-base.httpRequest",
"position": [
16,
272
],
"parameters": {
"url": "={{ $json.loc }}",
"options": {
"timeout": 20000,
"response": {
"response": {
"neverError": true
}
}
},
"sendHeaders": true,
"headerParameters": {
"parameters": [
{
"name": "User-Agent",
"value": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/0.0.0.0 Safari/537.36"
},
{
"name": "Accept",
"value": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
},
{
"name": "Accept-Language",
"value": "en-US,en;q=0.9"
},
{
"name": "Accept-Encoding",
"value": "gzip, deflate, br"
},
{
"name": "Cache-Control",
"value": "no-cache"
},
{
"name": "Pragma",
"value": "no-cache"
}
]
}
},
"typeVersion": 4.2
},
{
"id": "4581c231-86ab-42ae-b57a-a9656a22cf1a",
"name": "Extract Content",
"type": "n8n-nodes-base.code",
"position": [
224,
272
],
"parameters": {
"jsCode": "// Extract content from HTML - handle all response types safely\nconst url = $('Loop URLs').item.json.loc;\nlet html = '';\nlet statusCode = 'Unknown';\n\n// Handle different response formats\nif (typeof $json === 'string') {\n html = $json;\n statusCode = 200; // Assume success if we got a string\n} else if ($json && $json.body) {\n html = $json.body;\n statusCode = $json.statusCode || 200;\n} else if ($json && typeof $json === 'object') {\n // Try to find HTML content in the object\n const possibleHtml = Object.values($json).find(val => \n typeof val === 'string' && (val.includes('<html') || val.includes('<div') || val.length > 100)\n );\n html = possibleHtml || '';\n statusCode = $json.statusCode || 'Unknown';\n}\n\nconsole.log('Processing URL:', url);\nconsole.log('Response type:', typeof $json);\nconsole.log('HTML length:', html ? html.length : 0);\nconsole.log('Status code:', statusCode);\n\nfunction extractTitle(html) {\n if (!html) return 'No Title';\n \n // Try multiple methods\n const titlePatterns = [\n /<title[^>]*>([^<]+)<\\/title>/i,\n /<meta[^>]*property=[\"']og:title[\"'][^>]*content=[\"']([^\"']+)[\"']/i,\n /<h1[^>]*>([^<]+)<\\/h1>/i\n ];\n \n for (const pattern of titlePatterns) {\n const match = html.match(pattern);\n if (match && match[1]) {\n return match[1].trim().replace(/\\s+/g, ' ');\n }\n }\n \n return 'Untitled Page';\n}\n\nfunction extractContent(html) {\n if (!html) return 'No content available';\n \n // Remove scripts, styles, and comments\n let content = html\n .replace(/<script[^>]*>[\\s\\S]*?<\\/script>/gi, '')\n .replace(/<style[^>]*>[\\s\\S]*?<\\/style>/gi, '')\n .replace(/<!--[\\s\\S]*?-->/g, '')\n .replace(/<noscript[^>]*>[\\s\\S]*?<\\/noscript>/gi, '');\n \n // Convert HTML to readable text\n content = content\n .replace(/<h[1-6][^>]*>([^<]+)<\\/h[1-6]>/gi, '\\n\\n## $1\\n\\n')\n .replace(/<p[^>]*>([^<]+)<\\/p>/gi, '\\n\\n$1\\n\\n')\n .replace(/<br\\s*\\/?>/gi, '\\n')\n .replace(/<div[^>]*>([^<]*)<\\/div>/gi, '\\n$1\\n')\n .replace(/<a[^>]*href=[\"']([^\"']*)[\"'][^>]*>([^<]+)<\\/a>/gi, '$2 [$1]')\n .replace(/<li[^>]*>([^<]+)<\\/li>/gi, '\\n\u2022 $1')\n .replace(/<[^>]+>/g, ' ')\n .replace(/\\s+/g, ' ')\n .replace(/\\n\\s+/g, '\\n')\n .replace(/\\s+\\n/g, '\\n')\n .replace(/\\n{3,}/g, '\\n\\n')\n .trim();\n \n return content || 'No readable content found';\n}\n\nconst title = extractTitle(html);\nconst content = extractContent(html);\nconst wordCount = content.split(/\\s+/).filter(w => w.length > 0).length;\n\n// Check for framework indicators\nconst isDivi = html.toLowerCase().includes('divi') || html.includes('et_pb_');\nconst isWordPress = html.toLowerCase().includes('wp-') || html.includes('wp_');\nconst hasJavaScript = html.includes('<script');\n\nreturn {\n url: url,\n title: title,\n content: content,\n wordCount: wordCount,\n htmlLength: html.length,\n statusCode: statusCode,\n isDivi: isDivi,\n isWordPress: isWordPress,\n hasJavaScript: hasJavaScript,\n timestamp: new Date().toISOString(),\n success: html.length > 0\n};"
},
"typeVersion": 2
},
{
"id": "f48d870e-9566-4b7c-bd8d-f1b3cb310325",
"name": "Format Content",
"type": "n8n-nodes-base.code",
"position": [
416,
272
],
"parameters": {
"jsCode": "// Format content for Google Drive\nconst data = $json;\n\nif (!data.success) {\n return {\n title: 'Scraping Failed',\n formatted: `# Scraping Failed\\n\\n**URL:** ${data.url}\\n**Error:** No content retrieved\\n**Status Code:** ${data.statusCode}\\n**Timestamp:** ${data.timestamp}\\n\\nThe page may be JavaScript-dependent or blocking automated requests.`\n };\n}\n\nconst sections = [];\n\n// YAML frontmatter\nsections.push('---');\nsections.push(`title: \"${data.title.replace(/\"/g, '\\\\\"')}\"`); \nsections.push(`url: ${data.url}`);\nsections.push(`scraped_at: ${data.timestamp}`);\nsections.push(`word_count: ${data.wordCount}`);\nsections.push(`html_length: ${data.htmlLength}`);\nsections.push(`status_code: ${data.statusCode}`);\nsections.push(`is_divi: ${data.isDivi}`);\nsections.push(`is_wordpress: ${data.isWordPress}`);\nsections.push(`has_javascript: ${data.hasJavaScript}`);\nsections.push('scraper_type: \"simple_direct_sitemap\"');\nsections.push('---');\nsections.push('');\n\n// Main content\nsections.push(`# ${data.title}`);\nsections.push('');\nsections.push(`**Source URL:** ${data.url}`);\nsections.push(`**Scraped:** ${new Date(data.timestamp).toLocaleString()}`);\nsections.push(`**Word Count:** ${data.wordCount.toLocaleString()}`);\nsections.push(`**HTML Size:** ${data.htmlLength.toLocaleString()} characters`);\nsections.push(`**Status Code:** ${data.statusCode}`);\nsections.push('');\n\nif (data.isDivi) {\n sections.push('\ud83c\udfa8 **Divi Theme Detected** - This WordPress site uses dynamic content loading');\n sections.push('');\n}\n\nif (data.hasJavaScript && data.wordCount < 100) {\n sections.push('\u26a0\ufe0f **Limited Content** - This page likely requires JavaScript rendering for full content');\n sections.push('');\n}\n\nsections.push('---');\nsections.push('');\nsections.push('## Content');\nsections.push('');\n\nif (data.content && data.content !== 'No readable content found') {\n sections.push(data.content);\n} else {\n sections.push('*No readable content could be extracted from this page.*');\n sections.push('');\n sections.push('**Possible reasons:**');\n sections.push('- Page content is loaded dynamically with JavaScript');\n sections.push('- Site has anti-bot protection');\n sections.push('- Content is behind authentication');\n sections.push('- Page structure is complex or non-standard');\n}\n\nsections.push('');\nsections.push('---');\nsections.push('');\nsections.push('**Technical Details:**');\nsections.push(`- Framework detected: ${data.isDivi ? 'Divi' : (data.isWordPress ? 'WordPress' : 'Unknown')}`);\nsections.push(`- JavaScript present: ${data.hasJavaScript ? 'Yes' : 'No'}`);\nsections.push(`- Content extraction: ${data.wordCount > 50 ? 'Successful' : 'Limited'}`);\nsections.push('');\nsections.push('*Scraped with n8n Simple Working Scraper (Full Sitemap)*');\n\nreturn {\n title: data.title,\n formatted: sections.join('\\n')\n};"
},
"typeVersion": 2
},
{
"id": "306d986f-6e37-4726-a17d-64f867d51d3e",
"name": "Save to Google Drive",
"type": "n8n-nodes-base.googleDrive",
"position": [
624,
272
],
"parameters": {
"name": "={{ $('Loop URLs').item.json.loc.replace(/https?:\\/\\//, '').replace(/\\//g, '_') }}_sitemap.md",
"content": "={{ $json.formatted }}",
"driveId": {
"__rl": true,
"mode": "list",
"value": "My Drive"
},
"options": {},
"folderId": {
"__rl": true,
"mode": "list",
"value": "YOUR_GOOGLE_DRIVE_FOLDER_ID_HERE",
"cachedResultUrl": "https://drive.google.com/drive/folders/YOUR_GOOGLE_DRIVE_FOLDER_ID_HERE",
"cachedResultName": "YOUR_FOLDER_NAME_HERE"
},
"operation": "createFromText"
},
"credentials": {
"googleDriveOAuth2Api": {
"name": "<your credential>"
}
},
"typeVersion": 3
},
{
"id": "e62e8040-db5c-4a59-8b31-c589aa2c7d0e",
"name": "Wait Between Pages",
"type": "n8n-nodes-base.wait",
"position": [
816,
272
],
"parameters": {
"amount": 3
},
"typeVersion": 1.1
}
],
"active": false,
"settings": {
"executionOrder": "v1"
},
"versionId": "YOUR_VERSION_ID_HERE",
"connections": {
"Loop URLs": {
"main": [
[],
[
{
"node": "Scrape Page",
"type": "main",
"index": 0
}
]
]
},
"Get Sitemap": {
"main": [
[
{
"node": "Parse Sitemap XML",
"type": "main",
"index": 0
}
]
]
},
"Scrape Page": {
"main": [
[
{
"node": "Extract Content",
"type": "main",
"index": 0
}
]
]
},
"Format Content": {
"main": [
[
{
"node": "Save to Google Drive",
"type": "main",
"index": 0
}
]
]
},
"Extract Content": {
"main": [
[
{
"node": "Format Content",
"type": "main",
"index": 0
}
]
]
},
"Set Sitemap URL": {
"main": [
[
{
"node": "Get Sitemap",
"type": "main",
"index": 0
}
]
]
},
"Parse Sitemap XML": {
"main": [
[
{
"node": "Split Sitemap URLs",
"type": "main",
"index": 0
}
]
]
},
"Split Sitemap URLs": {
"main": [
[
{
"node": "Limit URLs (Optional)",
"type": "main",
"index": 0
}
]
]
},
"Wait Between Pages": {
"main": [
[
{
"node": "Loop URLs",
"type": "main",
"index": 0
}
]
]
},
"Save to Google Drive": {
"main": [
[
{
"node": "Wait Between Pages",
"type": "main",
"index": 0
}
]
]
},
"Limit URLs (Optional)": {
"main": [
[
{
"node": "Loop URLs",
"type": "main",
"index": 0
}
]
]
},
"Start Working Scraper": {
"main": [
[
{
"node": "Set Sitemap URL",
"type": "main",
"index": 0
}
]
]
}
}
}
Credentials you'll need
Each integration node will prompt for credentials when you import. We strip credential IDs before publishing — you'll add your own.
googleDriveOAuth2Api
For the full experience including quality scoring and batch install features for each workflow upgrade to Pro
About this workflow
A reliable, no-frills web scraper that extracts content directly from websites using their sitemaps. Perfect for content audits, migrations, and research when you need straightforward HTML extraction without external dependencies.
Source: https://n8n.io/workflows/7186/ — original creator credit. Request a take-down →
Related workflows
Workflows that share integrations, category, or trigger type with this one. All free to copy and import.
Use responsibly and follow local rules and regulations
Transform static product images from any online store into engaging animated videos using Google's Veo 3.1 AI. Simply submit a catalog page URL and automatically generate professional product showcase
This n8n template demonstrates how to automatically clone and adapt competitor Facebook ads for your own product using AI. Simply provide a Facebook Ad Library URL and your product image, and the work
This tutorial details an end-to-end automation solution for streamlining the lien filing process for Homeowners Associations (HOAs) using an n8n workflow. It significantly reduces manual effort and po
📚 Learners and educators who want a fast overview of a creator’s entire catalog. 🧩 Research, SEO, and content ops teams building an intelligence layer on top of YouTube channels.