This workflow follows the Agent → Google Sheets recipe pattern — see all workflows that pair these two integrations.
The workflow JSON
Copy or download the full n8n JSON below. Paste it into a new n8n workflow, add your credentials, activate. Full import guide →
{
"nodes": [
{
"parameters": {
"rules": {
"values": [
{
"conditions": {
"options": {
"caseSensitive": true,
"leftValue": "",
"typeValidation": "loose",
"version": 2
},
"conditions": [
{
"leftValue": "={{ $json.content.isNotEmpty() && $json.links.length > 0}}",
"rightValue": "",
"operator": {
"type": "boolean",
"operation": "true",
"singleValue": true
},
"id": "a33e576f-7b88-4c5c-8590-fa7f36c1991c"
}
],
"combinator": "and"
},
"renameOutput": true,
"outputKey": "OK"
},
{
"conditions": {
"options": {
"caseSensitive": true,
"leftValue": "",
"typeValidation": "loose",
"version": 2
},
"conditions": [
{
"id": "2872a05a-8f62-4a4a-aad5-6954f2d3747c",
"leftValue": "={{ $json.links}}",
"rightValue": "0",
"operator": {
"type": "array",
"operation": "empty",
"singleValue": true
}
}
],
"combinator": "and"
},
"renameOutput": true,
"outputKey": "NullHyperlink"
},
{
"conditions": {
"options": {
"caseSensitive": true,
"leftValue": "",
"typeValidation": "loose",
"version": 2
},
"conditions": [
{
"id": "73f7f831-5ad9-42d8-95fb-ba65552ff91c",
"leftValue": "={{ $json.content.isEmpty() }}",
"rightValue": "",
"operator": {
"type": "boolean",
"operation": "true",
"singleValue": true
}
}
],
"combinator": "and"
},
"renameOutput": true,
"outputKey": "NullContent"
}
]
},
"looseTypeValidation": true,
"options": {}
},
"type": "n8n-nodes-base.switch",
"typeVersion": 3.3,
"position": [
2928,
2176
],
"id": "8458420a-709b-41ed-8cc8-eadc5f70cd39",
"name": "Switch",
"alwaysOutputData": false
},
{
"parameters": {
"mode": "runOnceForEachItem",
"jsCode": "const puppeteer = require('puppeteer-extra');\nconst stealthPlugin = require('puppeteer-extra-plugin-stealth');\n\n// Aplica o plugin Stealth\npuppeteer.use(stealthPlugin());\n\nasync function scrapeSubpage(url) {\n // ... (toda a sua fun\u00e7\u00e3o scrapeSubpage at\u00e9 o try...catch) ...\n // ... (o browser, page, setRequestInterception, etc. continuam iguais) ...\n let browser = null;\n\n try {\n console.log(`\ud83d\ude80 Iniciando browser...`);\n browser = await puppeteer.launch({\n headless: true,\n args: [\n '--no-sandbox',\n '--disable-setuid-sandbox',\n '--disable-dev-shm-usage',\n '--disable-gpu',\n ],\n });\n\n const page = await browser.newPage();\n page.setDefaultNavigationTimeout(30000);\n\n // Bloqueia recursos desnecess\u00e1rios\n await page.setRequestInterception(true);\n page.on('request', (req) => {\n if (['image', 'stylesheet', 'font'].includes(req.resourceType())) {\n req.abort();\n } else {\n req.continue();\n }\n });\n \n console.log(`\ud83d\udd78\ufe0f Scraping subpage: ${url}`);\n \n // Adiciona uma verifica\u00e7\u00e3o para garantir que a URL \u00e9 uma string\n if (typeof url !== 'string' || !url.startsWith('http')) {\n throw new Error(`URL inv\u00e1lida ou n\u00e3o \u00e9 uma string: ${url}`);\n }\n\n await page.goto(url, { waitUntil: 'networkidle2' });\n\n // Extrai texto limpo\n const content = await page.evaluate(() => {\n //const text = document.body?.innerText || '';\n //return text.replace(/\\s+/g, ' ').trim();\n return document.documentElement.outerHTML;\n });\n\n await browser.close();\n\n return {\n json: {\n success: true,\n url,\n content,\n length: content.length,\n timestamp: new Date().toISOString(),\n }\n };\n } catch (error) {\n // Garante que o browser feche mesmo se o goto() falhar\n if (browser) {\n await browser.close();\n }\n \n // ****** MUDAN\u00c7A PRINCIPAL AQUI ******\n // Use 'return' em vez de 'throw'\n // Isso passa o erro adiante sem parar o workflow do n8n\n return {\n json: { \n success: false, \n url, \n error: error.message,\n timestamp: new Date().toISOString(),\n }\n };\n }\n}\n\n// ===========================\n// Execu\u00e7\u00e3o principal do node\n// ===========================\n\n// O Split Out envia o objeto no formato: { \"links\": \"https://...\" }\nconst url = $json.url;\n\n// ****** MUDAN\u00c7A PRINCIPAL AQUI ******\n// O resultado da fun\u00e7\u00e3o j\u00e1 \u00e9 o objeto que queremos.\n// N\u00e3o precisamos envolv\u00ea-lo em { json: ... }\nreturn scrapeSubpage(url);"
},
"type": "n8n-nodes-base.code",
"typeVersion": 2,
"position": [
4224,
1712
],
"id": "0b2b1ed3-c3d8-4e72-a71d-4f2aca6424a1",
"name": "Subpage Scrapper",
"onError": "continueErrorOutput"
},
{
"parameters": {
"jsCode": "/**\n * Puppeteer Scraper v3 (Corrigido)\n *\n * - N\u00e3o usa 'puppeteer-extra-plugin-user-preferences'.\n * - Usa page.evaluate() para extrair texto e links absolutos (corrigindo o problema original).\n * - Corrige a inicializa\u00e7\u00e3o do Puppeteer com Stealth.\n * - Limpa o fluxo de retorno e o fechamento do browser.\n */\n\nconst puppeteer = require('puppeteer-extra');\nconst stealthPlugin = require('puppeteer-extra-plugin-stealth');\n\n// Aplica o plugin Stealth\npuppeteer.use(stealthPlugin());\n\nasync function scrapePage(urlToScrape) {\n let browser = null;\n\n try {\n console.log(`\ud83d\ude80 Iniciando browser...`);\n browser = await puppeteer.launch({\n headless: true,\n args: [\n '--no-sandbox',\n '--disable-setuid-sandbox',\n '--disable-dev-shm-usage',\n '--disable-gpu',\n ],\n });\n\n const page = await browser.newPage();\n page.setDefaultNavigationTimeout(30000);\n\n // Bloqueia recursos desnecess\u00e1rios\n await page.setRequestInterception(true);\n page.on('request', (req) => {\n if (['image', 'stylesheet', 'font'].includes(req.resourceType())) {\n req.abort();\n } else {\n req.continue();\n }\n });\n\n console.log(`\ud83d\udd78\ufe0f Acessando p\u00e1gina: ${urlToScrape}`);\n await page.goto(urlToScrape, { waitUntil: 'networkidle2' });\n\n // 1. Verifica\u00e7\u00e3o de CAPTCHA (baseada no HTML bruto)\n const pageContent = await page.content();\n if (pageContent.toLowerCase().includes('captcha') || pageContent.toLowerCase().includes('are you a robot')) {\n return { success: false, url: urlToScrape, content: null, links: [], error: 'Captcha detectado' };\n }\n\n // 2. Extra\u00e7\u00e3o de Dados (via page.evaluate)\n // 'a.href' dentro do evaluate() retorna o link ABSOLUTO.\n const result = await page.evaluate(() => {\n // Extrai o texto limpo\n //const content = (document.body?.innerText || '').replace(/\\s+/g, ' ').trim();\n const content = document.documentElement.outerHTML;\n \n // Extrai links, garante que s\u00e3o \u00fanicos e pega os 10 primeiros\n const allLinks = Array.from(document.querySelectorAll('a[href]'));\n const uniqueAbsoluteLinks = [...new Set(\n allLinks\n .map(a => a.href) // 'a.href' resolve caminhos relativos!\n .filter(href => href.startsWith('http')) // Garante que s\u00e3o links web\n )];\n \n const links = uniqueAbsoluteLinks.slice(0, 10);\n\n return { content, links };\n });\n\n // 3. Verifica\u00e7\u00f5es de Sucesso (baseadas no 'result')\n if (!result.content || result.content.length < 50) {\n return { success: false, url: urlToScrape, content: result.content, links: result.links, error: 'Conte\u00fado n\u00e3o encontrado ou muito curto' };\n }\n\n if (result.links.length === 0) {\n return { success: false, url: urlToScrape, content: result.content, links: result.links, error: 'Nenhum hyperlink encontrado' };\n }\n\n // 4. Sucesso!\n return {\n success: true,\n url: urlToScrape,\n content: result.content,\n links: result.links,\n timestamp: new Date().toISOString(),\n };\n\n } catch (error) {\n console.error(`\u274c Erro no scrape: ${error.message}`);\n return {\n success: false,\n url: urlToScrape,\n error: error.message,\n timestamp: new Date().toISOString(),\n };\n } finally {\n // 5. Garante que o browser sempre ser\u00e1 fechado\n if (browser) {\n console.log(`\ud83d\udd12 Fechando browser...`);\n await browser.close();\n }\n }\n}\n\n// ===========================\n// Execu\u00e7\u00e3o principal do node\n// ===========================\nconst url = $input.first().json.url\n\nif (!url) {\n return { json: { success: false, error: \"Nenhuma 'url' fornecida no input do Webhook.\" } };\n}\n\nconst output = await scrapePage(url);\n\n// Retorna um \u00fanico objeto JSON\nreturn { json: output };"
},
"type": "n8n-nodes-base.code",
"typeVersion": 2,
"position": [
2752,
2192
],
"id": "0570d10f-9aad-4bff-b851-944e9cc1cb25",
"name": "PuppeteerScraper"
},
{
"parameters": {
"aggregate": "aggregateAllItemData",
"options": {}
},
"type": "n8n-nodes-base.aggregate",
"typeVersion": 1,
"position": [
4336,
2080
],
"id": "3eb2ca5b-6fcd-4146-b5ac-b13c894e4cc7",
"name": "AggregateSubpages"
},
{
"parameters": {
"url": "=http://api.scrape.do",
"authentication": "genericCredentialType",
"genericAuthType": "httpQueryAuth",
"sendQuery": true,
"queryParameters": {
"parameters": [
{
"name": "url",
"value": "={{ $json.url }}"
}
]
},
"options": {
"timeout": 120000
}
},
"id": "bbc24172-aee7-4d16-b853-fa02af9c95ae",
"name": "Scrape.do1",
"type": "n8n-nodes-base.httpRequest",
"position": [
3104,
2304
],
"retryOnFail": true,
"typeVersion": 4.2,
"waitBetweenTries": 5000,
"credentials": {
"httpQueryAuth": {
"name": "<your credential>"
}
}
},
{
"parameters": {
"url": "=http://api.scrape.do",
"authentication": "genericCredentialType",
"genericAuthType": "httpQueryAuth",
"sendQuery": true,
"queryParameters": {
"parameters": [
{
"name": "url",
"value": "={{ $('Webhook').item.json.body.url }}"
}
]
},
"options": {
"timeout": 120000
}
},
"id": "d4fed834-0122-41d0-a7bf-600c7ca50e90",
"name": "Scrape.do2",
"type": "n8n-nodes-base.httpRequest",
"position": [
4416,
1920
],
"retryOnFail": true,
"typeVersion": 4.2,
"waitBetweenTries": 5000,
"credentials": {
"httpQueryAuth": {
"name": "<your credential>"
}
},
"onError": "continueErrorOutput"
},
{
"parameters": {
"fieldToSplitOut": "links",
"include": "allOtherFields",
"options": {}
},
"type": "n8n-nodes-base.splitOut",
"typeVersion": 1,
"position": [
3616,
1712
],
"id": "b3f192a7-19a8-4322-b658-a17278404493",
"name": "Split Out"
},
{
"parameters": {},
"type": "n8n-nodes-base.manualTrigger",
"typeVersion": 1,
"position": [
2432,
2192
],
"id": "8256e5c9-12c1-4c20-8c4e-96d125dd5a55",
"name": "When clicking \u2018Execute workflow\u2019"
},
{
"parameters": {
"documentId": {
"__rl": true,
"value": "18ERlfLcL5JITmWumxFxwBFcgMCYcRXDR84T6BTgrMP4",
"mode": "list",
"cachedResultName": "sheets-tegra-n8n",
"cachedResultUrl": "https://docs.google.com/spreadsheets/d/18ERlfLcL5JITmWumxFxwBFcgMCYcRXDR84T6BTgrMP4/edit?usp=drivesdk"
},
"sheetName": {
"__rl": true,
"value": "gid=0",
"mode": "list",
"cachedResultName": "webpages_links",
"cachedResultUrl": "https://docs.google.com/spreadsheets/d/18ERlfLcL5JITmWumxFxwBFcgMCYcRXDR84T6BTgrMP4/edit#gid=0"
},
"filtersUI": {
"values": [
{
"lookupColumn": "status",
"lookupValue": "pending"
}
]
},
"options": {
"returnFirstMatch": true
}
},
"type": "n8n-nodes-base.googleSheets",
"typeVersion": 4.7,
"position": [
2592,
2192
],
"id": "fdb18e5d-9750-4bda-9d5c-5a82d3433e38",
"name": "GetPendingURL",
"credentials": {
"googleSheetsOAuth2Api": {
"name": "<your credential>"
}
}
},
{
"parameters": {
"operation": "append",
"documentId": {
"__rl": true,
"value": "18ERlfLcL5JITmWumxFxwBFcgMCYcRXDR84T6BTgrMP4",
"mode": "list",
"cachedResultName": "sheets-tegra-n8n",
"cachedResultUrl": "https://docs.google.com/spreadsheets/d/18ERlfLcL5JITmWumxFxwBFcgMCYcRXDR84T6BTgrMP4/edit?usp=drivesdk"
},
"sheetName": {
"__rl": true,
"value": "gid=0",
"mode": "list",
"cachedResultName": "webpages_links",
"cachedResultUrl": "https://docs.google.com/spreadsheets/d/18ERlfLcL5JITmWumxFxwBFcgMCYcRXDR84T6BTgrMP4/edit#gid=0"
},
"columns": {
"mappingMode": "defineBelow",
"value": {
"url": "={{ $json.url }}",
"instructions": "={{ $json.instructions }}",
"status": "={{ $json.status }}",
"last_updated": "={{ $json.last_updated }}",
"error_notes": "={{ $json.error_notes }}"
},
"matchingColumns": [],
"schema": [
{
"id": "url",
"displayName": "url",
"required": false,
"defaultMatch": false,
"display": true,
"type": "string",
"canBeUsedToMatch": true
},
{
"id": "instructions",
"displayName": "instructions",
"required": false,
"defaultMatch": false,
"display": true,
"type": "string",
"canBeUsedToMatch": true
},
{
"id": "status",
"displayName": "status",
"required": false,
"defaultMatch": false,
"display": true,
"type": "string",
"canBeUsedToMatch": true
},
{
"id": "last_updated",
"displayName": "last_updated",
"required": false,
"defaultMatch": false,
"display": true,
"type": "string",
"canBeUsedToMatch": true
},
{
"id": "error_notes",
"displayName": "error_notes",
"required": false,
"defaultMatch": false,
"display": true,
"type": "string",
"canBeUsedToMatch": true
}
],
"attemptToConvertTypes": false,
"convertFieldsToString": false
},
"options": {}
},
"type": "n8n-nodes-base.googleSheets",
"typeVersion": 4.7,
"position": [
3952,
1712
],
"id": "2b257242-7fb4-4277-b3d0-15cbc5793a70",
"name": "Append row in sheet",
"credentials": {
"googleSheetsOAuth2Api": {
"name": "<your credential>"
}
}
},
{
"parameters": {
"jsCode": "/*\n * SCRIPT: Code in JavaScript1 (Fallback Subp\u00e1gina)\n * * INPUT: HTML bruto do n\u00f3 Scrape.do2.\n * OUTPUT: Formato id\u00eantico ao Subpage Scraper { success, url, content, error_notes }\n */\n\n// --- Execu\u00e7\u00e3o Principal ---\n\nconst scrapeDoResult = $input.first().json; // Recebe o HTML (ou erro)\n// A URL da subp\u00e1gina vem do item que entrou neste loop\n// Assumindo que o Split Out passou { links: \"url_subpagina\" }\n// Se o seu Split Out passou { url: \"url_subpagina\" }, mude abaixo\nconst subpageUrl = $input.first().json.links || $input.first().json.url; \n\nif (!subpageUrl) {\n throw new Error(\"N\u00e3o foi poss\u00edvel encontrar a URL da subp\u00e1gina no input do loop.\");\n}\n\n// Verifica se o Scrape.do2 retornou um erro\nif (typeof scrapeDoResult !== 'string') {\n return {\n json: {\n success: false,\n url: subpageUrl,\n content: null,\n error_notes: `Fallback Scrape.do2 falhou: ${JSON.stringify(scrapeDoResult)}`\n }\n };\n}\n\nconst htmlContent = scrapeDoResult;\n\n// Verifica conte\u00fado m\u00ednimo\nif (!htmlContent || htmlContent.length < 50) {\n return {\n json: {\n success: false,\n url: subpageUrl,\n content: htmlContent,\n error_notes: 'Fallback Scrape.do2 retornou conte\u00fado muito curto ou vazio.'\n }\n };\n}\n\n// Sucesso no Fallback\nreturn {\n json: {\n success: true,\n url: subpageUrl,\n content: htmlContent, // O HTML bruto\n error_notes: 'Used Scrape.do2 Fallback' // Nota opcional\n }\n};"
},
"type": "n8n-nodes-base.code",
"typeVersion": 2,
"position": [
4656,
1920
],
"id": "aec92ce1-8804-40a5-8151-f31044bed7a8",
"name": "HTML Extractor",
"onError": "continueErrorOutput"
},
{
"parameters": {
"assignments": {
"assignments": [
{
"id": "d23c65be-ba06-46f4-8d1f-cf478e6e58b2",
"name": "url",
"value": "={{ $json.links }}",
"type": "string"
},
{
"id": "6b753e76-ba03-4bc6-b780-cbbafead681f",
"name": "instructions",
"value": "={{ $('GetPendingURL').item.json.instructions }}",
"type": "string"
},
{
"id": "959001e5-e4bf-41ec-b5e3-262abb358018",
"name": "status",
"value": "pending",
"type": "string"
},
{
"id": "15f45a34-6eb4-44d6-ba5b-bc30b7954b74",
"name": "last_updated",
"value": "={{ $now.toISO() }}",
"type": "string"
},
{
"id": "ac36c06a-4505-4516-b3a2-577ed19a0465",
"name": "error_notes",
"value": "",
"type": "string"
}
]
},
"options": {}
},
"type": "n8n-nodes-base.set",
"typeVersion": 3.4,
"position": [
3776,
1712
],
"id": "7e8af143-84eb-40c2-a61a-9391bb9a05d4",
"name": "Set hyperlink fields"
},
{
"parameters": {
"operation": "update",
"documentId": {
"__rl": true,
"value": "18ERlfLcL5JITmWumxFxwBFcgMCYcRXDR84T6BTgrMP4",
"mode": "list",
"cachedResultName": "sheets-tegra-n8n",
"cachedResultUrl": "https://docs.google.com/spreadsheets/d/18ERlfLcL5JITmWumxFxwBFcgMCYcRXDR84T6BTgrMP4/edit?usp=drivesdk"
},
"sheetName": {
"__rl": true,
"value": "gid=0",
"mode": "list",
"cachedResultName": "webpages_links",
"cachedResultUrl": "https://docs.google.com/spreadsheets/d/18ERlfLcL5JITmWumxFxwBFcgMCYcRXDR84T6BTgrMP4/edit#gid=0"
},
"columns": {
"mappingMode": "defineBelow",
"value": {
"url": "={{ $json.url }}",
"last_updated": "={{ $json.last_updated }}",
"error_notes": "={{ $json.error_notes }}",
"instructions": "={{ $json.instructions }}",
"status": "={{ $json.status }}"
},
"matchingColumns": [
"url"
],
"schema": [
{
"id": "url",
"displayName": "url",
"required": false,
"defaultMatch": false,
"display": true,
"type": "string",
"canBeUsedToMatch": true,
"removed": false
},
{
"id": "instructions",
"displayName": "instructions",
"required": false,
"defaultMatch": false,
"display": true,
"type": "string",
"canBeUsedToMatch": true
},
{
"id": "status",
"displayName": "status",
"required": false,
"defaultMatch": false,
"display": true,
"type": "string",
"canBeUsedToMatch": true,
"removed": false
},
{
"id": "last_updated",
"displayName": "last_updated",
"required": false,
"defaultMatch": false,
"display": true,
"type": "string",
"canBeUsedToMatch": true
},
{
"id": "error_notes",
"displayName": "error_notes",
"required": false,
"defaultMatch": false,
"display": true,
"type": "string",
"canBeUsedToMatch": true
},
{
"id": "row_number",
"displayName": "row_number",
"required": false,
"defaultMatch": false,
"display": true,
"type": "number",
"canBeUsedToMatch": true,
"readOnly": true,
"removed": true
}
],
"attemptToConvertTypes": false,
"convertFieldsToString": false
},
"options": {}
},
"type": "n8n-nodes-base.googleSheets",
"typeVersion": 4.7,
"position": [
5120,
2192
],
"id": "4d1067c0-6872-4f74-aac5-45185867d78b",
"name": "Update row in sheet",
"credentials": {
"googleSheetsOAuth2Api": {
"name": "<your credential>"
}
}
},
{
"parameters": {
"mode": "runOnceForEachItem",
"jsCode": "// Assume que 'data' vem do input do n8n (ou de outra fonte)\nconst data = $json.data\n\n/**\n * Extrai texto limpo e leg\u00edvel de uma string HTML.\n * @param {string} html - A string HTML bruta.\n * @returns {string} O texto extra\u00eddo e limpo.\n */\nfunction extractTextFromHTML(html) {\n if (typeof html !== 'string') return '';\n\n const cleanedHTML = html\n // Remove blocos de conte\u00fado n\u00e3o vis\u00edveis ou indesejados\n .replace(/<script[\\s\\S]*?>[\\s\\S]*?<\\/script>/gi, ' ')\n .replace(/<style[\\s\\S]*?>[\\s\\S]*?<\\/style>/gi, ' ')\n .replace(/<head[\\s\\S]*?>[\\s\\S]*?<\\/head>/gi, ' ')\n .replace(/<nav[\\s\\S]*?>[\\s\\S]*?<\\/nav>/gi, ' ')\n .replace(/<footer[\\s\\S]*?>[\\s\\S]*?<\\/footer>/gi, ' ')\n \n // Substitui todas as tags HTML restantes por um espa\u00e7o\n // Isso evita que palavras se juntem (ex: <p>Ol\u00e1</p><p>Mundo</p> -> \"Ol\u00e1 Mundo\")\n .replace(/<[^>]+>/g, ' ')\n \n // Substitui m\u00faltiplos espa\u00e7os, tabs ou quebras de linha por um \u00fanico espa\u00e7o\n .replace(/\\s+/g, ' ')\n .trim();\n \n return cleanedHTML;\n}\n\n/**\n * Extrai todos os URLs (atributos href) de uma string HTML.\n * @param {string} html - A string HTML bruta.\n * @returns {string[]} Um array de URLs.\n */\nfunction extractLinksFromHTML(html) {\n if (typeof html !== 'string') return [];\n \n const links = [];\n // Regex para encontrar tags <a> e capturar o atributo href\n // Lida com aspas simples ou duplas\n const linkRegex = /<a\\s+[^>]*?href\\s*=\\s*[\"']([^\"']*)[\"']/gi;\n \n // Usamos matchAll para encontrar todas as ocorr\u00eancias\n const matches = html.matchAll(linkRegex);\n\n for (const match of matches) {\n const url = match[1]; // O grupo capturado (a URL)\n \n // Filtra links vazios, \u00e2ncoras de p\u00e1gina (#) ou a\u00e7\u00f5es javascript\n if (url && !url.startsWith('#') && !url.toLowerCase().startsWith('javascript:')) {\n links.push(url);\n }\n // Para o loop assim que atingir 10 links\n if (links.length >= 10) {\n break;\n }\n }\n \n // Retorna apenas links \u00fanicos\n return [...new Set(links)];\n}\n\n// --- Execu\u00e7\u00e3o ---\n\n// Extrai as duas informa\u00e7\u00f5es do HTML\nconst extractedText = extractTextFromHTML(data);\nconst extractedLinks = extractLinksFromHTML(data);\n\n// Retorna no formato JSON desejado\nreturn {\n success: $json.data ? true : false,\n content: extractedText,\n links: extractedLinks\n};"
},
"type": "n8n-nodes-base.code",
"typeVersion": 2,
"position": [
3248,
2304
],
"id": "5e8c705e-db1e-44c1-8026-4d5e57fc4fcf",
"name": "ExtractHtmlSubpage"
},
{
"parameters": {
"jsCode": "/*\n * SCRIPT 1: FORMATADOR DA P\u00c1GINA PRINCIPAL\n * Coloque este n\u00f3 logo ap\u00f3s o Main_Page_AI_Agent\n */\n\n// --- Fun\u00e7\u00e3o Helper para parsear o output da IA ---\nfunction parseAiOutput(aiItemJson) {\n try {\n // aiItemJson \u00e9 o { output: \"```json...\" }\n const jsonString = aiItemJson.output.replace(/```json\\n|```/g, '').trim();\n const parsed = JSON.parse(jsonString);\n // Retorna o primeiro item (ou vazio) pois a IA foi instru\u00edda a agregar por URL\n return (parsed.extracted_data && parsed.extracted_data[0]) ? parsed.extracted_data[0] : {};\n } catch (e) {\n console.error(\"Falha ao parsear JSON da IA (Main):\", e.message, aiItemJson.output);\n return {};\n }\n}\n\n// --- 1. Obter todas as fontes de dados ---\n\n// Fonte A: Output da IA (Input deste n\u00f3)\nconst aiOutput = $input.first().json;\nconst aiData = parseAiOutput(aiOutput);\n\n// Fonte B: Dados Iniciais (do in\u00edcio)\n// !!! MUDE \"GetPendingURL\" para o nome do seu n\u00f3 inicial !!!\nconst startData = $node[\"GetPendingURL\"].json;\nconst instructions = startData.instructions;\n\n// Fonte C: Dados do Scrape da P\u00c1GINA PRINCIPAL (do Merge ANTES da IA)\n// !!! MUDE \"Merge_Main_Scrape\" para o nome do n\u00f3 Merge que uniu Scraper/Fallback !!!\nconst mainPageScrapeData = $node[\"Merge\"].json;\nif (!mainPageScrapeData) {\n throw new Error(\"N\u00e3o foi poss\u00edvel encontrar os dados de scrape da P\u00e1gina Principal no n\u00f3 Merge_Main_Scrape.\");\n}\nconst page = mainPageScrapeData;\n\n// --- 2. Preparar e retornar o item ---\nconst now = new Date().toISOString();\n\nreturn {\n json: {\n url: page.url,\n instructions: instructions,\n status: page.success ? 'completed' : 'failed',\n last_updated: now,\n error_notes: page.error_notes || '',\n extracted_content: aiData.extracted_content || '' // Conte\u00fado da IA principal\n }\n};"
},
"type": "n8n-nodes-base.code",
"typeVersion": 2,
"position": [
4144,
2464
],
"id": "b0312eef-bf39-430d-8c2a-53fc05cbac34",
"name": "CodeFormatMainPage"
},
{
"parameters": {},
"type": "n8n-nodes-base.merge",
"typeVersion": 3.2,
"position": [
3424,
2176
],
"id": "c374f589-0969-434f-b6e2-7a38eb1ece63",
"name": "Merge"
},
{
"parameters": {
"promptType": "define",
"text": "=INSTRUCTIONS:\n{{ $('GetPendingURL').item.json.instructions }}\nEXECUTION_TIMESTAMP:\n{{ $now.toISO() }}\n\nCONTENT_TO_ANALYZE:\n\n[Subpage]\nURL: {{ $json.url }}\nContent: {{ $json.content }}\n[End of Subpage]",
"needsFallback": true,
"options": {
"systemMessage": "# AI Agent Configuration: Content Extractor\n\n## 1. Role\n\nContent Extractor\n\n## 2. Context\n\nYou are an AI data extraction agent. Your purpose is to analyze the provided `CONTENT_TO_ANALYZE`, which is split into `[Main Page]` and `[Subpages]`.\n\nYou must strictly follow the `INSTRUCTIONS` to identify, extract \"as-is\" (verbatim), and correlate *only* the relevant text segments.\n\nThe content you extract will be used to feed a vector database (via the `scrape_vector_store` tool). Your final answer *must* be a formatted JSON object.\n\n## 3. Tools\n\n(This defines the available tool for the agent.)\n\n```json\n[\n {\n \"name\": \"scrape_vector_store\",\n \"description\": \"Stores the extracted content and its source URL in a Supabase vector database. Use this tool to save the final data.\",\n \"parameters\": {\n \"type\": \"object\",\n \"properties\": {\n \"data_to_store\": {\n \"type\": \"array\",\n \"items\": {\n \"type\": \"object\",\n \"properties\": {\n \"source_url\": {\n \"type\": \"string\",\n \"description\": \"The URL where the content was extracted from.\"\n },\n \"content_chunk\": {\n \"type\": \"string\",\n \"description\": \"The extracted text chunk.\"\n },\n \"timestamp\": {\n \"type\": \"string\",\n \"description\": \"ISO timestamp of the extraction.\"\n }\n },\n \"required\": [\"source_url\", \"content_chunk\", \"timestamp\"]\n }\n }\n },\n \"required\": [\"data_to_store\"]\n }\n }\n]```\n\n## 4. Reasoning (Chain of Thought)\n(This is the internal reasoning the agent will follow.)\n\n1. First, I will read and understand the `INSTRUCTIONS`.\n2. I will identify the `EXECUTION_TIMESTAMP`.\n3. I will scan the `Content` (which is HTML) for relevant information based on the `INSTRUCTIONS` (e.g., product titles and prices).\n4. I will temporarily store *all* extracted segments as `{ url: \"source_url\", content: \"extracted_segment\" }`.\n5. After analyzing all `[Main Page]` and `[Subpages]` blocks, I will **group all temporarily stored segments by their unique `url`**.\n6. For each unique `url`, I will **aggregate all its corresponding `content` segments into a single string**. I will use a newline character (`\\n`) as a separator.\n7. I will format my final answer as a JSON object, following the `Output Rules`, creating *only one object* per unique `source_url`.\n## 5. Output Rules (Structured Output Parser)\n(Defines the mandatory JSON output format.)\n## Output rules\n- The output format must be *exclusively* a JSON object.\n- The `extracted_data` array must contain *only one object per unique source_url*.\n- The `extracted_content` field MUST contain all segments found for that URL, joined together into a single string (separated by newlines `\\n`).\n\n- Example of a *good* `extracted_content` (all segments joined): \n `\"Title: A Light in the ..., Price: \u00a351.77\\nTitle: Tipping the Velvet, Price: \u00a353.74\"`\n\n- Example of a *bad* structure (which is now forbidden):\n `...{ \"source_url\": \"url1\", \"extracted_content\": \"chunk A\" }, { \"source_url\": \"url1\", \"extracted_content\": \"chunk B\" }...`\n\n- The final JSON structure must be:\n{\n \"extracted_data\": [\n {\n \"source_url\": \"https://books.toscrape.com/index.html\",\n \"extracted_content\": \"Title: A Light in the ..., Price: \u00a351.77\\nTitle: Tipping the Velvet, Price: \u00a353.74\\nTitle: Soumission, Price: \u00a350.10\",\n \"timestamp\": \"the_value_of_the_EXECUTION_TIMESTAMP\"\n },\n {\n \"source_url\": \"https://books.toscrape.com/catalogue/category/books/travel_2/index.html\",\n \"extracted_content\": \"Title: It's Only the Himalayas, Price: \u00a345.17\\nTitle: Full Moon over ..., Price: \u00a349.43\",\n \"timestamp\": \"the_value_of_the_EXECUTION_TIMESTAMP\"\n }\n ]\n}\n\n- If no relevant content is found, return an empty array:\n{\n \"extracted_data\": []\n}"
}
},
"type": "@n8n/n8n-nodes-langchain.agent",
"typeVersion": 2.2,
"position": [
3808,
2080
],
"id": "95b2ec2b-e217-4071-8716-952a68c7e010",
"name": "AIAgentSubpage",
"retryOnFail": true,
"waitBetweenTries": 5000,
"onError": "continueErrorOutput"
},
{
"parameters": {
"promptType": "define",
"text": "=INSTRU\u00c7\u00d5ES:\n{{ $('GetPendingURL').item.json.instructions }}\n\nCONTE\u00daDO:\n[P\u00e1gina principal]{{ $json.content }}\n[FIM P\u00e1gina principal]\n",
"options": {
"systemMessage": "# AI Agent Configuration: Content Extractor\n\n## 1. Role\n\nYou are an AI data extraction agent. Your purpose is to analyze the provided `CONTENT_TO_ANALYZE`, which is split into `[Main Page]` and `[Subpages]`.\nYou must strictly follow the `INSTRUCTIONS` to identify, extract \"as-is\" (verbatim), and correlate *only* the relevant text segments.\n\nYou are an AI agent that parses raw **HTML content**, not plain text. You must navigate the HTML structure to find data, connecting related pieces of information (like a product title and its price).\n\n**Reasoning:**\n1. First, I will read and understand the `INSTRUCTIONS`.\n2. I will identify the `EXECUTION_TIMESTAMP`.\n3. I will scan the `Content` (which is HTML) for relevant information based on the `INSTRUCTIONS` (e.g., product titles and prices).\n4. I will temporarily store *all* extracted segments as `{ url: \"source_url\", content: \"extracted_segment\" }`.\n5. After analyzing all `[Main Page]` and `[Subpages]` blocks, I will **group all temporarily stored segments by their unique `url`**.\n6. For each unique `url`, I will **aggregate all its corresponding `content` segments into a single string**. I will use a newline character (`\\n`) as a separator.\n7. I will format my final answer as a JSON object, following the `Output Rules`, creating *only one object* per unique `source_url`.\n\n**Output Rules:**\n- The output format must be *exclusively* a JSON object.\n- The `extracted_data` array must contain *only one object per unique source_url*.\n- The `extracted_content` field MUST contain all segments found for that URL, joined together into a single string (separated by newlines `\\n`).\n- Example `extracted_content`: \"Title: A Light in the ..., Price: \u00a351.77\\nTitle: Tipping the Velvet, Price: \u00a353.74\"\n\n- The final JSON structure must be:\n{\n \"extracted_data\": [\n {\n \"source_url\": \"url_of_the_source_page\",\n \"extracted_content\": \"all_relevant_segments_joined_by_newline\",\n \"timestamp\": \"the_value_of_the_EXECUTION_TIMESTAMP\"\n }\n ]\n}\n\n- If no relevant content is found, return an empty array:\n{\n \"extracted_data\": []\n}"
}
},
"type": "@n8n/n8n-nodes-langchain.agent",
"typeVersion": 2.2,
"position": [
3808,
2464
],
"id": "9eab0b9d-5878-447a-8e09-e3854951aa73",
"name": "MainPageAIAgent"
},
{
"parameters": {
"mode": "runOnceForEachItem",
"jsCode": "/*\n * SCRIPT 2: FORMATADOR DAS SUBP\u00c1GINAS (DENTRO DO LOOP)\n * Coloque este n\u00f3 logo ap\u00f3s o AI_Agent_Subpage (dentro do loop)\n */\n\n// --- Fun\u00e7\u00e3o Helper para parsear o output da IA ---\nfunction parseAiOutput(aiItemJson) {\n try {\n // aiItemJson \u00e9 o { output: \"```json...\" }\n const jsonString = aiItemJson.output.replace(/```json\\n|```/g, '').trim();\n const parsed = JSON.parse(jsonString);\n // Retorna o primeiro item (ou vazio)\n return (parsed.extracted_data && parsed.extracted_data[0]) ? parsed.extracted_data[0] : {};\n } catch (e) {\n console.error(\"Falha ao parsear JSON da IA (Subpage):\", e.message, aiItemJson.output);\n return {};\n }\n}\n\n// --- 1. Obter todas as fontes de dados ---\n\n// Fonte A: Output da IA (Input deste n\u00f3)\n// O n\u00f3 da IA passa adiante os dados que ele recebeu (scrape data)\nconst aiOutput = $json\nconst aiData = parseAiOutput(aiOutput);\n\n// Fonte B: Dados Iniciais (do in\u00edcio)\n// !!! MUDE \"GetPendingURL\" para o nome do seu n\u00f3 inicial !!!\nconst startData = $('GetPendingURL').item.json\nconst instructions = startData.instructions;\n\n// Fonte C: Dados do Scrape da SUBP\u00c1GINA (est\u00e3o no input, passados pela IA)\n// O n\u00f3 da IA deveria ter recebido { success, url, content, error_notes } do Merge_Subpage_Result\nconst scrapeData = $('Subpage Scrapper').item.json\n//if (!aiData || !aiData.source_url) {\n// throw new Error(\"Input para Code_Format_Subpage n\u00e3o cont\u00e9m dados de scrape (url, success). Verifique o n\u00f3 Merge anterior \u00e0 IA.\");\n\n\n// --- 2. Preparar e retornar o item ---\nconst now = new Date().toISOString();\n\nreturn {\n json: {\n url: scrapeData.url, // Veio do input (scrape)\n instructions: instructions, // Veio do n\u00f3 inicial\n status: scrapeData.success ? 'completed' : 'failed', // Veio do input (scrape)\n last_updated: now,\n error_notes: scrapeData.error_notes || '', // Veio do input (scrape)\n content: aiData.extracted_content || '' // Veio do output da IA\n }\n};"
},
"type": "n8n-nodes-base.code",
"typeVersion": 2,
"position": [
4160,
2080
],
"id": "bdb740b6-6fa4-44fb-a247-954fec10928a",
"name": "SubPageFormatter"
},
{
"parameters": {},
"type": "n8n-nodes-base.merge",
"typeVersion": 3.2,
"position": [
4800,
1712
],
"id": "4ba308bb-d799-4c5f-a557-cf2ae47497db",
"name": "Merge1"
},
{
"parameters": {},
"type": "n8n-nodes-base.merge",
"typeVersion": 3.2,
"position": [
4480,
2336
],
"id": "b1380c13-ba4c-4387-b21f-44be9aa40bfc",
"name": "Merge2"
},
{
"parameters": {
"jsCode": "const dataArray = $input.first().json.data\nif (!Array.isArray(dataArray)) return [];\n\nreturn dataArray.map(item => ({ json: item }));\n"
},
"type": "n8n-nodes-base.code",
"typeVersion": 2,
"position": [
5136,
2416
],
"id": "5fc249dc-321d-4f80-b8bb-e15fc8808b84",
"name": "FlattenResult"
},
{
"parameters": {
"mode": "runOnceForEachItem",
"jsCode": "// Este c\u00f3digo roda para CADA item (URL) do n\u00f3 anterior\nconst item = $json;\nconst outputItems = [];\n\n// 1. \"Explode\" o campo 'content' em linhas\nconst chunks = item.content.split('\\n')\n .map(line => line.trim())\n .filter(line => line.length > 0);\n\nconst totalChunks = chunks.length;\n\n// 2. Cria um novo item de sa\u00edda para CADA chunk (t\u00edtulo)\nfor (let i = 0; i < chunks.length; i++) {\n const chunkText = chunks[i];\n \n // Formato obrigat\u00f3rio para o n\u00f3 'Supabase Vector Store'\n const newItem = {\n // pageContent \u00e9 a chave padr\u00e3o que o n\u00f3 procura para embedar\n chunk_text: chunkText,\n source_url: item.url,\n chunk_index: i,\n total_chunks: totalChunks,\n last_updated: item.last_updated,\n // metadata \u00e9 onde colocamos todos os dados extras\n metadata: {\n instructions: item.instructions,\n status: item.status \n }\n };\n\n outputItems.push({ json: newItem });\n}\n\n// 4. Retorna um array de itens. \n// O n8n vai criar m\u00faltiplos itens (um para cada t\u00edtulo de livro)\nreturn {json: {outputItems}};"
},
"type": "n8n-nodes-base.code",
"typeVersion": 2,
"position": [
5328,
2416
],
"id": "96accfac-c8a9-49e9-844f-70417b1b547e",
"name": "ChunkContent"
},
{
"parameters": {
"jsCode": "/*\n * SCRIPT 3: MESCLAGEM FINAL (ap\u00f3s Page + Subpages)\n * Entrada 0 \u2192 P\u00e1gina principal formatada (CodeFormatMainPage)\n * Entrada 1 \u2192 Subp\u00e1ginas agregadas (AggregateSubpages)\n */\n\nconst finalItemsToUpdate = [];\n\n// Entrada 0: P\u00e1gina principal\nconst mainInput = $input.first().json\nif (mainInput) {\n finalItemsToUpdate.push({ json: mainInput });\n} else {\n console.warn(\"\u26a0\ufe0f Nenhum dado da p\u00e1gina principal encontrado.\");\n}\n\n// Entrada 1: Subp\u00e1ginas agregadas\nconst subInput = $input.last().json.data \nif (subInput && Array.isArray(subInput.data)) {\n for (const subpage of subInput.data) {\n finalItemsToUpdate.push({ json: subpage });\n }\n} else if (Array.isArray(subInput)) {\n // Caso o input j\u00e1 seja um array direto (sem wrapper data)\n for (const subpage of subInput) {\n finalItemsToUpdate.push({ json: subpage });\n }\n} else if (subInput && typeof subInput === 'object') {\n // Caso seja apenas um objeto \u00fanico\n finalItemsToUpdate.push({ json: subInput });\n} else {\n console.warn(\"\u26a0\ufe0f Nenhum dado de subp\u00e1ginas encontrado na entrada 1.\");\n}\n\nreturn finalItemsToUpdate;\n"
},
"type": "n8n-nodes-base.code",
"typeVersion": 2,
"position": [
4736,
2336
],
"id": "684651b6-b00b-416e-a717-e683ca6972a1",
"name": "FinalFormatter"
},
{
"parameters": {
"fieldToSplitOut": "data",
"options": {}
},
"type": "n8n-nodes-base.splitOut",
"typeVersion": 1,
"position": [
4928,
2192
],
"id": "d5462f18-5fea-4ead-b54f-e8edb3430c62",
"name": "Split Out1"
},
{
"parameters": {
"operation": "appendOrUpdate",
"documentId": {
"__rl": true,
"value": "18ERlfLcL5JITmWumxFxwBFcgMCYcRXDR84T6BTgrMP4",
"mode": "list",
"cachedResultName": "sheets-tegra-n8n",
"cachedResultUrl": "https://docs.google.com/spreadsheets/d/18ERlfLcL5JITmWumxFxwBFcgMCYcRXDR84T6BTgrMP4/edit?usp=drivesdk"
},
"sheetName": {
"__rl": true,
"value": "gid=0",
"mode": "list",
"cachedResultName": "webpages_links",
"cachedResultUrl": "https://docs.google.com/spreadsheets/d/18ERlfLcL5JITmWumxFxwBFcgMCYcRXDR84T6BTgrMP4/edit#gid=0"
},
"columns": {
"mappingMode": "defineBelow",
"value": {
"url": "={{ $json.url }}",
"instructions": "={{ $('GetPendingURL').item.json.instructions }}",
"status": "={{ $('PuppeteerScraper').item.json.success ? 'completed' : 'failed' }}",
"last_updated": "={{ $('PuppeteerScraper').item.json.timestamp }}",
"error_notes": "={{ $json.error ? $json.error:''}}"
},
"matchingColumns": [
"url"
],
"schema": [
{
"id": "url",
"displayName": "url",
"required": false,
"defaultMatch": false,
"display": true,
"type": "string",
"canBeUsedToMatch": true,
"removed": false
},
{
"id": "instructions",
"displayName": "instructions",
"required": false,
"defaultMatch": false,
"display": true,
"type": "string",
"canBeUsedToMatch": true
},
{
"id": "status",
"displayName": "status",
"required": false,
"defaultMatch": false,
"display": true,
"type": "string",
"canBeUsedToMatch": true
},
{
"id": "last_updated",
"displayName": "last_updated",
"required": false,
"defaultMatch": false,
"display": true,
"type": "string",
"canBeUsedToMatch": true
},
{
"id": "error_notes",
"displayName": "error_notes",
"required": false,
"defaultMatch": false,
"display": true,
"type": "string",
"canBeUsedToMatch": true
}
],
"attemptToConvertTypes": false,
"convertFieldsToString": false
},
"options": {}
},
"type": "n8n-nodes-base.googleSheets",
"typeVersion": 4.7,
"position": [
3168,
2000
],
"id": "63e7086a-b58a-48c0-994a-dee9f3a1ed54",
"name": "Append or update row in sheet",
"credentials": {
"googleSheetsOAuth2Api": {
"name": "<your credential>"
}
}
},
{
"parameters": {
"fieldToSplitOut": "outputItems",
"options": {}
},
"type": "n8n-nodes-base.splitOut",
"typeVersion": 1,
"position": [
5536,
2416
],
"id": "2017dd43-5378-4b9d-a082-4717d5e22e38",
"name": "Split Out2"
},
{
"parameters": {
"tableId": "scraped_content",
"dataToSend": "autoMapInputData"
},
"type": "n8n-nodes-base.supabase",
"typeVersion": 1,
"position": [
6160,
2416
],
"id": "192bc6d2-44e6-4816-8b2e-bb8df918ae82",
"name": "Create a row",
"credentials": {
"supabaseApi": {
"name": "<your credential>"
}
}
},
{
"parameters": {
"method": "POST",
"url": "https://generativelanguage.googleapis.com/v1beta/models/gemini-embedding-001:embedContent",
"authentication": "genericCredentialType",
"genericAuthType": "httpHeaderAuth",
"sendBody": true,
"specifyBody": "json",
"jsonBody": "={\n \"model\": \"models/gemini-embedding-001\",\n \"content\": { \n \"parts\": [{ \"text\": \"{{ $json.json.chunk_text }}\" }]\n },\n \"outputDimensionality\": 1536\n}",
"options": {
"batching": {
"batch": {
"batchSize": 2,
"batchInterval": 3000
}
}
}
},
"type": "n8n-nodes-base.httpRequest",
"typeVersion": 4.2,
"position": [
5744,
2416
],
"id": "c95dddbc-1f11-46a6-a8da-95eaf5a95f93",
"name": "HTTP Request",
"credentials": {
"googlePalmApi": {
"name": "<your credential>"
},
"httpHeaderAuth": {
"name": "<your credential>"
}
}
},
{
"parameters": {
"mode": "runOnceForEachItem",
"jsCode": "// O 'item' cont\u00e9m todos os dados do n\u00f3 anterior\n// (chunk_text, source_url, etc. + a resposta da API 'embedding')\nconst item = $('Split Out2').item.json.json\n\n// 1. O HTTP Request retorna o embedding dentro de { \"embedding\": { \"values\": [...] } }\n// N\u00f3s precisamos extrair apenas o array 'values'.\nconst embeddingVector =$json.embedding.values;\n\n// 2. Opcional: Agrupar o metadata em um \u00fanico objeto JSONB\n// (Assumindo que sua tabela tem uma coluna 'metadata' como no exemplo anterior)\nconst metadata = {\n instructions: item.metadata.instructions,\n status: item.metadata.status,\n last_updated: item.last_updated\n // Adicione quaisquer outros campos que voc\u00ea queira guardar no JSONB\n};\n\n// 3. Montar o objeto final limpo.\n// As chaves (ex: 'source_url') devem ser ID\u00caNTICAS\n// aos nomes das colunas da sua tabela Supabase.\nconst rowData = {\n source_url: item.source_url,\n chunk_text: item.chunk_text,\n chunk_index: item.chunk_index,\n total_chunks: item.total_chunks,\n embedding: embeddingVector, // <-- O vetor limpo\n metadata: metadata // <-- O objeto JSONB\n};\n\n// 4. Retornar o objeto formatado\nreturn { json: rowData };"
},
"type": "n8n-nodes-base.code",
"typeVersion": 2,
"position": [
5952,
2416
],
"id": "79434a31-5376-4e10-812b-62a5f40128c2",
"name": "Code in JavaScript"
},
{
"parameters": {
"modelName": "models/gemini-2.0-flash-001",
"options": {}
},
"type": "@n8n/n8n-nodes-langchain.lmChatGoogleGemini",
"typeVersion": 1,
"position": [
3968,
2288
],
"id": "58116084-e52b-4df5-b41d-bdde8f2864cd",
"name": "Fallback Model",
"credentials": {
"googlePalmApi": {
"name": "<your credential>"
}
}
},
{
"parameters": {
"modelName": "models/gemini-2.0-flash",
"options": {}
},
"type": "@n8n/n8n-nodes-langchain.lmChatGoogleGemini",
"typeVersion": 1,
"position": [
3808,
2288
],
"id": "4b8751b8-e6f1-4178-a3d0-133cbcc226cc",
"name": "Embedding Model",
"credentials": {
"googlePalmApi": {
"name": "<your credential>"
}
}
}
],
"connections": {
"Switch": {
"main": [
[
{
"node": "Append or update row in sheet",
"type": "main",
"index": 0
},
{
"node": "Merge",
"type": "main",
"index": 0
}
],
[
{
"node": "Scrape.do1",
"type": "main",
"index": 0
}
],
[
{
"node": "Scrape.do1",
"type": "main",
"index": 0
}
]
]
},
"Subpage Scrapper": {
"main": [
[
{
"node": "Merge1",
"type": "main",
"index": 0
}
],
[
{
"node": "Scrape.do2",
"type": "main",
"index": 0
}
]
]
},
"PuppeteerScraper": {
"main": [
[
{
"node": "Switch",
"type": "main",
"index": 0
}
]
]
},
"AggregateSubpages": {
"main": [
[
{
"node": "Merge2",
"type": "main",
"index": 0
}
]
]
},
"Scrape.do1": {
"main": [
[
{
"node": "ExtractHtmlSubpage",
"type": "main",
"index": 0
}
]
]
},
"Scrape.do2": {
"main": [
[
{
"node": "HTML Extractor",
"type": "main",
"index": 0
}
]
]
},
"Split Out": {
"main": [
[
{
"node": "Set hyperlink fields",
"type": "main",
"index": 0
}
]
]
},
"When clicking \u2018Execute workflow\u2019": {
"main": [
[
{
"node": "GetPendingURL",
"type": "main",
"index": 0
}
]
]
},
"GetPendingURL": {
"main": [
[
{
"node": "PuppeteerScraper",
"type": "main",
"index": 0
}
]
]
},
"Append row in sheet": {
"main": [
[
{
"node": "Subpage Scrapper",
"type": "main",
"index": 0
}
]
]
},
"HTML Extractor": {
"main": [
[
{
"node": "Merge1",
"type": "main",
"index": 1
}
]
]
},
"Set hyperlink fields": {
"main": [
[
{
"node": "Append row in sheet",
"type": "main",
"index": 0
}
]
]
},
"ExtractHtmlSubpage": {
"main": [
[
{
"node": "Merge",
"type": "main",
"index": 1
}
]
]
},
"CodeFormatMainPage": {
"main": [
[
{
"node": "Merge2",
"type": "main",
"index": 1
}
]
]
},
"Merge": {
"main": [
[
{
"node": "MainPageAIAgent",
"type": "main",
"index": 0
},
{
"node": "Split Out",
"type": "main",
"index": 0
}
]
]
},
"AIAgentSubpage": {
"main": [
[
{
"node": "SubPageFormatter",
"type": "main",
"index": 0
}
],
[]
]
},
"MainPageAIAgent": {
"main": [
[
{
"node": "CodeFormatMainPage",
"type": "main",
"index": 0
}
]
]
},
"SubPageFormatter": {
"main": [
[
{
"node": "AggregateSubpages",
"type": "main",
"index": 0
}
]
]
},
"Merge1": {
"main": [
[
{
"node": "AIAgentSubpage",
"type": "main",
"index": 0
}
]
]
},
"Merge2": {
"main": [
[
{
"node": "FinalFormatter",
"type": "main",
"index": 0
}
]
]
},
"FlattenResult": {
"main": [
[
{
"node": "ChunkContent",
"type": "main",
"index": 0
}
]
]
},
"ChunkContent": {
"main": [
[
{
"node": "Split Out2",
"type": "main",
"index": 0
}
]
]
},
"FinalFormatter": {
"main": [
[
{
"node": "FlattenResult",
"type": "main",
"index": 0
},
{
"node": "Split Out1",
"type": "main",
"index": 0
}
]
]
},
"Split Out1": {
"main": [
[
{
"node": "Update row in sheet",
"type": "main",
"index": 0
}
]
]
},
"Append or update row in sheet": {
"main": [
[]
]
},
"Split Out2": {
"main": [
[
{
"node": "HTTP Request",
"type": "main",
"index": 0
}
]
]
},
"Create a row": {
"main": [
[]
]
},
"HTTP Request": {
"main": [
[
{
"node": "Code in JavaScript",
"type": "main",
"index": 0
}
]
]
},
"Code in JavaScript": {
"main": [
[
{
"node": "Create a row",
"type": "main",
"index": 0
}
]
]
},
"Fallback Model": {
"ai_languageModel": [
[
{
"node": "AIAgentSubpage",
"type": "ai_languageModel",
"index": 1
}
]
]
},
"Embedding Model": {
"ai_languageModel": [
[
{
"node": "AIAgentSubpage",
"type": "ai_languageModel",
"index": 0
},
{
"node": "MainPageAIAgent",
"type": "ai_languageModel",
"index": 0
}
]
]
}
},
"meta": {
"templateCredsSetupCompleted": true
}
}
Credentials you'll need
Each integration node will prompt for credentials when you import. We strip credential IDs before publishing — you'll add your own.
googlePalmApigoogleSheetsOAuth2ApihttpHeaderAuthhttpQueryAuthsupabaseApi
For the full experience including quality scoring and batch install features for each workflow upgrade to Pro
About this workflow
Tegra Scraper. Uses httpRequest, googleSheets, agent, supabase. Event-driven trigger; 32 nodes.
Source: https://github.com/jpmartins201/tegra-scraper/blob/f171f08ac7f787b5b773ba4098028b0464d636b8/workflow.json — original creator credit. Request a take-down →
Related workflows
Workflows that share integrations, category, or trigger type with this one. All free to copy and import.
This workflow creates a multi-talented AI assistant named Simran that interacts with users via Telegram. It can handle text and voice messages, understand the user's intent, and perform various tasks.
⏺ 🚀 How it works
This project is a template for building a complete academic virtual assistant using n8n. It connects to Telegram, answers frequently asked questions by querying MongoDB, keeps the community informed a
> Note: This workflow uses sticky notes extensively to document each logical section of the automation. Sticky notes are mandatory and already included to explain OCR, AI parsing, folder logic, dup
leads. Uses supabase, gmail, formTrigger, httpRequest. Webhook trigger; 62 nodes.