This workflow corresponds to n8n.io template #8707 — we link there as the canonical source.
This workflow follows the Documentdefaultdataloader → OpenAI Embeddings recipe pattern — see all workflows that pair these two integrations.
The workflow JSON
Copy or download the full n8n JSON below. Paste it into a new n8n workflow, add your credentials, activate. Full import guide →
{
"meta": {
"templateCredsSetupCompleted": true
},
"nodes": [
{
"id": "ab180eb3-c086-4f9f-b9d0-f3f56056a416",
"name": "When clicking \u2018Test workflow\u2019",
"type": "n8n-nodes-base.manualTrigger",
"position": [
-6816,
-304
],
"parameters": {},
"typeVersion": 1
},
{
"id": "20e77374-c3ce-457f-945c-d6f6dc928de1",
"name": "HTTP Request",
"type": "n8n-nodes-base.httpRequest",
"position": [
-6624,
-304
],
"parameters": {
"url": "https://www.kiekens.com/sitemap.xml",
"options": {}
},
"typeVersion": 4.2
},
{
"id": "b23dd724-1bd7-4eef-9e22-8bef987b2128",
"name": "XML",
"type": "n8n-nodes-base.xml",
"position": [
-6432,
-304
],
"parameters": {
"options": {}
},
"typeVersion": 1
},
{
"id": "4715b380-f386-4926-892e-2c133a1155c1",
"name": "Split Out",
"type": "n8n-nodes-base.splitOut",
"position": [
-6224,
-304
],
"parameters": {
"options": {},
"fieldToSplitOut": "urlset.url"
},
"typeVersion": 1
},
{
"id": "56181432-63f2-4d93-be6d-6f1489e04ca9",
"name": "Loop Over Items",
"type": "n8n-nodes-base.splitInBatches",
"position": [
-5152,
-592
],
"parameters": {
"options": {}
},
"typeVersion": 3
},
{
"id": "f52b3e19-7d64-4f3d-848d-81cf2b65bb15",
"name": "Wait",
"type": "n8n-nodes-base.wait",
"position": [
-4192,
-608
],
"parameters": {
"amount": 30
},
"typeVersion": 1.1
},
{
"id": "961143cf-c387-4e2d-a477-0988c0b0f512",
"name": "If",
"type": "n8n-nodes-base.if",
"position": [
-3728,
-608
],
"parameters": {
"options": {},
"conditions": {
"options": {
"version": 2,
"leftValue": "",
"caseSensitive": true,
"typeValidation": "strict"
},
"combinator": "and",
"conditions": [
{
"id": "9d90c1ce-590e-40a5-ae8c-d92326032975",
"operator": {
"type": "string",
"operation": "equals"
},
"leftValue": "={{ $json.status }}",
"rightValue": "completed"
}
]
}
},
"typeVersion": 2.2
},
{
"id": "991580c5-10ed-4bab-811e-2ec50d4050fd",
"name": "Default Data Loader",
"type": "@n8n/n8n-nodes-langchain.documentDefaultDataLoader",
"position": [
-2384,
-496
],
"parameters": {
"options": {
"metadata": {
"metadataValues": [
{
"name": "page",
"value": "={{ $json.result.url }}"
}
]
}
},
"jsonData": "={{ $json.cleanedText }}",
"jsonMode": "expressionData"
},
"typeVersion": 1
},
{
"id": "0fc79f0d-8ebd-4d61-ac29-7ba65284af52",
"name": "Character Text Splitter",
"type": "@n8n/n8n-nodes-langchain.textSplitterCharacterTextSplitter",
"position": [
-2368,
-352
],
"parameters": {
"chunkSize": 5000
},
"typeVersion": 1
},
{
"id": "bc5aac68-bb66-4c9c-abd7-9a913b0a56fa",
"name": "Embeddings OpenAI",
"type": "@n8n/n8n-nodes-langchain.embeddingsOpenAi",
"position": [
-2528,
-464
],
"parameters": {
"model": "text-embedding-ada-002",
"options": {}
},
"credentials": {
"openAiApi": {
"name": "<your credential>"
}
},
"typeVersion": 1.1
},
{
"id": "e3b525eb-7a3f-456d-a476-b013293c85e0",
"name": "Edit Fields",
"type": "n8n-nodes-base.set",
"position": [
-4064,
-288
],
"parameters": {
"options": {},
"assignments": {
"assignments": [
{
"id": "f2bcdb54-e1fe-4670-99aa-6eec973bf5f1",
"name": "task_id",
"type": "string",
"value": "={{ $('Crawl4ai Web Page Scrape').item.json.task_id }}"
}
]
}
},
"typeVersion": 3.4
},
{
"id": "bdbed5ea-d1a1-4922-a7b7-759466709fcb",
"name": "Crawl4AI_Task Status",
"type": "n8n-nodes-base.httpRequest",
"onError": "continueErrorOutput",
"position": [
-3968,
-608
],
"parameters": {
"url": "=https://crawl4ai-app-nrcsv.ondigitalocean.app/task/{{ $json.task_id }}",
"options": {
"timeout": 5000
},
"authentication": "genericCredentialType",
"genericAuthType": "httpHeaderAuth"
},
"credentials": {
"httpHeaderAuth": {
"name": "<your credential>"
}
},
"retryOnFail": true,
"typeVersion": 4.2,
"waitBetweenTries": 5000
},
{
"id": "f0da6b36-885a-4e86-b044-f3b490bf3829",
"name": "Loop Over Items1",
"type": "n8n-nodes-base.splitInBatches",
"position": [
-5824,
144
],
"parameters": {
"options": {}
},
"typeVersion": 3
},
{
"id": "f78a39bd-183c-4985-b1b1-f3142dfe31f3",
"name": "If2",
"type": "n8n-nodes-base.if",
"position": [
-4736,
-592
],
"parameters": {
"options": {},
"conditions": {
"options": {
"version": 2,
"leftValue": "",
"caseSensitive": true,
"typeValidation": "strict"
},
"combinator": "or",
"conditions": [
{
"id": "fbc89427-990b-45d0-8538-e403c1b18ddd",
"operator": {
"type": "string",
"operation": "contains"
},
"leftValue": "={{ $json.status }}",
"rightValue": "pending"
}
]
}
},
"typeVersion": 2.2
},
{
"id": "b6dfe888-4e2e-4c74-8a66-c3db28604514",
"name": "Split Out1",
"type": "n8n-nodes-base.splitOut",
"position": [
-5392,
-384
],
"parameters": {
"include": "selectedOtherFields",
"options": {},
"fieldToSplitOut": "url",
"fieldsToInclude": "status"
},
"typeVersion": 1
},
{
"id": "78f05cb5-8b9c-4f51-b252-4ca2195b52ad",
"name": "Format the URL",
"type": "n8n-nodes-base.set",
"position": [
-5648,
160
],
"parameters": {
"options": {},
"assignments": {
"assignments": [
{
"id": "9038a5b3-6985-4edc-bdd1-8dc5a3e8877c",
"name": "loc",
"type": "string",
"value": "={{ $json.loc.trim().toLowerCase() }}"
}
]
}
},
"typeVersion": 3.4
},
{
"id": "805f1fea-841b-40aa-a055-de7ddbbb306f",
"name": "Check if the URL is in the Supabase Table",
"type": "n8n-nodes-base.supabase",
"onError": "continueErrorOutput",
"position": [
-5456,
160
],
"parameters": {
"filters": {
"conditions": [
{
"keyName": "url",
"keyValue": "={{ $json.loc }}",
"condition": "eq"
}
]
},
"tableId": "scrape_queue",
"operation": "getAll",
"returnAll": true
},
"credentials": {
"supabaseApi": {
"name": "<your credential>"
}
},
"retryOnFail": true,
"typeVersion": 1,
"alwaysOutputData": true,
"waitBetweenTries": 5000
},
{
"id": "4f6e6ccb-7757-4e9f-b50c-9acb2fe99009",
"name": "Format the Output from the Supabase node",
"type": "n8n-nodes-base.code",
"position": [
-5184,
160
],
"parameters": {
"jsCode": "const supabaseResult = $json;\n\n// Get the clean URL from the Set node (Edit Fields1)\nconst originalLoc = $('Format the URL').item.json.loc;\nconst cleanUrl = typeof originalLoc === 'string' ? originalLoc.trim().toLowerCase() : '';\n\n// Check if URL already exists\n// Empty object {} means URL doesn't exist, so we should insert\nconst shouldInsert = Object.keys(supabaseResult).length === 0;\n\nreturn [\n {\n json: {\n url: cleanUrl,\n shouldInsert,\n }\n }\n];"
},
"typeVersion": 2
},
{
"id": "54ed36e4-e675-4bd2-a74e-aeadbe7f486c",
"name": "If \"shouldInsert\" is true",
"type": "n8n-nodes-base.if",
"position": [
-4992,
160
],
"parameters": {
"options": {},
"conditions": {
"options": {
"version": 2,
"leftValue": "",
"caseSensitive": true,
"typeValidation": "strict"
},
"combinator": "and",
"conditions": [
{
"id": "f3a00d98-73af-4d35-b4e5-5158c120753f",
"operator": {
"type": "boolean",
"operation": "true",
"singleValue": true
},
"leftValue": "={{ $json.shouldInsert }}",
"rightValue": "true"
}
]
}
},
"typeVersion": 2.2
},
{
"id": "483dc0c7-da52-423a-a3bb-cc9ef6d6f1df",
"name": "URL in a new row",
"type": "n8n-nodes-base.supabase",
"position": [
-4752,
272
],
"parameters": {
"tableId": "scrape_queue",
"fieldsUi": {
"fieldValues": [
{
"fieldId": "url",
"fieldValue": "={{ $json.url }}"
}
]
}
},
"credentials": {
"supabaseApi": {
"name": "<your credential>"
}
},
"typeVersion": 1
},
{
"id": "632752e1-138e-481f-92ad-2ac14c245c45",
"name": "Sticky Note1",
"type": "n8n-nodes-base.stickyNote",
"position": [
-5888,
64
],
"parameters": {
"width": 1280,
"height": 500,
"content": "## Put all Website`s URLs in Supabase Table - scrape_queue"
},
"typeVersion": 1
},
{
"id": "5fc57e6f-771c-4eaa-ba8e-8e233dc2a343",
"name": "CREATE TABLE scrape_queue in Supabase",
"type": "n8n-nodes-base.postgres",
"position": [
-6816,
-688
],
"parameters": {
"query": "CREATE TABLE scrape_queue (\n id uuid DEFAULT gen_random_uuid() PRIMARY KEY,\n url text NOT NULL UNIQUE,\n status text NOT NULL DEFAULT 'pending', -- 'pending', 'completed', 'error'\n task_id text,\n result text,\n created_at timestamp with time zone DEFAULT now(),\n updated_at timestamp with time zone DEFAULT now()\n);\n\n-- Optional: Auto-update updated_at on row change\nCREATE OR REPLACE FUNCTION update_updated_at_column()\nRETURNS TRIGGER AS $$\nBEGIN\n NEW.updated_at = now();\n RETURN NEW;\nEND;\n$$ language 'plpgsql';\n\nCREATE TRIGGER update_scrape_queue_updated_at\nBEFORE UPDATE ON scrape_queue\nFOR EACH ROW\nEXECUTE PROCEDURE update_updated_at_column();",
"options": {},
"operation": "executeQuery"
},
"credentials": {
"postgres": {
"name": "<your credential>"
}
},
"typeVersion": 2.6
},
{
"id": "8b2666b7-0eb6-42df-9ae2-e204516dd3d1",
"name": "CREATE TABLE scrape_queue in Supabase1",
"type": "n8n-nodes-base.postgres",
"position": [
-6608,
-688
],
"parameters": {
"query": "CREATE TABLE documents (\n id SERIAL PRIMARY KEY,\n content TEXT,\n metadata JSONB,\n embedding VECTOR(1536) -- Adjust the dimension size based on your OpenAI model (e.g. ada-002 returns 1536)\n);",
"options": {},
"operation": "executeQuery"
},
"credentials": {
"postgres": {
"name": "<your credential>"
}
},
"typeVersion": 2.6
},
{
"id": "7c7b8f66-00f6-48db-af03-fba30dc5e6b1",
"name": "Sticky Note2",
"type": "n8n-nodes-base.stickyNote",
"position": [
-6848,
-768
],
"parameters": {
"color": 3,
"width": 500,
"height": 280,
"content": "## Execute Once"
},
"typeVersion": 1
},
{
"id": "82279582-c71b-43aa-8e60-6b8af7ce866c",
"name": "Sticky Note",
"type": "n8n-nodes-base.stickyNote",
"position": [
-4992,
-736
],
"parameters": {
"color": 4,
"width": 460,
"height": 360,
"content": "## Get the URL from Supabase and check if it is completed or not\n\n**Only the NOT completed URLs will be passed**"
},
"typeVersion": 1
},
{
"id": "8b2245b2-cdc2-408a-879b-260335a10bcb",
"name": "Sticky Note3",
"type": "n8n-nodes-base.stickyNote",
"position": [
-4448,
-736
],
"parameters": {
"color": 5,
"width": 640,
"height": 360,
"content": "## Crawl4AI URL Scraping"
},
"typeVersion": 1
},
{
"id": "b42143d2-1e13-4031-996a-26af2dc26632",
"name": "Crawl4ai Web Page Scrape",
"type": "n8n-nodes-base.httpRequest",
"onError": "continueErrorOutput",
"position": [
-4384,
-608
],
"parameters": {
"url": "https://crawl4ai-app-nrcsv.ondigitalocean.app/crawl",
"method": "POST",
"options": {},
"sendBody": true,
"authentication": "genericCredentialType",
"bodyParameters": {
"parameters": [
{
"name": "urls",
"value": "={{ $json.url }}"
},
{
"name": "priority",
"value": "10"
}
]
},
"genericAuthType": "httpHeaderAuth"
},
"credentials": {
"httpHeaderAuth": {
"name": "<your credential>"
}
},
"retryOnFail": true,
"typeVersion": 4.2,
"waitBetweenTries": 5000
},
{
"id": "6ac1fda6-8363-4cff-8810-7cb2ffa63b67",
"name": "Remove redundant data from the scraping",
"type": "n8n-nodes-base.code",
"onError": "continueRegularOutput",
"position": [
-3488,
-768
],
"parameters": {
"jsCode": "return items.map(item => {\n // Handle both data structures\n const raw = item.json.result?.markdown || item.json.cleanedText || item.json.html || '';\n \n // Add a safety check for null/undefined\n if (!raw) {\n return {\n json: {\n url: item.json.result?.url || item.json.url || '',\n cleanedText: '',\n error: 'No content found to process'\n }\n };\n }\n \n let cleaned = raw\n // Remove headers but keep the content structure\n .replace(/^#{1,6}\\s+(.+)$/gm, '$1') // Convert headers to plain text\n \n // Remove markdown links but keep the text\n .replace(/\\[([^\\]]+)\\]\\([^)]+\\)/g, '$1') // Keep link text, remove URL\n \n // Remove code blocks completely\n .replace(/```[\\s\\S]*?```/g, '') \n .replace(/`([^`]+)`/g, '$1') // Remove inline code backticks but keep content\n \n // Remove markdown formatting\n .replace(/\\*\\*([^*]+)\\*\\*/g, '$1') // Remove bold formatting\n .replace(/\\*([^*]+)\\*/g, '$1') // Remove italic formatting\n .replace(/_{2,}([^_]+)_{2,}/g, '$1') // Remove underline formatting\n .replace(/~~([^~]+)~~/g, '$1') // Remove strikethrough\n \n // Remove lists formatting but keep content\n .replace(/^\\s*[-*+]\\s+/gm, '') // Remove bullet points\n .replace(/^\\s*\\d+\\.\\s+/gm, '') // Remove numbered lists\n \n // Remove HTML remnants\n .replace(/<[^>]*>/g, '') // Remove any remaining HTML tags\n .replace(/&[a-zA-Z0-9#]+;/g, '') // Remove HTML entities\n \n // Remove navigation and common web elements\n .replace(/\\b(Home|About|Contact|Privacy|Terms|Login|Register|Menu|Navigation|Footer|Header|Sidebar)\\b/gi, '')\n .replace(/\\b(Click here|Read more|Learn more|Show more|View all|See all)\\b/gi, '')\n .replace(/\\b(Previous|Next|Page \\d+|Back to top)\\b/gi, '')\n \n // Remove social media and sharing text\n .replace(/\\b(Share|Tweet|Facebook|LinkedIn|Instagram|Follow us|Subscribe)\\b/gi, '')\n \n // Remove common website noise\n .replace(/\\b(Cookie|Cookies|GDPR|Accept|Decline|Consent)\\b/gi, '')\n .replace(/\\b(Advertisement|Ad|Sponsored|Promotion)\\b/gi, '')\n \n // Remove excessive punctuation and symbols\n .replace(/[^\\w\\s.,!?;:()\\-\"']/g, '') // Keep only essential punctuation\n .replace(/\\.{2,}/g, '.') // Replace multiple dots with single dot\n .replace(/\\?{2,}/g, '?') // Replace multiple question marks\n .replace(/!{2,}/g, '!') // Replace multiple exclamation marks\n \n // Clean up whitespace and line breaks\n .replace(/\\n{3,}/g, '\\n\\n') // Replace multiple line breaks with double\n .replace(/\\s+/g, ' ') // Normalize whitespace\n .replace(/\\s*\\n\\s*/g, '\\n') // Clean line breaks\n \n // Remove lines that are too short (likely noise)\n .split('\\n')\n .filter(line => line.trim().length > 10) // Remove very short lines\n .join('\\n')\n \n .trim();\n \n // Additional quality checks\n const wordCount = cleaned.split(/\\s+/).length;\n const hasMinimumContent = wordCount >= 50; // Minimum 50 words\n \n // Check if content is mostly meaningful (not just numbers/symbols)\n const meaningfulContent = cleaned.replace(/[^\\w\\s]/g, '').length > cleaned.length * 0.7;\n \n // Extract additional metadata for better context\n const extractedTitle = raw.match(/^#{1,3}\\s+(.+)$/m)?.[1] || '';\n const domain = (item.json.result?.url || item.json.url || '').replace(/^https?:\\/\\//, '').split('/')[0];\n \n return {\n json: {\n url: item.json.result?.url || item.json.url || '',\n cleanedText: cleaned,\n wordCount: wordCount,\n hasMinimumContent: hasMinimumContent,\n meaningfulContent: meaningfulContent,\n extractedTitle: extractedTitle,\n domain: domain,\n contentLength: cleaned.length,\n // Quality score for filtering\n qualityScore: (hasMinimumContent ? 0.5 : 0) + (meaningfulContent ? 0.5 : 0)\n }\n };\n});"
},
"typeVersion": 2
},
{
"id": "520a512f-2da8-4cb7-b834-fe6fbfa2ad02",
"name": "Supabase Vector Store_documents",
"type": "@n8n/n8n-nodes-langchain.vectorStoreSupabase",
"position": [
-2544,
-672
],
"parameters": {
"mode": "insert",
"options": {
"queryName": "match_documents"
},
"tableName": {
"__rl": true,
"mode": "list",
"value": "documents",
"cachedResultName": "documents"
}
},
"credentials": {
"supabaseApi": {
"name": "<your credential>"
}
},
"typeVersion": 1
},
{
"id": "edb03374-1674-4070-b8a6-7afff6118f9a",
"name": "Get a row - scrape_queue Table",
"type": "n8n-nodes-base.supabase",
"position": [
-4912,
-592
],
"parameters": {
"filters": {
"conditions": [
{
"keyName": "url",
"keyValue": "={{ $json.url }}"
}
]
},
"tableId": "scrape_queue",
"operation": "get"
},
"credentials": {
"supabaseApi": {
"name": "<your credential>"
}
},
"typeVersion": 1
},
{
"id": "57358b66-0d48-4d53-a188-c5c550e46a9e",
"name": "Update a row in scrape_queue Table",
"type": "n8n-nodes-base.supabase",
"position": [
-2224,
-992
],
"parameters": {
"filters": {
"conditions": [
{
"keyName": "url",
"keyValue": "={{ $('Get a row - scrape_queue Table').item.json.url }}",
"condition": "eq"
}
]
},
"tableId": "scrape_queue",
"fieldsUi": {
"fieldValues": [
{
"fieldId": "status",
"fieldValue": "={{ $('Crawl4AI_Task Status').item.json.status }}"
},
{
"fieldId": "task_id",
"fieldValue": "={{ $('Crawl4ai Web Page Scrape').item.json.task_id }}"
}
]
},
"operation": "update"
},
"credentials": {
"supabaseApi": {
"name": "<your credential>"
}
},
"typeVersion": 1
},
{
"id": "3291a358-282c-4cc2-a869-c9b4651e157e",
"name": "Update a row in scrape_queue Table1",
"type": "n8n-nodes-base.supabase",
"position": [
-3984,
-1072
],
"parameters": {
"filters": {
"conditions": [
{
"keyName": "url",
"keyValue": "={{ $('Get a row - scrape_queue Table').first().json.url }}",
"condition": "eq"
}
]
},
"tableId": "scrape_queue",
"fieldsUi": {
"fieldValues": [
{
"fieldId": "task_id",
"fieldValue": "={{ $json.task_id }}"
},
{
"fieldId": "status",
"fieldValue": "={{ $json.error.status }}"
}
]
},
"operation": "update"
},
"credentials": {
"supabaseApi": {
"name": "<your credential>"
}
},
"typeVersion": 1
},
{
"id": "f801de82-dbe9-44c1-a6c3-ac2847e93060",
"name": "Wait1",
"type": "n8n-nodes-base.wait",
"position": [
-4352,
-208
],
"parameters": {
"unit": "minutes"
},
"typeVersion": 1.1
},
{
"id": "10aecbd3-6fd8-420f-b997-34d68eecde54",
"name": "Quality Filter Node",
"type": "n8n-nodes-base.code",
"onError": "continueRegularOutput",
"position": [
-3264,
-768
],
"parameters": {
"jsCode": "// Filter out low-quality content\nreturn items.filter(item => {\n const quality = item.json.qualityScore || 0;\n const minWords = item.json.wordCount >= 50;\n const hasContent = item.json.cleanedText.length > 200;\n \n return quality >= 0.5 && minWords && hasContent;\n});"
},
"typeVersion": 2
},
{
"id": "9473c86c-7525-41f6-a2be-f7750d930317",
"name": "Content Type Detection",
"type": "n8n-nodes-base.code",
"onError": "continueRegularOutput",
"position": [
-3008,
-768
],
"parameters": {
"jsCode": "// Content Type Detection - Fixed Version\nreturn items.map(item => {\n const text = item.json.cleanedText || '';\n \n // Content type detection function\n const detectContentType = (text) => {\n if (!text || text.length < 10) {\n return 'unknown';\n }\n \n const lowerText = text.toLowerCase();\n \n // Check for code content\n if (lowerText.includes('function') || lowerText.includes('class') || \n lowerText.includes('import') || lowerText.includes('def ') ||\n lowerText.includes('var ') || lowerText.includes('const ')) {\n return 'code';\n }\n \n // Check for tutorial content\n if (lowerText.includes('step 1') || lowerText.includes('tutorial') || \n lowerText.includes('how to') || lowerText.includes('guide') ||\n lowerText.includes('walkthrough')) {\n return 'tutorial';\n }\n \n // Check for FAQ content\n if (lowerText.includes('faq') || lowerText.includes('q:') || \n lowerText.includes('a:') || lowerText.includes('question') ||\n lowerText.includes('frequently asked')) {\n return 'faq';\n }\n \n // Check for documentation\n if (lowerText.includes('api') || lowerText.includes('documentation') ||\n lowerText.includes('reference') || lowerText.includes('manual')) {\n return 'documentation';\n }\n \n // Check for news/blog content\n if (lowerText.includes('published') || lowerText.includes('author') ||\n lowerText.includes('posted') || lowerText.includes('blog')) {\n return 'blog';\n }\n \n // Check for product/service pages\n if (lowerText.includes('price') || lowerText.includes('buy') ||\n lowerText.includes('purchase') || lowerText.includes('product')) {\n return 'product';\n }\n \n // Default to article\n return 'article';\n };\n \n // Detect content type\n const contentType = detectContentType(text);\n \n // Return the item with added content type\n return {\n json: {\n ...item.json, // Keep all existing data\n contentType: contentType\n }\n };\n});"
},
"typeVersion": 2
},
{
"id": "54873bf5-ecb2-44e3-9dfb-e0e6ace02917",
"name": "Better Metadata Extraction",
"type": "n8n-nodes-base.code",
"onError": "continueRegularOutput",
"position": [
-2784,
-768
],
"parameters": {
"jsCode": "// Enhanced metadata extraction - Fixed Version\nreturn items.map(item => {\n const cleaned = item.json.cleanedText || '';\n const url = item.json.url || '';\n const contentType = item.json.contentType || 'article';\n \n // Extract title from the cleaned text (look for first meaningful line)\n const extractTitle = (text) => {\n if (!text) return '';\n \n const lines = text.split('\\n').filter(line => line.trim().length > 0);\n if (lines.length === 0) return '';\n \n // Find the first substantial line (likely the title)\n const titleLine = lines.find(line => \n line.trim().length > 10 && \n line.trim().length < 200 &&\n !line.includes('http') &&\n !line.includes('www.')\n );\n \n return titleLine ? titleLine.trim() : lines[0].trim();\n };\n \n // Extract domain from URL\n const extractDomain = (url) => {\n if (!url) return '';\n try {\n return url.replace(/^https?:\\/\\//, '').split('/')[0];\n } catch (e) {\n return '';\n }\n };\n \n // Count words in the text\n const countWords = (text) => {\n if (!text) return 0;\n return text.trim().split(/\\s+/).filter(word => word.length > 0).length;\n };\n \n // Calculate quality score\n const calculateQualityScore = (text, wordCount) => {\n if (!text || wordCount < 50) return 0;\n \n const meaningfulContent = text.replace(/[^\\w\\s]/g, '').length > text.length * 0.7;\n const hasMinimumContent = wordCount >= 50;\n \n return (hasMinimumContent ? 0.5 : 0) + (meaningfulContent ? 0.5 : 0);\n };\n \n // Simple language detection (basic version)\n const detectLanguage = (text) => {\n if (!text) return 'unknown';\n \n // Simple heuristic - could be improved with a proper language detection library\n const commonEnglishWords = ['the', 'and', 'is', 'in', 'to', 'of', 'a', 'for', 'on', 'with'];\n const commonDutchWords = ['de', 'het', 'en', 'van', 'een', 'in', 'op', 'te', 'aan', 'met'];\n \n const lowerText = text.toLowerCase();\n const englishCount = commonEnglishWords.filter(word => lowerText.includes(` ${word} `)).length;\n const dutchCount = commonDutchWords.filter(word => lowerText.includes(` ${word} `)).length;\n \n if (englishCount > dutchCount) return 'en';\n if (dutchCount > englishCount) return 'nl';\n return 'unknown';\n };\n \n // Extract all metadata\n const extractedTitle = extractTitle(cleaned);\n const domain = extractDomain(url);\n const wordCount = countWords(cleaned);\n const qualityScore = calculateQualityScore(cleaned, wordCount);\n const detectedLanguage = detectLanguage(cleaned);\n \n // Enhanced metadata object\n const metadata = {\n page: url,\n title: extractedTitle,\n domain: domain,\n contentType: contentType,\n wordCount: wordCount,\n scrapedDate: new Date().toISOString(),\n language: detectedLanguage,\n qualityScore: qualityScore,\n contentLength: cleaned.length\n };\n \n return {\n json: {\n ...item.json, // Keep all existing data\n metadata: metadata,\n // Also keep individual fields for easier access\n extractedTitle: extractedTitle,\n domain: domain,\n wordCount: wordCount,\n qualityScore: qualityScore,\n detectedLanguage: detectedLanguage\n }\n };\n});"
},
"typeVersion": 2
},
{
"id": "f2d3d6a3-b48e-4b08-bf8e-f8fff06d3494",
"name": "Sticky Note4",
"type": "n8n-nodes-base.stickyNote",
"position": [
-3536,
-912
],
"parameters": {
"color": 6,
"width": 900,
"height": 340,
"content": "## Clean te HTML code"
},
"typeVersion": 1
},
{
"id": "6ddcf33d-84cb-4ee7-bf62-cb2747aff406",
"name": "If1",
"type": "n8n-nodes-base.if",
"position": [
-3632,
-288
],
"parameters": {
"options": {},
"conditions": {
"options": {
"version": 2,
"leftValue": "",
"caseSensitive": true,
"typeValidation": "strict"
},
"combinator": "and",
"conditions": [
{
"id": "3e84e5d8-e49c-4a7b-98c3-9e115f592c10",
"operator": {
"type": "string",
"operation": "exists",
"singleValue": true
},
"leftValue": "={{ $json.task_id }}",
"rightValue": ""
},
{
"id": "c6a0525f-3224-4ad5-8d0a-e0a7a27fb5d1",
"operator": {
"type": "number",
"operation": "gte"
},
"leftValue": "={{ $json.attempt_count }}",
"rightValue": 10
}
]
}
},
"typeVersion": 2.2
},
{
"id": "ffb7b9cb-a4fb-4db2-833c-331672de42bd",
"name": "Update a row in scrape_queue Table2",
"type": "n8n-nodes-base.supabase",
"position": [
-3376,
-176
],
"parameters": {
"filters": {
"conditions": [
{
"keyName": "url",
"keyValue": "={{ $('Get a row - scrape_queue Table').first().json.url }}",
"condition": "eq"
}
]
},
"tableId": "scrape_queue",
"fieldsUi": {
"fieldValues": [
{
"fieldId": "task_id",
"fieldValue": "={{ $json.task_id }}"
},
{
"fieldId": "status",
"fieldValue": "=error"
}
]
},
"operation": "update"
},
"credentials": {
"supabaseApi": {
"name": "<your credential>"
}
},
"typeVersion": 1
},
{
"id": "44c7fe75-0e88-4114-b506-6e7850c2a038",
"name": "Task_id Counter",
"type": "n8n-nodes-base.code",
"position": [
-3856,
-288
],
"parameters": {
"jsCode": "// Simple counter that resets for each new task ID\nif (typeof globalThis.currentTaskId === 'undefined') {\n globalThis.currentTaskId = null;\n globalThis.currentCounter = 0;\n}\n\nreturn items.map(item => {\n const taskId = item.json.task_id;\n \n // Check if this is a new task ID\n if (globalThis.currentTaskId !== taskId) {\n // New task ID detected - reset counter\n globalThis.currentTaskId = taskId;\n globalThis.currentCounter = 1;\n } else {\n // Same task ID - increment counter\n globalThis.currentCounter++;\n }\n \n return {\n json: {\n ...item.json,\n attempt_count: globalThis.currentCounter\n }\n };\n});"
},
"typeVersion": 2
}
],
"connections": {
"If": {
"main": [
[
{
"node": "Remove redundant data from the scraping",
"type": "main",
"index": 0
}
],
[
{
"node": "Edit Fields",
"type": "main",
"index": 0
}
]
]
},
"If1": {
"main": [
[
{
"node": "Update a row in scrape_queue Table2",
"type": "main",
"index": 0
}
],
[
{
"node": "Wait",
"type": "main",
"index": 0
}
]
]
},
"If2": {
"main": [
[
{
"node": "Crawl4ai Web Page Scrape",
"type": "main",
"index": 0
}
],
[
{
"node": "Loop Over Items",
"type": "main",
"index": 0
}
]
]
},
"XML": {
"main": [
[
{
"node": "Split Out",
"type": "main",
"index": 0
}
]
]
},
"Wait": {
"main": [
[
{
"node": "Crawl4AI_Task Status",
"type": "main",
"index": 0
}
]
]
},
"Wait1": {
"main": [
[
{
"node": "Crawl4ai Web Page Scrape",
"type": "main",
"index": 0
}
]
]
},
"Split Out": {
"main": [
[
{
"node": "Loop Over Items1",
"type": "main",
"index": 0
}
]
]
},
"Split Out1": {
"main": [
[
{
"node": "Loop Over Items",
"type": "main",
"index": 0
}
]
]
},
"Edit Fields": {
"main": [
[
{
"node": "Task_id Counter",
"type": "main",
"index": 0
}
]
]
},
"HTTP Request": {
"main": [
[
{
"node": "XML",
"type": "main",
"index": 0
}
]
]
},
"Format the URL": {
"main": [
[
{
"node": "Check if the URL is in the Supabase Table",
"type": "main",
"index": 0
}
]
]
},
"Loop Over Items": {
"main": [
[],
[
{
"node": "Get a row - scrape_queue Table",
"type": "main",
"index": 0
}
]
]
},
"Task_id Counter": {
"main": [
[
{
"node": "If1",
"type": "main",
"index": 0
}
]
]
},
"Loop Over Items1": {
"main": [
[
{
"node": "Split Out1",
"type": "main",
"index": 0
}
],
[
{
"node": "Format the URL",
"type": "main",
"index": 0
}
]
]
},
"URL in a new row": {
"main": [
[
{
"node": "Loop Over Items1",
"type": "main",
"index": 0
}
]
]
},
"Embeddings OpenAI": {
"ai_embedding": [
[
{
"node": "Supabase Vector Store_documents",
"type": "ai_embedding",
"index": 0
}
]
]
},
"Default Data Loader": {
"ai_document": [
[
{
"node": "Supabase Vector Store_documents",
"type": "ai_document",
"index": 0
}
]
]
},
"Quality Filter Node": {
"main": [
[
{
"node": "Content Type Detection",
"type": "main",
"index": 0
}
]
]
},
"Crawl4AI_Task Status": {
"main": [
[
{
"node": "If",
"type": "main",
"index": 0
}
],
[
{
"node": "Update a row in scrape_queue Table1",
"type": "main",
"index": 0
}
]
]
},
"Content Type Detection": {
"main": [
[
{
"node": "Better Metadata Extraction",
"type": "main",
"index": 0
}
]
]
},
"Character Text Splitter": {
"ai_textSplitter": [
[
{
"node": "Default Data Loader",
"type": "ai_textSplitter",
"index": 0
}
]
]
},
"Crawl4ai Web Page Scrape": {
"main": [
[
{
"node": "Wait",
"type": "main",
"index": 0
}
],
[
{
"node": "Wait1",
"type": "main",
"index": 0
}
]
]
},
"If \"shouldInsert\" is true": {
"main": [
[
{
"node": "URL in a new row",
"type": "main",
"index": 0
}
],
[
{
"node": "Loop Over Items1",
"type": "main",
"index": 0
}
]
]
},
"Better Metadata Extraction": {
"main": [
[
{
"node": "Supabase Vector Store_documents",
"type": "main",
"index": 0
}
]
]
},
"Get a row - scrape_queue Table": {
"main": [
[
{
"node": "If2",
"type": "main",
"index": 0
}
]
]
},
"Supabase Vector Store_documents": {
"main": [
[
{
"node": "Update a row in scrape_queue Table",
"type": "main",
"index": 0
}
]
]
},
"When clicking \u2018Test workflow\u2019": {
"main": [
[
{
"node": "HTTP Request",
"type": "main",
"index": 0
}
]
]
},
"Update a row in scrape_queue Table": {
"main": [
[
{
"node": "Loop Over Items",
"type": "main",
"index": 0
}
]
]
},
"Update a row in scrape_queue Table1": {
"main": [
[
{
"node": "Loop Over Items",
"type": "main",
"index": 0
}
]
]
},
"Update a row in scrape_queue Table2": {
"main": [
[
{
"node": "Loop Over Items",
"type": "main",
"index": 0
}
]
]
},
"Remove redundant data from the scraping": {
"main": [
[
{
"node": "Quality Filter Node",
"type": "main",
"index": 0
}
]
]
},
"Format the Output from the Supabase node": {
"main": [
[
{
"node": "If \"shouldInsert\" is true",
"type": "main",
"index": 0
}
]
]
},
"Check if the URL is in the Supabase Table": {
"main": [
[
{
"node": "Format the Output from the Supabase node",
"type": "main",
"index": 0
}
]
]
}
}
}
Credentials you'll need
Each integration node will prompt for credentials when you import. We strip credential IDs before publishing — you'll add your own.
httpHeaderAuthopenAiApipostgressupabaseApi
For the full experience including quality scoring and batch install features for each workflow upgrade to Pro
About this workflow
This template crawls a website from its sitemap, deduplicates URLs in Supabase, scrapes pages with Crawl4AI, cleans and validates the text, then stores content + metadata in a Supabase vector store using OpenAI embeddings. It’s a reliable, repeatable pipeline for building…
Source: https://n8n.io/workflows/8707/ — original creator credit. Request a take-down →
Related workflows
Workflows that share integrations, category, or trigger type with this one. All free to copy and import.
Wordpress Ai Chatbot To Enhance User Experience With Supabase And Openai. Uses manualTrigger, embeddingsOpenAi, documentDefaultDataLoader, textSplitterTokenSplitter. Event-driven trigger; 53 nodes.
RAG & GenAI App With WordPress Content. Uses manualTrigger, embeddingsOpenAi, documentDefaultDataLoader, textSplitterTokenSplitter. Event-driven trigger; 53 nodes.
RAG & GenAI App With WordPress Content. Uses manualTrigger, embeddingsOpenAi, documentDefaultDataLoader, textSplitterTokenSplitter. Event-driven trigger; 53 nodes.
OIL Rag. Uses lmChatOpenAi, embeddingsOpenAi, agent, telegramTrigger. Event-driven trigger; 53 nodes.
This n8n workflow automates the process of ingesting files from Google Drive into a Supabase database, preparing them for a knowledge base system. It supports text-based files (PDF, DOCX, TXT, etc.) a