{
  "id": "VPtrkEUaljkq5VtD",
  "meta": {
    "templateCredsSetupCompleted": true
  },
  "name": "Crawl Website Blog Content and Save to Google Sheets with Dumpling AI",
  "tags": [],
  "nodes": [
    {
      "id": "b9be7abb-d02c-4383-b18d-e40f77f833bf",
      "name": "Form Submission ",
      "type": "n8n-nodes-base.formTrigger",
      "position": [
        -112,
        0
      ],
      "parameters": {
        "options": {},
        "formTitle": "blog content strategy",
        "formFields": {
          "values": [
            {
              "fieldLabel": "Client URL",
              "requiredField": true
            }
          ]
        }
      },
      "typeVersion": 2.2
    },
    {
      "id": "e4c0b49d-ec38-4357-ad7c-bb47145b0cd5",
      "name": "Create Blog Audit Sheet",
      "type": "n8n-nodes-base.googleSheets",
      "position": [
        112,
        0
      ],
      "parameters": {
        "title": "={{ $json[\"Client URL\"].trim().split(/\u203a|>|\u00bb/)[0].trim().split(\".\")[0] }}",
        "options": {},
        "resource": "spreadsheet",
        "sheetsUi": {
          "sheetValues": [
            {
              "title": "Blog content audit"
            }
          ]
        }
      },
      "credentials": {
        "googleSheetsOAuth2Api": {
          "name": "<your credential>"
        }
      },
      "typeVersion": 4.6
    },
    {
      "id": "1138d43b-14bb-4f9c-a6dc-f82cc965e5c4",
      "name": "Set Sheet Headers",
      "type": "n8n-nodes-base.set",
      "position": [
        336,
        0
      ],
      "parameters": {
        "values": {
          "string": [
            {
              "name": "rows",
              "value": "Url,Crawled_pages,website_content"
            }
          ]
        },
        "options": {}
      },
      "typeVersion": 1
    },
    {
      "id": "3630f64c-a9df-4c18-a668-822b5d0aed00",
      "name": "Format Header Row",
      "type": "n8n-nodes-base.code",
      "position": [
        560,
        0
      ],
      "parameters": {
        "jsCode": "return [\n  {\n    json: {\n      data: [ $json.rows.split(',') ]\n    }\n  }\n];\n\n"
      },
      "typeVersion": 2
    },
    {
      "id": "171ecc40-447d-4a64-adc8-f0d38d1d9cdf",
      "name": "Insert Headers into Sheet",
      "type": "n8n-nodes-base.httpRequest",
      "position": [
        784,
        0
      ],
      "parameters": {
        "url": "=https://sheets.googleapis.com/v4/spreadsheets/{{ $('Create Blog Audit Sheet').first().json.spreadsheetId }}/values/{{ $('Create Blog Audit Sheet').first().json.sheets[0].properties.title }}!A:Z",
        "method": "PUT",
        "options": {},
        "sendBody": true,
        "sendQuery": true,
        "authentication": "predefinedCredentialType",
        "bodyParameters": {
          "parameters": [
            {
              "name": "range",
              "value": "={{ $('Create Blog Audit Sheet').first().json.sheets[0].properties.title }}!A:Z"
            },
            {
              "name": "values",
              "value": "={{ $json.data }}"
            }
          ]
        },
        "queryParameters": {
          "parameters": [
            {
              "name": "valueInputOption",
              "value": "RAW"
            }
          ]
        },
        "nodeCredentialType": "googleSheetsOAuth2Api"
      },
      "credentials": {
        "googleSheetsOAuth2Api": {
          "name": "<your credential>"
        }
      },
      "typeVersion": 4.1
    },
    {
      "id": "a1b24d76-a502-409a-b310-d4d3cc5e7c9a",
      "name": "Dumpling AI: Crawl Website",
      "type": "n8n-nodes-base.httpRequest",
      "position": [
        1008,
        0
      ],
      "parameters": {
        "url": "https://app.dumplingai.com/api/v1/crawl",
        "method": "POST",
        "options": {},
        "sendBody": true,
        "authentication": "genericCredentialType",
        "bodyParameters": {
          "parameters": [
            {
              "name": "url",
              "value": "={{ $('Form Submission ').item.json[\"Client URL\"] }}"
            },
            {
              "name": "limit",
              "value": "=10"
            }
          ]
        },
        "genericAuthType": "httpHeaderAuth"
      },
      "credentials": {
        "httpHeaderAuth": {
          "name": "<your credential>"
        }
      },
      "typeVersion": 4.2
    },
    {
      "id": "bf50d679-1f16-4811-9e20-99eaccd9fc44",
      "name": "Extract Blog URLs",
      "type": "n8n-nodes-base.code",
      "position": [
        1232,
        0
      ],
      "parameters": {
        "jsCode": "// Get all input items\nconst items = $input.all();\n\n// Convert everything to a string for searching\nconst fullDataString = JSON.stringify(items);\n\n// Find ALL URLs in the content\nconst urlPattern = /https?:\\/\\/[^\\s\\\"\\'<>\\(\\)\\[\\]]+/gi;\nconst allUrls = fullDataString.match(urlPattern) || [];\n\n// Clean and deduplicate URLs\nlet uniqueUrls = [...new Set(allUrls.map(url => \n  url.replace(/[\\(\\)\\[\\]\\\"\\'<>\\\\]/g, '').trim()\n))];\n\n// Function to determine if a URL might be a blog post\nfunction isPotentialBlogPost(url) {\n  // Common blog URL patterns\n  const blogPatterns = [\n    /\\/blog\\//i,\n    /\\/post\\//i,\n    /\\/posts\\//i,\n    /\\/article\\//i,\n    /\\/articles\\//i,\n    /\\/news\\//i,\n    /\\/insights\\//i,\n    /\\/stories\\//i,\n    /\\/resources\\/blog/i,\n    /\\/\\d{4}\\/\\d{2}\\//,\n    /\\/how-to-/i,\n    /\\/guide-to-/i,\n    /\\/tips/i,\n    /\\/free-printable-art/i\n  ];\n  \n  // Exclude patterns\n  const excludePatterns = [\n    /\\.(jpg|jpeg|png|gif|svg|css|js|pdf|zip)$/i,\n    /\\/wp-content\\//i,\n    /\\/cart\\//i,\n    /\\/checkout/i,\n    /\\/product\\//i,\n    /\\/shop\\//i,\n    /\\?add-to-cart=/i,\n    /\\/my-account/i,\n    /\\/collections?\\//i\n  ];\n  \n  // Check excludes first\n  for (const pattern of excludePatterns) {\n    if (pattern.test(url)) {\n      return false;\n    }\n  }\n  \n  // Check blog patterns\n  for (const pattern of blogPatterns) {\n    if (pattern.test(url)) {\n      return true;\n    }\n  }\n  \n  return false;\n}\n\n// Filter URLs\nconst blogUrls = uniqueUrls.filter(url => url.includes('/blog'));\nconst potentialBlogPosts = uniqueUrls.filter(isPotentialBlogPost);\n\n// Combine and deduplicate\nconst allBlogRelatedUrls = [...new Set([...blogUrls, ...potentialBlogPosts])];\n\n// Sort URLs\nallBlogRelatedUrls.sort();\n\n// Return results\nif (allBlogRelatedUrls.length > 0) {\n  return allBlogRelatedUrls.map(url => ({\n    json: {\n      blogUrl: url\n    }\n  }));\n} else {\n  return [{\n    json: {\n      message: \"No blog URLs found\",\n      totalUrlsChecked: uniqueUrls.length\n    }\n  }];\n}"
      },
      "typeVersion": 2
    },
    {
      "id": "cf4d1e28-6dd4-46f5-bdde-6882b9bc59d9",
      "name": "Dumpling AI: Scrape Blog Pages",
      "type": "n8n-nodes-base.httpRequest",
      "position": [
        1456,
        0
      ],
      "parameters": {
        "url": "https://app.dumplingai.com/api/v1/scrape",
        "method": "POST",
        "options": {},
        "sendBody": true,
        "authentication": "genericCredentialType",
        "bodyParameters": {
          "parameters": [
            {
              "name": "url",
              "value": "={{ $json.blogUrl }}"
            }
          ]
        },
        "genericAuthType": "httpHeaderAuth"
      },
      "credentials": {
        "httpHeaderAuth": {
          "name": "<your credential>"
        }
      },
      "typeVersion": 4.2
    },
    {
      "id": "d5135eaa-34f2-4d9e-afa9-5dd4a98b658b",
      "name": "Prepare Row Data",
      "type": "n8n-nodes-base.set",
      "position": [
        1680,
        0
      ],
      "parameters": {
        "options": {},
        "assignments": {
          "assignments": [
            {
              "id": "ce53a00e-e3ff-49b8-a867-2b2005655946",
              "name": "Url",
              "type": "string",
              "value": "={{ $('Form Submission ').item.json[\"Client URL\"] }}"
            },
            {
              "id": "8d2cc248-a6da-44a2-a22a-865e13f9d15c",
              "name": "Crawled_pages",
              "type": "string",
              "value": "={{ $('Extract Blog URLs').item.json.blogUrl }}"
            },
            {
              "id": "4a3c75d6-fa9e-4cb5-84b0-3fb33f38bd45",
              "name": "website_content",
              "type": "string",
              "value": "={{ $json.content }}"
            }
          ]
        }
      },
      "typeVersion": 3.4
    },
    {
      "id": "7fb2c810-f438-48af-a075-29e514d6855c",
      "name": "Save Blog Data to Google Sheets",
      "type": "n8n-nodes-base.googleSheets",
      "position": [
        1904,
        0
      ],
      "parameters": {
        "columns": {
          "value": {},
          "schema": [
            {
              "id": "Url",
              "type": "string",
              "display": true,
              "required": false,
              "displayName": "Url",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            },
            {
              "id": "Crawled_pages",
              "type": "string",
              "display": true,
              "required": false,
              "displayName": "Crawled_pages",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            },
            {
              "id": "website_content",
              "type": "string",
              "display": true,
              "required": false,
              "displayName": "website_content",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            }
          ],
          "mappingMode": "autoMapInputData",
          "matchingColumns": [],
          "attemptToConvertTypes": false,
          "convertFieldsToString": false
        },
        "options": {},
        "operation": "append",
        "sheetName": {
          "__rl": true,
          "mode": "id",
          "value": "={{ $('Create Blog Audit Sheet').item.json.sheets[0].properties.sheetId }}"
        },
        "documentId": {
          "__rl": true,
          "mode": "url",
          "value": "={{ $('Create Blog Audit Sheet').item.json.spreadsheetUrl }}"
        }
      },
      "credentials": {
        "googleSheetsOAuth2Api": {
          "name": "<your credential>"
        }
      },
      "typeVersion": 4.7
    },
    {
      "id": "bbd3cb7c-b1a5-4919-8617-9050dabdde20",
      "name": "Sticky Note",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -128,
        -304
      ],
      "parameters": {
        "width": 880,
        "height": 448,
        "content": "## Workflow Overview\n\n1. **Trigger: Form Submission (Client URL)** \u2014 Starts the workflow when a client URL is entered.  \n2. **Create Blog Audit Sheet** \u2014 Creates a new Google Sheet for the audit.  \n3. **Set Sheet Headers** \u2014 Defines the columns (URL, Crawled Pages, Website Content).  \n4. **Format Header Row** \u2014 Prepares the headers into the right format for Google Sheets.  \n5. **Insert Headers into Sheet** \u2014 Updates the sheet with the headers.  \n6. **Dumpling AI: Crawl Website** \u2014 Crawls the submitted URL to discover pages.  \n7. **Extract Blog URLs** \u2014 Filters the crawl results to keep only blog-related links.  \n8. **Dumpling AI: Scrape Blog Pages** \u2014 Scrapes the content from each blog page.  \n9. **Prepare Row Data** \u2014 Maps the URL, crawled page, and content into structured fields.  \n10. **Save Blog Data to Google Sheets** \u2014 Appends the results into the audit sheet for review.\n\n"
      },
      "typeVersion": 1
    }
  ],
  "active": false,
  "settings": {
    "executionOrder": "v1"
  },
  "versionId": "319cd2cf-cb58-48ba-80dd-88c67a42fa8f",
  "connections": {
    "Form Submission ": {
      "main": [
        [
          {
            "node": "Create Blog Audit Sheet",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Prepare Row Data": {
      "main": [
        [
          {
            "node": "Save Blog Data to Google Sheets",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Extract Blog URLs": {
      "main": [
        [
          {
            "node": "Dumpling AI: Scrape Blog Pages",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Format Header Row": {
      "main": [
        [
          {
            "node": "Insert Headers into Sheet",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Set Sheet Headers": {
      "main": [
        [
          {
            "node": "Format Header Row",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Create Blog Audit Sheet": {
      "main": [
        [
          {
            "node": "Set Sheet Headers",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Insert Headers into Sheet": {
      "main": [
        [
          {
            "node": "Dumpling AI: Crawl Website",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Dumpling AI: Crawl Website": {
      "main": [
        [
          {
            "node": "Extract Blog URLs",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Dumpling AI: Scrape Blog Pages": {
      "main": [
        [
          {
            "node": "Prepare Row Data",
            "type": "main",
            "index": 0
          }
        ]
      ]
    }
  }
}