{
  "id": "FLn2skSh92HNO2SS",
  "meta": {
    "templateCredsSetupCompleted": true
  },
  "name": "Airline Web Check-in Scraper with AI & Vector DB Storage using n8n",
  "tags": [],
  "nodes": [
    {
      "id": "ee7a49bf-a2dc-4d12-aef0-9add291a398c",
      "name": "Loop Over Items",
      "type": "n8n-nodes-base.splitInBatches",
      "position": [
        -220,
        175
      ],
      "parameters": {
        "options": {}
      },
      "typeVersion": 3
    },
    {
      "id": "049cfbd5-bbc7-483c-964e-a32cdab1e6b8",
      "name": "Fetch Airline URLs\t",
      "type": "n8n-nodes-base.googleSheets",
      "position": [
        -440,
        175
      ],
      "parameters": {
        "options": {},
        "sheetName": {
          "__rl": true,
          "mode": "list",
          "value": 2125635496,
          "cachedResultUrl": "https://docs.google.com/spreadsheets/d/1ws8YonQyc32SveWQdfihYOW_OzOS-2REIrwSYS37oQ8/edit#gid=2125635496",
          "cachedResultName": "Sheet1"
        },
        "documentId": {
          "__rl": true,
          "mode": "list",
          "value": "1ws8YonQyc32SveWQdfihYOW_OzOS-2REIrwSYS37oQ8",
          "cachedResultUrl": "https://docs.google.com/spreadsheets/d/1ws8YonQyc32SveWQdfihYOW_OzOS-2REIrwSYS37oQ8/edit?usp=drivesdk",
          "cachedResultName": "airline_faq_urls"
        },
        "authentication": "serviceAccount"
      },
      "credentials": {
        "googleApi": {
          "name": "<your credential>"
        }
      },
      "typeVersion": 4.5
    },
    {
      "id": "7e2ca713-229f-490c-bd2e-481cf8f18184",
      "name": "Chat Trigger - Start\t",
      "type": "@n8n/n8n-nodes-langchain.chatTrigger",
      "position": [
        -660,
        175
      ],
      "parameters": {
        "public": true,
        "options": {}
      },
      "typeVersion": 1.1
    },
    {
      "id": "c11c66ea-3e36-4c12-a263-109d03d8be1a",
      "name": "Scrape Airline Webpage\t",
      "type": "n8n-nodes-base.httpRequest",
      "position": [
        0,
        0
      ],
      "parameters": {
        "url": "=https://r.jina.ai/{{ $json['WEB CHECK IN URL'] }}",
        "method": "POST",
        "options": {},
        "jsonHeaders": "{\n  \"Cookie\": \"cookie-keyname1=cookie-value1; cookie-keyname2=cookie-value2; cookie-keyname3=cookie-value3\"\n}\n",
        "sendHeaders": true,
        "authentication": "genericCredentialType",
        "specifyHeaders": "json",
        "genericAuthType": "httpHeaderAuth"
      },
      "credentials": {
        "httpHeaderAuth": {
          "name": "<your credential>"
        }
      },
      "typeVersion": 4.2
    },
    {
      "id": "27072e20-58dc-49e2-ae6b-1053750607f9",
      "name": "Extract Info with LLM\t",
      "type": "@n8n/n8n-nodes-langchain.chainLlm",
      "position": [
        220,
        0
      ],
      "parameters": {
        "text": "={{ $json.data }}",
        "messages": {
          "messageValues": [
            {
              "message": "=You are an intelligent parser trained to extract structured data from messy airline webpages.\n\nYour task is to extract and return well-structured airline check-in and policy details from raw text. Always return the result as a clean, valid JSON object using the exact schema described below.\n\n---\n\nExtraction Guidelines:\n\n* Ensure consistent JSON structure for every airline.\n* If a key has no value or content, **remove it** (do not return null, empty arrays, or empty objects).\n* Include any other useful data under `\"additional_info\"` if it doesn\u2019t fit existing keys.\n* Always extract **direct URLs** wherever available.\n* Your output should be compact, valid, and readable JSON.\n\n---\n\nJSON Structure Format:\n\n1. Web Check-in Details\n\n* \"web\\_checkin\\_available\": true/false\n* \"checkin\\_url\": \"<URL>\"\n* \"checkin\\_methods\": \\[\"Online\", \"Mobile App\", \"Kiosk\", \"Airport Counter\"]\n* \"checkin\\_start\": \"<Start Time>\"\n* \"checkin\\_deadline\": \"<Deadline Time>\"\n* \"boarding\\_pass\\_options\":\n\n  * \"mobile\\_boarding\\_pass\\_available\": true/false\n  * \"printed\\_boarding\\_pass\\_required\": true/false\n  * \"additional\\_checkin\\_info\": \"<Extra instructions>\"\n\n2. Customer Support\n\n* \"customer\\_support\":\n\n  * \"phone\": \"<Phone Number>\"\n  * \"email\": \"<Email>\"\n  * \"support\\_url\": \"<Support URL>\"\n  * \"chat\\_url\": \"<Chat URL>\"\n  * \"operating\\_hours\": \"<Hours>\"\n  * \"additional\\_help\\_channels\": \\[\"WhatsApp\", \"Twitter Support\", \"Chatbot\"]\n\n3. Baggage Allowance\n\n* \"baggage\\_allowance\":\n\n  * \"hand\\_baggage\":\n\n    * \"weight\\_limit\": \"<Weight Limit>\"\n    * \"size\\_limit\": \"<Size Limit>\"\n    * \"additional\\_items\\_allowed\": \\[\"Handbag\", \"Laptop\", \"Baby Items\", \"Medical Equipment\"]\n    * \"special\\_conditions\": \"<Any special baggage conditions>\"\n  * \"checked\\_baggage\":\n\n    * \"general\\_rules\": \"<Baggage Rules>\"\n    * \"class\\_specific\\_limits\": \"<Limits for different travel classes>\"\n    * \"baggage\\_calculator\\_url\": \"<URL>\"\n    * \"oversized\\_special\\_baggage\": \"\\<Details on sports/music equipment>\"\n\n4. Refund & Cancellation Policy\n\n* \"refund\\_policy\":\n\n  * \"conditions\": \"<Refund conditions>\"\n  * \"processing\\_time\": \"<Processing Time>\"\n  * \"refund\\_policy\\_url\": \"<URL>\"\n* \"cancellation\\_policy\":\n\n  * \"conditions\": \"<Cancellation conditions>\"\n  * \"fees\\_or\\_penalties\": \"<Cancellation Fees>\"\n  * \"cancellation\\_policy\\_url\": \"<URL>\"\n\n5. Airport & Travel Guidelines\n\n* \"airport\\_travel\\_guidelines\":\n\n  * \"security\\_immigration\\_rules\": \"\\<Security & Immigration Rules>\"\n  * \"airport\\_checkin\\_requirements\": \"<Check-in document requirements>\"\n  * \"special\\_services\": \\[\"Priority Boarding\", \"Lounge Access\", \"Wheelchair Assistance\"]\n\n6. Frequently Asked Questions (FAQ)\n\n* \"faq\":\n\n  * \"faq\\_url\": \"<FAQ Page URL>\"\n  * \"questions\\_answers\": \\[\n    {\"question\": \"<Q1>\", \"answer\": \"<A1>\"},\n    {\"question\": \"<Q2>\", \"answer\": \"<A2>\"}\n    ]\n\n7. Additional Details\n\n* \"additional\\_info\": \"<Any extra info from the page not captured above>\"\n\n---\n\nOutput Rules Summary:\n\n* Always return valid JSON.\n* Do **not** include empty fields or structures (null, {}, or \\[]).\n* Place unrelated or extra info under \"additional\\_info\".\n\nUse this structure for every page, even if some values are missing. Just remove the missing fields completely in the output.\n\n\n"
            },
            {
              "type": "AIMessagePromptTemplate",
              "message": "=You are an intelligent parser trained to extract structured data from messy airline webpages.\n\nYour task is to extract and return well-structured airline check-in and policy details from raw text. Always return the result as a clean, valid JSON object using the exact schema described below.\n\n---\n\nExtraction Guidelines:\n\n* Ensure consistent JSON structure for every airline.\n* If a key has no value or content, **remove it** (do not return null, empty arrays, or empty objects).\n* Include any other useful data under `\"additional_info\"` if it doesn\u2019t fit existing keys.\n* Always extract **direct URLs** wherever available.\n* Your output should be compact, valid, and readable JSON.\n\n---\n\nJSON Structure Format:\n\n1. Web Check-in Details\n\n* \"web\\_checkin\\_available\": true/false\n* \"checkin\\_url\": \"<URL>\"\n* \"checkin\\_methods\": \\[\"Online\", \"Mobile App\", \"Kiosk\", \"Airport Counter\"]\n* \"checkin\\_start\": \"<Start Time>\"\n* \"checkin\\_deadline\": \"<Deadline Time>\"\n* \"boarding\\_pass\\_options\":\n\n  * \"mobile\\_boarding\\_pass\\_available\": true/false\n  * \"printed\\_boarding\\_pass\\_required\": true/false\n  * \"additional\\_checkin\\_info\": \"<Extra instructions>\"\n\n2. Customer Support\n\n* \"customer\\_support\":\n\n  * \"phone\": \"<Phone Number>\"\n  * \"email\": \"<Email>\"\n  * \"support\\_url\": \"<Support URL>\"\n  * \"chat\\_url\": \"<Chat URL>\"\n  * \"operating\\_hours\": \"<Hours>\"\n  * \"additional\\_help\\_channels\": \\[\"WhatsApp\", \"Twitter Support\", \"Chatbot\"]\n\n3. Baggage Allowance\n\n* \"baggage\\_allowance\":\n\n  * \"hand\\_baggage\":\n\n    * \"weight\\_limit\": \"<Weight Limit>\"\n    * \"size\\_limit\": \"<Size Limit>\"\n    * \"additional\\_items\\_allowed\": \\[\"Handbag\", \"Laptop\", \"Baby Items\", \"Medical Equipment\"]\n    * \"special\\_conditions\": \"<Any special baggage conditions>\"\n  * \"checked\\_baggage\":\n\n    * \"general\\_rules\": \"<Baggage Rules>\"\n    * \"class\\_specific\\_limits\": \"<Limits for different travel classes>\"\n    * \"baggage\\_calculator\\_url\": \"<URL>\"\n    * \"oversized\\_special\\_baggage\": \"\\<Details on sports/music equipment>\"\n\n4. Refund & Cancellation Policy\n\n* \"refund\\_policy\":\n\n  * \"conditions\": \"<Refund conditions>\"\n  * \"processing\\_time\": \"<Processing Time>\"\n  * \"refund\\_policy\\_url\": \"<URL>\"\n* \"cancellation\\_policy\":\n\n  * \"conditions\": \"<Cancellation conditions>\"\n  * \"fees\\_or\\_penalties\": \"<Cancellation Fees>\"\n  * \"cancellation\\_policy\\_url\": \"<URL>\"\n\n5. Airport & Travel Guidelines\n\n* \"airport\\_travel\\_guidelines\":\n\n  * \"security\\_immigration\\_rules\": \"\\<Security & Immigration Rules>\"\n  * \"airport\\_checkin\\_requirements\": \"<Check-in document requirements>\"\n  * \"special\\_services\": \\[\"Priority Boarding\", \"Lounge Access\", \"Wheelchair Assistance\"]\n\n6. Frequently Asked Questions (FAQ)\n\n* \"faq\":\n\n  * \"faq\\_url\": \"<FAQ Page URL>\"\n  * \"questions\\_answers\": \\[\n    {\"question\": \"<Q1>\", \"answer\": \"<A1>\"},\n    {\"question\": \"<Q2>\", \"answer\": \"<A2>\"}\n    ]\n\n7. Additional Details\n\n* \"additional\\_info\": \"<Any extra info from the page not captured above>\"\n\n---\n\nOutput Rules Summary:\n\n* Always return valid JSON.\n* Do **not** include empty fields or structures (null, {}, or \\[]).\n* Place unrelated or extra info under \"additional\\_info\".\n\nUse this structure for every page, even if some values are missing. Just remove the missing fields completely in the output.\n\n\n"
            }
          ]
        },
        "promptType": "define"
      },
      "typeVersion": 1.5
    },
    {
      "id": "ba090b45-e6e8-434a-9577-51d281dd4a5b",
      "name": "Chat Model",
      "type": "@n8n/n8n-nodes-langchain.lmChatOllama",
      "position": [
        308,
        220
      ],
      "parameters": {
        "options": {}
      },
      "credentials": {
        "ollamaApi": {
          "name": "<your credential>"
        }
      },
      "typeVersion": 1
    },
    {
      "id": "d557adab-856e-460e-aa81-f929a66ca465",
      "name": "Wait for Response",
      "type": "n8n-nodes-base.wait",
      "position": [
        580,
        0
      ],
      "parameters": {},
      "typeVersion": 1.1
    },
    {
      "id": "279b24fc-e1f3-4a1c-9c70-0177b13f32d8",
      "name": "Store Extracted Info\t",
      "type": "n8n-nodes-base.googleSheets",
      "position": [
        816,
        0
      ],
      "parameters": {
        "columns": {
          "value": {
            "row_number": "={{ $('Loop Over Items').item.json.row_number }}",
            "web check in details": "={{ $json.text.removeTags().replace(/^```json|```$/g, '').trim() }}"
          },
          "schema": [
            {
              "id": "Airline",
              "type": "string",
              "display": true,
              "required": false,
              "displayName": "Airline",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            },
            {
              "id": "WEB CHECK IN URL",
              "type": "string",
              "display": true,
              "removed": true,
              "required": false,
              "displayName": "WEB CHECK IN URL",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            },
            {
              "id": "web check in details",
              "type": "string",
              "display": true,
              "removed": false,
              "required": false,
              "displayName": "web check in details",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            },
            {
              "id": "output",
              "type": "string",
              "display": true,
              "removed": false,
              "required": false,
              "displayName": "output",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            },
            {
              "id": "row_number",
              "type": "string",
              "display": true,
              "removed": false,
              "readOnly": true,
              "required": false,
              "displayName": "row_number",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            }
          ],
          "mappingMode": "defineBelow",
          "matchingColumns": [
            "row_number"
          ],
          "attemptToConvertTypes": false,
          "convertFieldsToString": false
        },
        "options": {},
        "operation": "update",
        "sheetName": {
          "__rl": true,
          "mode": "list",
          "value": 2125635496,
          "cachedResultUrl": "https://docs.google.com/spreadsheets/d/1ws8YonQyc32SveWQdfihYOW_OzOS-2REIrwSYS37oQ8/edit#gid=2125635496",
          "cachedResultName": "Sheet1"
        },
        "documentId": {
          "__rl": true,
          "mode": "list",
          "value": "1ws8YonQyc32SveWQdfihYOW_OzOS-2REIrwSYS37oQ8",
          "cachedResultUrl": "https://docs.google.com/spreadsheets/d/1ws8YonQyc32SveWQdfihYOW_OzOS-2REIrwSYS37oQ8/edit?usp=drivesdk",
          "cachedResultName": "airline_faq_urls"
        },
        "authentication": "serviceAccount"
      },
      "credentials": {
        "googleApi": {
          "name": "<your credential>"
        }
      },
      "typeVersion": 4.5
    },
    {
      "id": "866e9eca-68ad-419e-acf0-c28141bf7727",
      "name": "Generate Embeddings\t",
      "type": "@n8n/n8n-nodes-langchain.embeddingsOllama",
      "position": [
        1036,
        220
      ],
      "parameters": {},
      "credentials": {
        "ollamaApi": {
          "name": "<your credential>"
        }
      },
      "typeVersion": 1
    },
    {
      "id": "56553cae-a61f-4b64-8709-06dbab314bce",
      "name": "Prepare Text for Vector DB\t",
      "type": "@n8n/n8n-nodes-langchain.documentDefaultDataLoader",
      "position": [
        1156,
        222.5
      ],
      "parameters": {
        "options": {}
      },
      "typeVersion": 1
    },
    {
      "id": "82da65d6-9ecd-451a-b2f8-466795cd07a0",
      "name": "Split Long Text\t",
      "type": "@n8n/n8n-nodes-langchain.textSplitterTokenSplitter",
      "position": [
        1244,
        420
      ],
      "parameters": {
        "chunkSize": 10000
      },
      "typeVersion": 1
    },
    {
      "id": "a670a7c1-af95-452d-92e5-a82d5be2d0a5",
      "name": "Save to Vector DB\t",
      "type": "@n8n/n8n-nodes-langchain.vectorStorePGVector",
      "position": [
        1052,
        0
      ],
      "parameters": {
        "mode": "insert",
        "options": {
          "collection": {
            "values": {
              "useCollection": true
            }
          }
        }
      },
      "credentials": {
        "postgres": {
          "name": "<your credential>"
        }
      },
      "typeVersion": 1
    },
    {
      "id": "7c4941f0-4dff-49d0-ac9b-901a23987686",
      "name": "Wait Before Next Batch\t",
      "type": "n8n-nodes-base.wait",
      "position": [
        1532,
        175
      ],
      "parameters": {
        "amount": 15
      },
      "typeVersion": 1.1
    },
    {
      "id": "0fef87ce-bfc3-4edd-aff8-8e10a0e7489a",
      "name": "Sticky Note",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -740,
        -860
      ],
      "parameters": {
        "width": 660,
        "height": 860,
        "content": "\n\n### \ud83d\udcdd Web Check-in Details Extractor (LLM Prompt Guide)\n\n#### \u2705 What is this?\n\nThis is a powerful AI prompt used inside the **\"Basic LLM Chain\"** node. It tells the AI how to **extract structured airline web check-in data** (like check-in time, baggage policy, cancellation rules) from messy airline webpages.\n\n#### \ud83c\udfaf Why is it used?\n\nAirline websites often present data in unstructured formats. This LLM-based step:\n\n* Cleans the content scraped from airline URLs.\n* Extracts important travel-related info in a consistent JSON format.\n* Helps automate the enrichment of airline data stored in your Google Sheets and Vector DB.\n\n#### \ud83d\udee0\ufe0f How to use it?\n\n1. **Input**: This node receives raw webpage content from the airline\u2019s \"Web Check-in URL\".\n2. **Prompt**: It applies a fixed set of rules (in natural language) to guide the AI to convert the unstructured data into clean JSON.\n3. **Output**: The AI returns a **structured JSON** object with fields like:\n\n   * `checkin_url`\n   * `baggage_allowance`\n   * `refund_policy`\n   * `faq`\n   * `additional_info`\n4. The next nodes save this output to:\n\n   * Google Sheet (for visibility)\n   * PGVector (for semantic search)\n\n\ud83d\udca1 **Pro Tip:** This works best when the HTML content is readable and includes useful labels like \u201cCheck-in\u201d, \u201cCancellation\u201d, \u201cSupport\u201d, etc.\n\n\n"
      },
      "typeVersion": 1
    }
  ],
  "active": false,
  "settings": {
    "executionOrder": "v1"
  },
  "versionId": "b785e5c9-19bf-42c0-8c99-1659b1c2509b",
  "connections": {
    "Chat Model": {
      "ai_languageModel": [
        [
          {
            "node": "Extract Info with LLM\t",
            "type": "ai_languageModel",
            "index": 0
          }
        ]
      ]
    },
    "Loop Over Items": {
      "main": [
        [],
        [
          {
            "node": "Scrape Airline Webpage\t",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Split Long Text\t": {
      "ai_textSplitter": [
        [
          {
            "node": "Prepare Text for Vector DB\t",
            "type": "ai_textSplitter",
            "index": 0
          }
        ]
      ]
    },
    "Wait for Response": {
      "main": [
        [
          {
            "node": "Store Extracted Info\t",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Save to Vector DB\t": {
      "main": [
        [
          {
            "node": "Wait Before Next Batch\t",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Fetch Airline URLs\t": {
      "main": [
        [
          {
            "node": "Loop Over Items",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Generate Embeddings\t": {
      "ai_embedding": [
        [
          {
            "node": "Save to Vector DB\t",
            "type": "ai_embedding",
            "index": 0
          }
        ]
      ]
    },
    "Chat Trigger - Start\t": {
      "main": [
        [
          {
            "node": "Fetch Airline URLs\t",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Store Extracted Info\t": {
      "main": [
        [
          {
            "node": "Save to Vector DB\t",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Extract Info with LLM\t": {
      "main": [
        [
          {
            "node": "Wait for Response",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Scrape Airline Webpage\t": {
      "main": [
        [
          {
            "node": "Extract Info with LLM\t",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Wait Before Next Batch\t": {
      "main": [
        [
          {
            "node": "Loop Over Items",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Prepare Text for Vector DB\t": {
      "ai_document": [
        [
          {
            "node": "Save to Vector DB\t",
            "type": "ai_document",
            "index": 0
          }
        ]
      ]
    }
  }
}