{ "id": "kZ3aL4r7xc96Q7lp", "meta": { "instanceId": "b8b2c0d20b02864cf66adc9cbefc86e9e56de0252b653d37ba6613341b5e0bef", "templateCredsSetupCompleted": true }, "name": "Selenium Ultimate Scraper Workflow", "tags": [], "nodes": [ { "id": "20d35d68-db49-4183-a913-85ad06c13912", "name": "Extract First Url Match", "type": "n8n-nodes-base.html", "position": [ 1820, 540 ], "parameters": { "options": {}, "operation": "extractHtmlContent", "extractionValues": { "values": [ { "key": "Url Find ", "attribute": "href", "cssSelector": "=a[href*=\"https://\"][href*=\"{{ $('Edit Fields (For testing prupose )').item.json['Website Domaine'] }}\"]\n", "returnArray": true, "returnValue": "attribute" } ] } }, "typeVersion": 1.2 }, { "id": "9167ea20-fc9c-4d75-bf4d-bb2016079dd0", "name": "OpenAI Chat Model", "type": "@n8n/n8n-nodes-langchain.lmChatOpenAi", "position": [ 2060, 700 ], "parameters": { "model": "gpt-4o", "options": {} }, "credentials": { "openAiApi": { "id": "FmszNHDDVS32ud21", "name": "OpenAi account" } }, "typeVersion": 1 }, { "id": "42a8646d-1b0b-4309-a87d-9c8aeb355a28", "name": "Clean Webdriver ", "type": "n8n-nodes-base.httpRequest", "notes": "Script to delete traces of selenium in the browser ", "position": [ 3120, 560 ], "parameters": { "url": "=http://selenium_chrome:4444/wd/hub/session/{{ $('Create Selenium Session').item.json.value.sessionId }}/execute/sync", "method": "POST", "options": {}, "jsonBody": "{\n \"script\": \"Object.defineProperty(navigator, 'webdriver', { get: () => undefined }); window.navigator.chrome = { runtime: {} }; Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });\",\n \"args\": []\n}\n", "sendBody": true, "specifyBody": "json" }, "notesInFlow": false, "typeVersion": 4.2 }, { "id": "107dd8de-e341-4819-a493-94ed57fd0f33", "name": "Delete Session", "type": "n8n-nodes-base.httpRequest", "position": [ 5180, 920 ], "parameters": { "url": "=http://selenium_chrome:4444/wd/hub/session/{{ $('Create Selenium Session').item.json.value.sessionId }}", "method": "DELETE", "options": {} }, "typeVersion": 4.2 }, { "id": "8c7ec6bc-d417-48c2-a6f2-ecce27803671", "name": "Delete Session2", "type": "n8n-nodes-base.httpRequest", "position": [ 6740, -160 ], "parameters": { "url": "=http://selenium_chrome:4444/wd/hub/session/{{ $('Create Selenium Session').item.json.value.sessionId }}", "method": "DELETE", "options": {} }, "typeVersion": 4.2 }, { "id": "e43ecd94-b7f2-4f73-a9fa-b829de9e0296", "name": "If Block1", "type": "n8n-nodes-base.if", "position": [ 6520, -20 ], "parameters": { "options": {}, "conditions": { "options": { "version": 2, "leftValue": "", "caseSensitive": true, "typeValidation": "strict" }, "combinator": "and", "conditions": [ { "id": "e6e6e15d-1cfe-48be-8ea0-f112e9781c9d", "operator": { "name": "filter.operator.equals", "type": "string", "operation": "equals" }, "leftValue": "={{ $json.content }}", "rightValue": "BLOCK" } ] } }, "typeVersion": 2.2 }, { "id": "08e46f63-41b5-4606-8f2c-df9e96c9c34e", "name": "Delete Session3", "type": "n8n-nodes-base.httpRequest", "position": [ 6740, 60 ], "parameters": { "url": "=http://selenium_chrome:4444/wd/hub/session/{{ $('Create Selenium Session').item.json.value.sessionId }}", "method": "DELETE", "options": {} }, "typeVersion": 4.2 }, { "id": "b47d9b22-9a59-4c7a-8cba-9487f18207ee", "name": "Limit", "type": "n8n-nodes-base.limit", "position": [ 5120, -100 ], "parameters": {}, "typeVersion": 1 }, { "id": "541622f7-562b-4e8a-93e5-61e6e918ff52", "name": "Delete Session1", "type": "n8n-nodes-base.httpRequest", "position": [ 5180, 720 ], "parameters": { "url": "=http://selenium_chrome:4444/wd/hub/session/{{ $('Create Selenium Session').item.json.value.sessionId }}", "method": "DELETE", "options": {} }, "typeVersion": 4.2 }, { "id": "825be0d7-9dd3-4a2f-8c3d-fd405f59a5d6", "name": "Delete Session4", "type": "n8n-nodes-base.httpRequest", "onError": "continueRegularOutput", "position": [ 5780, 260 ], "parameters": { "url": "=http://selenium_chrome:4444/wd/hub/session/{{ $('Create Selenium Session').item.json.value.sessionId }}", "method": "DELETE", "options": {} }, "retryOnFail": false, "typeVersion": 4.2 }, { "id": "56f6f4f6-f737-4de8-bdfe-029546909677", "name": "Success with cookie", "type": "n8n-nodes-base.respondToWebhook", "position": [ 7260, 60 ], "parameters": { "options": { "responseCode": 200 } }, "typeVersion": 1.1 }, { "id": "c6939773-e230-45e1-bf76-d0299c2c7066", "name": "Respond to Webhook2", "type": "n8n-nodes-base.respondToWebhook", "position": [ 6920, -160 ], "parameters": { "options": { "responseCode": 200 }, "respondWith": "json", "responseBody": "{\n \"Success \": \"Request has been block by the targeted website\"\n}" }, "typeVersion": 1.1 }, { "id": "ea921f11-323f-4c79-8cc6-779b39498b05", "name": "Code", "type": "n8n-nodes-base.code", "position": [ 4700, -100 ], "parameters": { "jsCode": "// Récupère les données du nœud Webhook (en remplaçant \"Webhook\" par le nom du nœud Webhook dans votre workflow)\nconst webhookData = $node[\"Webhook\"].json;\n\n// Fonction pour convertir la valeur de sameSite\nfunction convertSameSite(value) {\n // Conversion spécifique des valeurs de sameSite\n const conversionMap = {\n \"unspecified\": \"None\",\n \"lax\": \"Lax\",\n \"strict\": \"Strict\"\n };\n \n // Si la valeur existe dans le tableau de conversion, on la convertit\n if (value in conversionMap) {\n return conversionMap[value];\n }\n \n // Si la valeur est déjà une des valeurs acceptées par Selenium\n const allowedValues = [\"Strict\", \"Lax\", \"None\"];\n if (allowedValues.includes(value)) {\n return value;\n } else {\n // Si la valeur n'est pas reconnue, on la remplace par \"Lax\" (par défaut)\n return \"Lax\";\n }\n}\n\n// Vérifiez et traitez les données des cookies\nif (webhookData.body && webhookData.body.cookies) {\n let items = [];\n for (const cookieObject of webhookData.body.cookies) {\n if (cookieObject.cookie) {\n // Convertir la valeur de sameSite\n cookieObject.cookie.sameSite = convertSameSite(cookieObject.cookie.sameSite);\n \n // Ajouter le cookie à la liste des items\n items.push({\n json: cookieObject.cookie\n });\n }\n }\n return items;\n}\n\n// Si les cookies ne sont pas trouvés, renvoyer un tableau vide\nreturn [];\n" }, "typeVersion": 2 }, { "id": "c3d77928-eefc-4903-9b4f-b14bd6f34e3c", "name": "Delete Session5", "type": "n8n-nodes-base.httpRequest", "onError": "continueRegularOutput", "position": [ 3940, 360 ], "parameters": { "url": "=http://selenium_chrome:4444/wd/hub/session/{{ $('Create Selenium Session').item.json.value.sessionId }}", "method": "DELETE", "options": {} }, "retryOnFail": false, "typeVersion": 4.2 }, { "id": "036cfce6-8082-4539-bb0e-980368679fe5", "name": "Error", "type": "n8n-nodes-base.respondToWebhook", "position": [ 4120, 360 ], "parameters": { "options": { "responseCode": 404 }, "respondWith": "json", "responseBody": "{\n \"Error\": \"Cookies are note for the targeted url\"\n}" }, "typeVersion": 1.1 }, { "id": "09d6a99b-d8b3-40c9-b74a-14014e3647e2", "name": "Error1", "type": "n8n-nodes-base.respondToWebhook", "position": [ 6000, 260 ], "parameters": { "options": { "responseCode": 500 } }, "typeVersion": 1.1 }, { "id": "0b1f3442-6b70-405f-b597-642e9c982b82", "name": "Error2", "type": "n8n-nodes-base.respondToWebhook", "position": [ 3060, 780 ], "parameters": { "options": { "responseCode": 500 } }, "typeVersion": 1.1 }, { "id": "4d0112bb-cbfd-45c6-961a-964bd8f59cac", "name": "If", "type": "n8n-nodes-base.if", "position": [ 3760, 200 ], "parameters": { "options": {}, "conditions": { "options": { "version": 2, "leftValue": "", "caseSensitive": true, "typeValidation": "strict" }, "combinator": "and", "conditions": [ { "id": "1bffbc80-9913-46e7-a594-ebc26948c83b", "operator": { "type": "string", "operation": "contains" }, "leftValue": "={{ $('Webhook').item.json.body.cookies[0].cookie.domain }}", "rightValue": "={{ $('Webhook').item.json.body.Url }}" } ] } }, "typeVersion": 2.2 }, { "id": "58a50b80-df4c-4b6f-a682-72237f4dbdef", "name": "Inject Cookie", "type": "n8n-nodes-base.httpRequest", "onError": "continueRegularOutput", "position": [ 4900, -100 ], "parameters": { "url": "=http://selenium_chrome:4444/wd/hub/session/{{ $('Create Selenium Session').item.json.value.sessionId }}/cookie", "method": "POST", "options": {}, "jsonBody": "={\n \"cookie\": {\n \"name\": \"{{ $json.name }}\",\n \"value\": \"{{ $json.value }}\",\n \"domain\": \"{{ $json.domain }}\",\n \"path\": \"{{ $json.path }}\",\n \"secure\": {{ $json.secure }},\n \"httpOnly\": {{ $json.httpOnly }},\n \"sameSite\": \"{{ $json.sameSite }}\",\n \"expirationDate\": {{ $json.expirationDate }}\n }\n}", "sendBody": true, "specifyBody": "json" }, "typeVersion": 4.2 }, { "id": "39f7401b-b6b7-4f0c-9afc-8f144d394350", "name": "Respond to Webhook3", "type": "n8n-nodes-base.respondToWebhook", "position": [ 5400, 720 ], "parameters": { "options": { "responseCode": 200 }, "respondWith": "json", "responseBody": "{\n \"Success \": \"Request has been block by the targeted website\"\n}" }, "typeVersion": 1.1 }, { "id": "80b107cc-2f6c-46f0-a597-e85594634492", "name": "Success", "type": "n8n-nodes-base.respondToWebhook", "position": [ 5740, 920 ], "parameters": { "options": { "responseKey": "={{ $json.output }}", "responseCode": 200 } }, "typeVersion": 1.1 }, { "id": "94a97354-07d9-428e-989c-ef066f9b4d8a", "name": "Go on url", "type": "n8n-nodes-base.httpRequest", "onError": "continueErrorOutput", "position": [ 3900, 780 ], "parameters": { "url": "=http://selenium_chrome:4444/wd/hub/session/{{ $('Create Selenium Session').item.json.value.sessionId }}/url", "method": "POST", "options": {}, "jsonBody": "={\n \"url\": \"{{ $('Webhook').item.json.body['Target Url'] }}\"\n}\n", "sendBody": true, "specifyBody": "json" }, "retryOnFail": true, "typeVersion": 4.2 }, { "id": "fd044cf3-594d-48af-bbd1-f2d9adedcbc1", "name": "Delete Session6", "type": "n8n-nodes-base.httpRequest", "onError": "continueRegularOutput", "position": [ 4360, 1200 ], "parameters": { "url": "=http://selenium_chrome:4444/wd/hub/session/{{ $('Create Selenium Session').item.json.value.sessionId }}", "method": "DELETE", "options": {} }, "retryOnFail": false, "typeVersion": 4.2 }, { "id": "7c28c3b6-1141-4609-8774-cb6b4d842b97", "name": "Error3", "type": "n8n-nodes-base.respondToWebhook", "position": [ 4520, 1200 ], "parameters": { "options": { "responseCode": 500 }, "respondWith": "json", "responseBody": "{\n \"Error\": \"Page crash on the extracted url\"\n}" }, "typeVersion": 1.1 }, { "id": "52f78923-156f-4861-88ba-f0253c483bd9", "name": "Information Extractor", "type": "@n8n/n8n-nodes-langchain.informationExtractor", "position": [ 2040, 540 ], "parameters": { "text": "={{ $json['Url Find '][1] }}{{ $json['Url Find '][2] }}{{ $json['Url Find '][3] }}", "options": { "systemPromptTemplate": "=You are an expert extraction algorithm.\nOnly extract relevant url from the unstructured urls array.\nA relevant url is a url whre you can find relevant information about this subject : {{ $('Edit Fields (For testing prupose )').item.json.Subject }}, on this domaine name : {{ $('Edit Fields (For testing prupose )').item.json['Website Domaine'] }}.\nIf you do not know the value of an attribute asked to extract, you need \\ attribute's value as NA." }, "attributes": { "attributes": [ { "name": "Good_url_for_etract_information", "required": true, "description": "=The url where I can extract relevant infroamtion on this subject : {{ $('Edit Fields (For testing prupose )').item.json.Subject }} on this domaine name : {{ $('Edit Fields (For testing prupose )').item.json['Website Domaine'] }}" } ] } }, "typeVersion": 1 }, { "id": "6ac249e2-a9d8-4590-b050-3a0a2472fa3c", "name": "Check if empty of NA", "type": "n8n-nodes-base.if", "position": [ 2440, 540 ], "parameters": { "options": {}, "conditions": { "options": { "version": 2, "leftValue": "", "caseSensitive": true, "typeValidation": "strict" }, "combinator": "or", "conditions": [ { "id": "9470fb6c-e367-4af7-a697-275e724fe771", "operator": { "type": "string", "operation": "empty", "singleValue": true }, "leftValue": "={{ $json.output.Good_url_for_etract_information }}", "rightValue": "" }, { "id": "8518e9a9-5b0c-4699-97c5-d9b7b1943918", "operator": { "name": "filter.operator.equals", "type": "string", "operation": "equals" }, "leftValue": "={{ $json.output.Good_url_for_etract_information }}", "rightValue": "NA" } ] } }, "typeVersion": 2.2 }, { "id": "f380eff7-3d18-4791-9dac-8a88d3fdcc4f", "name": "If Block", "type": "n8n-nodes-base.if", "position": [ 4960, 840 ], "parameters": { "options": {}, "conditions": { "options": { "version": 2, "leftValue": "", "caseSensitive": true, "typeValidation": "strict" }, "combinator": "and", "conditions": [ { "id": "e6e6e15d-1cfe-48be-8ea0-f112e9781c9d", "operator": { "type": "string", "operation": "contains" }, "leftValue": "={{ $json.content }}", "rightValue": "BLOCK" } ] } }, "typeVersion": 2.2 }, { "id": "43382397-89b5-4b90-9016-49109ec04baf", "name": "Google search Query ", "type": "n8n-nodes-base.httpRequest", "position": [ 1600, 540 ], "parameters": { "url": "=https://www.google.com/search?q=site:{{ $json['Website Domaine'] }}+{{$json.Subject}}&oq=site&gs_lcrp=EgZjaHJvbWUqCAgAEEUYJxg7MggIABBFGCcYOzIICAEQRRgnGDsyBggCEEUYOzIRCAMQRRg5GEMYyQMYgAQYigUyBggEEEUYQDIGCAUQRRg9MgYIBhBFGD0yBggHEEUYPdIBCDEwNTRqMGo3qAIAsAIA&sourceid=chrome&ie=UTF-8", "options": {} }, "typeVersion": 4.2 }, { "id": "d34256af-1b43-4f64-853c-cf063b8c6b68", "name": "Create Selenium Session", "type": "n8n-nodes-base.httpRequest", "onError": "continueErrorOutput", "position": [ 2680, 640 ], "parameters": { "url": "http://selenium_chrome:4444/wd/hub/session", "method": "POST", "options": { "timeout": 5000 }, "jsonBody": "{\n \"capabilities\": {\n \"alwaysMatch\": {\n \"browserName\": \"chrome\",\n \"goog:chromeOptions\": {\n \"args\": [ \n \"--disable-blink-features=AutomationControlled\",\n \"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3\"\n ]\n }\n }\n }\n}\n", "sendBody": true, "specifyBody": "json" }, "retryOnFail": true, "typeVersion": 4.2 }, { "id": "4f0f696c-9637-4c7d-82ae-1f5c36bb9cd1", "name": "Get ScreenShot 1", "type": "n8n-nodes-base.httpRequest", "onError": "continueErrorOutput", "position": [ 4420, 840 ], "parameters": { "url": "=http://selenium_chrome:4444/wd/hub/session/{{ $('Create Selenium Session').item.json.value.sessionId }}/screenshot", "options": {} }, "typeVersion": 4.2 }, { "id": "ba72c0cf-217a-4411-80f6-ca28ccdb0151", "name": "Refresh browser", "type": "n8n-nodes-base.httpRequest", "onError": "continueErrorOutput", "position": [ 5320, -100 ], "parameters": { "url": "=http:///selenium_chrome:4444/wd/hub/session/{{ $('Create Selenium Session').item.json.value.sessionId }}/refresh", "method": "POST", "options": {}, "jsonBody": "{}", "sendBody": true, "specifyBody": "json" }, "typeVersion": 4.2 }, { "id": "b6ba7068-399a-467d-ba58-7f47d650e2f1", "name": "Get ScreenShot ", "type": "n8n-nodes-base.httpRequest", "onError": "continueErrorOutput", "position": [ 5880, -20 ], "parameters": { "url": "=http://selenium_chrome:4444/wd/hub/session/{{ $('Create Selenium Session').item.json.value.sessionId }}/screenshot", "options": {} }, "typeVersion": 4.2 }, { "id": "792649be-0ee2-442f-bc21-d0c297cea227", "name": "Convert to File", "type": "n8n-nodes-base.convertToFile", "onError": "continueErrorOutput", "position": [ 6160, -20 ], "parameters": { "options": {}, "operation": "toBinary", "sourceProperty": "value" }, "typeVersion": 1.1 }, { "id": "49e58759-bedf-4f38-a96c-bd18e67b8aaf", "name": "Convert to File1", "type": "n8n-nodes-base.convertToFile", "onError": "continueErrorOutput", "position": [ 4600, 840 ], "parameters": { "options": {}, "operation": "toBinary", "sourceProperty": "value" }, "typeVersion": 1.1 }, { "id": "3735f5f5-665e-4649-b1c2-84a4a8699f70", "name": "Delete Session7", "type": "n8n-nodes-base.httpRequest", "onError": "continueRegularOutput", "position": [ 2920, 780 ], "parameters": { "url": "=http://selenium_chrome:4444/wd/hub/session/{{ $('Create Selenium Session').item.json.value.sessionId }}", "method": "DELETE", "options": {} }, "retryOnFail": false, "typeVersion": 4.2 }, { "id": "1b8b1e0c-f465-4963-869c-0e7086922151", "name": "Sticky Note", "type": "n8n-nodes-base.stickyNote", "position": [ 920, -1023.3944834469928 ], "parameters": { "color": 4, "width": 851.2111300888805, "height": 1333.3079943516484, "content": "## N8N Ultimate Scraper - Workflow\n\nThis workflow's objective is to collect data from any website page, whether it requires login or not.\n\nFor example, you can collect the number of stars of the n8n-ultimate-scraper project on GitHub.\n\n## Requirements\n**Selenium Container**: Selenium is an open-source automation framework for web applications, enabling browser control and interaction through scripts in various programming languages.\nYou can deploy the Docker Compose file from the associated GitHub project to set up your Selenium container and configuration: https://github.com/Touxan/n8n-ultimate-scraper\n\n**Residential Proxy Server**: To scrape data at scale without being blocked, I personally recommend GeoNode. They offer affordable, high-quality residential proxies: https://geonode.com/invite/98895\n\n**OpenAI API Key**: For using GPT-4.\n\n## Optional\nSession Cookies Collection: To use login functionality with the n8n Ultimate Scraper, you need to collect session cookies from the target website. You can do this using the extension created for this application in the GitHub project: https://github.com/Touxan/n8n-ultimate-scraper. Follow the installation procedure to use it.\n\n## How to use \nDeploy the project with all the requiremnts and request your webhook.\n\n**Example of request**:\ncurl -X POST http://localhost:5678/webhook-test/yourwebhookid \\\n-H \"Content-Type: application/json\" \\\n-d '{\n \"subject\": \"Hugging Face\",\n \"Url\": \"github.com\",\n \"Target data\": [\n {\n \"DataName\": \"Followers\",\n \"description\": \"The number of followers of the GitHub page\"\n },\n {\n \"DataName\": \"Total Stars\",\n \"description\": \"The total numbers of stars on the different repos\"\n }\n ],\n \"cookies\": []\n}'\n\nYou can also scrape link like this : \ncurl -X POST http://localhost:5678/webhook-test/67d77918-2d5b-48c1-ae73-2004b32125f0 \\\n-H \"Content-Type: application/json\" \\\n-d '{\n \"Target Url\": \"https://github.com\",\n \"Target data\": [\n {\n \"DataName\": \"Followers\",\n \"description\": \"The number of followers of the GitHub page\"\n },\n {\n \"DataName\": \"Total Stars\",\n \"description\": \"The total numbers of stars on the different repo\"\n }\n]\n}'\n\n**Note**\nThe maximum nimber of Target data is 5." }, "typeVersion": 1 }, { "id": "4d743518-4fcb-4e9f-aff7-a8959a78ccaf", "name": "Edit Fields (For testing prupose )", "type": "n8n-nodes-base.set", "position": [ 1160, 540 ], "parameters": { "options": {}, "assignments": { "assignments": [ { "id": "3895040f-0a21-47ee-a73f-d3c7fd6edf36", "name": "Subject", "type": "string", "value": "={{ $json.body.subject }}" }, { "id": "304e4240-513f-4c87-ae9d-4efda7d0c4ab", "name": "Website Domaine", "type": "string", "value": "={{ $json.body.Url }}" } ] } }, "typeVersion": 3.4 }, { "id": "62b0a416-71a2-4d2b-83f9-8c5465c72006", "name": "Get ScreenShot 2", "type": "n8n-nodes-base.httpRequest", "onError": "continueErrorOutput", "position": [ 6200, 851 ], "parameters": { "url": "=http://selenium_chrome:4444/wd/hub/session/{{ $('Create Selenium Session').item.json.value.sessionId }}/screenshot", "options": {} }, "typeVersion": 4.2 }, { "id": "6a5b1a08-c47a-435e-8e0b-648cb8282a90", "name": "Convert to File2", "type": "n8n-nodes-base.convertToFile", "onError": "continueErrorOutput", "position": [ 6440, 851 ], "parameters": { "options": {}, "operation": "toBinary", "sourceProperty": "value" }, "typeVersion": 1.1 }, { "id": "a2aa5d45-5f41-41f7-a8ee-07c145b73d89", "name": "Go on ip-api.com", "type": "n8n-nodes-base.httpRequest", "onError": "continueErrorOutput", "position": [ 5960, 851 ], "parameters": { "url": "=http://selenium_chrome:4444/wd/hub/session/{{ $('Create Selenium Session').item.json.value.sessionId }}/url", "method": "POST", "options": {}, "jsonBody": "={\n \"url\": \"https://ip-api.com/\"\n}\n", "sendBody": true, "specifyBody": "json" }, "retryOnFail": true, "typeVersion": 4.2 }, { "id": "8ddde1d2-0b09-45ca-88ef-db24352b095e", "name": "Delete Session8", "type": "n8n-nodes-base.httpRequest", "onError": "continueRegularOutput", "position": [ 6440, 1071 ], "parameters": { "url": "=http://selenium_chrome:4444/wd/hub/session/{{ $('Create Selenium Session').item.json.value.sessionId }}", "method": "DELETE", "options": {} }, "retryOnFail": false, "typeVersion": 4.2 }, { "id": "78ffd8e1-b4b8-444c-8a7d-410172d3a7f8", "name": "Sticky Note1", "type": "n8n-nodes-base.stickyNote", "position": [ 5920, 727 ], "parameters": { "color": 6, "width": 784.9798841202522, "height": 520.0741248156677, "content": "## Debug IP\n\nThis small debug flow aims to check the IP you're requesting with, in case you're using a proxy" }, "typeVersion": 1 }, { "id": "be5de434-5f07-40bc-a1e6-aece9ad211b4", "name": "Sticky Note2", "type": "n8n-nodes-base.stickyNote", "position": [ 1580, 420 ], "parameters": { "width": 751.8596006980003, "height": 430.433007240277, "content": "## Search\n\n**Description** :\nThis part aims to search on Google for the subject and find the URL of the subject page based on the input URL." }, "typeVersion": 1 }, { "id": "ffbb3c92-245b-4635-9adf-17d24f236bff", "name": "Error can't find url", "type": "n8n-nodes-base.respondToWebhook", "position": [ 2800, 280 ], "parameters": { "options": { "responseCode": 404 }, "respondWith": "json", "responseBody": "{\n \"Error\": \"Can't find url\"\n}" }, "typeVersion": 1.1 }, { "id": "088ad72c-907a-409a-9fa4-00a16d396e1b", "name": "Sticky Note3", "type": "n8n-nodes-base.stickyNote", "position": [ 2420, 420 ], "parameters": { "width": 827.9448220213314, "height": 502.0185388323068, "content": "## Selenium Session\n\n**Description**:\nCreation and configuration of the Selenium session." }, "typeVersion": 1 }, { "id": "00b8bf19-b34e-42ed-bb2a-3fbfa5f02a25", "name": "Resize browser window", "type": "n8n-nodes-base.httpRequest", "position": [ 2920, 560 ], "parameters": { "url": "=http://selenium_chrome:4444/wd/hub/session/{{ $json.value.sessionId }}/window/rect", "method": "POST", "options": {}, "jsonBody": "{\n \"width\": 1920,\n \"height\": 1080,\n \"x\": 0,\n \"y\": 0\n}\n", "sendBody": true, "specifyBody": "json" }, "typeVersion": 4.2 }, { "id": "007354a1-3f00-4ae9-ab53-54ded5eed563", "name": "Sticky Note4", "type": "n8n-nodes-base.stickyNote", "position": [ 3500, -300 ], "parameters": { "width": 3939.555135735299, "height": 821.0847869745435, "content": "## Scrape with cookies session\n\n**Description**\nThis part goes to the extracted URL, injects the cookies passed into the webhook, takes a screenshot of the webpage, and analyzes the image with GPT to extract the targeted data." }, "typeVersion": 1 }, { "id": "5ab44e1b-6878-4af5-bfd8-1f1e5cbee3a7", "name": "Sticky Note5", "type": "n8n-nodes-base.stickyNote", "position": [ 3500, 580 ], "parameters": { "width": 3336.952424000919, "height": 821.0847869745435, "content": "## Scrape without cookies session\n\n**Description**\nSame as the 'Scrape with cookies session' flow, but without the cookie injection" }, "typeVersion": 1 }, { "id": "4fc7e290-0c60-4efe-ac3f-eb71ce5e457b", "name": "OpenAI", "type": "@n8n/n8n-nodes-langchain.openAi", "position": [ 6340, -20 ], "parameters": { "text": "=Analyse this image and extract revlant infromation about this subject : {{ $('Webhook').item.json.body.subject }}. \n\nIf the webpage seem block by waf, or don't have any relant information about the subject reurn BLOCK with out any aditinonal information.", "modelId": { "__rl": true, "mode": "list", "value": "gpt-4o", "cachedResultName": "GPT-4O" }, "options": { "detail": "auto", "maxTokens": 300 }, "resource": "image", "inputType": "base64", "operation": "analyze" }, "credentials": { "openAiApi": { "id": "FmszNHDDVS32ud21", "name": "OpenAi account" } }, "typeVersion": 1.5 }, { "id": "b039ed2a-94da-4a37-b794-7fb1721a8ab3", "name": "OpenAI1", "type": "@n8n/n8n-nodes-langchain.openAi", "onError": "continueErrorOutput", "position": [ 4780, 840 ], "parameters": { "text": "=Analyse this image and extract revlant infromation about this subject : {{ $('Webhook').item.json.body.subject }}. \n\nIf the webpage seem block by waf, or don't have any relant information about the subject reurn BLOCK with out any aditinonal information.", "modelId": { "__rl": true, "mode": "list", "value": "gpt-4o", "cachedResultName": "GPT-4O" }, "options": { "detail": "auto", "maxTokens": 300 }, "resource": "image", "inputType": "base64", "operation": "analyze" }, "credentials": { "openAiApi": { "id": "FmszNHDDVS32ud21", "name": "OpenAi account" } }, "typeVersion": 1.5 }, { "id": "c69364ce-c7e3-4f7a-ae0c-bad97643da30", "name": "Information Extractor1", "type": "@n8n/n8n-nodes-langchain.informationExtractor", "position": [ 5400, 920 ], "parameters": { "text": "={{ $('OpenAI1').item.json.content }}", "options": { "systemPromptTemplate": "You are an expert extraction algorithm.\nOnly extract relevant information from the text.\nIf you do not know the value of an attribute asked to extract, set the attribute's value to NA." }, "attributes": { "attributes": [ { "name": "={{ $('Webhook').item.json.body['Target data'][0].DataName }}", "description": "={{ $('Webhook').item.json.body['Target data'][0].description }}" }, { "name": "={{ $('Webhook').item.json.body['Target data'][1].DataName }}", "description": "=The total number of stars on all project" }, { "name": "={{ $('Webhook').item.json.body['Target data'][2].DataName }}", "description": "={{ $('Webhook').item.json.body['Target data'][2].description }}" }, { "name": "={{ $('Webhook').item.json.body['Target data'][3].DataName }}", "description": "={{ $('Webhook').item.json.body['Target data'][3].description }}" }, { "name": "={{ $('Webhook').item.json.body['Target data'][4].DataName }}", "description": "={{ $('Webhook').item.json.body['Target data'][4].description }}" } ] } }, "typeVersion": 1 }, { "id": "0e756adb-a6ba-421f-9d21-374e7fa74781", "name": "OpenAI Chat Model1", "type": "@n8n/n8n-nodes-langchain.lmChatOpenAi", "position": [ 5400, 1140 ], "parameters": { "model": "gpt-4o-mini", "options": {} }, "credentials": { "openAiApi": { "id": "FmszNHDDVS32ud21", "name": "OpenAi account" } }, "typeVersion": 1 }, { "id": "920e9315-7de4-4a23-adbe-36338ea18097", "name": "Information Extractor2", "type": "@n8n/n8n-nodes-langchain.informationExtractor", "position": [ 6920, 60 ], "parameters": { "text": "={{ $('OpenAI').item.json.content }}", "options": { "systemPromptTemplate": "You are an expert extraction algorithm.\nOnly extract relevant information from the text.\nIf you do not know the value of an attribute asked to extract, set the attribute's value to NA. If the attribute is empty you can omit it." }, "attributes": { "attributes": [ { "name": "={{ $('Webhook').item.json.body['Target data'][0].DataName }}", "description": "={{ $('Webhook').item.json.body['Target data'][0].description }}" }, { "name": "={{ $('Webhook').item.json.body['Target data'][1].DataName }}", "description": "=The total number of stars on all project" }, { "name": "={{ $('Webhook').item.json.body['Target data'][2].DataName }}", "description": "={{ $('Webhook').item.json.body['Target data'][2].description }}" }, { "name": "={{ $('Webhook').item.json.body['Target data'][3].DataName }}", "description": "={{ $('Webhook').item.json.body['Target data'][3].description }}" }, { "name": "={{ $('Webhook').item.json.body['Target data'][4].DataName }}", "description": "={{ $('Webhook').item.json.body['Target data'][4].description }}" } ] } }, "typeVersion": 1 }, { "id": "aa98d16e-d20c-4a8f-8eaf-1f64751dd8ea", "name": "OpenAI Chat Model2", "type": "@n8n/n8n-nodes-langchain.lmChatOpenAi", "position": [ 6940, 220 ], "parameters": { "model": "gpt-4o-mini", "options": {} }, "credentials": { "openAiApi": { "id": "FmszNHDDVS32ud21", "name": "OpenAi account" } }, "typeVersion": 1 }, { "id": "ba41b87e-feb7-4753-95b3-d569d54d8756", "name": "Sticky Note6", "type": "n8n-nodes-base.stickyNote", "position": [ 1820, -680 ], "parameters": { "color": 3, "width": 813.0685668942513, "height": 507.4126722815008, "content": "## Proxy\n\n**Configuration**\n\nTo configure your proxy with the project, follow the instructions on the GitHub project: https://github.com/Touxan/n8n-ultimate-scraper. To configure the docker-compose, you also need to add this argument to the 'Create Selenium Session' node : --proxy-server=address:port.\n\n### ⚠️Warning⚠️\n Selenium does not support proxy authentication, so you need to add your server IP to the proxy whitelist. On GeoNode, it's here: https://app.geonode.com/whitelist-ip!" }, "typeVersion": 1 }, { "id": "194bbecc-a5b3-4c5f-a17f-94703a44f196", "name": "Webhook", "type": "n8n-nodes-base.webhook", "position": [ 940, 540 ], "webhookId": "67d77918-2d5b-48c1-ae73-2004b32125f0", "parameters": { "path": "67d77918-2d5b-48c1-ae73-2004b32125f0", "options": {}, "httpMethod": "POST", "responseMode": "responseNode" }, "typeVersion": 2 }, { "id": "513389b0-0930-48d8-8cbb-e3575a0276ae", "name": "If Target Url", "type": "n8n-nodes-base.if", "position": [ 1380, 620 ], "parameters": { "options": {}, "conditions": { "options": { "version": 2, "leftValue": "", "caseSensitive": true, "typeValidation": "strict" }, "combinator": "and", "conditions": [ { "id": "4b608dcd-a175-4019-82c2-560320a2abce", "operator": { "type": "string", "operation": "empty", "singleValue": true }, "leftValue": "={{ $('Webhook').item.json.body['Target Url'] }}", "rightValue": "" } ] } }, "typeVersion": 2.2 }, { "id": "4ca0aee7-0dd2-4c78-b99b-8c188a3917f4", "name": "If1", "type": "n8n-nodes-base.if", "position": [ 3700, 900 ], "parameters": { "options": {}, "conditions": { "options": { "version": 2, "leftValue": "", "caseSensitive": true, "typeValidation": "strict" }, "combinator": "and", "conditions": [ { "id": "ff919945-b8c2-492a-b496-8617e9147389", "operator": { "type": "string", "operation": "notEmpty", "singleValue": true }, "leftValue": "={{ $('Webhook').item.json.body['Target Url'] }}", "rightValue": "" } ] } }, "typeVersion": 2.2 }, { "id": "baa4dc94-67f3-4683-b8c7-6b6e856e7c64", "name": "Go on url1", "type": "n8n-nodes-base.httpRequest", "onError": "continueErrorOutput", "position": [ 3900, 960 ], "parameters": { "url": "=http://selenium_chrome:4444/wd/hub/session/{{ $('Create Selenium Session').item.json.value.sessionId }}/url", "method": "POST", "options": {}, "jsonBody": "={\n \"url\": \"{{ $('Information Extractor').item.json.output.Good_url_for_etract_information }}\"\n}\n", "sendBody": true, "specifyBody": "json" }, "retryOnFail": true, "typeVersion": 4.2 }, { "id": "2c439b0e-7c78-4ae8-b653-3f02b3834aa8", "name": "If2", "type": "n8n-nodes-base.if", "position": [ 3340, 560 ], "parameters": { "options": {}, "conditions": { "options": { "version": 2, "leftValue": "", "caseSensitive": true, "typeValidation": "loose" }, "combinator": "and", "conditions": [ { "id": "2a1bfc1e-28a6-45d1-9581-53b632af90e0", "operator": { "type": "string", "operation": "notEmpty", "singleValue": true }, "leftValue": "={{ $('Webhook').item.json.body.cookies }}", "rightValue": "" } ] }, "looseTypeValidation": true }, "typeVersion": 2.2 }, { "id": "fc3260da-9131-4850-a581-55a27ce4428d", "name": "Go on url2", "type": "n8n-nodes-base.httpRequest", "onError": "continueErrorOutput", "position": [ 4260, -20 ], "parameters": { "url": "=http://selenium_chrome:4444/wd/hub/session/{{ $('Create Selenium Session').item.json.value.sessionId }}/url", "method": "POST", "options": {}, "jsonBody": "={\n \"url\": \"{{ $('Webhook').item.json.body['Target Url'] }}\"\n}\n", "sendBody": true, "specifyBody": "json" }, "retryOnFail": true, "typeVersion": 4.2 }, { "id": "fe345010-1fa3-4d2c-8bc2-e87f6aeeb0d9", "name": "If3", "type": "n8n-nodes-base.if", "position": [ 4060, 100 ], "parameters": { "options": {}, "conditions": { "options": { "version": 2, "leftValue": "", "caseSensitive": true, "typeValidation": "strict" }, "combinator": "and", "conditions": [ { "id": "ff919945-b8c2-492a-b496-8617e9147389", "operator": { "type": "string", "operation": "notEmpty", "singleValue": true }, "leftValue": "={{ $('Webhook').item.json.body['Target Url'] }}", "rightValue": "" } ] } }, "typeVersion": 2.2 }, { "id": "1aae02ec-3a22-4dd5-aea4-819758f130c1", "name": "Go on url3", "type": "n8n-nodes-base.httpRequest", "onError": "continueErrorOutput", "position": [ 4260, 160 ], "parameters": { "url": "=http://selenium_chrome:4444/wd/hub/session/{{ $('Create Selenium Session').item.json.value.sessionId }}/url", "method": "POST", "options": {}, "jsonBody": "={\n \"url\": \"{{ $('Information Extractor').item.json.output.Good_url_for_etract_information }}\"\n}\n", "sendBody": true, "specifyBody": "json" }, "retryOnFail": true, "typeVersion": 4.2 } ], "active": true, "pinData": {}, "settings": { "executionOrder": "v1" }, "versionId": "e0ae7ac4-4be7-4b9c-9247-1475ffd297b1", "connections": { "If": { "main": [ [ { "node": "If3", "type": "main", "index": 0 } ], [ { "node": "Delete Session5", "type": "main", "index": 0 } ] ] }, "If1": { "main": [ [ { "node": "Go on url", "type": "main", "index": 0 } ], [ { "node": "Go on url1", "type": "main", "index": 0 } ] ] }, "If2": { "main": [ [ { "node": "If", "type": "main", "index": 0 } ], [ { "node": "If1", "type": "main", "index": 0 } ] ] }, "If3": { "main": [ [ { "node": "Go on url2", "type": "main", "index": 0 } ], [ { "node": "Go on url3", "type": "main", "index": 0 } ] ] }, "Code": { "main": [ [ { "node": "Inject Cookie", "type": "main", "index": 0 } ] ] }, "Limit": { "main": [ [ { "node": "Refresh browser", "type": "main", "index": 0 } ] ] }, "OpenAI": { "main": [ [ { "node": "If Block1", "type": "main", "index": 0 } ] ] }, "OpenAI1": { "main": [ [ { "node": "If Block", "type": "main", "index": 0 } ], [ { "node": "Delete Session6", "type": "main", "index": 0 } ] ] }, "Webhook": { "main": [ [ { "node": "Edit Fields (For testing prupose )", "type": "main", "index": 0 } ] ] }, "If Block": { "main": [ [ { "node": "Delete Session1", "type": "main", "index": 0 } ], [ { "node": "Delete Session", "type": "main", "index": 0 } ] ] }, "Go on url": { "main": [ [ { "node": "Get ScreenShot 1", "type": "main", "index": 0 } ], [ { "node": "Delete Session6", "type": "main", "index": 0 } ] ] }, "If Block1": { "main": [ [ { "node": "Delete Session2", "type": "main", "index": 0 } ], [ { "node": "Delete Session3", "type": "main", "index": 0 } ] ] }, "Go on url1": { "main": [ [ { "node": "Get ScreenShot 1", "type": "main", "index": 0 } ], [ { "node": "Delete Session6", "type": "main", "index": 0 } ] ] }, "Go on url2": { "main": [ [ { "node": "Code", "type": "main", "index": 0 } ], [ { "node": "Delete Session4", "type": "main", "index": 0 } ] ] }, "Go on url3": { "main": [ [ { "node": "Code", "type": "main", "index": 0 } ], [ { "node": "Delete Session4", "type": "main", "index": 0 } ] ] }, "If Target Url": { "main": [ [ { "node": "Google search Query ", "type": "main", "index": 0 } ], [ { "node": "Create Selenium Session", "type": "main", "index": 0 } ] ] }, "Inject Cookie": { "main": [ [ { "node": "Limit", "type": "main", "index": 0 } ] ] }, "Delete Session": { "main": [ [ { "node": "Information Extractor1", "type": "main", "index": 0 } ] ] }, "Convert to File": { "main": [ [ { "node": "OpenAI", "type": "main", "index": 0 } ], [ { "node": "Delete Session4", "type": "main", "index": 0 } ] ] }, "Delete Session1": { "main": [ [ { "node": "Respond to Webhook3", "type": "main", "index": 0 } ] ] }, "Delete Session2": { "main": [ [ { "node": "Respond to Webhook2", "type": "main", "index": 0 } ] ] }, "Delete Session3": { "main": [ [ { "node": "Information Extractor2", "type": "main", "index": 0 } ] ] }, "Delete Session4": { "main": [ [ { "node": "Error1", "type": "main", "index": 0 } ] ] }, "Delete Session5": { "main": [ [ { "node": "Error", "type": "main", "index": 0 } ] ] }, "Delete Session6": { "main": [ [ { "node": "Error3", "type": "main", "index": 0 } ] ] }, "Delete Session7": { "main": [ [ { "node": "Error2", "type": "main", "index": 0 } ] ] }, "Get ScreenShot ": { "main": [ [ { "node": "Convert to File", "type": "main", "index": 0 } ], [ { "node": "Delete Session4", "type": "main", "index": 0 } ] ] }, "Refresh browser": { "main": [ [ { "node": "Get ScreenShot ", "type": "main", "index": 0 } ], [ { "node": "Delete Session4", "type": "main", "index": 0 } ] ] }, "Clean Webdriver ": { "main": [ [ { "node": "If2", "type": "main", "index": 0 } ] ] }, "Convert to File1": { "main": [ [ { "node": "OpenAI1", "type": "main", "index": 0 } ], [ { "node": "Delete Session6", "type": "main", "index": 0 } ] ] }, "Get ScreenShot 1": { "main": [ [ { "node": "Convert to File1", "type": "main", "index": 0 } ], [ { "node": "Delete Session6", "type": "main", "index": 0 } ] ] }, "Get ScreenShot 2": { "main": [ [ { "node": "Convert to File2", "type": "main", "index": 0 } ], [ { "node": "Delete Session8", "type": "main", "index": 0 } ] ] }, "Go on ip-api.com": { "main": [ [ { "node": "Get ScreenShot 2", "type": "main", "index": 0 } ], [ { "node": "Delete Session8", "type": "main", "index": 0 } ] ] }, "OpenAI Chat Model": { "ai_languageModel": [ [ { "node": "Information Extractor", "type": "ai_languageModel", "index": 0 } ] ] }, "OpenAI Chat Model1": { "ai_languageModel": [ [ { "node": "Information Extractor1", "type": "ai_languageModel", "index": 0 } ] ] }, "OpenAI Chat Model2": { "ai_languageModel": [ [ { "node": "Information Extractor2", "type": "ai_languageModel", "index": 0 } ] ] }, "Check if empty of NA": { "main": [ [ { "node": "Error can't find url", "type": "main", "index": 0 } ], [ { "node": "Create Selenium Session", "type": "main", "index": 0 } ] ] }, "Google search Query ": { "main": [ [ { "node": "Extract First Url Match", "type": "main", "index": 0 } ] ] }, "Information Extractor": { "main": [ [ { "node": "Check if empty of NA", "type": "main", "index": 0 } ] ] }, "Resize browser window": { "main": [ [ { "node": "Clean Webdriver ", "type": "main", "index": 0 } ] ] }, "Information Extractor1": { "main": [ [ { "node": "Success", "type": "main", "index": 0 } ] ] }, "Information Extractor2": { "main": [ [ { "node": "Success with cookie", "type": "main", "index": 0 } ] ] }, "Create Selenium Session": { "main": [ [ { "node": "Resize browser window", "type": "main", "index": 0 } ], [ { "node": "Delete Session7", "type": "main", "index": 0 } ] ] }, "Extract First Url Match": { "main": [ [ { "node": "Information Extractor", "type": "main", "index": 0 } ] ] }, "Edit Fields (For testing prupose )": { "main": [ [ { "node": "If Target Url", "type": "main", "index": 0 } ] ] } } }