From c9badeb345b78c6d577d6cd08e24780bf8ee0308 Mon Sep 17 00:00:00 2001 From: Faisal Salman Date: Thu, 21 Aug 2025 21:40:50 +0700 Subject: [PATCH] [extensions] Add new crawlers: Algolia, Baidu, BLEXBot, Botify, Freespoke, Marginalia, MSNBot, OnCrawl, SeekportBot, Siteimprove, TwinAgent, YepBot, ZumBot --- src/extensions/ua-parser-extensions.js | 21 ++-- test/data/ua/extension/crawler.json | 142 ++++++++++++++++++++++++- 2 files changed, 155 insertions(+), 8 deletions(-) diff --git a/src/extensions/ua-parser-extensions.js b/src/extensions/ua-parser-extensions.js index 5a29b50..6346b9f 100644 --- a/src/extensions/ua-parser-extensions.js +++ b/src/extensions/ua-parser-extensions.js @@ -62,7 +62,11 @@ const Crawlers = Object.freeze({ // OpenAI's SearchGPT - https://platform.openai.com/docs/bots // PerplexityBot - https://perplexity.ai/perplexitybot // SeznamBot - http://napoveda.seznam.cz/seznambot-intro - /((?:adidx|ahrefs|amazon|bing|brave|cc|contx|coveo|criteo|dot|duckduck(?:go-favicons-)?|exa|facebook|gpt|iask|kagi|kangaroo |linkedin|mj12|mojeek|oai-search|onespot-scraper|perplexity|semrush|seznam)bot)\/([\w\.-]+)/i, + // YepBot - https://yep.com/yepbot/ + /((?:adidx|ahrefs|amazon|bing|brave|cc|contx|coveo|criteo|dot|duckduck(?:go-favicons-)?|exa|facebook|gpt|iask|kagi|kangaroo |linkedin|mj12|mojeek|oai-search|onespot-scraper|perplexity|semrush|seznam|yep)bot)\/([\w\.-]+)/i, + + // Algolia Crawler + /(algolia crawler(?: renderscript)?)\/?([\w\.]*)/i, // Applebot - http://apple.com/go/applebot /(applebot(?:-extended)?)\/?([\w\.]*)/i, @@ -89,6 +93,9 @@ const Crawlers = Object.freeze({ // Internet Archive (archive.org) /(ia_archiver|archive\.org_bot)\/?([\w\.]*)/i, + // OnCrawl + /(oncrawl) mobile\/([\w\.]+)/i, + // Qwantbot - https://help.qwant.com/bot /(qwantbot)[-\w]*\/?([\w\.]*)/i, @@ -107,9 +114,10 @@ const Crawlers = Object.freeze({ // Yeti (Naver) /(yeti)\/([\w\.]+)/i, - // aiHitBot / Algolia Crawler / Diffbot / FirecrawlAgent / HuggingFace-Bot / Linespider / Magpie-Crawler / Omgilibot / OpenAI Image Downloader / PanguBot / Replicate-Bot / RunPod-Bot / Webzio-Extended / Screaming Frog SEO Spider / Startpage / Timpibot / Together-Bot / VelenPublicWebCrawler / xAI-Bot / YisouSpider / YouBot + // aiHitBot / Algolia Crawler / BLEXBot / Diffbot / FirecrawlAgent / HuggingFace-Bot / Linespider / MSNBot / Magpie-Crawler / Omgilibot / OpenAI Image Downloader / PanguBot / Replicate-Bot / RunPod-Bot / Webzio-Extended / Screaming Frog SEO Spider / Startpage / Timpibot / Together-Bot / VelenPublicWebCrawler / xAI-Bot / YisouSpider / YouBot / ZumBot // Cotoyogi - https://ds.rois.ac.jp/en_center8/en_crawler/ - /((?:aihit|diff|huggingface-|pangu|replicate-|runpod-|timpi|together-|xai-|you)bot|omgili(?:bot)?|cotoyogi|firecrawlagent|openai image downloader|(?:algolia |magpie-|velenpublicweb)crawler|startpageprivateimageproxy|webzio-extended|(?:chatglm-|line|screaming frog seo |yisou)spider)\/?([\w\.]*)/i + // Freespoke - https://docs.freespoke.com/search/bot/ + /((?:aihit|blex|diff|huggingface-|msn|pangu|replicate-|runpod-|timpi|together-|xai-|you|zum)bot|(?:magpie-|velenpublicweb)crawler|(?:chatglm-|line|screaming frog seo |yisou)spider|cotoyogi|firecrawlagent|freespoke|omgili(?:bot)?|openai image downloader|startpageprivateimageproxy|twinagent|webzio-extended)\/?([\w\.]*)/i ], [NAME, VERSION, [TYPE, CRAWLER]], @@ -119,16 +127,15 @@ const Crawlers = Object.freeze({ /((?:adsbot|apis|mediapartners)-google(?:-mobile)?|google-?(?:other|cloudvertexbot|extended|safety))/i, // AI2Bot - https://allenai.org/crawler - // Bytespider // DataForSeoBot - https://dataforseo.com/dataforseo-bot - // DeepSeekBot // Huawei AspiegelBot / PetalBot https://aspiegel.com/petalbot // ImagesiftBot - https://imagesift.com/about - // Qihoo 360Spider + // Siteimprove - https://help.siteimprove.com/support/solutions/articles/80000448553 // TurnitinBot - https://www.turnitin.com/robot/crawlerinfo.html // v0bot - https://vercel.com/docs/bot-management // Yahoo! Slurp - http://help.yahoo.com/help/us/ysearch/slurp - /\b(360spider-?(?:image|video)?|bytespider|cohere-training-data-crawler|elastic(?=\/s)|(?:ai2|aspiegel|dataforseo|deepseek|imagesift|petal|turnitin|v0)bot|teoma|yahoo! slurp)/i + // Botify / Bytespider / DeepSeekBot / Qihoo 360Spider / SeekportBot + /\b((?:ai2|aspiegel|dataforseo|deepseek|imagesift|petal|seekport|turnitin|v0)bot|360spider-?(?:image|video)?|baidu-ads|botify|bytespider|cohere-training-data-crawler|elastic(?=\/s)|marginalia|siteimprove(?=bot|\.com)|teoma|yahoo! slurp)/i ], [NAME, [TYPE, CRAWLER]] ] diff --git a/test/data/ua/extension/crawler.json b/test/data/ua/extension/crawler.json index f54aac7..3506077 100644 --- a/test/data/ua/extension/crawler.json +++ b/test/data/ua/extension/crawler.json @@ -89,6 +89,16 @@ "type" : "crawler" } }, + { + "desc" : "Algolia Crawler Renderscript", + "ua" : "Algolia Crawler Renderscript", + "expect" : + { + "name" : "Algolia Crawler Renderscript", + "version" : "undefined", + "type" : "crawler" + } + }, { "desc" : "Applebot", "ua" : "Mozilla/5.0 (iPhone; CPU iPhone OS 8_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B410 Safari/600.1.4 (Applebot/0.1;+http://www.apple.com/go/applebot)", @@ -149,6 +159,16 @@ "type" : "crawler" } }, + { + "desc" : "Baidu ADS", + "ua" : "Baidu-ADS", + "expect" : + { + "name" : "Baidu-ADS", + "version" : "undefined", + "type" : "crawler" + } + }, { "desc" : "Baiduspider", "ua" : "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)", @@ -239,6 +259,26 @@ "type" : "crawler" } }, + { + "desc" : "BLEXBot", + "ua" : "Mozilla/5.0 (compatible; BLEXBot/1.0; +http://webmeup-crawler.com/)", + "expect" : + { + "name" : "BLEXBot", + "version" : "1.0", + "type" : "crawler" + } + }, + { + "desc" : "botify", + "ua" : "Desktop: Mozilla/5.0 (compatible; botify; http://botify.com)", + "expect" : + { + "name" : "botify", + "version" : "undefined", + "type" : "crawler" + } + }, { "desc" : "Bravebot", "ua" : "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Bravebot/1.0; +https://search.brave.com/help/brave-search-crawler) Chrome/W.X.Y.Z Safari/537.36", @@ -519,6 +559,16 @@ "type" : "crawler" } }, + { + "desc" : "Freespoke", + "ua" : "Mozilla/5.0 (compatible; Freespoke/2.0; +https://docs.freespoke.com/search/bot)", + "expect" : + { + "name" : "Freespoke", + "version" : "2.0", + "type" : "crawler" + } + }, { "desc" : "Googlebot-Video", "ua" : "Googlebot-Video/1.0", @@ -719,6 +769,16 @@ "type" : "crawler" } }, + { + "desc" : "Marginalia Search", + "ua" : "search.marginalia.nu", + "expect" : + { + "name" : "marginalia", + "version" : "undefined", + "type" : "crawler" + } + }, { "desc" : "Meta-ExternalAgent", "ua" : "meta-externalagent/1.1 (+https://developers.facebook.com/docs/sharing/webmasters/crawler)", @@ -750,6 +810,16 @@ "type" : "crawler" } }, + { + "desc" : "msnbot", + "ua" : "msnbot/2.0b (+http://search.msn.com/msnbot.htm)", + "expect" : + { + "name" : "msnbot", + "version" : "2.0b", + "type" : "crawler" + } + }, { "desc" : "Omgili", "ua" : "omgili/0.5 +https://omgili.com", @@ -770,6 +840,16 @@ "type" : "crawler" } }, + { + "desc" : "OnCrawl", + "ua" : "Mozilla/5.0 (iPhone; CPU iPhone OS 8_3 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12F70 Safari/600.1.4 (compatible; OnCrawl Mobile/1.0; +http://www.oncrawl.com/)", + "expect" : + { + "name" : "OnCrawl", + "version" : "1.0", + "type" : "crawler" + } + }, { "desc" : "Onespot", "ua" : "Mozilla/5.0 (compatible; Onespot-ScraperBot/1.0; +https://www.onespot.com/identifying-traffic.html)", @@ -880,6 +960,16 @@ "type" : "crawler" } }, + { + "desc" : "SeekportBot", + "ua" : "Mozilla/5.0 (compatible; SeekportBot; +https://bot.seekport.com)", + "expect" : + { + "name" : "SeekportBot", + "version" : "undefined", + "type" : "crawler" + } + }, { "desc" : "SemrushBot", "ua" : "Mozilla/5.0 (compatible; SemrushBot/7~bl; +http://www.semrush.com/bot.html)", @@ -931,7 +1021,27 @@ } }, { - "desc" : "Sogou", + "desc" : "Siteimprove", + "ua" : "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; SiteCheck-sitecrawl by Siteimprove.com; +https://siteimprove.com/bots) Chrome/[VERSION] Safari/537.36", + "expect" : + { + "name" : "Siteimprove", + "version" : "undefined", + "type" : "crawler" + } + }, + { + "desc" : "Sogou Pic Spider", + "ua" : "Sogou Pic Spider/3.0( http://www.sogou.com/docs/help/webmasters.htm#07)", + "expect" : + { + "name" : "Sogou Pic Spider", + "version" : "3.0", + "type" : "crawler" + } + }, + { + "desc" : "Sogou web spider", "ua" : "Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)", "expect" : { @@ -990,6 +1100,16 @@ "type" : "crawler" } }, + { + "desc" : "TwinAgent", + "ua" : "TwinAgent/1.0", + "expect" : + { + "name" : "TwinAgent", + "version" : "1.0", + "type" : "crawler" + } + }, { "desc" : "xAI-Bot", "ua" : "Mozilla/5.0 (compatible; xAI-Bot/1.0; +https://x.ai/)", @@ -1050,6 +1170,16 @@ "type" : "crawler" } }, + { + "desc" : "YepBot", + "ua" : "Mozilla/5.0 (compatible; YepBot/1.0; +http://yep.com/yepbot/)", + "expect" : + { + "name" : "YepBot", + "version" : "1.0", + "type" : "crawler" + } + }, { "desc" : "Yeti", "ua" : "Mozilla/5.0 (compatible; Yeti/1.1; +http://naver.me/spd)", @@ -1089,5 +1219,15 @@ "version" : "undefined", "type" : "crawler" } + }, + { + "desc" : "ZumBot", + "ua" : "Mozilla/5.0 (compatible; ZumBot/1.0; http://help.zum.com/inquiry)", + "expect" : + { + "name" : "ZumBot", + "version" : "1.0", + "type" : "crawler" + } } ]