From 72d0c2acb32917a9287b8cb838be015d8e5dcac9 Mon Sep 17 00:00:00 2001 From: Faisal Salman Date: Tue, 3 Jun 2025 11:03:45 +0700 Subject: [PATCH] [extensions] Add new crawler bots: ChatGLM, Onespot, Startpage --- src/extensions/ua-parser-extensions.js | 7 +++--- src/helpers/ua-parser-helpers.js | 3 +++ test/data/ua/extension/crawler.json | 30 ++++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 3 deletions(-) diff --git a/src/extensions/ua-parser-extensions.js b/src/extensions/ua-parser-extensions.js index a91bdd9..b2e31fb 100644 --- a/src/extensions/ua-parser-extensions.js +++ b/src/extensions/ua-parser-extensions.js @@ -52,10 +52,11 @@ const Crawlers = Object.freeze({ // LinkedInBot - http://www.linkedin.com // MJ12bot - https://mj12bot.com/ // MojeekBot - https://www.mojeek.com/bot.html + // Onespot - https://www.onespot.com/identifying-traffic.html // OpenAI's SearchGPT - https://platform.openai.com/docs/bots // PerplexityBot - https://perplexity.ai/perplexitybot // SeznamBot - http://napoveda.seznam.cz/seznambot-intro - /((?:adidx|ahrefs|amazon|bing|cc|dot|duckduck|exa|facebook|gpt|iask|linkedin|mj12|mojeek|oai-search|perplexity|semrush|seznam)bot)\/([\w\.-]+)/i, + /((?:adidx|ahrefs|amazon|bing|cc|dot|duckduck|exa|facebook|gpt|iask|linkedin|mj12|mojeek|oai-search|onespot-scraper|perplexity|semrush|seznam)bot)\/([\w\.-]+)/i, // Applebot - http://apple.com/go/applebot /(applebot(?:-extended)?)\/?([\w\.]*)/i, @@ -100,8 +101,8 @@ const Crawlers = Object.freeze({ // Yeti (Naver) /(yeti)\/([\w\.]+)/i, - // aiHitBot / Diffbot / Linespider / Magpie-Crawler / Omgilibot / OpenAI Image Downloader / Webzio-Extended / Screaming Frog SEO Spider / Timpibot / VelenPublicWebCrawler / YisouSpider / YouBot - /((?:aihit|diff|timpi|you)bot|omgili(?:bot)?|openai image downloader|(?:magpie-|velenpublicweb)crawler|webzio-extended|(?:screaming frog seo |line|yisou)spider)\/?([\w\.]*)/i + // aiHitBot / Diffbot / Linespider / Magpie-Crawler / Omgilibot / OpenAI Image Downloader / Webzio-Extended / Screaming Frog SEO Spider / Startpage / Timpibot / VelenPublicWebCrawler / YisouSpider / YouBot + /((?:aihit|diff|timpi|you)bot|omgili(?:bot)?|openai image downloader|(?:magpie-|velenpublicweb)crawler|startpageprivateimageproxy|webzio-extended|(?:chatglm-|line|screaming frog seo |yisou)spider)\/?([\w\.]*)/i ], [NAME, VERSION, [TYPE, CRAWLER]], diff --git a/src/helpers/ua-parser-helpers.js b/src/helpers/ua-parser-helpers.js index 64d7e90..d14667b 100644 --- a/src/helpers/ua-parser-helpers.js +++ b/src/helpers/ua-parser-helpers.js @@ -110,6 +110,9 @@ const isAIBot = (resultOrUA) => [ // You.com 'youbot', + // Zhipu AI + 'chatglm-spider', + // Zyte 'scrapy' diff --git a/test/data/ua/extension/crawler.json b/test/data/ua/extension/crawler.json index bf29428..918ee98 100644 --- a/test/data/ua/extension/crawler.json +++ b/test/data/ua/extension/crawler.json @@ -259,6 +259,16 @@ "type" : "crawler" } }, + { + "desc" : "ChatGLM-Spider", + "ua" : "Mozilla/5.0 (compatible; ChatGLM-Spider/1.0; +https://chatglm.cn/)", + "expect" : + { + "name" : "ChatGLM-Spider", + "version" : "1.0", + "type" : "crawler" + } + }, { "desc" : "Coc Coc Bot (web)", "ua" : "Mozilla/5.0 (compatible; coccocbot-web/1.0; +http://help.coccoc.com/searchengine)", @@ -620,6 +630,16 @@ "type" : "crawler" } }, + { + "desc" : "Onespot", + "ua" : "Mozilla/5.0 (compatible; Onespot-ScraperBot/1.0; +https://www.onespot.com/identifying-traffic.html)", + "expect" : + { + "name" : "Onespot-ScraperBot", + "version" : "1.0", + "type" : "crawler" + } + }, { "desc" : "OpenAI Search", "ua" : "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; OAI-SearchBot/1.0; +https://openai.com/searchbot", @@ -750,6 +770,16 @@ "type" : "crawler" } }, + { + "desc" : "Startpage", + "ua" : "StartpagePrivateImageProxy/3.0 (https://www.startpage.com/; support@startpage.com) aiohttp.client/3.11.11", + "expect" : + { + "name" : "StartpagePrivateImageProxy", + "version" : "3.0", + "type" : "crawler" + } + }, { "desc" : "Teoma", "ua" : "Mozilla/2.0 (compatible; Ask Jeeves/Teoma; +http://sp.ask.com/docs/about/tech_crawling.html)",