[extensions] Improve bot detection for ByteDance, Google, SB Intuitions, Webzio

This commit is contained in:
Faisal Salman 2025-08-31 20:04:49 +07:00
parent ce242a362f
commit 146f182533
4 changed files with 81 additions and 10 deletions

View File

@ -441,7 +441,6 @@ const Extension = Object.freeze({
WGET: 'wget'
},
Crawlers: {
'360_SPIDER': '360Spider',
AHREFS_BOT: 'AhrefsBot',
AI2_BOT: 'AI2Bot',
AIHIT_BOT: 'aiHitBot',
@ -468,9 +467,9 @@ const Extension = Object.freeze({
BLEX_BOT: 'BLEXBot',
BOTIFY: 'botify',
BRAVE_BOT: 'Bravebot',
BYTEDANCE_SPIDER: 'Bytespider',
BYTEDANCE_BYTESPIDER: 'Bytespider',
BYTEDANCE_TIKTOKSPIDER: 'TikTokSpider',
CC_BOT: 'CCBot',
CHATGLM_SPIDER: 'ChatGLM-Spider',
COCCOC_BOT_WEB: 'coccocbot-web',
COCCOC_BOT_IMAGE: 'coccocbot-image',
COHERE_TRAINING_DATA_CRAWLER: 'cohere-training-data-crawler',
@ -492,10 +491,12 @@ const Extension = Object.freeze({
GOOGLE_ADSBOT: 'AdsBot-Google',
GOOGLE_ADSBOT_MOBILE: 'Adsbot-Google-Mobile',
GOOGLE_ADSENSE: 'AdSense',
GOOGLE_APIS: 'APIs-Google',
GOOGLE_BOT: 'Googlebot',
GOOGLE_BOT_IMAGE: 'Googlebot-Image',
GOOGLE_BOT_NEWS: 'Googlebot-News',
GOOGLE_BOT_VIDEO: 'Googlebot-Video',
GOOGLE_CLOUDVERTEXBOT: 'Google-CloudVertexBot',
GOOGLE_INSPECTIONTOOL: 'Google-InspectionTool',
GOOGLE_OTHER: 'GoogleOther',
GOOGLE_OTHER_IMAGE: 'GoogleOther-Image',
@ -525,16 +526,16 @@ const Extension = Object.freeze({
MICROSOFT_ADIDXBOT: 'adidxbot',
MOJEEK_BOT: 'MojeekBot',
MOZ_DOTBOT: 'DotBot',
OMGILI: 'omgili',
OMGILI_BOT: 'omgilibot',
ONCRAWL: 'OnCrawl',
ONESPOT_SCRAPERBOT: 'Onespot-ScraperBot',
OPENAI_GPTBOT: 'GPTBot',
OPENAI_SEARCH: 'OAI-SearchBot',
PERPLEXITY_BOT: 'PerplexityBot',
QIHOO_360_SPIDER: '360Spider',
QWANT_BOT: 'Qwantbot',
REPLICATE_BOT: 'Replicate-Bot',
RUNPOD_BOT: 'RunPod-Bot',
SB_INTUITIONS_BOT: 'SBIntuitionsBot',
SEEKPORT_BOT: 'SeekportBot',
SEMRUSH_BOT: 'SemrushBot',
SEMRUSH_BOT_BACKLINK: 'SemrushBot-BA',
@ -549,8 +550,12 @@ const Extension = Object.freeze({
TOGETHER_BOT: 'Together-Bot',
TURNITIN_BOT: 'TurnitinBot',
TWIN_AGENT: 'TwinAgent',
XAI_BOT: 'xAI-Bot',
VERCEL_V0BOT: 'v0bot',
WEBZIO: 'webzio',
WEBZIO_EXTENDED: 'Webzio-Extended',
WEBZIO_OMGILI: 'omgili',
WEBZIO_OMGILI_BOT: 'omgilibot',
XAI_BOT: 'xAI-Bot',
YAHOO_JAPAN: 'Y!J-BRW',
YAHOO_SLURP: 'Yahoo! Slurp',
YANDEX_ACCESSIBILITY_BOT: 'YandexAccessibilityBot',
@ -586,6 +591,7 @@ const Extension = Object.freeze({
YETI: 'Yeti',
YISOU_SPIDER: 'YisouSpider',
YOU_BOT: 'YouBot',
ZHIPU_CHATGLM_SPIDER: 'ChatGLM-Spider',
ZUM_BOT: 'ZumBot'
},
Emails: {
@ -624,7 +630,7 @@ const Extension = Object.freeze({
GOOGLE_CHROME_LIGHTHOUSE: 'Chrome-Lighthouse',
GOOGLE_FEEDFETCHER: 'FeedFetcher-Google',
GOOGLE_GEMINI_DEEP_RESEARCH: 'Gemini-Deep-Research',
GOOGLE_IMAGE_PROXY: 'GoogleImageProxy',
GOOGLE_IMAGEPROXY: 'GoogleImageProxy',
GOOGLE_PAGERENDERER: 'Google-PageRenderer',
GOOGLE_READ_ALOUD: 'Google-Read-Aloud',
GOOGLE_PRODUCER: 'GoogleProducer',

View File

@ -61,9 +61,10 @@ const Crawlers = Object.freeze({
// Onespot - https://www.onespot.com/identifying-traffic.html
// OpenAI's SearchGPT - https://platform.openai.com/docs/bots
// PerplexityBot - https://perplexity.ai/perplexitybot
// SBIntuitionsBot - https://www.sbintuitions.co.jp/bot/
// SeznamBot - http://napoveda.seznam.cz/seznambot-intro
// YepBot - https://yep.com/yepbot/
/((?:adidx|ahrefs|amazon|bing|brave|cc|contx|coveo|criteo|dot|duckduck(?:go-favicons-)?|exa|facebook|gpt|iask|kagi|kangaroo |linkedin|mj12|mojeek|oai-search|onespot-scraper|perplexity|semrush|seznam|yep)bot)\/([\w\.-]+)/i,
/((?:adidx|ahrefs|amazon|bing|brave|cc|contx|coveo|criteo|dot|duckduck(?:go-favicons-)?|exa|facebook|gpt|iask|kagi|kangaroo |linkedin|mj12|mojeek|oai-search|onespot-scraper|perplexity|sbintuitions|semrush|seznam|yep)bot)\/([\w\.-]+)/i,
// Algolia Crawler
/(algolia crawler(?: renderscript)?)\/?([\w\.]*)/i,
@ -139,8 +140,8 @@ const Crawlers = Object.freeze({
// TurnitinBot - https://www.turnitin.com/robot/crawlerinfo.html
// v0bot - https://vercel.com/docs/bot-management
// Yahoo! Slurp - http://help.yahoo.com/help/us/ysearch/slurp
// Botify / Bytespider / DeepSeekBot / Qihoo 360Spider / SeekportBot
/\b((?:ai2|aspiegel|dataforseo|deepseek|imagesift|petal|seekport|turnitin|v0)bot|360spider-?(?:image|video)?|baidu-ads|botify|bytespider|cohere-training-data-crawler|elastic(?=\/s)|marginalia|siteimprove(?=bot|\.com)|teoma|yahoo! slurp)/i
// Botify / Bytespider / DeepSeekBot / Qihoo 360Spider / SeekportBot / TikTokSpider
/\b((ai2|aspiegel|dataforseo|deepseek|imagesift|petal|seekport|turnitin|v0)bot|360spider-?(image|video)?|baidu-ads|botify|(byte|tiktok)spider|cohere-training-data-crawler|elastic(?=\/s)|marginalia|siteimprove(?=bot|\.com)|teoma|webzio|yahoo! slurp)/i
],
[NAME, [TYPE, CRAWLER]]
]

View File

@ -88,6 +88,7 @@ const isAIBot = (resultOrUA) => [
'googleother',
'googleother-image',
'googleother-video',
'google-cloudvertexbot',
'google-extended',
// Hive AI
@ -123,6 +124,9 @@ const isAIBot = (resultOrUA) => [
// Runpod
'runpod-bot',
// SB Intuitions
'sbintuitionsbot',
// Semrush
'semrushbot-ocob',

View File

@ -579,6 +579,16 @@
"type" : "crawler"
}
},
{
"desc" : "APIs-Google",
"ua" : "APIs-Google (+https://developers.google.com/webmasters/APIs-Google.html)",
"expect" :
{
"name" : "APIs-Google",
"version" : "undefined",
"type" : "crawler"
}
},
{
"desc" : "Googlebot-Video",
"ua" : "Googlebot-Video/1.0",
@ -679,6 +689,16 @@
"type" : "crawler"
}
},
{
"desc" : "Google-CloudVertexBot",
"ua" : "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.7204.183 Mobile Safari/537.36 (compatible; Google-CloudVertexBot; +https://cloud.google.com/enterprise-search)",
"expect" :
{
"name" : "Google-CloudVertexBot",
"version" : "undefined",
"type" : "crawler"
}
},
{
"desc" : "Google-Safety",
"ua" : "Google-Safety",
@ -970,6 +990,16 @@
"type" : "crawler"
}
},
{
"desc" : "SBIntuitionsBot",
"ua" : "Mozilla/5.0 (compatible; SBIntuitionsBot/0.1;+https://www.sbintuitions.co.jp/bot/)",
"expect" :
{
"name" : "SBIntuitionsBot",
"version" : "0.1",
"type" : "crawler"
}
},
{
"desc" : "SeekportBot",
"ua" : "Mozilla/5.0 (compatible; SeekportBot; +https://bot.seekport.com)",
@ -1080,6 +1110,16 @@
"type" : "crawler"
}
},
{
"desc" : "TikTokSpider",
"ua" : "Mozilla/5.0 (Linux; Android 5.0) AppleWebKit/537.36 (KHTML, like Gecko) Mobile Safari/537.36 (compatible; TikTokSpider; ttspider-feedback@tiktok.com)",
"expect" :
{
"name" : "TikTokSpider",
"version" : "undefined",
"type" : "crawler"
}
},
{
"desc" : "Timpibot",
"ua" : "Timpibot/0.8 (+http://www.timpi.io)",
@ -1150,6 +1190,26 @@
"type" : "crawler"
}
},
{
"desc" : "webzio",
"ua" : "webzio (+https://webz.io/bot.html)",
"expect" :
{
"name" : "webzio",
"version" : "undefined",
"type" : "crawler"
}
},
{
"desc" : "Webzio-Extended",
"ua" : "Mozilla/5.0 (compatible; Webzio-Extended/1.0; +https://www.webzio.com/bot.html)",
"expect" :
{
"name" : "Webzio-Extended",
"version" : "1.0",
"type" : "crawler"
}
},
{
"desc" : "Yahoo! Japan",
"ua" : "Y!J-BRW/1.0 (https://www.yahoo-help.jp/app/answers/detail/p/595/a_id/42716)",