[extensions][enums] Improve detection for Yandex bots

This commit is contained in:
Faisal Salman
2025-08-30 17:01:05 +07:00
parent 2078b1ec92
commit ce242a362f
4 changed files with 415 additions and 3 deletions

View File

@@ -553,7 +553,35 @@ const Extension = Object.freeze({
VERCEL_V0BOT: 'v0bot',
YAHOO_JAPAN: 'Y!J-BRW',
YAHOO_SLURP: 'Yahoo! Slurp',
YANDEX_ACCESSIBILITY_BOT: 'YandexAccessibilityBot',
YANDEX_ADDITIONAL_BOT: 'YandexAdditionalBot',
YANDEX_ADNET: 'YandexAdNet',
YANDEX_BLOGS: 'YandexBlogs',
YANDEX_BOT: 'YandexBot',
YANDEX_BOT_MIRRORDETECTOR: 'YandexBot MirrorDetector',
YANDEX_COMBOT: 'YandexComBot',
YANDEX_FAVICONS: 'YandexFavicons',
YANDEX_IMAGE_RESIZER: 'YandexImageResizer',
YANDEX_IMAGES: 'YandexImages',
YANDEX_MARKET: 'YandexMarket',
YANDEX_MEDIA: 'YandexMedia',
YANDEX_METRIKA: 'YandexMetrika',
YANDEX_MOBILE_BOT: 'YandexMobileBot',
YANDEX_MOBILE_SCREENSHOT_BOT: 'YandexMobileScreenShotBot',
YANDEX_NEWS: 'YandexNews',
YANDEX_ONTODB: 'YandexOntoDB',
YANDEX_ONTODB_API: 'YandexOntoDBAPI',
YANDEX_PARTNER: 'YandexPartner',
YANDEX_RCA: 'YandexRCA',
YANDEX_RENDERRESOURCES_BOT: 'YandexRenderResourcesBot',
YANDEX_SCREENSHOT_BOT: 'YandexScreenshotBot',
YANDEX_SPRAV_BOT: 'YandexSpravBot',
YANDEX_TRACKER: 'YandexTracker',
YANDEX_VERTICALS: 'YandexVerticals',
YANDEX_VERTIS: 'YandexVertis',
YANDEX_VIDEO: 'YandexVideo',
YANDEX_VIDEO_PARSER: 'YandexVideoParser',
YANDEX_WEBMASTER: 'YandexWebmaster',
YEP_BOT: 'YepBot',
YETI: 'Yeti',
YISOU_SPIDER: 'YisouSpider',
@@ -624,6 +652,15 @@ const Extension = Object.freeze({
VERCEL_BOT: 'Vercelbot',
VERCEL_FLAGS: 'vercelflags',
VERCEL_TRACING: 'verceltracing',
YANDEX_CALENDAR: 'YandexCalendar',
YANDEX_DIRECT: 'YandexDirect',
YANDEX_DIRECTDYN: 'YandexDirectDyn',
YANDEX_DIRECTFETCHER: 'YaDirectFetcher',
YANDEX_FORDOMAIN: 'YandexForDomain',
YANDEX_PAGECHECKER: 'YandexPagechecker',
YANDEX_SEARCHSHOP: 'YandexSearchShop',
YANDEX_SITELINKS: 'YandexSitelinks',
YANDEX_USERPROXY: 'YandexUserproxy',
WHATSAPP: 'WhatsApp',
ZOOMINFO_BOT: 'Zoombot'
},

View File

@@ -109,7 +109,7 @@ const Crawlers = Object.freeze({
/(y!?j-(?:asr|br[uw]|dscv|mmp|vsidx|wsc))\/([\w\.]+)/i,
// Yandex Bots - https://yandex.com/bots
/(yandex(?:(?:mobile)?(?:accessibility|additional|renderresources|screenshot|sprav)?bot|image(?:s|resizer)|video(?:parser)?|blogs|adnet|favicons|fordomain|market|media|metrika|news|ontodb(?:api)?|pagechecker|partner|rca|tracker|turbo|vertis|webmaster|antivirus))\/([\w\.]+)/i,
/(yandex(?:(?:mobile)?(?:accessibility|additional|com|renderresources|screenshot|sprav)?bot(?!.+mirror)|image(?:s|resizer)|adnet|blogs|favicons|market|media|metrika|news|ontodb(?:api)?|partner|rca|tracker|turbo|verti(?:cal)?s|webmaster|video(?:parser)?))\/([\w\.]+)/i,
// Yeti (Naver)
/(yeti)\/([\w\.]+)/i,
@@ -119,9 +119,14 @@ const Crawlers = Object.freeze({
// Freespoke - https://docs.freespoke.com/search/bot/
/((?:aihit|blex|diff|huggingface-|msn|pangu|replicate-|runpod-|timpi|together-|xai-|you|zum)bot|(?:magpie-|velenpublicweb)crawler|(?:chatglm-|line|screaming frog seo |yisou)spider|cotoyogi|firecrawlagent|freespoke|omgili(?:bot)?|openai image downloader|startpageprivateimageproxy|twinagent|webzio-extended)\/?([\w\.]*)/i
],
[NAME, VERSION, [TYPE, CRAWLER]],
[
// YandexBot MirrorDetector
/(yandexbot\/([\w\.]+); mirrordetector)/i
],
[[NAME, /\/.+;/ig, ''], VERSION, [TYPE, CRAWLER]],
[
// Google Bots
/((?:adsbot|apis|mediapartners)-google(?:-mobile)?|google-?(?:other|cloudvertexbot|extended|safety))/i,
@@ -260,7 +265,7 @@ const Fetchers = Object.freeze({
// Perplexity-User - https://docs.perplexity.ai/guides/bots
// MistralAI-User - https://docs.mistral.ai/robots/
// Yandex Bots - https://yandex.com/bots
/(asana|ahrefssiteaudit|(?:bing|microsoft)preview|blueno|(?:chatgpt|claude|mistralai|perplexity)-user|cohere-ai|hubspot page fetcher|mastodon|(?:bitly|bufferlinkpreview|discord|duckassist|linkedin|pinterest|reddit|roger|siteaudit|twitter|uptimero|zoom)bot|google-site-verification|iframely|kakaotalk-scrap|meta-externalfetcher|y!?j-dlc|yandex(?:calendar|direct(?:dyn)?|searchshop)|yadirectfetcher)\/([\w\.]+)/i,
/(asana|ahrefssiteaudit|(?:bing|microsoft)preview|blueno|(?:chatgpt|claude|mistralai|perplexity)-user|cohere-ai|hubspot page fetcher|mastodon|(?:bitly|bufferlinkpreview|discord|duckassist|linkedin|pinterest|reddit|roger|siteaudit|twitter|uptimero|zoom)bot|google-site-verification|iframely|kakaotalk-scrap|meta-externalfetcher|y!?j-dlc|yandex(?:calendar|direct(?:dyn)?|fordomain|pagechecker|searchshop)|yadirectfetcher)\/([\w\.]+)/i,
// Bluesky
/(bluesky) cardyb\/([\w\.]+)/i,