[submodule:extensions] Add some new bots: 360Spider, Archive.org Bots, CCBot, DataForSeoBot, DuckAssistBot, Exabot, Google Bots, Meta Bots, MojeekBot, PerplexityBot, PetalBot, TurnitinBot, Yeti, YisouSpider

This commit is contained in:
Faisal Salman 2024-11-10 13:01:46 +07:00
parent 3b3361fe53
commit a0e11b701e
3 changed files with 415 additions and 12 deletions

View File

@ -44,26 +44,36 @@ const Crawlers = Object.freeze({
// Amazonbot - https://developer.amazon.com/amazonbot
// Applebot - http://apple.com/go/applebot
// Bingbot - http://www.bing.com/bingbot.htm
// CCBot - https://commoncrawl.org/faq
// Dotbot - https://moz.com/help/moz-procedures/crawlers/dotbot
// DuckDuckBot - http://duckduckgo.com/duckduckbot.html
// FacebookBot - https://developers.facebook.com/docs/sharing/bot/
// GPTBot - https://platform.openai.com/docs/gptbot
// MJ12bot - https://mj12bot.com/
// OpenAI Search - https://platform.openai.com/docs/bots
// MojeekBot - https://www.mojeek.com/bot.html
// OpenAI's SearchGPT - https://platform.openai.com/docs/bots
// PerplexityBot - https://perplexity.ai/perplexitybot
// SemrushBot - http://www.semrush.com/bot.html
/((?:ahrefs|amazon|apple|bing|dot|duckduck|facebook|gpt|mj12|oai-search|semrush)bot)\/([\w\.]+)/i,
/((?:ahrefs|amazon|apple|bing|cc|dot|duckduck|exa|facebook|gpt|mj12|mojeek|oai-search|perplexity|semrush)bot)\/([\w\.]+)/i,
// Baiduspider https://help.baidu.com/question?prod_id=99&class=0&id=3001
/(baiduspider)[-imagevdonsfcpr]{0,6}\/([\w\.]+)/i,
// ClaudeBot
// ClaudeBot (Anthropic)
/(claude(?:bot|-web))\/([\w\.]+)/i,
// Coc Coc Bot - https://help.coccoc.com/en/search-engine
/(coccocbot-(?:image|web))\/([\w\.]+)/i,
// Facebook / Meta
// https://developers.facebook.com/docs/sharing/webmasters/web-crawlers
/(facebook(?:externalhit|catalog)|meta-externalagent)\/([\w\.]+)/i,
// Googlebot - http://www.google.com/bot.html
/(google(?:bot|other)(?:-image|-video|-news|-extended)?|(?:storebot-)?google(?:-inspectiontool)?)\/?([\w\.]*)/i,
/(google(?:bot|other|-inspectiontool)(?:-image|-video|-news)?|storebot-google)\/?([\w\.]*)/i,
// Internet Archive (archive.org)
/(ia_archiver|archive\.org_bot)\/?([\w\.]*)/i,
// Sogou Spider
/(sogou (?:pic|head|web|orion|news) spider)\/([\w\.]+)/i,
@ -72,14 +82,29 @@ const Crawlers = Object.freeze({
/(y!?j-(?:asr|br[uw]|dscv|mmp|vsidx|wsc))\/([\w\.]+)/i,
// Yandex Bots - https://yandex.com/bots
/(yandex(?:(?:mobile)?(?:accessibility|additional|renderresources|screenshot|sprav)?bot|image(?:s|resizer)|video(?:parser)?|blogs|adnet|favicons|fordomain|market|media|metrika|news|ontodb(?:api)?|pagechecker|partner|rca|tracker|turbo|vertis|webmaster|antivirus))\/([\w\.]+)/i
/(yandex(?:(?:mobile)?(?:accessibility|additional|renderresources|screenshot|sprav)?bot|image(?:s|resizer)|video(?:parser)?|blogs|adnet|favicons|fordomain|market|media|metrika|news|ontodb(?:api)?|pagechecker|partner|rca|tracker|turbo|vertis|webmaster|antivirus))\/([\w\.]+)/i,
// Yeti (Naver)
/(yeti)\/([\w\.]+)/i,
// YisouSpider
/(yisouspider)\/?([\w\.]*)/i
],
[NAME, VERSION, [TYPE, CRAWLER]],
[
// Google Bots
/((?:adsbot|apis|mediapartners)-google(?:-mobile)?|google-?(?:other|cloudvertexbot|extended|safety))/i,
// Bytespider
// DataForSeoBot - https://dataforseo.com/dataforseo-bot
// Huawei AspiegelBot / PetalBot https://aspiegel.com/petalbot
// Qihoo 360Spider
// TurnitinBot - https://www.turnitin.com/robot/crawlerinfo.html
// Yahoo! Slurp - http://help.yahoo.com/help/us/ysearch/slurp
[/((?:bytespider|(?=yahoo! )slurp))/i],
/(360spider-?(?:image|video)?|bytespider|(?:aspiegel|dataforseo|petal|turnitin)bot|(?=yahoo! )slurp)/i
],
[NAME, [TYPE, CRAWLER]]
]
});
@ -184,8 +209,15 @@ const Fetchers = Object.freeze({
[
// AhrefsSiteAudit - https://ahrefs.com/robot/site-audit
// ChatGPT-User - https://platform.openai.com/docs/plugins/bot
// DuckAssistBot - https://duckduckgo.com/duckassistbot/
// BingPreview / Mastodon / Pinterestbot / Redditbot / Rogerbot / Telegrambot / Twitterbot / UptimeRobot
/(ahrefssiteaudit|bingpreview|chatgpt-user|mastodon|(?:discord|linkedin|pinterest|reddit|roger|telegram|twitter|uptimero)bot)\/([\w\.]+)/i,
/(ahrefssiteaudit|bingpreview|chatgpt-user|mastodon|(?:discord|duckassist|linkedin|pinterest|reddit|roger|telegram|twitter|uptimero)bot)\/([\w\.]+)/i,
// Google Site Verifier
/(google-site-verification)\/([\w\.]+)/i,
// Meta
/(meta-externalfetcher)\/([\w\.]+)/i,
// Slackbot - https://api.slack.com/robots
/(slack(?:bot)?(?:-imgproxy|-linkexpanding)?) ([\w\.]+)/i,
@ -203,7 +235,7 @@ const Fetchers = Object.freeze({
[NAME, VERSION, [TYPE, FETCHER]],
// Google Bots / Snapchat
[/(feedfetcher-google|google-read-aloud|(?=bot; )snapchat)/i],
[/(feedfetcher-google|google(?:-read-aloud|producer)|(?=bot; )snapchat)/i],
[NAME, [TYPE, FETCHER]],
]
});
@ -252,8 +284,8 @@ const MediaPlayers = Object.freeze({
/(flrp)\/([\w\.-]+)/i // Flip Player
], [[NAME, 'Flip Player'], VERSION, [TYPE, MEDIAPLAYER]], [
/(fstream|nativehost|queryseekspider|ia-archiver|facebookexternalhit)/i
// FStream/NativeHost/QuerySeekSpider/IA Archiver/facebookexternalhit
/(fstream|nativehost|queryseekspider)/i
// FStream/NativeHost/QuerySeekSpider
], [NAME, [TYPE, MEDIAPLAYER]], [
/(gstreamer) souphttpsrc.+libsoup\/([\w\.-]+)/i

View File

@ -1,4 +1,44 @@
[
{
"desc" : "360Spider",
"ua" : "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0); 360Spider",
"expect" :
{
"name" : "360Spider",
"version" : "undefined",
"type" : "crawler"
}
},
{
"desc" : "AdsBot Mobile Web",
"ua" : "AdsBot-Google (+http://www.google.com/adsbot.html)",
"expect" :
{
"name" : "AdsBot-Google",
"version" : "undefined",
"type" : "crawler"
}
},
{
"desc" : "AdsBot Mobile Web",
"ua" : "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.96 Mobile Safari/537.36 (compatible; AdsBot-Google-Mobile; +http://www.google.com/mobile/adsbot.html)",
"expect" :
{
"name" : "AdsBot-Google-Mobile",
"version" : "undefined",
"type" : "crawler"
}
},
{
"desc" : "AdSense",
"ua" : "Mediapartners-Google",
"expect" :
{
"name" : "Mediapartners-Google",
"version" : "undefined",
"type" : "crawler"
}
},
{
"desc" : "AhrefsBot",
"ua" : "Mozilla/5.0 (compatible; AhrefsBot/7.0; +http://ahrefs.com/robot/)",
@ -49,6 +89,16 @@
"type" : "crawler"
}
},
{
"desc" : "CCBot",
"ua" : "CCBot/1.0 (+https://commoncrawl.org/bot.html)",
"expect" :
{
"name" : "CCBot",
"version" : "1.0",
"type" : "crawler"
}
},
{
"desc" : "Coc Coc Bot (web)",
"ua" : "Mozilla/5.0 (compatible; coccocbot-web/1.0; +http://help.coccoc.com/searchengine)",
@ -79,6 +129,16 @@
"type" : "crawler"
}
},
{
"desc" : "DataForSEO",
"ua" : "Mozilla/5.0 (compatible; DataForSeoBot; +https://dataforseo.com/dataforseo-bot) ",
"expect" :
{
"name" : "DataForSeoBot",
"version" : "undefined",
"type" : "crawler"
}
},
{
"desc" : "Dotbot",
"ua" : "Mozilla/5.0 (compatible; DotBot/1.2; +https://opensiteexplorer.org/dotbot; help@moz.com)",
@ -89,6 +149,16 @@
"type" : "crawler"
}
},
{
"desc" : "Exabot",
"ua" : "Mozilla/5.0 (compatible; Exabot/3.0; +http://www.exabot.com/go/robot)",
"expect" :
{
"name" : "Exabot",
"version" : "3.0",
"type" : "crawler"
}
},
{
"desc" : "FacebookBot",
"ua" : "Mozilla/5.0 (compatible; FacebookBot/1.0; +https://developers.facebook.com/docs/sharing/webmasters/facebookbot/",
@ -99,6 +169,26 @@
"type" : "crawler"
}
},
{
"desc" : "FacebookExternalHit",
"ua" : "facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)",
"expect" :
{
"name" : "facebookexternalhit",
"version" : "1.1",
"type" : "crawler"
}
},
{
"desc" : "FacebookExternalHit",
"ua" : "facebookcatalog/1.0",
"expect" :
{
"name" : "facebookcatalog",
"version" : "1.0",
"type" : "crawler"
}
},
{
"desc" : "Googlebot-Video",
"ua" : "Googlebot-Video/1.0",
@ -109,6 +199,106 @@
"type" : "crawler"
}
},
{
"desc" : "Googlebot",
"ua" : "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
"expect" :
{
"name" : "Googlebot",
"version" : "2.1",
"type" : "crawler"
}
},
{
"desc" : "Googlebot Image",
"ua" : "Googlebot-Image/1.0",
"expect" :
{
"name" : "Googlebot-Image",
"version" : "1.0",
"type" : "crawler"
}
},
{
"desc" : "Googlebot Video",
"ua" : "Googlebot-Video/1.0",
"expect" :
{
"name" : "Googlebot-Video",
"version" : "1.0",
"type" : "crawler"
}
},
{
"desc" : "Googlebot News",
"ua" : "Googlebot-News/1.0",
"expect" :
{
"name" : "Googlebot-News",
"version" : "1.0",
"type" : "crawler"
}
},
{
"desc" : "Google Storebot",
"ua" : "Storebot-Google/1.0",
"expect" :
{
"name" : "Storebot-Google",
"version" : "1.0",
"type" : "crawler"
}
},
{
"desc" : "Google InspectionTool",
"ua" : "Mozilla/5.0 (compatible; Google-InspectionTool/1.0;)",
"expect" :
{
"name" : "Google-InspectionTool",
"version" : "1.0",
"type" : "crawler"
}
},
{
"desc" : "GoogleOther",
"ua" : "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GoogleOther) Chrome/41.0.2272.96 Safari/537.36",
"expect" :
{
"name" : "GoogleOther",
"version" : "undefined",
"type" : "crawler"
}
},
{
"desc" : "GoogleOther-Image",
"ua" : "GoogleOther-Image/1.0",
"expect" :
{
"name" : "GoogleOther-Image",
"version" : "1.0",
"type" : "crawler"
}
},
{
"desc" : "GoogleOther-Video",
"ua" : "GoogleOther-Video/1.0",
"expect" :
{
"name" : "GoogleOther-Video",
"version" : "1.0",
"type" : "crawler"
}
},
{
"desc" : "Google-Safety",
"ua" : "Google-Safety",
"expect" :
{
"name" : "Google-Safety",
"version" : "undefined",
"type" : "crawler"
}
},
{
"desc" : "GPTBot",
"ua" : "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)",
@ -119,6 +309,36 @@
"type" : "crawler"
}
},
{
"desc" : "Archive.org Bot",
"ua" : "ia_archiver/8.1 (Windows 2000 1.9; en-US;)",
"expect" :
{
"name" : "ia_archiver",
"version" : "8.1",
"type" : "crawler"
}
},
{
"desc" : "Archive.org Bot",
"ua" : "Mozilla/5.0 (compatible; archive.org_bot/3.3.0 +https://archive.org/details/archive.org_bot)",
"expect" :
{
"name" : "archive.org_bot",
"version" : "3.3.0",
"type" : "crawler"
}
},
{
"desc" : "Meta-ExternalAgent",
"ua" : "meta-externalagent/1.1 (+https://developers.facebook.com/docs/sharing/webmasters/crawler)",
"expect" :
{
"name" : "meta-externalagent",
"version" : "1.1",
"type" : "crawler"
}
},
{
"desc" : "MJ12bot",
"ua" : "Mozilla/5.0 (compatible; MJ12bot/v1.4.8; http://mj12bot.com/)",
@ -126,6 +346,17 @@
{
"name" : "MJ12bot",
"version" : "v1.4.8",
"major" : "1",
"type" : "crawler"
}
},
{
"desc" : "MojeekBot",
"ua" : "Mozilla/5.0 (compatible; MojeekBot/0.11; +https://www.mojeek.com/bot.html)",
"expect" :
{
"name" : "MojeekBot",
"version" : "0.11",
"type" : "crawler"
}
},
@ -139,6 +370,36 @@
"type" : "crawler"
}
},
{
"desc" : "PerplexityBot",
"ua" : "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; PerplexityBot/1.0; +https://perplexity.ai/perplexitybot)",
"expect" :
{
"name" : "PerplexityBot",
"version" : "1.0",
"type" : "crawler"
}
},
{
"desc" : "PetalBot",
"ua" : "Mozilla/5.0 (compatible;PetalBot; +https://webmaster.petalsearch.com/site/petalbot) ",
"expect" :
{
"name" : "PetalBot",
"version" : "undefined",
"type" : "crawler"
}
},
{
"desc" : "PetalBot",
"ua" : "Mozilla/5.0 (Linux; Android 7.0;) AppleWebKit/537.36 (KHTML, like Gecko) Mobile Safari/537.36 (compatible; PetalBot;+https://webmaster.petalsearch.com/site/petalbot) ",
"expect" :
{
"name" : "PetalBot",
"version" : "undefined",
"type" : "crawler"
}
},
{
"desc" : "SemrushBot",
"ua" : "Mozilla/5.0 (compatible; SemrushBot/7~bl; +http://www.semrush.com/bot.html)",
@ -149,6 +410,16 @@
"type" : "crawler"
}
},
{
"desc" : "TurnitinBot",
"ua" : "TurnitinBot (https://turnitin.com/robot/crawlerinfo.html)",
"expect" :
{
"name" : "TurnitinBot",
"version" : "undefined",
"type" : "crawler"
}
},
{
"desc" : "Yahoo! Japan",
"ua" : "Y!J-BRW/1.0 (https://www.yahoo-help.jp/app/answers/detail/p/595/a_id/42716)",
@ -168,5 +439,35 @@
"version" : "3.0",
"type" : "crawler"
}
},
{
"desc" : "Yeti",
"ua" : "Mozilla/5.0 (compatible; Yeti/1.1; +http://naver.me/spd)",
"expect" :
{
"name" : "Yeti",
"version" : "1.1",
"type" : "crawler"
}
},
{
"desc" : "YisouSpider",
"ua" : "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36",
"expect" :
{
"name" : "YisouSpider",
"version" : "5.0",
"type" : "crawler"
}
},
{
"desc" : "YisouSpider",
"ua" : "YisouSpider",
"expect" :
{
"name" : "YisouSpider",
"version" : "undefined",
"type" : "crawler"
}
}
]

View File

@ -29,6 +29,76 @@
"type" : "fetcher"
}
},
{
"desc" : "DuckAssistBot",
"ua" : "DuckAssistBot/1.2; (+http://duckduckgo.com/duckassistbot.html)",
"expect" :
{
"name" : "DuckAssistBot",
"version" : "1.2",
"type" : "fetcher"
}
},
{
"desc" : "Google FeedFetcher",
"ua" : "FeedFetcher-Google; (+http://www.google.com/feedfetcher.html)",
"expect" :
{
"name" : "FeedFetcher-Google",
"version" : "undefined",
"type" : "fetcher"
}
},
{
"desc" : "Google Read Aloud - Mobile agent",
"ua" : "Mozilla/5.0 (Linux; Android 7.0; SM-G930V Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.36 (compatible; Google-Read-Aloud; +https://support.google.com/webmasters/answer/1061943)",
"expect" :
{
"name" : "Google-Read-Aloud",
"version" : "undefined",
"type" : "fetcher"
}
},
{
"desc" : "Google Read Aloud - Desktop agent",
"ua" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36 (compatible; Google-Read-Aloud; +https://support.google.com/webmasters/answer/1061943)",
"expect" :
{
"name" : "Google-Read-Aloud",
"version" : "undefined",
"type" : "fetcher"
}
},
{
"desc" : "Google Publisher Center",
"ua" : "GoogleProducer; (+https://developers.google.com/search/docs/crawling-indexing/google-producer)",
"expect" :
{
"name" : "GoogleProducer",
"version" : "undefined",
"type" : "fetcher"
}
},
{
"desc" : "Google Site Verifier",
"ua" : "Mozilla/5.0 (compatible; Google-Site-Verification/1.0)",
"expect" :
{
"name" : "Google-Site-Verification",
"version" : "1.0",
"type" : "fetcher"
}
},
{
"desc" : "Meta-ExternalFetcher",
"ua" : "meta-externalfetcher/1.1 (+https://developers.facebook.com/docs/sharing/webmasters/crawler)",
"expect" :
{
"name" : "meta-externalfetcher",
"version" : "1.1",
"type" : "fetcher"
}
},
{
"desc" : "Rogerbot",
"ua" : "Mozilla/5.0 (compatible; rogerBot/1.0; UrlCrawler; http://www.seomoz.org/dp/rogerbot)",