处理请求头文件agent-deny.conf
#禁止境内常见爬虫(根据需求自行控制是否禁止)
if ($http_user_agent ~* "qihoobot|Yahoo! Slurp China|Baiduspider|Baiduspider-image|spider|Sogou spider|Sogou web spider|Sogou inst spider|Sogou spider2|Sogou blog|Sogou News Spider|Sogou Orion spider|ChinasoSpider|Sosospider|YoudaoBot|yisouspider|EasouSpider|Tomato Bot|Scooter") {
return 403;
}
#禁止境外常见爬虫(根据需求自行控制是否禁止)
if ($http_user_agent ~* "Googlebot|Googlebot-Mobile|AdsBot-Google|Googlebot-Image|Mediapartners-Google|Adsbot-Google|Feedfetcher-Google|Yahoo! Slurp|MSNBot|Catall Spider|ArchitextSpider|AcoiRobot|Applebot|Bingbot|Discordbot|Twitterbot|facebookexternalhit|ia_archiver|LinkedInBot|Naverbot|Pinterestbot|seznambot|Slurp|teoma|TelegramBot|Yandex|Yeti|Infoseek|Lycos|Gulliver|Fast|Grabber") {
return 403;
}
#禁止指定 UA 及 UA 为空的访问
if ($http_user_agent ~ "WinHttp|WebZIP|FetchURL|node-superagent|java/|Bytespider|FeedDemon|Jullo|JikeSpider|Indy Library|Alexa Toolbar|AskTbFXTV|AhrefsBot|CrawlDaddy|CoolpadWebkit|Java|Feedly|Apache-HttpAsyncClient|UniversalFeedParser|ApacheBench|Microsoft URL Control|Swiftbot|ZmEu|oBot|jaunty|Python-urllib|lightDeckReports Bot|YYSpider|DigExt|HttpClient|MJ12bot|heritrix|Ezooms|BOT/0.1|YandexBot|FlightDeckReports|Linguee Bot|iaskspider|^$") {
return 403;
}
#禁止非 GET|HEAD|POST 方式的抓取
if ($request_method !~ ^(GET|HEAD|POST)$) {
return 403;
}
#禁止 Scrapy 等工具的抓取
#if ($http_user_agent ~* (Python|Java|Wget|Scrapy|Curl|HttpClient|Spider|PostmanRuntime)) {
if ($http_user_agent ~* (Scrapy|HttpClient)) {
return 403;
}
#屏蔽IP地址
#deny 123.123.123.123
#慎用-封IP段(123.0.0.1-123.255.255.254)
#deny 123.0.0.0/8
#慎用-封IP段(123.123.0.1-123.123.255.254)
#deny 123.123.0.0/16
#慎用-封IP段(123.123.123.1-123.123.123.254)
#deny 123.123.123.0/24
#使用说明:在网站 xxx.conf 相关配置中的 server 段插入如下代码:
#include agent_deny.conf;
nginx.conf引入agent_deny.conf
http {
#....
#引入限制爬虫UA配置文件
include agent_deny.conf;
#....
location =/robots.txt {# 爬虫规则说明,没有啥实际作用
default_type text/html;
add_header Content-Type "text/plain; charset=UTF-8";
return 200 "User-agent: *\nDisallow: /";
}
}
nginx.conf添加限制并发控制
http {
#....
#漏桶算法
#客户端IP限制并发连接数
limit_conn_zone $binary_remote_addr zone=conip:10m;
#客户端IP限制请求频率,每秒10次
limit_req_zone $binary_remote_addr zone=reqip:10m rate=10r/s;
location / {
limit_conn conip 2;
limit_req zone=reqip burst=20 nodelay;
# limit_rate 512k;# 限制客户端下载速率
# ...
# #allow 192.168.1.0/24; #仅允许局域网IP段访问
# #deny all; # 拒绝所有IP
# proxy_set_header Host $host;
# proxy_set_header X-Real-IP $remote_addr;
# proxy_set_header REMOTE-HOST $remote_addr;
# proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
}
#....
}
重启nginx服务后,校验是否拦截爬虫UA
$ curl -I -A '' www.test.com
$ curl -X GET -I -A 'YYSpider' www.test.com
$ curl -X GET -I -A 'Baiduspider' www.test.com