Nginx配置HTTPS和反爬虫配置
deny_agent.config
# 禁止Scrapy等工具的抓取
if ($http_user_agent ~* (Scrapy|Curl|HttpClient))
{
return 403;
}
# 禁止指定UA及UA为空的访问
if ($http_user_agent ~ "Bytespider|FeedDemon|JikeSpider|Indy Library|Alexa Toolbar|AskTbFXTV|AhrefsBot|CrawlDaddy|CoolpadWebkit|Java|Feedly|UniversalFeedParser|ApacheBench|Microsoft URL Control|Swiftbot|ZmEu|oBot|jaunty|Python-urllib|lightDeckReports Bot|YYSpider|DigExt|YisouSpider|HttpClient|MJ12bot|heritrix|EasouSpider|Ezooms|^$" )
{
return 403;
}
# 禁止非GET|HEAD|POST方式的抓取
if ($request_method !~ ^(GET|HEAD|POST)$)
{
return 403;
}
nginx.conf
# Nginx所用用户和组,window下不指定
# user nobody;
# 工作的子进程数量(通常等于CPU数量或者2倍于CPU)
worker_processes 4;
events{
# 最大连接数
worker_connections 1024;
}
http{
# 每个请求按时间顺序逐一分配到不同的后端服务器,如果后端服务器down 掉,能自动剔除
# ip_hash 每个访客固定访问一个后端服务器,可以解决session的问题。
# weight 权重(默认为1),指定轮询机率,哪个服务器性能好就提高访问量
upstream web_servers{
ip_hash;
server 172.16.0.2:8081 weight=5;
}
# HTTPS配置
server{
# 监听端口
listen 80;
server_name hlx.pub www.hlx.pub;
charset utf-8;
# 反爬虫配置
include deny_agent.config;
# ssl配置
listen 443;
ssl on;
ssl_certificate cert/1_hlx.pub_bundle.crt;
ssl_certificate_key cert/2_hlx.pub.key;
ssl_session_timeout 5m;
ssl_ciphers ECDHE-RSA-AES128-GCM-SHA256:ECDHE:ECDH:AES:HIGH:!NULL:!aNULL:!MD5:!ADH:!RC4;
ssl_protocols TLSv1 TLSv1.1 TLSv1.2;
ssl_prefer_server_ciphers on;
# 强制跳转到https
if ($server_port = 80) {
return 301 https://$server_name$request_uri;
}
if ($scheme = http) {
return 301 https://$server_name$request_uri;
}
error_page 497 https://$server_name$request_uri;
# 项目
location /video {
# 项目访问映射
proxy_pass http://web_servers/video;
proxy_redirect off;
proxy_set_header Host $http_host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
# 文件上传限制大小
client_max_body_size 500m;
client_body_buffer_size 128k;
proxy_connect_timeout 60s;
proxy_send_timeout 60s;
proxy_read_timeout 60s;
proxy_buffer_size 4k;
proxy_buffers 4 32k;
proxy_busy_buffers_size 64k;
proxy_temp_file_write_size 64k;
}
}# END OF server
}
测试
模拟宜搜蜘蛛的抓取:
curl -I -A ‘YisouSpider’ https://hlx.pub/video/api/test/
结果返回403
模拟UA为空的抓取:
curl -I -A ‘’ https://hlx.pub/video/api/test/
结果返回403
模拟百度蜘蛛的抓取:
curl -I -A ‘Baiduspider’ https://hlx.pub/video/api/test/
结果返回200