robot.txt:降低爬虫程序被网站的反爬虫机制封禁的风险
使用python自带robotparser
参考书:python网络爬虫实战 吕云翔 张扬
```RobotParser.py
import urllib.robotparser as urobot
import requests
import urllib
#方法一:
url="https://www.taobao.com/"
rp=urobot.RobotFileParser()
rp.set_url(url+"/robots.txt")
rp.read()
user_agent='Googlebot'
if rp.can_fetch(user_agent,'https://www.taobao.com/item/'):
site=requests.get(url)
print("seem good")
else:
print("cannot scrap because robots.txt banned you!")
#方法二:
def url_robots(url,newurl,user_agent):
rp = urobot.RobotFileParser()
rp.set_url(url + "/robots.txt")
rp.read()
if rp.can_fetch(user_agent, newurl):
urllib.request.urlopen(newurl)
print("seem good")
else:
print("cannot scrap because robots.txt banned you!")
url="https://www.taobao.com/"
user_agent ='Googlebot'
newurl='https://www.taobao.com/item/'
test=url_robots(url,newurl,user_agent)
运行结果:seem good*2