file.close()
‘’’
if self.cache:
#如果有缓存方式,缓存网页
self.cache[url] = result
print(url,“页面下载完成”)
return result[“html”]
def download(self,url,headers,proxy,num_retries,data=None):
‘’’
用于下载一个页面,返回页面和与之对应的状态码
‘’’
#构建请求
request = urllib.request.Request(url,data,headers or {})
request.add_header(“Cookie”,“finger=7360d3c2; UM_distinctid=15c59703db998-0f42b4b61afaa1-5393662-100200-15c59703dbcc1d; pgv_pvi=653650944; fts=1496149148; sid=bgsv74pg; buvid3=56812A21-4322-4C70-BF18-E6D646EA78694004infoc; CNZZDATA2724999=cnzz_eid%3D214248390-1496147515-https%253A%252F%252Fwww.baidu.com%252F%26ntime%3D1496805293”)
request.add_header(“Upgrade-Insecure-Requests”,“1”)
opener = self.opener or urllib.request.build_opener()
if proxy:
#如果有代理IP,使用代理IP
opener = urllib.request.build_opener(urllib.request.ProxyHandler(proxy))
try:
#下载网页
response = opener.open(request)
print(“code是”,response.code)
html = response.read().decode()
code = response.code
except Exception as e:
print(“下载出现错误”,str(e))
html = ‘’
if hasattr(e,“code”):
code =e.code
if num_retries > 0 and 500<code<600:
#如果错误不是未找到网页,则重新下载num_retries次
return self.download(url,headers,proxy,num_retries-1,data)
else:
code = None
print(html)
return {“html”:html,“code”:code}
class Throttle:
‘’’
按照延时,请求,代理IP等下载网页,处理网页中的link的类
‘’’
def __init__(self, delay):
self