import urllib.request as ur
from urllib.error import URLError,ContentTooShortError,HTTPError
import itertools
def download(url, num_retries=2, user_agent='wswp',charset='utf-8'):
print('Downloading:',url)
request=ur.Request(url)
#添加请求头(一般添加'Cookies','User-Agent')
#默认请求头wswp (Web Scrapying With Python)
request.add_header('User-Agent',user_agent)
#运用异常,try...except...处理遇到一些无法控制的错误的情况:
try:
resp=ur.urlopen(request)
#headers.get_content_charset()得到请求Http响应返回的字符集
cs=resp.headers.get_content_charset()
#如果不存在,则采用默认的字符集utf-8
if not cs:
cs=charset
#decode()表示根据某种编码表示
html=resp.read().decode(cs)
except (URLError,ContentTooShortError,HTTPError) as e:
#e.reason 输出错误的原因
print('Download error:',e.reason)
html=None
if num_retries>0:
#hasattr(object,name)判断对象(objedt)是否包含对应属性(name)
if hasattr(e,'code') and (500<=e.code<600):
#一般地说,4XX错误都是发生在请求中的,5XX错误都是发生在服务器端的
#重下载,排除由5XX引起的错误,设定重下载次数为num_retries
return download(url,num_retries-1)
return html
def crawl_sitemap(url,max_errors=5):
num_errors = 0;
#itertools生成"无限"迭代器,itertools.count(1),1,2,3,...;itertools.cycle(A),A,B,C...
for pg in itertools.count(1):
pg_url="{}{}".format(url,pg)
html=download(pg_url)
if html is None:
#设置最大错误数,用于解决小数字非连续网页
num_errors=num_errors+1
if (num_errors==max_errors):
break;
else:
num_errors=0;
python写简单的爬虫(三)——ID遍历
最新推荐文章于 2023-03-30 11:56:48 发布