前言
在写爬虫爬取网页数据时由于频繁访问会被限制IP,返回错误响应码以及验证字母验证;
这里简单时间返回错误相应码是调用函数自动重连;
可以加上代理服务器,参数有注释;
代理服务可以自己搭建,也可以购买收费的强力代理服务。
简单封装几个函数,生成header,重写get,解析网页信息并保存。
之前写过搭建免费ip池实现动态代理:传送门
下面贴出代码:
from bs4 import BeautifulSoup
import time
import random
import requests
import re
import sys
import importlib
import sys
importlib.reload(sys)
class Common(object):
def randHeader(self):
'''
随机生成User-Agent
:return:
'''
head_connection = ['Keep-Alive', 'close']
head_accept = ['text/html, application/xhtml+xml, */*','text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8']
head_accept_language = ['zh-CN,fr-FR;q=0.5', 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3']
head_user_agent = ['Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11',
'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
]
result = {
'Connection': head_connection[0],
'Accept': head_accept[0],
'Accept-Language': head_accept_language[1],
# 'cookie':'cna=ARccE9yrEQMCAWp5Py3Bf4zT; UM_distinctid=163d85e71c50-06cfc6183059d4-4d015463-15f900-163d85e71c6557; hng=CN%7Czh-CN%7CCNY%7C156; t=ebd07ceca182465b45772b1306d08ab5; tracknick=tb931069036; lid=tb931069036; _tb_token_=38e331ee5373e; cookie2=1c451d93587e2918e8e7f7d3755ceff8; CNZZDATA30066717=cnzz_eid%3D1636747811-1528340961-%26ntime%3D1529388230; isg=BDk50Q6IRgA3pRqkbVBuT16jRqUTRi349z6Gi1tutWDf4ll0o5Y9yKewYOYU2sUw',
'User-Agent': head_user_agent[random.randrange(0, len(head_user_agent))]
}
return result
def getCurrentTime(self,type=0):
switcher = {
0:time.strftime('[%Y-%m-%d %H:%M:%S]', time.localtime(time.time())), # 获取当前时间
1:int(time.time()),
2:time.strftime('%Y-%m-%d', time.localtime(time.time())),
3:time.strftime('%H:%M:%S', time.localtime(time.time())),
}
return switcher.get(type)
def getURL(self, url, redirects=False, tries_num=6, sleep_time=0.5, time_out=1000, max_retry=6, ):
'''
这里重写get函数,主要是为了实现网络中断后自动重连,同时为了兼容各种网站不同的反爬策略及,通过sleep时间和timeout动态调整来测试合适的网络连接参数;
通过isproxy 来控制是否使用代理,以支持一些在内网办公的同学
:param url:
:param tries_num: 重试次数
:param sleep_time: 休眠时间
:param time_out: 连接超时参数
:param max_retry: 最大重试次数,仅仅是为了递归使用
:return: response
'''
sleep_time_p = sleep_time
time_out_p = time_out
tries_num_p = tries_num
res = ""
# 代理服务器
proxyHost = ""
proxyPort = ""
# 代理隧道验证信息
proxyUser = ""
proxyPass = ""
proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
"host": proxyHost,
"port": proxyPort,
"user": proxyUser,
"pass": proxyPass,
}
proxies = {
"http": proxyMeta,
"https": proxyMeta,
}
try:
res = requests.Session()
res = requests.get(url,headers=self.randHeader(), allow_redirects=redirects, timeout=time_out)
res.raise_for_status() # 如果响应状态码不是 200,就主动抛出异常
except requests.RequestException as e:
sleep_time_p = sleep_time_p + 10
time_out_p = time_out_p + 10
tries_num_p = tries_num_p - 1
# 设置重试次数,最大timeout 时间和 最长休眠时间
if tries_num_p > 0:
time.sleep(sleep_time_p)
print (self.getCurrentTime(), url, 'URL Connection Error: 第', max_retry - tries_num_p, u'次 Retry Connection', e)
return self.getURL(url, 'False', tries_num_p, sleep_time_p, time_out_p, max_retry)
return res
# 下载页面 如果没法下载就 等待1秒 再下载
def getSoup(self, response, htmlcode="html.parser", decode="UTF-8"):
html = response.text
# html = response.content
# html = html.decode(decode)
# html.encoding = 'gb2312' #解决乱码问题
soup = BeautifulSoup(html, htmlcode)
return soup
def getReplace(self,str,old,new):
ret = ""
if str:
strinfo = re.compile(old)
ret = strinfo.sub(new, str)
return ret
def getText(self,str,type="text",num=0): #array
ret = "暂无数据"
if str:
if type=="text":
try:
ret = str.get_text().strip()
except Exception as e:
print (str,Exception,":",e)
else:
try:
ret = str[num].get_text().strip()
except Exception as e:
print ('getText',str,Exception,":",e)
return ret
def getContent(self,strs,key,symbol=':',type=0): #array type 1
for str in strs:
try:
r = str.get_text().strip()
#print 'getContent',r
if key in r:
if type==0:
return r.split(symbol)[1]
else:
return r
except Exception as e:
print ('getText',str,Exception,":",e)
return "暂无数据"
def getNumber(self, str, type="int"):
ret = str
re_num = re.compile(r"\d+\.?\d*")
try:
re_str = re.findall(re_num, str)
if re_str:
if type == "int":
ret = int(re_str[0])
elif type == "float":
ret = float(re_str[0])
else:
pass
except Exception as e:
print ('getNumber:',str,Exception, ":", e)
return ret