今天花了一上午的时间,研究一下如何让爬虫避免IP封死。
主要设计到PROXY和PROXYTYPE
以下是代码:
#-*- coding:utf-8 -*-
#Filename:最终版搜狗数据
#Author:Guan
#Datetime:2018/12/8
from selenium.webdriver.common.proxy import Proxy
from selenium.webdriver.common.proxy import ProxyType
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium import webdriver
import random
import requests
from bs4 import BeautifulSoup
import time
import re
import happybase
from _md5 import md5
import datetime
# #配置数据库链接池
# HBASE_HOST = '10.8.23.6'
# HBASE_PORT = 9090
# HBASE_TABLE = 'bt_t77_sougou'
#
# host = HBASE_HOST
# port = HBASE_PORT
# table_name = HBASE_TABLE
#
# connection = happybase.Connection(host=host,port=port,table_name=table_name)
#获得IP
def get_ip():
# 设置请求ip
proxie = ['10.6.198.114:808', '10.6.198.115:808', '10.6.198.117:808', '10.6.198.118:808', '10.6.198.172:808']
# 循环控制翻页
while True:
# 随机选择IP
ip = random.choice(proxie)
print("代理IP:", ip)
# 添加一个代理类型(手动代理类型)
print("代理类型:", ProxyType.MANUAL)
try:
# 创建一个代理对象
proxy = Proxy({
# 代理方式
'proxyType': ProxyType.MANUAL,
# HTTP代理(即IP)
'httpProxy': ip
})
# 新建一个“期望技能”,哈哈
desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
# 新功能给代理
proxy.add_to_capabilities(desired_capabilities)
# 实例化一个浏览器对象
driver1 = webdriver.Chrome(
desired_capabilities=desired_capabilities
)
return driver1
except Exception as e:
# 代理IP请求错误,直接发送请求到浏览其
print("代理IP请求失败" + ip)
def get_url(url):
return url
#设置消息头
def get_html(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36",
"Cookie": "CXID=2C7D3DCAAA31333F0CA6B9F1D42B448E; SUID=07E1EB7C5B68860A5BEA43270009A690; ad=oujQFyllll2bf6GXlllllVs$yDolllllKnsPxZllllylllllRv7ll5@@@@@@@@@@; ABTEST=7|1543394206|v17; SUV=1543394206395727; browerV=3; osV=1; pgv_pvi=6458014720; SUIR=A2444FD8A4A1D8F49FA7A706A50AC1B8; sst0=544; sct=219; PHPSESSID=psls2mcv8gs1r4h3grcnpct066; UM_distinctid=1678c33873e0-05c303d0d58b1e-75133b4f-144000-1678c338740405; CNZZDATA1271442956=429218166-1544242285-%7C1544242285; Hm_lvt_f5df380d5163c1cc4823c8d33ec5fa49=1544100309,1544245446; Hm_lpvt_f5df380d5163c1cc4823c8d33ec5fa49=1544245770; IPLOC=CN1100; ld=tkllllllll2b@On0lllllVZazJZlllllKnsPxZlllxlllllljllll5@@@@@@@@@@; SNUID=FB2B97C1F9FF8687B5AEF045FAAAF705; seccodeRight=success; successCount=1|Sat, 08 Dec 2018 07:07:10 GMT"
}
response = requests.get(url=url,headers=headers).content.decode()
return response
#解析网页
def get_cont(html):
soup = BeautifulSoup(html,'lxml')
#规则1
regulation1 = soup.select('div[class="rb"]')
reg_list=[]
cx_url = get_url(url)
for i in regulation1:
reg_cont = {}
reg_cont['chexing'] = re.sub('&from','',(re.split('=', cx_url, maxsplit=2))[1])
reg_cont['title'] = i.find_all('h3')[0].text.strip()
reg_cont['cont'] = i.select('div[class="ft"]')[0].text.strip()
reg_cont['source'] = re.split('-', (i.find_all('cite')[0].text.strip()), maxsplit=1)[0]
try:
reg_cont['pub_date'] = re.sub('翻译此页', '', (re.split('-', (re.split('-', (i.find_all('cite')[0].text.strip()), maxsplit=1)[1]), maxsplit=1)[1]))
except Exception:
reg_cont['pub_date'] = re.sub('翻译此页', '', (re.split('-', (i.find_all('cite')[0].text.strip()), maxsplit=1)[1]))
reg_list.append(reg_cont)
#规则2
regulation2 = soup.select('div[style="width:548px"]')
reg_list2 = []
for j in regulation2:
reg_cont2 = {}
reg_cont2['chexing'] = re.sub('&from', '', (re.split('=', cx_url, maxsplit=2))[1])
reg_cont2['title'] = j.find_all('h3')[0].text.strip()
reg_cont2['cont'] = j.find_all('p')[0].text.strip()
reg_cont2['source'] = re.split('-',j.find_all('cite')[0].text.strip(),maxsplit=1)[0]
try:
reg_cont2['pub_date'] = re.split('-',(re.split('-',j.find_all('cite')[0].text.strip(),maxsplit=1)[1]),maxsplit=1)[1].strip()
except Exception:
reg_cont2['pub_date'] = re.split('-',j.find_all('cite')[0].text.strip(),maxsplit=1)[1].strip()
reg_list2.append(reg_cont2)
#将两个规则数据合并
news = reg_list+reg_list2
#测试数据:
for new in news:
print(new)
#写入数据库
# for new in news:
# # print(new)
# rowkey = datetime.datetime.now().strftime('%Y%m%d')
# connection.open()
# table = connection.table(table_name)
# table.put(md5(rowkey.encode('utf-8')).hexdigest(),
# {
# 'cf1:chexing':new['chexing'],
# 'cf1:title': new['title'],
# 'cf1:cont': new['cont'],
# 'cf1:source': new['source'],
# 'cf1:pub_date': new['pub_date'],
# }
# )
# connection.close()
if __name__ == '__main__':
# 读取车型
file = open('D:\公司文件\.PyCharmCE2018.2\config\scratches\拓展\搜狗数据\搜狗配置车型', encoding='utf-8')
cont = file.readlines()
chexing1 = []
for i in cont:
new_chexing = i.split(',')
for j in new_chexing:
chexing1.append(j)
print(chexing1)
for cx in chexing1:
url = 'https://www.sogou.com/web?query=%s&from=index-nologin&sugsuv=1543394206395727&tsn=1'%cx
driver = get_ip()
driver.get(url)
get_url(url)
# 开始网页解析
html = get_html(url)
# 加载网页
get_cont(html)
while True:
time.sleep(1)
driver.implicitly_wait(3)
try:
# 翻页动作
driver.find_element_by_xpath('//*[@id="sogou_next"]').click()
# 加载翻页后数据
url = driver.current_url
get_url(url)
html = get_html(url)
get_cont(html)
except Exception:
print('下一个车型')
break
driver.quit()