##加粗样式
-
需要解决的难题:
1) 验证码的识别
2) 对网页的验证码进行截取
2) IP池的构建
3) 对网页延迟加载的解决 -
第一步:
验证码的识别:
之前也试过写个算法,对验证码进行识别,但是太复杂了,没能实现,网上搜索,发现有个"云打码"的平台,能够识别,准确率高达98%,下面是Python对这个接口的引用:
#coding=utf-8
import json
import time
import requests
class YDMHttp:
apiurl = ‘http://api.yundama.com/api.php’
username = ‘’
password = ‘’
appid = ‘’
appkey = ‘’
def init(self, username, password, appid, appkey):
self.username = username
self.password = password
self.appid = str(appid)
self.appkey = appkey
def request(self, fields, files=[]):
response = self.post_url(self.apiurl, fields, files)
response = json.loads(response)
return response
def balance(self):
data = {‘method’: ‘balance’, ‘username’: self.username, ‘password’: self.password, ‘appid’: self.appid,
‘appkey’: self.appkey}
response = self.request(data)
if response:
if response[‘ret’] and response[‘ret’] < 0:
return response[‘ret’]
else:
return response[‘balance’]
else:
return -9001
def login(self):
data = {‘method’: ‘login’, ‘username’: self.username, ‘password’: self.password, ‘appid’: self.appid,
‘appkey’: self.appkey}
response = self.request(data)
if response:
if response[‘ret’] and response[‘ret’] < 0:
return response[‘ret’]
else:
return response[‘uid’]
else:
return -9001
def upload(self, filename, codetype, timeout):
data = {‘method’: ‘upload’, ‘username’: self.username, ‘password’: self.password, ‘appid’: self.appid,
‘appkey’: self.appkey, ‘codetype’: str(codetype), ‘timeout’: str(timeout)}
file = {‘file’: filename}
response = self.request(data, file)
if response:
if response[‘ret’] and response[‘ret’] < 0:
return response[‘ret’]
else:
return response[‘cid’]
else:
return -9001
def result(self, cid):
data = {‘method’: ‘result’, ‘username’: self.username, ‘password’: self.password, ‘appid’: self.appid,
‘appkey’: self.appkey, ‘cid’: str(cid)}
response = self.request(data)
return response and response[‘text’] or ‘’def decode(self, filename, codetype, timeout):
cid = self.upload(filename, codetype, timeout)
if cid > 0:
for i in range(0, timeout):
result = self.result(cid)
if result != ‘’:
return cid, result
else:
time.sleep(1)
return -3003, ‘’
else:
return cid, ‘’def report(self, cid):
data = {‘method’: ‘report’, ‘username’: self.username, ‘password’: self.password, ‘appid’: self.appid,
‘appkey’: self.appkey, ‘cid’: str(cid), ‘flag’: ‘0’}
response = self.request(data)
if response:
return response[‘ret’]
else:
return -9001def post_url(self, url, fields, files=[]):
for key in files:
files[key] = open(files[key], ‘rb’)
res = requests.post(url, files=files, data=fields)
return res.text
def use_ydm(filename):
username = b’’ # 用户名
password = b’’ # 密码
app_id = **** # 软件ID
app_key = '*****************’ # 软件密钥
code_type = 1004 # 验证码类型
timeout = 60 # 超时时间,秒
yundama = YDMHttp(username, password, app_id, app_key) # 初始化
balance = yundama.balance() # 查询余额
print(‘您的题分余额为{}’.format(balance))
cid, result = yundama.decode(filename, code_type, timeout) # 开始识别
print(‘识别结果为{}’.format(result))
return result
if name==‘main’:
use_ydm(r’F:\py_code\验证码\1.png’)
2) 在这里我为什么要说是截取呢,不是爬取,因为如果你爬虫的方法,那么就相当于你又重新请求此网站,但是验证码是随机产生的,所以你爬取的验证码也就与当前网站的不一样
这里的截取思路就是:先对整个登陆网页截图,然后根据验证码在网页中的位置,对其进行截取
def jietu(browser):
browser.save_screenshot(‘screenshot.png’) # 这里截图完成开始对图片进行区域截图处理
pic = Image.open(‘screenshot.png’)
element = browser.find_element_by_css_selector(’#vcodeImg’) # 对票房数据进行定位
left = element.location[‘x’]
top = element.location[‘y’]
right = element.location[‘x’] + element.size[‘width’]
bottom = element.location[‘y’] + element.size[‘height’]
pic = pic.crop((left, top, right, bottom)) # 开始区域截图
pic.resize((140, 53), Image.BILINEAR) # 将图片放大,不然识别不出来
pic.save(‘1.png’)
time.sleep(1) -
第三步:
IP池的构建:
这一步很简单,是为了当你在一个网站反复爬取时,网站对识别你的IP,会封了你的IP,此时需要找到别的IP。
import urllib.request
from bs4 import BeautifulSoup
import socket
#验证时间较慢
def page(number,url):
user_agent=‘IP’
headers = {‘User-agent’: user_agent}
temp = []
for i in range(1,number):
ipurl = url + str(number)
request = urllib.request.Request(ipurl, headers=headers)
content = urllib.request.urlopen(request).read()
bs = BeautifulSoup(content, ‘html.parser’)
res = bs.find_all(‘tr’)
for item in res:
try:
tds = item.find_all(‘td’)
address=tds[1].text
pocket=tds[2].text
result=address+’:’+pocket
temp.append(result)
# print(temp)
except IndexError:
pass
return temp
def IPpool(temp):
socket.setdefaulttimeout(2)
IPpool=[]
for row in temp:
proxy_handler = urllib.request.ProxyHandler({“http”: row})
opener = urllib.request.build_opener(proxy_handler)
urllib.request.install_opener(opener)
try:
html = urllib.request.urlopen(‘http://www.baidu.com’)
IPpool.append(row)
except Exception:
continue
print(IPpool)
url=‘http://www.xicidaili.com/nn/’
temp=page(2,url)
IPpool(temp)
添加链接描述 -
第三步
爬取这个网站时,会遇到问题,出现内容爬取不完整,就是说你爬取的是源代码里面的,但是有的内容里面没有,就像在qq空间里面,你一直向下翻,会有一个加载的过程;
browser.execute_script(“window.scrollBy(0,10000)”)
time.sleep(3)
这个代码就是向下翻,速度由你自己设定,此时必须时间睡眠一段时间,睡眠时间和你的网速有很大的关系,selenium爬虫爬取不到内容,很大方面原因就是你的代码运行很快,但是你的网速跟不上