使用selenium模拟浏览器爬取,详情页使用requests请求爬取
关键点:
- 数据内容是嵌入子框架iframe的,要switch_to 子框架里面
- 请求详情页的url分为两种,一种直接用URLID构造,另一种要提交参数dbname,dbcode,filename
- 详情页有一个更多的按钮要点击才会展示全文
- 当翻页次数过多会跳出英文数字验证码,开始是一次,后面会不断跳验证码,需要分析两种情况的截图
- 当翻页次数太多,它干脆就不生成数据了,估计是ip问题(为解决)
关于中途跳验证码问题:
由于验证码的url是每次访问都会变的,所以不能直接拿这个url来下载验证码
- 进行全屏截图,并保存
- 用selenium获取验证码元素的css的长和宽,使用ps或者fw等处理图片的软件,量出验证码上边到全屏截图最上面的距离top,验证码左边到全屏截图最左边的距离left,和验证码在全屏截图中的size
- 用selenium获取到的css的长和宽 和 验证码在全屏截图中的长和宽 取商,得到他们的比例系数,根据这个系数乘以前面得到的 top 和 left 并根据 top 和 left 构造 right 和 bottom
- 交给打码平台
- 对于连续跳验证码,可能会出现验证码识别错误的情况,这种情况页面中会多了段文字,导致验证码截图不对,这种情况分开判断,多截图一次,对 top 加以调整,在截图就可以了
from selenium import webdriver
from PIL import Image
import requests
from hashlib import md5
import time
from lxml import etree
import re
import csv
import codecs
import sys
class Chaojiying_Client(object):
def __init__(self, username, password, soft_id):
self.username = username
password = password.encode('utf8')
self.password = md5(password).hexdigest()
self.soft_id = soft_id
self.base_params = {
'user': self.username,
'pass2': self.password,
'softid': self.soft_id,
}
self.headers = {
'Connection': 'Keep-Alive',
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
}
def PostPic(self, im, codetype):
"""
im: 图片字节
codetype: 题目类型 参考 http://www.chaojiying.com/price.html
"""
params = {
'codetype': codetype,
}
params.update(self.base_params)
files = {'userfile': ('ccc.jpg', im)}
r = requests.post('http://upload.chaojiying.net/Upload/Processing.php',
data=params, files=files, headers=self.headers)
return r.json()
def ReportError(self, im_id):
"""
im_id:报错题目的图片ID
"""
params = {
'id': im_id,
}
params.update(self.base_params)
r = requests.post(
'http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
return r.json()
def get_check_code(first):
'''
first:True 表示第一次,此时页面元素不变
first:False 表示 输入的为错误的验证码,此时页面添加了提醒出现错误的元素,需要调整截取验证码的位置
'''
ratio = 28.25
left = 31.4*ratio
if first:
top = 5.36*ratio
else:
top = 6.65*ratio
right = left + 2.29*ratio
bottom = top + 0.81*ratio
# right = left + width
# bottom = top + height
# size 63 22 ratio 2.29 0.81
# 470, 47, 533, 69 location position
if first:
im = Image.open('screenshot.png')
else:
im = Image.open('screenshot1.png')
im = im.crop((left, top, right, bottom))
im.save('check_code_img.png')
chaojiying = Chaojiying_Client(
'超级鹰账号', '密码', '软件ID') # 用户中心 >>软件ID 生成一个替换 96001
im = open(r'check_code_img.png', 'rb').read()
info = chaojiying.PostPic(im, 1902)
print(info)
pic_str = info.get('pic_str')
print(pic_str)
return pic_str
def request_get_detail(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'
}
try:
response = requests.get(url, headers)
if response.status_code == 200:
response = response.content.decode()
except:
return 'error'
cn_ = '否'
if '中文核心期刊' in response:
cn_ = '是'
# 抓内容
selector = etree.HTML(response)
work_place = selector.xpath("//div[@class='wxTitle']/div[@class='orgn']/span/a/text()")
if work_place:
work_place = work_place[0]
else:
work_place = None
p_elements = selector.xpath('//div[@class="wxBaseinfo"]//p')
# print(len(p_elements))
abstract = None
fundation = None
keywords = None
for p in p_elements:
text = p.xpath('string(.)')
# print(text)
if text.startswith('摘要'):
abstract = text.strip('更多还原').strip('摘要:').strip()
elif text.startswith('关键词'):
keywords = text.strip('关键词:').strip()
elif text.startswith('基金'):
fundation = text.strip('基金:').strip()
return (cn_, fundation, keywords, work_place, abstract)
def get_parm(href):
'''
分开两类 有URLID的加入到html里面
URLID href="/kns/detail/detail.aspx?QueryID=4&CurRec=1&recid=&FileName=ZDJY20191101006&DbName=CAPJLAST&DbCode=CJFQ&yx=Y&pr=&URLID=11.3792.G4.20191101.1204.014&bsm="
URLID https://kns.cnki.net/KCMS/detail/{11.3792.G4.20191101.1204.014}.html
判断,只要有 URLID的 就用 https://kns.cnki.net/KCMS/detail/{}.html
否则就用 https://kns.cnki.net/KCMS/detail/detail.aspx?dbcode=CJFQ&dbname=CJFDTEMP&filename=ZSCK201905012
'''
URLID = re.findall(r'.*?&URLID=(.*?)&bsm=', href, re.S)
if URLID[0]:
URLID = URLID[0]
url = 'https://kns.cnki.net/KCMS/detail/{}.html'.format(URLID)
else:
FileName = re.findall(r'&FileName=(.*?)&', href, re.S)
DbName = re.findall(r'&DbName=(.*?)&', href, re.S)
DbCode = re.findall(r'&DbCode=(.*?)&', href, re.S)
if FileName:
FileName = FileName[0]
if DbName:
DbName = DbName[0]
if DbCode:
DbCode = DbCode[0]
url = f'https://kns.cnki.net/KCMS/detail/detail.aspx?dbcode={DbCode}&dbname={DbName}&filename={FileName}'
return url
content = input('请输入想要搜索的关键词')
with open('{}.csv'.format(content), 'ab+') as fp:
fp.write(codecs.BOM_UTF8)
f = open('{}.csv'.format(content), 'a+', newline='', encoding='utf8')
writer = csv.writer(f)
# title, auctor, flag, real_orgin, release_time, cn_, fundation, keywords, work_place, abstract, url
writer.writerow([
'文章名字', '作者', '文献类别', '期刊', '发表时间', '是否中文核心', '基金来源', '关键字', '单位', '摘要', '网址'
])
# chrome_options= webdriver.ChromeOptions()
# chrome_options.add_argument('--headless')
# chrome_options.add_argument('--disable-gpu')
# driver = webdriver.Chrome(chrome_options=chrome_options)
driver = webdriver.Chrome()
driver.get("https://www.cnki.net/")
driver.maximize_window()
driver.implicitly_wait(10)
search_content = driver.find_element_by_id('txt_SearchText')
search_content.clear()
search_content.send_keys(content)
buttom = driver.find_element_by_class_name('search-btn')
buttom.click()
driver.implicitly_wait(10)
# 子框架
driver.switch_to_frame(driver.find_element_by_id('iframeResult'))
driver.find_element_by_link_text('相关度').click()
time.sleep(3)
# 切换为一页 50 条
driver.find_element_by_link_text('50').click()
time.sleep(3)
next_page = driver.find_element_by_partial_link_text('下一页')
first = True
while next_page:
if not first:
next_page.click()
if first:
first = False
driver.implicitly_wait(2)
input_ = None
# 记录输入验证码的次数
check_code_counter = 0
try:
# 检测页面中是否有验证码
driver.implicitly_wait(3)
input_ = driver.find_element_by_id('CheckCode')
except:
print('未出现验证码')
while input_:
check_code_counter += 1
print('出现验证码')
time.sleep(3)
check_code_img_element = driver.find_element_by_id('CheckCodeImg')
# 向上滑动滚动条
js='var action=document.documentElement.scrollTop=0'
driver.execute_script(js)
# 全屏截图
if check_code_counter == 1:
driver.save_screenshot('screenshot.png')
else:
driver.save_screenshot('screenshot1.png')
'''
left = check_code_img_element.location['x']
top = check_code_img_element.location['y']
right = check_code_img_element.location['x'] + check_code_img_element.size['width']
bottom = check_code_img_element.location['y'] + check_code_img_element.size['height']
print(left, top, right, bottom)
Out: 470 47 533 69
通过这个left, top, right, bottom获得验证码的size为(533-470, 69-47)
在ps中量出验证码左边距离最左端的距离,上边距离最上端的距离,和在ps中验证码的size
这个在ps中的size和原来的size是成比例的,对应除一下就找到比例的
比如这里,在ps中量的是(2.3, 0.81),原来的size是(63, 22)
>>> 63 / 2.3
27.39130434782609
>>> 22 / 0.81
27.160493827160494
最后的比例在这个值左右浮动,调试一下就可以了
然后就可以从selenium的全屏截图中截出验证码了
'''
# 交给打码平台打码
time.sleep(1)
if check_code_counter == 1:
code_str = get_check_code(first=True)
else:
code_str = get_check_code(first=False)
print(f'验证码为{code_str}')
input_.clear()
input_.send_keys(code_str)
time.sleep(1)
submit_button = driver.find_element_by_css_selector('input[value=提交]')
time.sleep(1)
print(submit_button)
submit_button.click()
time.sleep(2)
# 一直打码,直到验证啊不出现为止
try:
driver.implicitly_wait(3)
input_ = driver.find_element_by_id('CheckCode')
except:
time.sleep(2)
break
elements = driver.find_elements_by_xpath('//table[@class="GridTableContent"]/tbody/tr')[1:] # 第一个是表格的第一行
for element in elements:
# 外层信息
a_element = element.find_element_by_xpath('td[2]/a')
href = a_element.get_attribute('href')
title = element.find_element_by_xpath('td[2]/a').text # 标题
auctor = element.find_element_by_xpath('td[3]').text.strip() # 作者
orgin = element.find_element_by_xpath('td[4]').text.strip()
release_time = element.find_element_by_xpath('td[5]').text.strip() # 出版日期
flag = element.find_element_by_xpath('td[6]').text.strip()
real_orgin = None
if flag != '硕士' and flag != '博士':
flag = '期刊'
real_orgin = orgin
url = get_parm(href)
# 内层信息
setion_info = request_get_detail(url)
if setion_info == 'error':
setion_info = [None for k in range(5)]
setion_info = [title, auctor, flag, real_orgin, release_time, *setion_info, url]
writer.writerow(setion_info)
try:
next_page = driver.find_element_by_partial_link_text('下一页')
except:
driver.save_screenshot('exit_screemshot.png')
next_page = None
# time.sleep(3)
f.close()
driver.quit()