知网spider

使用selenium模拟浏览器爬取,详情页使用requests请求爬取

关键点:

  • 数据内容是嵌入子框架iframe的,要switch_to 子框架里面
  • 请求详情页的url分为两种,一种直接用URLID构造,另一种要提交参数dbname,dbcode,filename
  • 详情页有一个更多的按钮要点击才会展示全文
  • 当翻页次数过多会跳出英文数字验证码,开始是一次,后面会不断跳验证码,需要分析两种情况的截图
  • 当翻页次数太多,它干脆就不生成数据了,估计是ip问题(为解决)

关于中途跳验证码问题:
由于验证码的url是每次访问都会变的,所以不能直接拿这个url来下载验证码

  1. 进行全屏截图,并保存
  2. 用selenium获取验证码元素的css的长和宽,使用ps或者fw等处理图片的软件,量出验证码上边到全屏截图最上面的距离top,验证码左边到全屏截图最左边的距离left,和验证码在全屏截图中的size
  3. 用selenium获取到的css的长和宽 和 验证码在全屏截图中的长和宽 取商,得到他们的比例系数,根据这个系数乘以前面得到的 top 和 left 并根据 top 和 left 构造 right 和 bottom
  4. 交给打码平台
  5. 对于连续跳验证码,可能会出现验证码识别错误的情况,这种情况页面中会多了段文字,导致验证码截图不对,这种情况分开判断,多截图一次,对 top 加以调整,在截图就可以了
from selenium import webdriver
from PIL import Image
import requests
from hashlib import md5
import time
from lxml import etree
import re
import csv
import codecs
import sys


class Chaojiying_Client(object):
    def __init__(self, username, password, soft_id):
        self.username = username
        password = password.encode('utf8')
        self.password = md5(password).hexdigest()
        self.soft_id = soft_id
        self.base_params = {
            'user': self.username,
            'pass2': self.password,
            'softid': self.soft_id,
        }
        self.headers = {
            'Connection': 'Keep-Alive',
            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
        }

    def PostPic(self, im, codetype):
        """
        im: 图片字节
        codetype: 题目类型 参考 http://www.chaojiying.com/price.html
        """
        params = {
            'codetype': codetype,
        }
        params.update(self.base_params)
        files = {'userfile': ('ccc.jpg', im)}
        r = requests.post('http://upload.chaojiying.net/Upload/Processing.php',
                          data=params, files=files, headers=self.headers)
        return r.json()

    def ReportError(self, im_id):
        """
        im_id:报错题目的图片ID
        """
        params = {
            'id': im_id,
        }
        params.update(self.base_params)
        r = requests.post(
            'http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
        return r.json()


def get_check_code(first):
    '''
    first:True 表示第一次,此时页面元素不变
    first:False 表示 输入的为错误的验证码,此时页面添加了提醒出现错误的元素,需要调整截取验证码的位置
    '''
    ratio = 28.25

    left = 31.4*ratio
    if first:
        top = 5.36*ratio
    else:
        top = 6.65*ratio 
    right = left + 2.29*ratio
    bottom = top + 0.81*ratio

    # right = left + width
    # bottom = top + height
    # size 63 22  ratio  2.29  0.81
    # 470, 47, 533, 69 location position
    if first:
        im = Image.open('screenshot.png')
    else:
        im = Image.open('screenshot1.png')
    im = im.crop((left, top, right, bottom))
    im.save('check_code_img.png')

    chaojiying = Chaojiying_Client(
	    '超级鹰账号', '密码', '软件ID')  # 用户中心 >>软件ID 生成一个替换 96001
    im = open(r'check_code_img.png', 'rb').read()
    info = chaojiying.PostPic(im, 1902)
    print(info)
    pic_str = info.get('pic_str')
    print(pic_str)
    return pic_str


def request_get_detail(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'
    }
    try:
        response = requests.get(url, headers)
        if response.status_code == 200:
            response = response.content.decode()
    except:
        return 'error'
    cn_ = '否'
    if '中文核心期刊' in response:
        cn_ = '是'
    
    # 抓内容
    selector = etree.HTML(response)
    work_place = selector.xpath("//div[@class='wxTitle']/div[@class='orgn']/span/a/text()")
    if work_place:
        work_place = work_place[0]
    else:
        work_place = None
    p_elements = selector.xpath('//div[@class="wxBaseinfo"]//p')
    # print(len(p_elements))
    abstract = None
    fundation = None
    keywords = None
    for p in p_elements:
        text = p.xpath('string(.)')
        # print(text)
        if text.startswith('摘要'):
            abstract = text.strip('更多还原').strip('摘要:').strip()
        elif text.startswith('关键词'):
            keywords = text.strip('关键词:').strip()
        elif text.startswith('基金'):
            fundation = text.strip('基金:').strip()
    return (cn_, fundation, keywords, work_place, abstract)

def get_parm(href):
    '''
    分开两类 有URLID的加入到html里面
    URLID href="/kns/detail/detail.aspx?QueryID=4&CurRec=1&recid=&FileName=ZDJY20191101006&DbName=CAPJLAST&DbCode=CJFQ&yx=Y&pr=&URLID=11.3792.G4.20191101.1204.014&bsm="
    URLID https://kns.cnki.net/KCMS/detail/{11.3792.G4.20191101.1204.014}.html 
    判断,只要有 URLID的 就用 https://kns.cnki.net/KCMS/detail/{}.html 
    否则就用 https://kns.cnki.net/KCMS/detail/detail.aspx?dbcode=CJFQ&dbname=CJFDTEMP&filename=ZSCK201905012
    '''
    URLID = re.findall(r'.*?&URLID=(.*?)&bsm=', href, re.S)
    if URLID[0]:
        URLID = URLID[0]
        url = 'https://kns.cnki.net/KCMS/detail/{}.html'.format(URLID)
    else:
        FileName = re.findall(r'&FileName=(.*?)&', href, re.S)
        DbName = re.findall(r'&DbName=(.*?)&', href, re.S)
        DbCode = re.findall(r'&DbCode=(.*?)&', href, re.S)
        if FileName:
            FileName = FileName[0]
        if DbName:
            DbName = DbName[0]
        if DbCode:
            DbCode = DbCode[0]
        url = f'https://kns.cnki.net/KCMS/detail/detail.aspx?dbcode={DbCode}&dbname={DbName}&filename={FileName}'
    return url

content = input('请输入想要搜索的关键词')

with open('{}.csv'.format(content), 'ab+') as fp:
    fp.write(codecs.BOM_UTF8)
f = open('{}.csv'.format(content), 'a+', newline='', encoding='utf8')
writer = csv.writer(f)
# title, auctor, flag, real_orgin, release_time, cn_, fundation, keywords, work_place, abstract, url
writer.writerow([
    '文章名字', '作者', '文献类别', '期刊', '发表时间', '是否中文核心', '基金来源', '关键字', '单位', '摘要', '网址'
])

# chrome_options= webdriver.ChromeOptions()
# chrome_options.add_argument('--headless')
# chrome_options.add_argument('--disable-gpu')
# driver = webdriver.Chrome(chrome_options=chrome_options)

driver = webdriver.Chrome()
driver.get("https://www.cnki.net/")
driver.maximize_window()
driver.implicitly_wait(10)


search_content = driver.find_element_by_id('txt_SearchText')
search_content.clear()
search_content.send_keys(content)

buttom = driver.find_element_by_class_name('search-btn')
buttom.click()




driver.implicitly_wait(10)

# 子框架
driver.switch_to_frame(driver.find_element_by_id('iframeResult'))
driver.find_element_by_link_text('相关度').click()
time.sleep(3)
# 切换为一页 50 条
driver.find_element_by_link_text('50').click()
time.sleep(3)

next_page = driver.find_element_by_partial_link_text('下一页')
first = True

while next_page:
    if not first:
        next_page.click()

    if first:
        first = False
    
    driver.implicitly_wait(2)
    input_ = None
    # 记录输入验证码的次数
    check_code_counter = 0
    try:
        # 检测页面中是否有验证码
        driver.implicitly_wait(3)
        input_ = driver.find_element_by_id('CheckCode')
    except:
        print('未出现验证码')
    while input_:
        check_code_counter += 1
        print('出现验证码')
        time.sleep(3)
        check_code_img_element = driver.find_element_by_id('CheckCodeImg')
        # 向上滑动滚动条
        js='var action=document.documentElement.scrollTop=0'
        driver.execute_script(js)

        # 全屏截图
        if check_code_counter == 1:
            driver.save_screenshot('screenshot.png')
        else:
            driver.save_screenshot('screenshot1.png')
        '''
        left = check_code_img_element.location['x']
        top = check_code_img_element.location['y']
        right = check_code_img_element.location['x'] + check_code_img_element.size['width']
        bottom = check_code_img_element.location['y'] + check_code_img_element.size['height']
        print(left, top, right, bottom)
        Out: 470 47 533 69
        通过这个left, top, right, bottom获得验证码的size为(533-470, 69-47)
        在ps中量出验证码左边距离最左端的距离,上边距离最上端的距离,和在ps中验证码的size
        这个在ps中的size和原来的size是成比例的,对应除一下就找到比例的
        比如这里,在ps中量的是(2.3, 0.81),原来的size是(63, 22)
        >>> 63 / 2.3
        27.39130434782609
        >>> 22 / 0.81
        27.160493827160494
        最后的比例在这个值左右浮动,调试一下就可以了
        然后就可以从selenium的全屏截图中截出验证码了
        '''
        # 交给打码平台打码
        time.sleep(1)
        if check_code_counter == 1:
            code_str = get_check_code(first=True)
        else:
            code_str = get_check_code(first=False)
        print(f'验证码为{code_str}')

        input_.clear()
        input_.send_keys(code_str)
        time.sleep(1)
        submit_button = driver.find_element_by_css_selector('input[value=提交]')
        time.sleep(1)
        print(submit_button)
        submit_button.click()
        time.sleep(2)

        # 一直打码,直到验证啊不出现为止
        try:
            driver.implicitly_wait(3)
            input_ = driver.find_element_by_id('CheckCode')
        except:
            time.sleep(2)
            break
    

    elements = driver.find_elements_by_xpath('//table[@class="GridTableContent"]/tbody/tr')[1:] # 第一个是表格的第一行
    for element in elements:
        # 外层信息
        a_element = element.find_element_by_xpath('td[2]/a')
        href = a_element.get_attribute('href')
        title = element.find_element_by_xpath('td[2]/a').text # 标题
        auctor = element.find_element_by_xpath('td[3]').text.strip() # 作者
        orgin = element.find_element_by_xpath('td[4]').text.strip()
        release_time = element.find_element_by_xpath('td[5]').text.strip() # 出版日期
        flag = element.find_element_by_xpath('td[6]').text.strip()
        real_orgin = None
        if flag != '硕士' and flag != '博士':
            flag = '期刊'
            real_orgin = orgin
        url = get_parm(href)
        # 内层信息
        setion_info = request_get_detail(url)
        if setion_info == 'error':
            setion_info = [None for k in range(5)]
        setion_info = [title, auctor, flag, real_orgin, release_time, *setion_info, url]
        writer.writerow(setion_info)
    try:
        next_page = driver.find_element_by_partial_link_text('下一页')
    except:
        driver.save_screenshot('exit_screemshot.png')
        next_page = None
    # time.sleep(3)
f.close()
driver.quit()
  • 2
    点赞
  • 13
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值