爬取起点中文网字体反爬取

     参考文章:https://www.jianshu.com/p/fbc99cf4d557 

       个人比较喜欢看小说,于是乎想爬取小说网站--起点中文网,在爬取定位过程中遇到了反爬取,咨询了我旁边的前端大神,说下方法

当前页面接口返回的html源码

<p class="update"><span ><style>@font-face { font-family: YBbHmMyQ; src: url('https://qidian.gtimg.com/qd_anti_spider/YBbHmMyQ.eot?') format('eot'); src: url('https://qidian.gtimg.com/qd_anti_spider/YBbHmMyQ.woff') format('woff'), url('https://qidian.gtimg.com/qd_anti_spider/YBbHmMyQ.ttf') format('truetype'); } .YBbHmMyQ { font-family: 'YBbHmMyQ' !important;     display: initial !important; color: inherit !important; vertical-align: initial !important; }</style><span class="YBbHmMyQ">&#100320;&#100313;&#100320;&#100316;&#100315;&#100317;&#100319;</span>万字</span>

 

第一步:获取当前页面的字体文件链接,可以通过正则获取

    start_url = 'https://www.qidian.com/finish?action=hidden&orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=2&page=1'
    #获取当前页面的html
    response = requests.get(start_url).text   
    #通过正则获取当前页面字体文件链接
    url = re.search('woff.*?url.*?\'(.+?)\'.*?truetype',fonturl) .group(1)

第二步:通过fontTools模块获取当前字体映射关系

def get_font(url):
    response = requests.get(url)
    font = TTFont(BytesIO(response.content))
    cmap = font.getBestCmap()
    font.close()
    return cmap

第三步:通过当前映射关系可以对应的数据被变更为英文,然后创建dict已经转换

def get_encode(cmap,values):
    WORD_MAP = {'zero':'0','one':'1','two':'2','three':'3','four':'4','five':'5','six':'6','seven':'7','eight':'8','nine':'9','period':'.'}
    word_count=''
    for value in values.split(';'):
        value = value[2:]
        key = cmap[int(value)]
        word_count += WORD_MAP[key]
    return word_count

第四步:然后就是通过pyquery进行数据提取

def get_index(start_url):
    #获取当前页面的html
    response = requests.get(start_url).text   
    doc = pq(response)
    #获取当前字体文件名称
    classattr = doc('p.update > span > span').attr('class')
    pattern = '</style><span.*?%s.*?>(.*?)</span>'%classattr
    #获取当前页面所有被字数字符
    numberlist = re.findall(pattern,response)
    #获取当前包含字体文件链接的文本
    fonturl = doc('p.update > span > style').text() 
    #通过正则获取当前页面字体文件链接
    url = re.search('woff.*?url.*?\'(.+?)\'.*?truetype',fonturl) .group(1)
    cmap = get_font(url)
    books = doc('.all-img-list li').items()
    i = 0
    for book in books:
        item = {}
        item['img'] = 'http:' + book('.book-img-box a img').attr('src')
        item['bookname'] = book('.book-mid-info h4 a').text()
        item['author'] = book('.name').text()
        item['classes'] = book('p.author > a:nth-child(4)').text()
        item['content'] = book('.intro').text()
        item['number'] = get_encode(cmap,numberlist[i][:-1])
        i += 1

第五步:将输入存入mongodb

client = pymongo.MongoClient('127.0.0.1')
db = client.qidian
p  = db.finish
def mongo(item):
    p.insert(item)

 

附当前爬虫文件源码

#coding=utf-8
'''
Created on 2018年8月23日

@author: Administrator
'''
import requests,json,time,re
from requests.exceptions import RequestException
from pyquery import PyQuery as pq
from fontTools.ttLib import TTFont
from io import BytesIO
import pymongo

client = pymongo.MongoClient('127.0.0.1')
db = client.qidian
p  = db.finish


start_url = 'https://www.qidian.com/finish?action=hidden&orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=2&page='

def get_font(url):
    response = requests.get(url)
    font = TTFont(BytesIO(response.content))
    cmap = font.getBestCmap()
    font.close()
    return cmap

def get_encode(cmap,values):
    WORD_MAP = {'zero':'0','one':'1','two':'2','three':'3','four':'4','five':'5','six':'6','seven':'7','eight':'8','nine':'9','period':'.'}
    word_count=''
    for value in values.split(';'):
        value = value[2:]
        key = cmap[int(value)]
        word_count += WORD_MAP[key]
    return word_count

def get_index(start_url):
    #获取当前页面的html
    response = requests.get(start_url).text   
    doc = pq(response)
    #获取当前字体文件名称
    classattr = doc('p.update > span > span').attr('class')
    pattern = '</style><span.*?%s.*?>(.*?)</span>'%classattr
    #获取当前页面所有被字数字符
    numberlist = re.findall(pattern,response)
    #获取当前包含字体文件链接的文本
    fonturl = doc('p.update > span > style').text() 
    #通过正则获取当前页面字体文件链接
    url = re.search('woff.*?url.*?\'(.+?)\'.*?truetype',fonturl) .group(1)
    cmap = get_font(url)
    books = doc('.all-img-list li').items()
    i = 0
    for book in books:
        item = {}
        item['img'] = 'http:' + book('.book-img-box a img').attr('src')
        item['bookname'] = book('.book-mid-info h4 a').text()
        item['author'] = book('.name').text()
        item['classes'] = book('p.author > a:nth-child(4)').text()
        item['content'] = book('.intro').text()
        item['number'] = get_encode(cmap,numberlist[i][:-1])
        i += 1
        mongo(item)


def mongo(item):
    p.insert(item)


    
def main():
    for page in range(1,1000):
        url = start_url + str(page)
        get_index(url)

if __name__ == '__main__':
    main()
    
    
    

 

针对月票榜月票数字体反爬修改

def get_index(start_url):
    # 获取当前页面的html
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
    response = requests.get(start_url).text
    doc = pq(response)
    # 获取当前包含字体文件链接的文本
    fonturl = doc('div.total > p > span > style').text()
    # 通过正则获取当前页面字体文件链接
    addr = re.search('font-family: (.+?); src', fonturl).group(1)
    url = 'https://qidian.gtimg.com/qd_anti_spider/{addr}.woff'.format(addr=addr)
    cmap = get_font(url)
    print(cmap)
    # 获取当前字体文件名称
    pattern = '</style><span.*?%s.*?>(.*?)</span>' % addr
    # 获取当前页面所有被字数字符
    numberlist = re.findall(pattern, response)
    print('numberlist: ', numberlist)
    books = doc('.book-img-text li').items()
    i = 0
    print('i: ', i)
    for book in books:
        item = {}
        item['img'] = 'http:' + book('.book-img-box a img').attr('src')
        item['bookname'] = book('.book-mid-info h4 a').text()
        item['author'] = book('.name').text()
        item['classes'] = book('p.author > a:nth-child(4)').text()
        item['content'] = book('.intro').text()
        item['number'] = get_encode(cmap, numberlist[i][:-1])
        item['font_url'] = url
        i += 1
        mongo(item)

 

  • 7
    点赞
  • 32
    收藏
    觉得还不错? 一键收藏
  • 10
    评论
评论 10
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值