爬虫技术

python爬虫其实并没有什么太高深的技术,耗费精力的地方在于对网站返回结果的解析,以及对一些反爬机制的研究。


爬虫demo

下面这个例子可以爬取起点免费小说,直接看代码(本例来源于参考资料1):

# coding=utf-8
import urllib2
import sys
from bs4 import BeautifulSoup

# 设置编码
reload(sys)
sys.setdefaultencoding('utf-8')

class YuewenFreeSpider:

    def __init__(self):
        pass

    # 获取一个章节的内容
    def get_chapter_content(self, file, url):
        try:
            book_content_res = urllib2.urlopen(url)
            book_content_soup = BeautifulSoup(book_content_res.read(), "html.parser")
            file.write(book_content_soup.select("h3[class='j_chapterName'] span")[0].string + '\n')
            for p in book_content_soup.select(".j_readContent p"):
                file.write(p.next + '\n')
        except Exception, e:
            # 如果出错了,就重新运行一遍
            print(e)
            self.get_chapter_content(file, url)
        else:
            chapter_next = book_content_soup.select("a#j_chapterNext")[0]
            if chapter_next.string != "书末页":
                next_url = "https:" + chapter_next["href"]
                self.get_chapter_content(file, next_url)

    # 获取当前页所有书的内容
    def get_current_url_books(self, url):
        response = urllib2.urlopen(url)
        the_page = response.read()
        soup = BeautifulSoup(the_page, "html.parser")
        book_arr = soup.select("ul[class='all-img-list cf'] > li")
        global start_index
        if start_index > 0:
            book_arr = book_arr[start_index:]
            start_index = 0
        for book in book_arr:
            book_cover = book.select("div[class='book-mid-info'] h4 > a")[0]
            print "书名:" + book_cover.string
            # 先创建.txt文件,然后获取文本内容写入
            book_file = open("/home/username/crawler/books/" + book_cover.string + ".txt", "a+")
            bres = urllib2.urlopen("https:" + book_cover['href'])
            bsoup = BeautifulSoup(bres.read(), "html.parser")
            book_content_href = bsoup.select("a[class='red-btn J-getJumpUrl']")[0]["href"]
            self.get_chapter_content(book_file, "https:" + book_content_href)
            book_file.close()
        next_page = soup.select("a.lbf-pagination-next")[0]
        return next_page["href"]


# 根据传入参数设置从哪里开始下载
url = "//www.qidian.com/free/all?orderId=&vip=hidden&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=1&page=1"


if __name__ == '__main__':
    start_index = 0
    # 死循环 直到没有下一页
    while True:
        if url.startswith("//"):
            url = YuewenFreeSpider().get_current_url_books("https:" + url)
        else:
            break

这里主要用到了两个库,分别是urllib2和BeautifulSoup。urllib2是http请求库,BeautifulSoup的官方解释是:

  • Beautiful Soup提供一些简单的、python式的函数用来处理导航、搜索、修改分析树等功能。它是一个工具箱,通过解析文档为用户提供需要抓取的数据,因为简单,所以不需要多少代码就可以写出一个完整的应用程序。

  • Beautiful Soup自动将输入文档转换为Unicode编码,输出文档转换为utf-8编码。你不需要考虑编码方式,除非文档没有指定一个编码方式,这时,Beautiful Soup就不能自动识别编码方式了。然后,你仅仅需要说明一下原始编码方式就可以了。

Beautiful Soup已成为和lxml、html6lib一样出色的python解释器,为用户灵活地提供不同的解析策略或强劲的速度。

签约小说爬取

上节的例子中展示的是免费小说的爬取过程。如果要爬取签约小说的具体信息呢?示例如下:

# coding=utf-8
import urllib2
import sys
import time
import logging
import requests
from bs4 import BeautifulSoup
from lxml import etree
import json


class YuewenSpider:

    def __init__(self):
        self.session = requests.session()

    @staticmethod
    def get_url(url_key):
        url = {
            'search': 'https://www.qidian.com/search?kw=',
            'category': 'https://book.qidian.com/ajax/book/category?',
            'vipreader': 'https://vipreader.qidian.com/chapter/',
            'subscribe': 'https://vipreader.qidian.com/ajax/subscribe/subscribe?',
            'getSubscribe': 'https://vipreader.qidian.com/ajax/subscribe/getSubscribe?'
        }
        return url[url_key]

    @staticmethod
    def get_header_and_token():
        header = dict()
        header['Accept'] = 'text/html,application/xhtml+xml,application/xml;' \
                           'q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3'
        header['Accept-Encoding'] = 'gzip, deflate, br'
        header['Accept-Language'] = 'zh-CN,zh;q=0.9'
        header['Cache-Control'] = 'max-age=0'
        header['Connection'] = 'keep-alive'
        header['cookie'] = '_csrfToken=AYnkW4AoqnnkKTIjgbgbndUM8qQ2rikJA2gqBWvt; newstatisticUUID=1564129077_1845515318; qdrs=0%7C3%7C0%7C0%7C1; showSectionCommentGuide=1; qdgd=1; se_ref=baidu; se_ref_bid=1015221208; gender=male; e1=%7B%22pid%22%3A%22qd_P_Searchresult%22%2C%22eid%22%3A%22qd_S05%22%2C%22l1%22%3A3%7D; e2=%7B%22pid%22%3A%22qd_P_Searchresult%22%2C%22eid%22%3A%22qd_S05%22%2C%22l1%22%3A3%7D; rcr=1013432302%2C1015221208%2C1015741318%2C1014139104%2C1015129326%2C1015055967%2C1015235392%2C1015336641%2C1015835395%2C1010868264%2C1015444718%2C1010144088; lrbc=1013432302%7C432810477%7C0; pageOps=1'
        header['Host'] = 'www.qidian.com'
        header['Upgrade-Insecure-Requests'] = '1'
        header['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) ' \
                               'Chrome/75.0.3770.100 Safari/537.36'
        token = ''
        cookie = header['cookie']
        if cookie:
            cookie_pair = cookie.split(";")
            for p in cookie_pair:
                item = p.split("=")
                if item[0].strip() == "_csrfToken":
                    token = item[1]
        return header, token

    # 根据更新时间获取原创id和需要更新的章节列表
    def spider_chapter_list(self, book_title, book_author, updated_time):
        try:
            book = self.search_book(book_title, book_author)
            if not book:
                return None

            data = self.get_header_and_token()
            header, token = data[0], data[1]

            spider_book_chapters = dict()

            chapter_list = []

            # 爬取章节列表信息,判断是否存在更新章节book_chapter
            url = self.get_url(url_key='category') + '_csrfToken=' + token + '&bookId=' + book['id']
            r = self.session.get(url, headers=header)
            detail = json.loads(r.content)
            if detail['code'] == 1:
                raise Exception("get chapter list info failed")
            else:
                # 遍历卷信息
                for d in detail['data']['vs'][::-1]:
                    # 遍历章节信息
                    for c in d['cs'][::-1]:
                        t = int(time.mktime(time.strptime(c['uT'], '%Y-%m-%d %H:%M:%S')))
                        if t > updated_time:
                            chapter = dict()
                            chapter['chapter_title'] = c['cN']
                            chapter['chapter_url'] = str(book['id']) + "/" + str(c['id'])
                            chapter['chapter_cnt'] = c['cnt']  # 章节字数
                            chapter['chapter_id'] = c['id']
                            chapter['free'] = c['sS']
                            chapter['updated_time'] = t
                            chapter['uuid'] = c['uuid']
                            chapter_list.append(chapter)
                        else:
                            break
                    else:
                        continue
                    break
            if chapter_list:
                spider_book_chapters['book_id'] = int(book['id'])
                spider_book_chapters['chapters'] = chapter_list[::-1]
            return spider_book_chapters

        except Exception, e:
            logging.error("YueWenSpider-spider_chapter_list except: %s", e.message)
            return None


    # 根据书名和作者搜书
    def search_book(self, book_title, book_author):
        url = self.get_url(url_key='search') + book_title
        r = self.session.get(url, headers=self.get_header_and_token()[0])
        selector = etree.HTML(r.content)
        # 获取图书id
        book_ids = selector.xpath('//li[@class="res-book-item"]/@data-bid')
        # 获取图书名称和作者信息
        book_titles = selector.xpath('//li[@class="res-book-item"]//div[@class="book-mid-info"]/h4/a//text()')
        book_authors = selector.xpath('//div[@class="book-mid-info"]/p[@class="author"]/a[1]/text()')
        if not book_ids or not book_titles or not book_authors:
            return None
        for i in range(len(book_ids)):
            book = dict()
            book['id'] = book_ids[i] if book_ids[i] else ''
            book['author'] = book_authors[i] if book_authors[i] else ''
            book['title'] = book_titles[i] if book_titles[i] else ''
            if book['title'] == book_title and book['author'] == book_author:
                return book
        return None

if __name__ == '__main__':
    logging.basicConfig(stream=sys.stdout, level=logging.INFO,
                        format='%(asctime)s %(levelno)s %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S')
    start_time = int(time.time()) - 3600 * 24
    new_chapter_list = YuewenSpider().spider_chapter_list(u"九星毒奶", u"育", start_time)
    print new_chapter_list

爬取到需要更新的章节列表后,就需要订阅具体章节内容了,具体代码就不贴了。说一下主要过程:首先先获取章节价格,查询账户余额,然后发起订阅。

获取价格:

url = self.get_url('vipreader') + spider_chapter['chapter_url']
header['Host'] = 'vipreader.qidian.com'
 r = self.session.get(url, headers=header, verify=False)
 selector = etree.HTML(r.text)
 price = selector.xpath('//a[@class="single j_subscribeBtn"]/span/i/text()')

查询账户余额:

price = int(price[0])
# 准备参数
 data = dict()
 data['bookId'] = book_id
 data['chapterPrice'] = price
 chapters = dict()
 chapters['chapterId'] = spider_chapter['chapter_id']
 chapters['chapterCnt'] = spider_chapter['chapter_cnt']
 chapters['price'] = price
 chapters['uuid'] = spider_chapter['uuid']
 data['chapters'] = [chapters]
 data['isRenew'] = 0
 data['chapterCnt'] = 1
 data['isBuyAll'] = 0
 # 查询余额信息
 url = self.get_url(url_key='getSubscribe') + token
 r = self.session.post(url, headers=header, data=json.dumps(data))

最后订阅具体章节:

# 发起订阅
url = self.get_url(url_key='subscribe') + token
r = self.session.post(url, headers=header, data=json.dumps(data))

bs和lxml

在本节中解析html时用的是lxml。
BeautifulSoup是一个库,而XPath是一种技术,python中最常用的XPath库是lxml。

两者比较起来,性能方面lxml高于bs,而易用性方面则是bs好一些

从性能方面看,BeautifulSoup和lxml的原理不一样,BeautifulSoup是基于DOM的,会载入整个文档,解析整个DOM树,因此时间和内存开销都会大很多。而lxml只会局部遍历,另外lxml是用c写的,而BeautifulSoup是用python写的,因此性能方面自然会差很多。

从易用性方面看,BeautifulSoup用起来比较简单,API非常人性化,支持css选择器。lxml的XPath写起来麻烦,开发效率不如BeautifulSoup。

举个例子:

title = soup.select('.content div.title h3')

同样的代码用Xpath写起来会很麻烦:

title = tree.xpath("//*[@class='content']/div[@class='content']/h3")

本文介绍了爬虫的实际使用实例。但很多情况下,内容网站都有一些反爬机制,比如说禁止相同ip在短时间内大量访问,这时候我们就要用到代理了。代理将在下一篇blog中介绍。

参考资料

[1]https://www.jianshu.com/p/1d658f67fbdf

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值