py简单抓取小说动态页面数据

1. 打开页面,观察是否是动态加载页面

9.png

2. 查找隐藏的加载内容部分的链接

1.png

3. 找到链接所需要的对应的链接的参数

2.png

4. 在网页源码中找到对应的参数

3.png

5. 从源码中正则提取相关参数
sign = re.search(r'RP.sign = "(.*?)";', html, re.M | re.I)
book_id = re.search(r'book_id":"(.*?)",', html, re.M | re.I)
if sign:
  print 'book_id', book_id.group(1)
  print 'sign', sign.group(1)
else:
  print 'no match'
6. 拼接隐藏链接
contentURL = 'http://book.km.com/index.php?c=catch&a=getContent&book_id=%s&chapter_id=1&sign=%s' % (book_id.group(1), sign.group(1))
7. 开始正式的抓取页面中的数据
#!/usr/bin/python
#-*- coding: utf-8 -*-
#encoding=utf-8


# 统一用于上传的方法打包

import requests
import htmllib
import re
import urllib2
import json
import leancloud
import codecs
import string
from bs4 import BeautifulSoup
from lxml import etree
import sys
reload(sys)
sys.setdefaultencoding('utf8')



class bookcom:
    NovelName = ''
    NovelImageUrl = ''
    NovelType = ''
    NovelChapter = ''
    NovelChapterContent = ''
    NovelChapterId = ''

    # 获取小说列表
    def bookcomGetList(self):
        html = requests.get('http://book.km.com/shuku_0_0_0_1_0_0_1.html')
        pythonEtree = etree.HTML(html.text)
        pythonLink = pythonEtree.xpath('//div[@class="imgbox"]/a')
        for each in pythonLink:
            # print each.xpath('img/@_src')[0]
            # print each.xpath('img/@alt')[0]
            # print each.xpath('@href')[0]
            self.NovelName = each.xpath('img/@alt')[0]
            self.NovelImageUrl = each.xpath('img/@_src')[0]
            self.NovelType = '免费'

            searchObjOne = re.search(r'/shuku/(.*?).html', each.xpath('@href')[0], re.M | re.I | re.S)
            # print searchObjOne.group(1)
            URLTwo = 'http://book.km.com/chapterlist/%s.html' % searchObjOne.group(1)
            # print URLTwo
            # 保存图片列表
            self.bookcomGetListSave()
            self.bookcomGetNovelList(URLTwo)

    # 获取章节链接
    def bookcomGetNovelList(self,URL):
        html = requests.get(URL, 'GET')
        # print html.text
        pythonEtree = etree.HTML(html.text)
        pythonLink = pythonEtree.xpath('//ul[@class="catalog_list clearfix"]/li/a')

        for each in pythonLink:
            # print each.xpath('@href')[0]
            # print each.xpath('text()')[0]
            self.NovelChapter = each.xpath('text()')[0]
            novelContentUrl = 'http://book.km.com%s' % each.xpath('@href')[0]
            # print novelContentUrl
            self.bookcomGetNovelContent(novelContentUrl, URL)

    # 获取章节内容
    def bookcomGetNovelContent(self, URL, SuperURl):
        # ********** 根据表面链接获取所有源码 **********
        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:53.0) Gecko/20100101 Firefox/53.0',
                   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                   'Cookie': 'HTTP_REFERER=book.km.com; _ga=GA1.2.1574141621.1496282865; UM_distinctid=15c61689cad25e-04f45024755e92-49526a-fa000-15c61689cae39d; CNZZDATA30085487=cnzz_eid%3D2096924603-1496280564-%26ntime%3D1496291364; book_history=%5B%22b1343939%22%2C%22b1413544%22%5D; bdshare_firstime=1496282865098; Hm_lvt_b2e5ac9401b5820ffa4e9fa608593a5b=1496282865; Hm_lpvt_b2e5ac9401b5820ffa4e9fa608593a5b=1496296670; HTTP_REFERER=book.km.com',
                   'Referer': SuperURl
            , 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'
            , 'Connection': 'keep-alive'
            , 'Accept-Encoding': 'gzip, deflate'
            , 'Host': 'book.km.com'
            , 'Upgrade-Insecure-Requests': '1'
            , 'Cache-Control': 'max-age=0'
                   }

        html = requests.get(URL, headers=headers)
        # print html.text
        # 在源码中抓取需要的参数
        sign = re.search(r'RP.sign = "(.*?)";', html.text, re.M | re.I)
        book_id = re.search(r'book_id":"(.*?)",', html.text, re.M | re.I)
        chapter_id = re.search(r'"id":"(.*?)",', html.text, re.M | re.I)
        self.NovelChapterId = chapter_id.group(1)
        # if sign:
        #     print 'book_id', book_id.group(1)
        #     print 'sign', sign.group(1)
        #     print 'chapter_id', chapter_id.group(1)
        # else:
        #     print 'no match'

        contentHeader = {
            'Accept': '*/*'
            , 'Accept-Encoding': 'gzip, deflate'
            , 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'
            , 'Connection': 'keep-alive'
            ,
            'Cookie': '_ga=GA1.2.1574141621.1496282865; UM_distinctid=15c61689cad25e-04f45024755e92-49526a-fa000-15c61689cae39d; CNZZDATA30085487=cnzz_eid%3D2096924603-1496280564-%26ntime%3D1496800002; book_history=%5B%22b1343939%22%2C%22b1413544%22%5D; bdshare_firstime=1496282865098; Hm_lvt_b2e5ac9401b5820ffa4e9fa608593a5b=1496282865,1496631345; _gat=1'
            , 'Host': 'book.km.com'
            , 'Referer': URL
            , 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:53.0) Gecko/20100101 Firefox/53.0'
            , 'X-Requested-With': 'XMLHttpRequest'
        }

        # 拼接隐藏接口
        # print 'http://book.km.com/index.php?c=catch&a=getContent&book_id=%s&chapter_id=1&sign=%s' % (
        # book_id.group(1), sign.group(1))
        contentURL = 'http://book.km.com/index.php?c=catch&a=getContent&book_id=%s&chapter_id=%s&sign=%s' % (
        book_id.group(1), chapter_id.group(1), sign.group(1))
        contentHtml = requests.get(contentURL, headers=contentHeader)
        self.NovelChapterContent = contentHtml.text

        print self.NovelName
        print self.NovelImageUrl
        print self.NovelType
        print self.NovelChapter
        print self.NovelChapterId
        print self.NovelChapterContent
        self.bookcomChapterSave()

        # 获取小说内容
        # print contentHtml.text

    def bookcomChapterSave(self):
        Todo = leancloud.Object.extend('XuanHuanContent')
        todo = Todo()
        todo.set('NovelName', self.NovelName)
        todo.set('NovelChapterId', self.NovelChapterId)
        todo.set('NovelChapter', self.NovelChapter)
        todo.set('NovelChapterContent', self.NovelChapterContent)
        todo.save()

    def bookcomGetListSave(self):
        Todo = leancloud.Object.extend('XuanHuanList')
        todo = Todo()
        todo.set('NovelImageUrl', self.NovelImageUrl)
        todo.set('NovelType', self.NovelType)
        todo.set('NovelName', self.NovelName)
        todo.save()

leancloud.init("", "")
Book = bookcom()
Book.bookcomGetList()
# Book.bookcomGetNovelList('http://book.km.com/chapterlist/940750.html')
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值