Python爬虫实例一:网页小说爬取

提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档


前言

在大数据时代,互联网中拥有海量的信息数据,对个人而言,这些数据有的是有价值的,有的是没有价值。那么,如何在海量信息中找到并快速获取我们所需的大量数据?爬虫,就是很好的一种获取数据的方式。本章,我以网页小说爬取为例,为大家分享python爬虫的一些经验和遇到的一些问题。

一、爬虫所需库

requests:python的第三方库,专门用于发送HTTP请求。
安装requests库:pip install requests

二、网页分析

1.小说章节网页分析

以爬取某网站《深空**》为例,目标url:www.****.in/book/12793/
章节页码与源码分析
选取其中某一章,检查网页,可以找到这本小说所有章节的链接和名称。由此可以得出章节的路径xpath://div[@class=“panel panel-default”]/dl/dd/a/@href(有点懵???没关系,我们可以支直接在Google中查看一个章节的xpath,如下)

获取xpath演示

eg:某章的xpath为/html/body/div[2]/div[2]/dl/dd[1]/a

抓取章节代码如下(示例):

url = "https://www.xbxwx.in/book/12793/"  # 网址
response = requests.get(url, headers=headers)
response.encoding = 'GB2312'
html = etree.HTML(response.text)
url_j_list = ['https://www.xbxwx.in/book/12793/' + x for x in html.xpath('//div[@class="panel panel-default"]/dl/dd/a/@href')]
url_List.append(url_j_list)

2.小说内容网页分析

同上所述,可以得出小说内容的xpath://*[@id=“htmlContent”]/text()
在这里插入图片描述
抓取小说内容代码如下(示例):

rep = requests.get(url_x, headers=headers)
encoding = chardet.detect(rep.content)['encoding']
rep.encoding = encoding
dom = etree.HTML(rep.text)
# //*[@id="content"]/div[1]/h1/text()
name = dom.xpath('//*[@id="content"]/div[1]/h1/text()')[0]
text = dom.xpath('//*[@id="htmlContent"]/text()')
with open(path1 + '/' + f'{name}_{i + 1}.txt', 'w', encoding='utf-8') as f:
    for con in text:
        f.write(con)
    print(f'{name}_{i + 1} 下载完成')

三、代码实例

1.获取小说章节路径

import requests
from lxml import etree
import xlwt


path = r'D:\Python_project\xiaoshuo\ '
headers = {
    "Referer": "https://www.***.in/book/12793/",
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1"
}


def get_urls():
    # 由于网站中是分页显示章节,即:
    '''
    1-60章网址:https://www.***.in/book/12793/
    60-120章网址:https://www.***.in/book/12793/index_2.html
    网址源码:
    <select class="form-control" onchange="window.location=this.value;">
      <option value="/book/12793/">1</option>
      <option value="/book/12793/index_2.html">2</option>
      。。。。。。
      <option value="/book/12793/index_25.html" selected="">25(末页)</option>
    </select>

    /html/body/div[2]/div[2]/div[2]/select/option[1]
    '''

    url = "https://www.***.in/book/12793/"  # 网址
    response = requests.get(url, headers=headers)
    response.encoding = 'GB2312'
    html = etree.HTML(response.text)

    url_N_id = ['https://www.***.in/' + x for x in html.xpath('//select[@class="form-control"]/option/@value')]
    number = url_N_id.__len__()
    url_List = []
    for j in range(number):
        # 第N网址中章节的url列表
        '''
        获取章节的网址 :可直接在网页检查-元素-找到章节网址-复制xpath
        <dd class="col-md-3">
         <a href="38415404.html" title="第一章 旧土">第一章 旧土</a>
        </dd>
        右键->复制xpath: xpath = /html/body/div[2]/div[2]/dl/dd[1]/a
        即 xpath ==> "38415404.html"
        '''
        if j != 0:
            url_j_id = url_N_id[j]
            response = requests.get(url_j_id, headers=headers)
            response.encoding = 'GB2312'
            html = etree.HTML(response.text)

        url_j_list = ['https://www.***.in/book/12793/' + x for x in html.xpath('//div[@class="panel panel-default"]/dl/dd/a/@href')]
        url_List.append(url_j_list)

    url_list = []
    for m in range(url_List.__len__()):
        for n in range(url_List[m].__len__()):
             url_list.append(url_List[m][n])

    return url_list


def main():
    urls = get_urls()

    # 创建工作薄对象
    workbook = xlwt.Workbook(encoding='GB2312')
    Sheet_name = workbook.add_sheet('深空彼岸小说章节网站')
    Headers = ['序号', '网址']
    for index, Header in enumerate(Headers):
        Sheet_name.write(0, index, Header)

    for index, url in enumerate(urls):
        Sheet_name.write(index+1, 0, index+1)
        Sheet_name.write(index + 1, 1, url)

    workbook.save('xiaoshuo.xls')


if __name__ == '__main__':
    main()

2.爬取小说内容

import requests
from lxml import etree
import xlrd
import time
import random
import chardet # 自动检测编码格式


path = r'D:\Python_project\xiaoshuo\ '
path1 = r'D:\Python_project\xiaoshuo\xs'
headers = {
    "Referer": "https://www.***.in/book/12793/",
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1"
}



def get_text(url):
    rep = requests.get(url, headers=headers)
    '''
    rep.encoding = 'GB2312'
    '''
    encoding = chardet.detect(rep.content)['encoding']
    rep.encoding = encoding
    dom = etree.HTML(rep.text)
    # //*[@id="content"]/div[1]/h1/text()
    # 因为有章节分为上下页,还需下载下一页
    # 下一页网址:
    '''
    eg:第一页 https://www.***.in/book/12793/38415404.html
        第二页 https://www.***.in/book/12793/38415404_2.html
        第X页 //*[@id="content"]/div[1]/h1/small
    '''
    # 章节页数
    strnum = dom.xpath('//*[@id="content"]/div[1]/h1/small/text()')
    if len(strnum) == 0:
        rep = requests.get(url, headers=headers)
        '''
        rep.encoding = 'GB2312'
        '''
        encoding = chardet.detect(rep.content)['encoding']
        rep.encoding = encoding
        dom = etree.HTML(rep.text)
        # //*[@id="content"]/div[1]/h1/text()
        name = dom.xpath('//*[@id="content"]/div[1]/h1/text()')[0]
        text = dom.xpath('//*[@id="htmlContent"]/text()')
        with open(path + f'{name}.txt', 'w', encoding='utf-8') as f:
            for con in text:
                f.write(con)
            print(f'{name} 下载完成')
    else:
        str1 = strnum[0][3:4]
        num = int(str1)
        for i in range(num):
            if i == 0:
                url_x = url
            else:
                url_x = url[:-5] + '_' + str(i + 1) + '.html'
            rep = requests.get(url_x, headers=headers)
            '''
            rep.encoding = 'GB2312'
            '''
            encoding = chardet.detect(rep.content)['encoding']
            rep.encoding = encoding
            dom = etree.HTML(rep.text)
            # //*[@id="content"]/div[1]/h1/text()
            name = dom.xpath('//*[@id="content"]/div[1]/h1/text()')[0]
            text = dom.xpath('//*[@id="htmlContent"]/text()')
            with open(path1 + '/' + f'{name}_{i + 1}.txt', 'w', encoding='utf-8') as f:
                for con in text:
                    f.write(con)
                print(f'{name}_{i + 1} 下载完成')


def main():

    # 获取存储在xls文件中的小说网址
    workbook = xlrd.open_workbook('xiaoshuo.xls')
    Sheet_name = workbook.sheet_by_name('深空彼岸小说章节网站')
    print(Sheet_name.name, Sheet_name.ncols, Sheet_name.nrows)

    # 获取第一个sheet
    Sheet1 = workbook.sheet_by_index(0)

    # 获取第2列单元格数据
    cols = Sheet1.col_values(1)
    # print(urls)
    urls = cols[1264:1269]

    for url in urls:
        get_text(url)
        time.sleep(random.randint(1, 3))

if __name__ == '__main__':
    main()


总结

以上就是关于python爬虫相关的内容,本文仅仅简单介绍了爬虫在网页小说中的使用案例,后续我将继续更新python爬虫相关的应用案例,谢谢。

  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

我梦之

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值