python爬虫简易到进阶实战篇——（2）

最新推荐文章于 2024-06-24 14:39:42 发布

量子莉萝

最新推荐文章于 2024-06-24 14:39:42 发布

阅读量1.3k

点赞数 2

分类专栏：初级文章标签：新人练级

本文链接：https://blog.csdn.net/usernameisone/article/details/81977415

版权

初级专栏收录该内容

18 篇文章 0 订阅

订阅专栏

python简易实战（2）——小说爬取

第二篇文章简单介绍python中re模块在爬虫中的实战应用，希望我们一同进步。
因为对于这些模块我也没有深入研究，所以不对基础内容进行讲解。

上一章我们讲解了python中requests+Beautifulsoup的结合应用，这次requests+re模块进行对笔趣阁小说网的爬取。
废话不多说，直接上代码与注释：

import requests
import re


url = 'https://www.xxbiquge.com/20_20339/'    #目标网页，对于任意同网页的小说，改下网址即可
response = requests.get(url)     #对网页进行请求
try:                #异常处理
    response.raise_for_status()     #相当于判断是否正常访问网页
    response.encoding = response.apparent_encoding   #修改返回网址编码
except:
        None
html = response.text   #返回网页内容
div = re.findall('<div id="list">.*?</div>',html,re.S)[0]   #在html中定位范围最大的'<div id="list">与</div>中间的元素并以字符串形式返回'
chapter_info_list = re.findall('<a href="(.*?)">(.*?)</a>',div,re.S)    #在div中定位'<a href="与">与</a>'间的元素，以列表形式返回
title = re.findall('<h1>(.*?)</h1>',html,re.S)[0] #定位小说名称
#fb = open('%s.txt'%title,'w',encoding='utf-8')
for chapter_info in chapter_info_list:     #遍历列表
    chapter_url, chapter_title = chapter_info   #获取残缺的网页以及网页章节名
    chapter_url = 'https://www.xxbiquge.com%s' % chapter_url    #网页拼接
    try:     #拼接网页的异常处理
        chapter_response = requests.get(chapter_url)
        chapter_response.encoding = chapter_response.apparent_encoding
        chapter_html = chapter_response.text
    except:
        continue
    chapter_content = re.findall('<div id="content">(.*?)</div>',chapter_html,re.S)[0]   #依旧是定位，定位小说内容
    chapter_content = chapter_content.replace('&nbsp;&nbsp;&nbsp;&nbsp;','')#数据处理
    chapter_content = chapter_content.replace('<br /><br />', '')#数据处理
    chapter_content = chapter_content.replace('readx()', '')#数据处理
    print(chapter_title)
    print(chapter_url)
    print(chapter_content)
    # fb.write(chapter_title)
    # fb.write('\n')
    # fb.write(chapter_content)

如果想要写入文本的话，将代码含注释的#删去，就能简单的将数据写入txt文本，这不失为一种下载小说的方式。

为了巩固上一章的内容，我在发一串用requests+Beautifulsoup爬取笔趣阁小说网的代码，但是这串代码没有注释，需要同学自己进行理解：（以及本篇采用了函数形式，可能会比较不好理解。）

#__author:'lwq'
#date: 2018/8/5
import  requests
from bs4 import BeautifulSoup
import os

def getHtml(url):
    response = requests.get(url)
    try:
        response.raise_for_status()
        response.encoding = response.apparent_encoding
        return response.text
    except:
        None

def dealHtml(page):
    soup = BeautifulSoup(page,'lxml')
    a = soup.find_all('a',title = '')
    name = soup.find('h1').text
    url_list = [name]
    for a in a[15: ] :
        url_list.append('http://www.biquge.com.tw' + a['href'] + ',' + a.text)
    return url_list

def dealHtml2(page):
    soup = BeautifulSoup(page,'lxml')
    text = soup.find('div',id="content")
    return text.text

def writePage(title,word):
    try:
        os.makedirs('笔趣阁小说//')
    except:
        None
    with open('笔趣阁小说//%s.txt'%title,'a') as fb:
        fb.write(word)


if __name__ == '__main__':
    url = 'http://www.biquge.com.tw/1_1958/'
    page = getHtml(url)
    title = dealHtml(page)
    for title in title[1:]:
            url_text = title.split(',')[0]
            page2 = getHtml(url_text)
            writePage(dealHtml(page)[0],'-----------'+title.split(',')[1]+'-----------\n')
            print('-----------'+title.split(',')[1]+'-----------\n')
            word = dealHtml2(page2)
            print(word)
            writePage(dealHtml(page)[0], word.encode('gbk','ignore').decode('gbk')+'\n')