Python学习(一) 爬取一整篇小说

梦道长生

已于 2023-12-06 23:15:16 修改

阅读量1.8k

点赞数 5

文章标签： python 爬虫

于 2023-07-31 18:47:21 首次发布

本文链接：https://blog.csdn.net/qq_63036514/article/details/132028058

版权

自己编写的代码版本2
主要实现爬取网络小说功能，可实现搜索爬取
缺点：容易被反爬，并爬取网页有时候会返回502。并且部分网页异常，有可能连接超时。

学习到的新内容：
1.requests.Session()模块
可以实现headers、proxies、data等多种方法的记忆存储功能。
事例：
import requests
headers = {
*****
}
proxies = {
*****
}
session = requests.Session()
session.headers = headers
session.proxies = proxies
def main():
url = 'http//www.baidu.com'
response = session.get(url)
print(response)
if __name__ == '__main__'
main()

2.requests.adapters中的HTTPAdapter, Retry
可以实现断线重连的效果
实现代码：
from requests.adapters import HTTPAdapter, Retry

session = requests.Session()
session.headers = headers
session.proxies = proxies

# 创建一个Retry对象，设置重试的参数
retries = Retry(total=5, # total=5：重连次数5次
backoff_factor=0.1, # backoff_factor=0.1：第二次重连时间，每次重连+本身
# status_forcelist=[500, 502, 503, 504]：当返回为[500, 502, 503, 504]时重连
status_forcelist=[500, 502, 503, 504])
# 把Retry对象传给HTTPAdapter，然后把HTTPAdapter挂载到session上
session.mount('http://', HTTPAdapter(max_retries=retries))

url = 'https://www.baidu.com'
response = session.get(url)

3.asyncio,aiofiles异步爬虫
提高了代码的解析速度

# 导入模块
# 本来想用lxml解析的，结果无法返回数据
from requests.adapters import HTTPAdapter, Retry
from bs4 import BeautifulSoup
import requests
import asyncio
import aiofiles
import pandas
import random
import os


def novelSearch():  # 书籍搜索
    print('部分内容无法爬取可能原网站服务器崩坏')
    print('可搜书名和作者，请您少字也别输错字。')
    name = input('输入想搜索的书名：')

    url = f'http://www.biquge5200.cc/modules/article/search.php?searchkey={name}'

    response = session.get(url)
    response.close()

    url_html = BeautifulSoup(response.text, features='lxml')
    url_find = url_html.find('table', align="center").find_all('tr')[1:]
    for i in url_find:
        book_object = i('td')
        book_name = book_object[0].text
        book_url_pd = str(book_object[0]).split('"')[3]
        book_url = 'http://www.biquge5200.cc' + book_url_pd
        book_chapter = book_object[1].text
        book_author = book_object[2].text
        book_number = book_object[3].text
        book_renew = book_object[4].text
        book_state = book_object[5].text
        dic = {'文章名称': book_name, '链接': book_url, '最新章节': book_chapter, '作者': book_author,
               '字数': book_number, '更新': book_renew, '状态': book_state}
        cones.append(dic)
        book_url_z.append(book_url)

    pandas.set_option('display.max_columns', None)
    # book.to_excel('book.xlsx', index=False)
    print('搜索结果：仅显示前50条，请输入更详细的搜索条件，缩写搜索范围。')
    print(pandas.DataFrame(cones))


def novelAnalysis():  # 书籍子链接解析
    download_name = input('输入想下载的书名：')
    download_author = input('输入文章作者名称：')

    for dc in cones:

        if download_name == dc['文章名称'] and download_author == dc['作者']:
            url = f'{dc["链接"]}'

            response = session.get(url)
            response.close()

            url_html = BeautifulSoup(response.text, features='lxml')
            url_find = url_html.find('div', id="list").find_all('a')[9:]
            print(f'---------共{len(url_find)}章----------')

            for i in url_find:
                book_name_title = i.text
                book_url_name_title = str(i).split('"')[1]
                title_url = 'http://www.biquge5200.cc' + book_url_name_title
                book_dic = {'文章名称': dc['文章名称'], '章节名称': book_name_title, '章节链接': title_url}
                book_title_download.append(book_dic)


async def book_download():  # 书籍内容解析并下载
    for i in book_title_download:
        book_read = ''
        url = i['章节链接']
        book_title = (i['章节名称'])

        response = session.get(url)  # proxies=random.choice(proxies)
        if response != '<Response [200]>':
            response = session.get(url)
        response.close()

        url_html = BeautifulSoup(response.text, features='lxml')
        url_find = url_html.find('div', id="content")
        book_read = book_read + f'{book_title}\n'
        
        # 判断内容是否为None，因为老是报错写的后面加了断线重连的代码，基本上是稳定了。
        if url_find is not None:
            for p in url_find:
                book_chapter = (p.getText('p'))
                book_read = book_read + f'{book_chapter}\n'
        else:
            print("Can not find div tag with id content")

        if not os.path.exists('小说'):
            os.mkdir('小说')

        async with aiofiles.open(f"小说/{i['文章名称']}.txt", 'a', encoding='utf-8') as download:  # 下载内容
            await download.write(book_read)
            print(f'{book_title}  --下载成功')
            await asyncio.sleep(.5)
    print('全部下载完成!')


if __name__ == '__main__':  # 执行程序

    cones = list()
    book_url_z = list()
    book_title_download = list()

    # 为了防止被反爬，专门弄了个随机headers...
    headers_list = [
        {
            'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 8.0.0; SM-G955U Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Mobile Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 10; SM-G981B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Mobile Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (iPad; CPU OS 13_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/87.0.4280.77 Mobile/15E148 Safari/604.1'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.109 Safari/537.36 CrKey/1.54.248666'
        }, {
            'user-agent': 'Mozilla/5.0 (X11; Linux aarch64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.188 Safari/537.36 CrKey/1.54.250320'
        }, {
            'user-agent': 'Mozilla/5.0 (BB10; Touch) AppleWebKit/537.10+ (KHTML, like Gecko) Version/10.0.9.2372 Mobile Safari/537.10+'
        }, {
            'user-agent': 'Mozilla/5.0 (PlayBook; U; RIM Tablet OS 2.1.0; en-US) AppleWebKit/536.2+ (KHTML like Gecko) Version/7.2.1.0 Safari/536.2+'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; U; Android 4.3; en-us; SM-N900T Build/JSS15J) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; U; Android 4.1; en-us; GT-N7100 Build/JRO03C) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; U; Android 4.0; en-us; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 7.0; SM-G950U Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.84 Mobile Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 8.0.0; SM-G965U Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.111 Mobile Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 8.1.0; SM-T837A) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.80 Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; U; en-us; KFAPWI Build/JDQ39) AppleWebKit/535.19 (KHTML, like Gecko) Silk/3.13 Safari/535.19 Silk-Accelerated=true'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; U; Android 4.4.2; en-us; LGMS323 Build/KOT49I.MS32310c) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/102.0.0.0 Mobile Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 550) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Mobile Safari/537.36 Edge/14.14263'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 6.0.1; Moto G (4)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 10 Build/MOB31T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 8.0.0; Nexus 5X Build/OPR4.170623.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 7.1.1; Nexus 6 Build/N6F26U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 8.0.0; Nexus 6P Build/OPP3.170518.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 7 Build/MOB30X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows Phone 8.0; Trident/6.0; IEMobile/10.0; ARM; Touch; NOKIA; Lumia 520)'
        }, {
            'user-agent': 'Mozilla/5.0 (MeeGo; NokiaN9) AppleWebKit/534.13 (KHTML, like Gecko) NokiaBrowser/8.5.0 Mobile Safari/534.13'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 9; Pixel 3 Build/PQ1A.181105.017.A1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.158 Mobile Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 10; Pixel 4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Mobile Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 11; Pixel 3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.181 Mobile Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 8.0.0; Pixel 2 XL Build/OPD1.170816.004) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1'
        }, {
            'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1'
        }, {
            'user-agent': 'Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1'
        }
    ]

    headers = {
        'Host': 'www.biquge5200.cc',
        'Referer': 'http://www.biquge5200.cc/',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': f'{random.choice(headers_list)}'
    }

    session = requests.Session()
    session.headers = headers

    # 创建一个Retry对象，设置重试的参数
    retries = Retry(total=10,
                    backoff_factor=0.5,
                    status_forcelist=[500, 502, 503, 504])
    # 把Retry对象传给HTTPAdapter，然后把HTTPAdapter挂载到session上
    session.mount('http://', HTTPAdapter(max_retries=retries))

    novelSearch()
    novelAnalysis()
    asyncio.run(book_download())

基本的功能差不多是实现了，就是爬取所需要的时间有点长。还有就是不知道为什么爬取很很长的内容的时候有可能会出现程序卡死的现象（准备去学个debag检查一下）。