利用python爬取《三国演义》全本

最新推荐文章于 2023-11-29 00:03:35 发布

DQM_Venus

最新推荐文章于 2023-11-29 00:03:35 发布

阅读量2.2k

点赞数 3

文章标签： python

本文链接：https://blog.csdn.net/weixin_51703259/article/details/114847645

版权

# -*- coding: utf-8 -*-
# @ 2021/3/10 19:53
# @ 三国演义（txt）.py
# @ DQMNB

import re
import os
import urllib.request
import urllib.error
from time import sleep
from tqdm import tqdm

findbiaoti = re.compile(r'<h2 class="grap--h2">(.*?)</h2>')
findwenben = re.compile(r'<div>(.*?)</div>', re.S)


def main():
    if not os.path.exists('三国演义'):
        os.mkdir('三国演义')
    baseurl = 'http://sanguo.5000yan.com/'
    biaotidata, wenbendata = getData(baseurl)
    saveData(biaotidata, wenbendata)


def getData(baseurl):
    biaotidata = []
    wenbendata = []
    for i in tqdm(range(965, 1085), '开始爬取'):
        url = baseurl + str(i) + ".html"
        html = askURL(url)
        sleep(0.5)
        # print(html.decode('utf-8'))  #获取html源码
        biaoti = re.findall(findbiaoti, html.decode('utf-8', errors='ignore'))[0]
        biaotidata.append(biaoti)
        wenbenlist = re.findall(findwenben, html.decode('utf-8', errors='ignore'))
        wenben = ''.join(wenbenlist)
        wenben = wenben.replace("&ldquo;", '')
        wenben = wenben.replace("&lsquo;", '')
        wenben = wenben.replace("&rdquo;", '')
        wenben = wenben.replace("&rsquo;", '')
        wenben = wenben.replace("&nbsp;", '')
        wenben = biaoti + "\n\n" + wenben
        wenbendata.append(wenben)
    return biaotidata, wenbendata


def askURL(url):
    head = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36 Edg/86.0.622.63"}
    request = urllib.request.Request(
        url, headers=head)
    html = ""
    try:
        response = urllib.request.urlopen(request)
        html = response.read()
    except urllib.error.URLError as e:
        if hasattr(e, "code"):
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reason)
    return html


def saveData(biaotidata, wenbendata):
    for i in tqdm(range(0, 120), '爬取成功，正在保存'):
        biaotiStr = biaotidata[i] + '.txt'
        f = open('./三国演义/' + biaotiStr, 'w')
        f.write(wenbendata[i])
        f.close()


main()
print("保存成功，爬取完毕！")

 1. 不建议抓取太多数据，容易对服务器造成负载，浅尝辄止即可。
 2. 本文仅用于交流学习，未经作者允许，禁止转载，更勿做其他用途，违者必究。

DQM_Venus

关注

3
点赞
踩
12

收藏

觉得还不错? 一键收藏
1
评论
利用python爬取《三国演义》全本

# -*- coding: utf-8 -*-# @ 2021/3/10 19:53# @ 三国演义（txt）.py# @ DQMNBimport reimport osimport urllib.requestimport urllib.errorfrom time import sleepfrom tqdm import tqdmfindbiaoti = re.compile(r'<h2 class="grap--h2">(.*?)</h2>')find
复制链接

扫一扫