# -*- coding: utf-8 -*-
# @ 2021/3/10 19:53
# @ 三国演义(txt).py
# @ DQMNB
import re
import os
import urllib.request
import urllib.error
from time import sleep
from tqdm import tqdm
findbiaoti = re.compile(r'<h2 class="grap--h2">(.*?)</h2>')
findwenben = re.compile(r'<div>(.*?)</div>', re.S)
def main():
if not os.path.exists('三国演义'):
os.mkdir('三国演义')
baseurl = 'http://sanguo.5000yan.com/'
biaotidata, wenbendata = getData(baseurl)
saveData(biaotidata, wenbendata)
def getData(baseurl):
biaotidata = []
wenbendata = []
for i in tqdm(range(965, 1085), '开始爬取'):
url = baseurl + str(i) + ".html"
html = askURL(url)
sleep(0.5)
# print(html.decode('utf-8')) #获取html源码
biaoti = re.findall(findbiaoti, html.decode('utf-8', errors='ignore'))[0]
biaotidata.append(biaoti)
wenbenlist = re.findall(findwenben, html.decode('utf-8', errors='ignore'))
wenben = ''.join(wenbenlist)
wenben = wenben.replace("“", '')
wenben = wenben.replace("‘", '')
wenben = wenben.replace("”", '')
wenben = wenben.replace("’", '')
wenben = wenben.replace(" ", '')
wenben = biaoti + "\n\n" + wenben
wenbendata.append(wenben)
return biaotidata, wenbendata
def askURL(url):
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36 Edg/86.0.622.63"}
request = urllib.request.Request(
url, headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read()
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return html
def saveData(biaotidata, wenbendata):
for i in tqdm(range(0, 120), '爬取成功,正在保存'):
biaotiStr = biaotidata[i] + '.txt'
f = open('./三国演义/' + biaotiStr, 'w')
f.write(wenbendata[i])
f.close()
main()
print("保存成功,爬取完毕!")
1. 不建议抓取太多数据,容易对服务器造成负载,浅尝辄止即可。
2. 本文仅用于交流学习,未经作者允许,禁止转载,更勿做其他用途,违者必究。
利用python爬取《三国演义》全本
最新推荐文章于 2023-04-02 20:22:30 发布