# -*- codeing = utf-8 -*-
# @Time : 2020/10/16 10:23
# @Author : VIC1
# @File : novel.py
# @Software : PyCharm
# ____________________________
from bs4 import BeautifulSoup # 网页解析,获取数据
import re # 正则表达式,进行文字匹配
import urllib.request, urllib.error # 制定URL, 获取网页数据
import gzip
import time
def main():
baseurl = "http://www.biquge.info/74_74132/"
# 爬取每章小说url
urllist = get1url(baseurl)
# 获取每章内容
for i in urllist:
data = getData(i)
saveData(data)
time.sleep(4) # 你懂的
print("爬取完毕")
# 每章小说herf链接
findHref = re.compile(r'<a href="(.*?)" title=')
# 每章小说标题
findTitle = re.compile(r'<h1>(.*?)</h1>')
# 每章小说内容
findContent = re.compile(r'<div id="content">(.*?)</div>',re.S)
# 爬取每章小说URL
def get1url(baseurl):
urllist = []
html = askURL1(baseurl) # 保存获取网页源代码
# 逐一解析数据
soup = BeautifulSoup(html, "html.parser")
for dd in soup.find_all('dd'): # 查找符合要求的字符串
# print(dd) # 测试:查看全部信息
dd = str(dd)
herf = re.findall(findHref,dd)[0] # 获取herf
url1 = baseurl + herf # 拼接获得每章的url
# print(url1) # 测试:检查url是否正确
urllist.append(url1)
return (urllist)
# 爬取小说内容
def getData(url):
data = []
html = askURL1(url) # 保存每章网页源代码
soup = BeautifulSoup(html, "html.parser")
for box_con in soup.find_all('div',class_="box_con"):
box_con = str(box_con)
# 小说标题
title = re.findall(findTitle,box_con)[0]
data.append(title)
# 小说内容
content = re.findall(findContent,box_con)[0]
content = content.replace('<!--go-->','')
content = content.replace('\xa0', '')
content = content.replace('<br/><br/>', '\n')
content = content.replace('<!--over-->', '')
data.append(content)
return (data)
# 保存为txt文件
def saveData(data):
title = data[0]
file = open(r'D:\我师兄实在太稳健了\%s.txt'%title,'w',encoding='utf-8')
content = data[1]
file.write(title+'\n'+content)
file.close()
# 得到一个指定URL的网页内容
def askURL1(url):
head = {# 模拟头部信息
"User-Agent": "Mozilla / 5.0(Windows NT 10.0;\
Win64;\
x64) AppleWebKit / 537.36(KHTML, like\
Gecko) Chrome / 86.0\
.4240\
.75\
Safari / 537.36"
}
# 用户代理,表示告诉服务器,我们是什么类型的机器,浏览器
request = urllib.request.Request(url, headers=head)
html = ""
response = urllib.request.urlopen(request)
# 在header中检查是否为gzip
if ('Content-Encoding','gzip') in response.headers._headers:
html = response.read()
html = gzip.decompress(html)
html = html.decode("utf-8")
# print(html)
else:
html = response.read().decode("utf-8")
return html
if __name__ == "__main__": #当程序执行时
# 调用函数
main()