Python爬取小说
感想
感觉这个够蛋疼的,因为你如果正常写的话,前几次运行没问题,之后你连代码都没改,再运行就出错了。。。
其实这可能是网络请求失败,或者有反爬虫的东西吧。但这就会让你写的时候非常苦恼,所以这这东西,健壮性及其重要!
import requests
from bs4 import BeautifulSoup
import os
import re
import random
# 获取url的html内容
def getHTMLText(url):
try:
# 伪装为浏览器
kv = {'user-agent': 'Mozilla/5.0'}
r = requests.get(url, timeout=10, headers=kv)
r.raise_for_status() # 如果状态不是200,引发HTTPError异常
r.encoding = r.apparent_encoding
return r.text
except:
return "产生异常"
# 把字符串filestr存到本地
def txtDownload(filestr, txtName):
root = "E://fiction//"
path = root + txtName + ".txt"
try:
# 如果没有,建文件夹
if not os.path.exists(root):
os.mkdir(root)
# 无重名文件
if not os.path.exists(path):
fh = open(path, 'w')
fh.write(filestr)
fh.close()
print("文本保存成功")
else:
print("文本已存在")
except:
print("爬取失败")
#分析每一章url中的内容,
def htmlAnalyse(urlList, hp):
html = getHTMLText(hp + '/' + urlList.split('$')[0])
soup = BeautifulSoup(html, "html.parser")
tmp = soup.find_all(id='content')
if tmp:
text = tmp[0]
else:
print("没有爬取到真实内容")
return
filestr = ""
for string in text.strings:
tt = repr(string)
m = re.search(r'[\u4e00-\u9fa5]', tt)
if m:
#正则表达式处理内容
strinfo = re.compile('xa0|n|r')
st2 = strinfo.sub('', m.string)
strinfo2 = re.compile(r"\\|'")
st3 = strinfo2.sub('', st2)
filestr = filestr + st3 + '\n'
txtDownload(filestr, urlList.split('$')[1])
#主页url,分析出每一章的url和内容,返回到list列表中 用$分隔url和name
def homeAnalyseUrl(hUrl):
homeHtml = getHTMLText(hUrl)
hsoup = BeautifulSoup(homeHtml, "html.parser")
tmp = hsoup.find_all(id='list')
list = []
if tmp:
text = tmp[0]
else:
print("网站拒绝了我们访问")
print(hsoup.prettify())
return list
alist = text.find_all('a')
for i in range(len(alist)):
s = str(alist[i].attrs['href'])
s = s.split('/')[-1]
list.append(s + '$' + alist[i].string)
print("主页全部url分析完成")
return list
#优化版本,增加随机爬取。增加参数,
def homeAnalyseUrl2(hUrl, num):
homeHtml = getHTMLText(hUrl)
hsoup = BeautifulSoup(homeHtml, "html.parser")
tmp = hsoup.find_all(id='list')
list = []
if tmp:
text = tmp[0]
else:
print("网站拒绝了我们访问:")
print(hsoup.prettify())
return list
alist = text.find_all('a')
allCha = range(len(alist))
rid = random.sample(allCha,num)
for i in range(num):
s = str(alist[rid[i]].attrs['href'])
s = s.split('/')[-1]
list.append(s + '$' + alist[rid[i]].string)
print("主页全部url分析完成")
return list
if __name__ == "__main__":
#所以章节的url$name列表
# homePage = "http://www.biquge.com.tw/2_2826"
# homePage = "http://www.biquge.com.tw/18_18186"
# homePage = "http://www.biquge.com.tw/16_16279"
#good
# homePage = "http://www.biquge.com.tw/18_18740"
homePage = "http://www.biquge.com.tw/14_14297"
#连续爬取
# urlList = homeAnalyseUrl(homePage)
#随机爬取
spiNum = 10
urlList = homeAnalyseUrl2(homePage, spiNum)
if urlList:
for i in range(spiNum):
print("第 " + str(i + 1) + " 次", end='')
htmlAnalyse(urlList[i], homePage)
else:
print("我们被反爬虫了")