爬虫
爬取凡人小说
import os.path
import requests
from fake_useragent import UserAgent
from lxml import etree
from bs4 import BeautifulSoup
def fanrenxiuxianzhuan(number):
url = 'http://www.156n.com/html/0/29/'+str(number)+'.html'#修仙小说url
ua = UserAgent() #随机请求头
head = ua.chrome
headers = {
'user-agent': head
}
response = requests.get(url, headers=headers) #请求
html = etree.HTML(response.text)
zhangjie = html.xpath("//div[@class='bookname']/h1/text()")#章节名称
txt = html.xpath("//div[@id='content']/p/text()") #具体小说文字
fanrentxt = '修仙小说'#文件夹
#保存文件txt
fanren = '烦人休闲传1.txt' #保存文件txt
if not os.path.exists(fanrentxt):
os.mkdir(fanrentxt)
address = os.getcwd() + "\\" + fanrentxt
with open(address + '/' + fanren, 'a+', encoding='utf-8') as f:
# 爬取的第一章标题的类型为'list',需要转为字符串
f.write('------------------------------------------------------ ' + str(zhangjie) + '------------------------------------------------------ '+'\n' )
#循环遍历保存章节内容
for i in txt:
with open(fanrentxt + '/' + fanren, 'a+', encoding='utf-8') as f:
f.write(i + '\n')
for i in range(1,101):#爬取修仙前100章
# 这里的第一章为1591,第十章为15910,第一百章为159100,第一千章为1591000,所以这里需要转字符串进行拼接
fanrenxiuxianzhuan('159'+str(i))
print('------------------------第'+str(i)+'章下载完成------------------------')
最终效果展示: