python爬虫之爬取网站小说,由于学习没多久,许多小细节还不是很理解清楚,在大佬的帮助下,完成了这篇。
代码仅作参考:
'''
函数式编程
纵横中文网爬取阴阳酒馆小说
'''
# 导入第三方库
import requests
from lxml import etree
import time
import os
from fake_useragent import UserAgent
ua = UserAgent()
headers = {'User-Agent':ua.random}
# 得到html文本的函数
def get_html(url):
time.sleep(1)
html = requests.get(url, headers=headers).content.decode()
return html
# 解析html文本的函数
def paser_html(html):
e = etree.HTML(html)
href = e.xpath('//ul[@class="chapter-list clearfix"]/li[@class=" col-4"]/a/@href')
return href
# 定义解析详情页并保存的函数
def paser_detail(href):
if not os.path.exists('阴阳酒馆'):
os.mkdir('阴阳酒馆')
for url in href:
time.sleep(1)
parg = requests.get(url, headers=headers).content.decode()
e = etree.HTML(parg)
text = e.xpath('//div[@class="content"]/p/text()')
title = e.xpath('//div[@class="title"]/div[@class="title_txtbox"]/text()')
with open('阴阳医馆' + '/' + '{}.txt'.format(title[0]), 'a', encoding='utf-8') as f:
f.write(title[0])
f.write('\r')
for i in text:
f.write(i)
f.write('\r')
# 定义主函数
def main():
url = "http://book.zongheng.com/showchapter/894704.html"
html = get_html(url)
href = paser_html(html)
paser_detail(href)
# 运行函数
if __name__ == '__main__':
main()