最近有朋友在看一本小说,但嫌弃网站广告很烦,让我帮忙写个爬虫爬下来看。xūe微测试了下目标网站,都是些简单的静态页面,大约花10分钟写完脚本开始爬,在这记录一下~
目标网站: link
直接上Python代码:
# coding=utf-8
"""
# :author: Terry Li
# :url: https://blog.csdn.net/qq_42183962
# :copyright: © 2021-present Terry Li
# :motto: I believe that the God rewards the diligent.
"""
import requests
import time
from lxml import etree
def gecko(save_text_path, part=1, chapter=1):
url = "http://www.erhaoshouzhang.cn/{}/{}.html".format(part, chapter)
resp = requests.get(url)
print(resp.status_code, chapter)
if resp.status_code == 200:
wb_data = resp.content.decode("gbk")
html = etree.HTML(wb_data)
with open(save_text_path, "a", encoding='utf-8-sig', newline='') as f:
html_title = html.xpath('/html/body/div[2]/div/div[1]/div[1]/h2')
if html_title is not None:
f.write(str('第%s章--'%chapter))
for t in html_title:
if t.text is not None:
f.write(t.text)
f.write('\r\n')
html_p_data = html.xpath('/html/body/div[2]/div/div[1]/p')
if html_p_data is not None:
for node_p in html_p_data:
if node_p.text is not None:
f.write(node_p.text)
f.write('\r\n')
f.write('\r\n')
html_pre_data = html.xpath('/html/body/div[2]/div/div[1]/pre')
if html_pre_data is not None:
for node_pre in html_pre_data:
if node_pre.text is not None:
f.write(node_pre.text)
f.write('\r\n')
f.write('\r\n')
html_p_br_data = html.xpath('/html/body/div[2]/div/div[1]/p[1]/text()')
if html_p_br_data is not None:
for t in range(len(html_p_br_data)):
cache = html_p_br_data[t]
cache.replace(u'\u3000', u'')
f.write(cache)
f.write('\r\n')
f.write('\r\n')
elif resp.status_code == 404:
print("爬取失败,网站链接返回404")
# sys.exit()
else:
raise Exception("Error,result is %s" % resp.json())
time.sleep(5.0)
if __name__ == '__main__':
"""看到第一部有296章,用for逐页爬取"""
for i in range(1, 297):
#爬虫入参part:因为有1、2、3...好多部,做成了参数
#爬虫入参chapter:小说章节号
gecko("./test.txt", part=1, chapter=i)
⭐对了,运行之前要装lxml
库
pip3 install lxml