代码不是很完美, 抓取的数据中少数几个带有html标签!
╮(╯▽╰)╭ 只怪自己太cai, 还没有想好怎么把正则表达式写的再完美一点。
抓取代码
#encoding=utf-8
import requests
import json
import re
class NeihanSpider:
def __init__(self):
self.headers = {"user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36"}
self.url = "http://www.neihanshu.net/text/index{}{}.html"
self.sum = 0
def get_url_list(self, url):
url_list = [self.url.format("", "")]
for i in range(133):
url_list.append(self.url.format("_", i + 2))
return url_list
def parse_url(self, url):
response = requests.get(url, headers=self.headers)
return response.content.decode()
def get_content_list(self, html_str):
return re.findall(r"<div class=\"article-body\".*?<p>([^"].*?)</p>", html_str, re.S)
def save_content_list(self, content_list, page_number):
with open("neihan.txt", "a", encoding="utf-8") as f:
for content in content_list:
self.sum += 1 # 记录是第几个段子
f.write(str(self.sum) + " " + json.dumps(content, ensure_ascii=False) + "\n\n")
print("第{}页保存成功".format(page_number + 1))
def run(self):
# 生成rul 列表
url_list = self.get_url_list(self.url)
# 发送请求, 获取响应
for url in url_list:
html_str = self.parse_url(url)
# 提取数据
content_list = self.get_content_list(html_str)
# 保存数据
self.save_content_list(content_list, url_list.index(url))
if __name__ == "__main__":
neihanspider = NeihanSpider()
neihanspider.run()