今天花了一天写的一个爬虫小程序
# -*- coding: utf-8 -*-
# @Time : 2020/5/9 19:03
# @Author : 贺林森
# @FileName: 抓取霹雳轻小说
# @Software: PyCharm
# @Email : 505384662@qq.com
import requests
import re
import os
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, "
"like Gecko) Chrome/81.0.4044.138 Safari/537.36"
}
def get_catalogue(url):
# 发送请求
response = requests.get(url=url, headers=headers)
response.encoding = "utf-8"
chapter_info_list = []
if response.status_code == 200:
li_list = re.findall(r"<li class=.*</li>", response.text)
for li in li_list:
# print(li)
result = re.search(r'href="(.*?)">(.*?)<', li)
if result:
chapter_url = "https://www.linovelib.com" + result.group(1)
title = result.group(2)
chapter = {"title": title, "url": chapter_url}
chapter_info_list.append(chapter)
print(chapter_info_list)
return chapter_info_list
def get_content(chapter_info_list):
for chapter_info in chapter_info_list:
response = requests.get(url=chapter_info["url"], headers=headers)
if response.status_code == 200:
if os.path.exists("霹雳轻小说"):
pass
else:
os.makedirs("霹雳轻小说")
content_list = re.findall(r"<p>(.*?)</p>", response.content.decode("utf8"))[1:-1]
with open("./霹雳轻小说/" + chapter_info["title"] + ".txt", "w", encoding="utf8") as file:
for content in content_list:
file.write(content + "\n")
if __name__ == "__main__":
get_content(get_catalogue("https://www.linovelib.com/novel/2834/catalog"))