import os
import requests
from pyquery import PyQuery as pq
import re
headers = {
"referer": "https://read.qidian.com/chapter/0aFVD3Gm6QUz2wAI5wVpbA2/MMVtN_lV-wDM5j8_3RRvhw2",
"User-Agent": "MMozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36",
"Host": "book.qidian.com",
}
headers1 = {
"referer": "https://read.qidian.com/chapter/0aFVD3Gm6QUz2wAI5wVpbA2/MMVtN_lV-wDM5j8_3RRvhw2",
"User-Agent": "MMozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36",
"Host": "read.qidian.com",
}
save_path = os.getcwd()
def create_book_dir_name(full_dir_name):
if not os.path.exists(full_dir_name):
os.makedirs(full_dir_name)
def write_text(save_path, dir_name, word_content_wrap):
with open(save_path + "\\" + dir_name + "\\" + dir_name + ".txt", "w") as f:
f.write(word_content_wrap)
print(word_content_wrap)
f.write("\n")
if __name__ == "__main__":
book_url = "https://book.qidian.com"
book_url_get = requests.get("https://book.qidian.com/info/1021338796#Catalog", headers=headers)
book_url_get.encoding = "utf-8"
book_url_get.close()
show_book_url_get_doc = pq(book_url_get.text)
# 章节链接还有名字 .volume .cf li a
book_page_href = show_book_url_get_doc(".volume .cf li a")
book_page_name = show_book_url_get_doc(".volume .cf li a").text().split(" ")
for page_i, page_name, every_i in zip(book_page_name[0:36:2], book_page_name[1:37:2], book_page_href):
dir_name = page_i + "_" + page_name
full_dir_name = os.path.join(save_path, dir_name)
print(dir_name)
create_book_dir_name(full_dir_name)
i_href = every_i.get("href")
full_https_url = "https:" + i_href
full_https_url_get = requests.get(full_https_url, headers=headers1)
full_https_url_get.encoding = "utf-8"
full_https_url_get_doc = pq(full_https_url_get.text)
full_https_url_get.close()
word_content_wrap = full_https_url_get_doc(".read-content.j_readContent p").text()
write_text(save_path, dir_name, word_content_wrap)
中文起点网小说爬虫
最新推荐文章于 2024-06-19 18:03:00 发布