欢迎有大佬指点优化。
import requests
from pyquery import PyQuery as pq
import json
import sys
def load(name):
# 获取起点的搜索页面
url_select = 'https://www.qidian.com/search?kw=' + name
response_select = requests.get(url_select).content.decode('utf8')
doc_select = pq(response_select)
# clear 加了items变成了生成器,
clear_select = doc_select('.book-mid-info h4 a').items()
# 获取书id,使用next进行迭代,由于只取第一个数据,所以不用for
data_eid = next(clear_select).attr('data-bid')
# print(data_eid)
# 获取章节目录
url_catalog = 'https://book.qidian.com/ajax/book/category?_csrfToken' \
'=KxOyODbbsZHWGtIfUsnDEqI9teZBDuUDC4QJ5YsZ&bookId=' + data_eid
response_catalog = requests.get(url_catalog).content.decode('utf8')
# 返回的是json格式其中包含起点所有的章节需要的id
Json = json.loads(response_catalog)
# 分析Json结构
date = Json["data"]
vs = date["vs"]
for i in vs:
cs = i["cs"] # 包含正文的url的id以及章节名称
vN = i["vN"] # 分卷名称
# with open(r'D:\Users\MSI-PC\Desktop\123.txt', encoding='utf8', mode='a+') as f:
# f.write(str(cs) + '\n')
if vN == '作品相关':
continue
elif vN == 'VIP卷':
for cU in cs:
only = cU["id"] # id是vip章节url的id
cN = cU["cN"] # 章节名称
list_cU = {cN: str(only)}
vip(list_cU, data_eid)
else:
for cU in cs:
only = cU["cU"] # cU是章节url的id
cN = cU["cN"] # 章节名称
list_cU = {cN: only}
down(list_cU)
# Cookie内容需要用自己账号登陆后产生的
headers = {
'Cookie': ''
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KH'
'TML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
}
# 已经订阅的VIP章节
def vip(ID_url, data_eid):
# key是章节名称,value章节id
for key, value in ID_url.items():
# 组合完成的每一章节的url
urls = f'https://vipreader.qidian.com/chapter/{data_eid}/{value}'
response = requests.get(urls, headers=headers).content.decode('utf8')
text = pq(response)
# 使用PyQuery 筛选正文内容
text_w = text(".read-content.j_readContent p")
# 写入标题
with open(r'D:\Users\MSI-PC\Desktop\123.txt', encoding='utf8', mode='a+') as f:
f.write(str(key) + '\n')
# 由于起点是每句一个<p></p>加属性 i获得是属性,需要加text方法获得内容
# 之所以不整个列表打印,是因为好看=-=所以和页面同步,逐段打印
for i in text_w:
# 为了减少不必要的运行,一般未订阅的VIP章节,只能看三行,字数必然小于1000就直接停止程序
if len(text_w.text()) > 1500:
with open(r'D:\Users\MSI-PC\Desktop\123.txt', encoding='utf8', mode='a+') as f:
f.write(str(i.text) + '\n')
else:
with open(r'D:\Users\MSI-PC\Desktop\123.txt', encoding='utf8', mode='a+') as f:
f.write('对不起,未订阅' + '\n')
sys.exit()
# 免费章节
def down(ID_url):
# key是章节名称,value章节id
for key, value in ID_url.items():
# 组合完成的每一章节的url
urls = 'https://read.qidian.com/chapter/' + value
response = requests.get(urls).content.decode('utf8')
text = pq(response)
# 使用PyQuery 筛选正文内容
text_w = text(".read-content.j_readContent p")
# 写入标题
with open(r'D:\Users\MSI-PC\Desktop\123.txt', encoding='utf8', mode='a+') as f:
f.write(str(key) + '\n')
# 由于起点是每句一个<p></p>加属性 i获得是属性,需要加text方法获得内容
for i in text_w:
with open(r'D:\Users\MSI-PC\Desktop\123.txt', encoding='utf8', mode='a+') as f:
f.write(str(i.text) + '\n')
if __name__ == '__main__':
namebook = input('请输入小说名称: ')
load(namebook)