Python之起点中文网爬虫
注:请勿用于其他用途,仅供学习使用
import requests
import re
import os
from lxml import etree
head = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/78.0.3904.97 Safari/537.36"}
def get_page(book_id):
"""获取章节字段"""
# 所有章节接口https:https://book.qidian.com/ajax/book/category?bookId=1013414929
b_url = 'https://book.qidian.com/ajax/book/category?bookId='
# 获取文章内容https://read.qidian.com/chapter/9KwLON5H3DQKgXB091LLaA2/EftiSjrby1j6ItTi_ILQ7A2
b_p_url = 'https://read.qidian.com/chapter/'
# 小说名//*[@id="j_textWrap"]/div/div/h1
# 章节名//div//h3/span/text()[1]
# 内容//div//p/span/text()
url = b_url + book_id
try:
r = requests.get(url, headers=head)
r.raise_for_status()
r.encoding = 'utf-8'
print('获取完成!。。。')
# 章节详细页
page_list = re.findall(r'{"uuid":\d+,"cN":".+?","uT":".+?","cnt":\d+,"cU":"(.+?)","id":\d+,"sS":\d}', r.text)
# 创建小说文件夹
n_url = 'https://book.qidian.com/info/' + book_id
r1 = requests.get(n_url, headers=head)
r1.raise_for_status()
r1.encoding = 'utf-8'
novel_name = etree.HTML(r1.text).xpath('/html/body/div/div[6]/div[1]/div[2]/h1/em/text()')[0]
os.mkdir('./%s' % novel_name)
# d_url = b_p_url + page_list[0]
# r2 = requests.get(d_url, headers=head)
# r2.encoding = 'utf-8'
# ttt = etree.HTML(r2.text).xpath('//div//h3/span/text()[1]')[1]
# j = '\n'
# content = j.join(ttt)
# print(content)
# print(ttt)
for each in page_list:
d_url = b_p_url + each
try:
r2 = requests.get(d_url, header=head)
r2.raise_for_status()
r2.encoding = 'utf-8'
# 内容
ttt = etree.HTML(r2.text).xpath('//div[@class="read-content j_readContent"]/p/text()')
j = '\n'
content = j.join(ttt)
# 章节名
p_name = etree.HTML(r2.text).xpath('//div//h3/span/text()[1]')[1]
# 写入文件
with open('./%s/%s.txt' % (novel_name, p_name), 'w') as f:
f.write(content)
print("finish")
except Exception as results:
print(results)
except Exception as result:
print(result)
def main():
# u = input('将起点中文网小说主页链接粘贴此处(请以+结尾):')
u = 'https:https://book.qidian.com/ajax/book/category?bookId=1013414929'
# 匹配出小说id:https://book.qidian.com/info/1013414929#catlog来得到bookid
b_id = re.findall(r'(\d+)', u)[0]
get_page(b_id)
if __name__ == '__main__':
main()
是的,未经许可,禁止转载!