python爬虫爬取起点小说_python3爬虫-使用requests爬取起点小说

最新推荐文章于 2024-06-19 18:03:00 发布

weixin_39523280

最新推荐文章于 2024-06-19 18:03:00 发布

阅读量462

点赞数

文章标签： python爬虫爬取起点小说

import requests

from lxml import etree

from urllib import parse

import os, time

def get_page_html(url):

'''向url发送请求'''

resoponse = session.get(url, headers=headers, timeout=timeout)

try:

if resoponse.status_code == 200:

return resoponse

except exception:

return none

def get_next_url(resoponse):

'''获取下一页的url链接'''

if resoponse:

try:

selector = etree.html(resoponse.text)

url = selector.xpath("//a[@id='j_chapternext']/@href")[0]

next_url = parse.urljoin(resoponse.url, url)

return next_url

except indexerror:

return none

def xs_content(resoponse):

'''获取小说的章节名，内容'''

if resoponse:

selector = etree.html(resoponse.text)

title = selector.xpath("//h3[@class='j_chaptername']/text()")[0]

content_xpath = selector.xpath(

"//div[contains(@class,'read-content') and contains(@class,'j_readcontent')]//p/text()")

return title, content_xpath

def write_to_txt(info_tuple: tuple):

if not info_tuple: return

path = os.path.join(base_path, info_tuple[0])

if not os.path.exists(path):

with open(path + ".txt", "wt", encoding="utf-8") as f:

for line in info_tuple[1]:

f.write(line + "\n")

f.flush()

def run(url):

'''启动'''

html = get_page_html(url)

next_url = get_next_url(html)

info_tupe = xs_content(html)

if next_url and info_tupe:

print("正在写入")

write_to_txt(info_tupe)

time.sleep(sleep_time) # 延迟发送请求的时间，减少对服务器的压力。

print("正在爬取%s" % info_tupe[0])

print("正在爬取%s" % next_url)

run(next_url)

if __name__ == '__main__':

session = requests.session()

sleep_time = 5

timeout = 5

base_path = r"d:\图片\lszj" # 存放文件的目录

url = "https://read.qidian.com/chapter/8iw8dkb_ztxrzk4x-cujuw2/fwjwroiobhn4p8iew--ppw2" # 这是斗破苍穹第一章的url 需要爬取的小说的第一章的链接(url)

headers = {

"referer": "read.qidian.com",

"user-agent": "mozilla/5.0 (windows nt 10.0; wow64) applewebkit/537.36 (khtml, like gecko) chrome/72.0.3626.121 safari/537.36"

}

print('开始运行爬虫')

run(url)

weixin_39523280

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
python爬虫爬取起点小说_python3爬虫-使用requests爬取起点小说

import requestsfrom lxml import etreefrom urllib import parseimport os, timedef get_page_html(url):'''向url发送请求'''resoponse = session.get(url, headers=headers, timeout=timeout)try:if resoponse.status_c...
复制链接

扫一扫

评论

被折叠的条评论为什么被折叠?

到【灌水乐园】发言

查看更多评论

添加红包

成就一亿技术人!

hope_wisdom

发出的红包

实付元

使用余额支付

点击重新获取

扫码支付

钱包余额 0

抵扣说明：

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。