python爬虫爬取起点小说_python3爬虫-使用requests爬取起点小说

import requests

from lxml import etree

from urllib import parse

import os, time

def get_page_html(url):

'''向url发送请求'''

resoponse = session.get(url, headers=headers, timeout=timeout)

try:

if resoponse.status_code == 200:

return resoponse

except exception:

return none

def get_next_url(resoponse):

'''获取下一页的url链接'''

if resoponse:

try:

selector = etree.html(resoponse.text)

url = selector.xpath("//a[@id='j_chapternext']/@href")[0]

next_url = parse.urljoin(resoponse.url, url)

return next_url

except indexerror:

return none

def xs_content(resoponse):

'''获取小说的章节名,内容'''

if resoponse:

selector = etree.html(resoponse.text)

title = selector.xpath("//h3[@class='j_chaptername']/text()")[0]

content_xpath = selector.xpath(

"//div[contains(@class,'read-content') and contains(@class,'j_readcontent')]//p/text()")

return title, content_xpath

def write_to_txt(info_tuple: tuple):

if not info_tuple: return

path = os.path.join(base_path, info_tuple[0])

if not os.path.exists(path):

with open(path + ".txt", "wt", encoding="utf-8") as f:

for line in info_tuple[1]:

f.write(line + "\n")

f.flush()

def run(url):

'''启动'''

html = get_page_html(url)

next_url = get_next_url(html)

info_tupe = xs_content(html)

if next_url and info_tupe:

print("正在写入")

write_to_txt(info_tupe)

time.sleep(sleep_time) # 延迟发送请求的时间,减少对服务器的压力。

print("正在爬取%s" % info_tupe[0])

print("正在爬取%s" % next_url)

run(next_url)

if __name__ == '__main__':

session = requests.session()

sleep_time = 5

timeout = 5

base_path = r"d:\图片\lszj" # 存放文件的目录

url = "https://read.qidian.com/chapter/8iw8dkb_ztxrzk4x-cujuw2/fwjwroiobhn4p8iew--ppw2" # 这是斗破苍穹第一章的url 需要爬取的小说的第一章的链接(url)

headers = {

"referer": "read.qidian.com",

"user-agent": "mozilla/5.0 (windows nt 10.0; wow64) applewebkit/537.36 (khtml, like gecko) chrome/72.0.3626.121 safari/537.36"

}

print('开始运行爬虫')

run(url)

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值