python线程池wait_线程池 wait 方法可以等待所有的线程任务完成,但是完成后不退出...

# !/use/bin/python3

# _*_ coding:utf-8 _*_

# __author__ : __ajiang__

# 2020/5/1

import os

import re

import requests

from urllib import parse

from datetime import datetime

from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED

from scrapy import Selector

from fake_useragent import UserAgent

from hanhan_spider.models import *

# 笔趣阁的主路径

domain = 'https://www.yqzww.net/'

executor = ThreadPoolExecutor(max_workers=10)

def get_chapter_id(url):

"""

:param url: 章节url

:return: 章节ID

"""

res = re.match('.*\/([0-9]+).html', url)

if res:

return int(res.group(1))

return None

def parse_novel_detail(novel_url):

"""

:param novel_url: 小说详情页链接

:return: None

"""

headers['User-Agent'] = ua.random

novel_detail_html = requests.get(novel_url, headers=headers)

novel_detail_html.encoding = 'gbk'

novel_detail_text = novel_detail_html.text

sel = Selector(text=novel_detail_text)

novel = NovelContent()

novel_id = novel_url.split('/')[-2]

_novel_id = novel_id.split('_')[1]

novel.novel_id = _novel_id

novel_status = sel.xpath('//meta[@property="og:novel:status"]/@content').extract()

if novel_status:

novel.status = novel_status[0]

novel_image = sel.xpath('//meta[@property="og:image"]/@content').extract()

if novel_image:

novel.image = novel_image[0]

novel_name = sel.xpath('//meta[@property="og:novel:book_name"]/@content').extract()

if novel_name:

novel.name = novel_name[0]

print('开始爬取小说:{}'.format(novel_name[0]))

novel_author = sel.xpath('//meta[@property="og:novel:author"]/@content').extract()

if novel_author:

novel.author = novel_author[0]

novel_update_time = sel.xpath('//meta[@property="og:novel:update_time"]/@content').extract()

if novel_update_time:

novel.last_update = datetime.strptime(novel_update_time[0], '%Y-%m-%d %H:%M:%S')

novel_description = sel.xpath('//meta[@property="og:description"]/@content').extract()

if novel_description:

novel.description = novel_description[0]

# 判断这本书在不在数据库里面

_novel = NovelContent.select().where(NovelContent.novel_id == _novel_id)

if _novel:

novel.save()

else:

novel.save(force_insert=True)

print('小说{}爬取完成'.format(novel_name[0]))

# 判断是否需要重新抓取新的章节 或者说这里不管,就把剩下的操作全部丢给解析章节的函数

novel_chapter_urls = sel.xpath('//div[@class="article-list"]//dd')

for novel_chapter_url in novel_chapter_urls:

if novel_chapter_url.xpath('.//a/@href').extract():

chapter_url = novel_chapter_url.xpath('.//a/@href').extract()[0]

executor.submit(parse_novel_chapter, parse.urljoin(novel_url, chapter_url))

def parse_novel_chapter(chapter_url):

"""

:param chapter_url: 小说详情页链接

:return: None

pass

"""

headers['User-Agent'] = ua.random

html = requests.get(chapter_url, headers=headers)

# 使用utf-8编码解码

html.encoding = 'gbk'

page_text = html.text

sel = Selector(text=page_text)

chapter = NovelChapter()

# 我们从章节的URL里面获取章节ID

result_chapter = re.match('.*\/([0-9]+)\.html', chapter_url)

chapter_id = 0

if result_chapter:

chapter_id = int(result_chapter.group(1))

result_chapter = NovelChapter.select().where(NovelChapter.chapter_id == chapter_id)

result_novel = re.match('.*\/.*_([0-9]+)\/.*\.html', chapter_url)

novel_id = 0

if result_novel:

novel_id = int(result_novel.group(1))

res_novel = NovelContent.select().where(NovelContent.novel_id == novel_id)

chapter_next_btn = sel.xpath('//div[@class="bottem"]//a[contains(text(), "下一章")]/@href').extract()

chapter_pre_btn = sel.xpath('//div[@class="bottem"]//a[contains(text(), "上一章")]/@href').extract()

if not result_chapter:

chapter.chapter_id = chapter_id

try:

if result_novel:

chapter.novel = res_novel

except Exception as e:

chapter.novel = novel_id

chapter_title = sel.xpath('//div[@class="bookname"]/h1/text()').extract()

if chapter_title:

chapter.title = chapter_title[0]

print('开始爬取章节:{}'.format(chapter_title[0]))

chapter_content = sel.xpath('//div[@id="content"]').extract()

if chapter_content:

chapter.content = chapter_content[0]

if '.html' in chapter_next_btn[0]:

chapter.next_chapter = get_chapter_id(chapter_next_btn[0])

if '.html' in chapter_pre_btn[0]:

chapter.pre_chapter = get_chapter_id(chapter_pre_btn[0])

chapter.save(force_insert=True)

print('爬取章节{}结束'.format(chapter_title[0]))

# 解析下一章的信息

if '.html' in chapter_next_btn[0]:

executor.submit(parse_novel_chapter, parse.urljoin(domain, chapter_next_btn[0]))

else:

return False

if __name__ == '__main__':

# 任务列表,往线程池里面提交任务

# 使用随机的UA

ua_location = os.path.dirname(os.path.dirname(__file__)) + '/fake-useragent.json'

ua = UserAgent(path=ua_location)

headers = {

"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",

"Accept-Encoding": "gzip, deflate, br",

"Accept-Language": "en,zh;q=0.9,ar;q=0.8,zh-CN;q=0.7,zh-TW;q=0.6,zh-HK;q=0.5",

"Cache-Control": "no-cache",

"Connection": "keep-alive",

"Host": "www.yqzww.net",

"Pragma": "no-cache",

"Sec-Fetch-Dest": "document",

"Sec-Fetch-Mode": "navigate",

"Sec-Fetch-Site": "cross-site",

"Upgrade-Insecure-Requests": "1"

}

task_list = [executor.submit(parse_novel_detail, 'https://www.yqzww.net/book_87570/')]

wait(task_list, return_when=ALL_COMPLETED)

# parse_novel_chapter('https://www.yqzww.net/book_87570/33206060.html')

老师这个是我写的一个爬小说详情和章节列表的一个爬虫,我在最后使用了

wait(task_list, return_when=ALL_COMPLETED)

但是爬到最有一章的时候没有退出,我在 parse_novel_chapter 函数里面 的最后 有判断

# 解析下一章的信息

if '.html' in chapter_next_btn[0]:

executor.submit(parse_novel_chapter, parse.urljoin(domain,chapter_next_btn[0]))

else:

return False

这个执行到 return false 的时候 程序就停住了 并没有退出,是应该提交一个什么信号给executor么? 麻烦老师帮忙看一下

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值