python 多线程全站小说_python爬虫 --多线程爬取笔趣网小说【晋级版】

最新推荐文章于 2024-03-28 21:42:46 发布

勉强解惑强行胡诌

最新推荐文章于 2024-03-28 21:42:46 发布

阅读量256

点赞数

文章标签： python 多线程全站小说

本文链接：https://blog.csdn.net/weixin_42501904/article/details/113495203

版权

import requests

from lxml import etree

import os

from queue import Queue

import threading

class Biquge(threading.Thread):

def __init__(self,url=None,name=None,q_novels=None):

super().__init__()

self.url = url

self.name = name

self.q_novel = q_novels

self.proxies = self.get_proxies()

# self.parse()

def get_proxies(self):

try:

response = requests.get('http://localhost:5000/get')

proxy = response.text

proxies = {

'http': 'http://' + proxy

}

return proxies

except Exception:

return None

def get_xpath_by_requests(self,url, proxies):

'''

:param url:

:param proxies: 代理字典

:return:

'''

try:

headers = {

'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',

'Cookie': '_abcde_qweasd=0; _abcde_qweasd=0; bdshare_firstime=1577178973028; Hm_lvt_169609146ffe5972484b0957bd1b46d6=1577178973,1577186563,1577186739,1577235413; BAIDU_SSP_lcr=https://www.baidu.com/link?url=AvLJGcMiHKBXi90P2T0xOluezhPz2PeeTLAbP75dmma&wd=&eqid=e131d391001338d8000000025e02b3d2; Hm_lpvt_169609146ffe5972484b0957bd1b46d6=1577235422',

'Referer': 'http://www.xbiquge.la/'

}

response = requests.get(url, headers=headers, proxies=proxies)

return etree.HTML(response.content.decode('utf-8'))

except Exception:

new_proxies = self.get_proxies()

print('更换{}代理ip！'.format(new_proxies))

return self.get_xpath_by_requests(url, new_proxies)

def get_text(self,text):

if text:

return text[0]

return ''

def write_to_txt(self,text, book_name):

filename = './book/' + book_name

dirname = os.path.dirname(filename)

if not os.path.exists(dirname):

os.mkdir(dirname)

with open(filename, 'a+', encoding='utf-8') as fp:

fp.write(text)

def parse_chapter(self,url):

url = 'http://www.xbiquge.la' + url

html = self.get_xpath_by_requests(url, self.proxies)

chapter_name = self.get_text(html.xpath('//div[@class="bookname"]/h1/text()'))

book_name = self.get_text(html.xpath('//div[@class="con_top"]/a[last()]/text()'))

# print(chapter_name,book_name)

contents = html.xpath('//div[@id="content"]/text()')

# print(type(contents))

# content = ''

content = ''.join(contents)

text = chapter_name + r'\n' + content

self.write_to_txt(text, book_name)

# print(url)

# print(''.join(contents))

def parse_novel(self,url):

# 获取页面xpath对象

html = self.get_xpath_by_requests(url, self.proxies)

chapters = html.xpath('//div[@id="list"]/dl/dd/a/@href')

# print(chapters)

for chapter in chapters:

self.parse_chapter(chapter)

def get_novels(self):

html = self.get_xpath_by_requests(self.url, self.proxies)

novel_urls = html.xpath('//span[@class="s2"]/a/@href')

# print(novel_urls)

return novel_urls

# for url in novel_urls:

# self.parse_novel(url)

def run(self):

while True:

if self.q_novel.empty():

break

novel_url = self.q_novel.get()

print('======={}==========@{}'.format(novel_url,self.name))

self.parse_novel(novel_url)

if __name__ == '__main__':

base_url = 'http://www.xbiquge.la/xuanhuanxiaoshuo/'

b = Biquge(url=base_url)

novel_urs = b.get_novels()

#初始化任务队列

q_novels = Queue()

for url in novel_urs:

q_novels.put(url)

#创建一个list，遍历这个list创建线程

crawl_list = ['aa','bb','cc','dd']

for crwal in crawl_list:

t = Biquge(name = crwal,q_novels=q_novels)

t.start()

勉强解惑强行胡诌

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

python 多线程 全站小说_python爬虫 --多线程爬取笔趣网小说【晋级版】

python 多线程全站小说_python爬虫 --多线程爬取笔趣网小说【晋级版】