#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/12/7 15:06
# @Author : huni
# @File : 高质量多线程.py
# @Software: PyCharm
from threading import Thread
from queue import Queue
import requests
from lxml import etree
import os
from urllib import parse
import time
import re
# 获取列表页
class CrawlInfo(Thread):
#重写构造函数
def __init__(self,url_queue,html_queue):
Thread.__init__(self)
#声明两个类属性
self.url_queue = url_queue
self.html_queue = html_queue
def run(self):
#爬虫代码
while self.url_queue.empty() == False: #当url队列中不是空的就继续爬
url = self.url_queue.get() #从队列中获取一个url
reponse = requests.get(url=url,headers=headers)
if reponse.status_code == 200:
self.html_queue.put(reponse.text) #访问成功把html文件放进html队列中
#获取单页面的主题列表
class ParseInfo(Thread):
def __init__(self,html_queue,detail_queue):
Thread.__init__(self)
self.html_queue = html_queue
self.detail_queue = detail_queue
def run(self):
while self.html_queue.empty() == False:
tree1 = etree.HTML(self.html_queue.get()) #从html队列中获取一个html使用etree解析
#下面的内容可以修改为自己需要解析的内容,比如爬取段子,音频,视频等,只需要在运行前把base_url做更改就可以了
href_set = set(tree1.xpath('//ul[@class="photo_ul"]//@href'))
for href1 in href_set:
href2 = 'https://www.nvshens.org' + href1
response2 = requests.get(url=href2, headers=headers).text
detail_queue.put(response2)
# 从主题也获取页码
class DetailInfo(Thread):
def __init__(self, detail_queue):
Thread.__init__(self)
self.detail_queue = detail_queue
def run(self):
s = requests.Session()
s.keep_alive = False # 关闭多余连接
headers = {
'Referer': 'https://www.nvshens.org/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
}
while self.detail_queue.empty() == False:
tree2 = etree.HTML(self.detail_queue.get()) # 从html队列中获取一个html使用etree解析
href_page_num = int((re.findall(r'(\d+)', tree2.xpath('//*[@id="dinfo"]/span/text()')[0]))[0])
hrefbefore = tree2.xpath('//*[@id="pages"]/a[2]/@href')[0]
title = tree2.xpath('/html/head/title/text()')[0]
title_path = search_path + '/' + title
if not os.path.exists(title_path):
os.mkdir(title_path)
for j in range(1, href_page_num // 3 + 2):
href3 = 'https://www.nvshens.org' + hrefbefore.replace(hrefbefore.split('/')[-1],f'{j}.html')
response3 = requests.get(url=href3, headers=headers).text
tree3 = etree.HTML(response3)
src_list = tree3.xpath('//ul[@id="hgallery"]//@src')
for src in src_list:
src1 = src.replace('img','t1')
src2 = src1.replace('/s','')
jpg_data = requests.get(url=src2, headers=headers).content
jpg_name = src2.split('/')[-1]
jpg_path = title_path + '/' + jpg_name
with open(jpg_path, 'wb') as fp:
fp.write(jpg_data)
print(jpg_name, '下载完成')
if __name__ == '__main__':
star = time.time()
headers = {
'Connection': 'keep-alive',
'Referer': 'https://www.nvshens.org/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
}
# 创建一个存储有url和html的容器:队列
url_queue = Queue()
html_queue = Queue()
detail_queue = Queue()
kw = 'xxx'
keyword = parse.quote(kw, encoding='utf-8')
search_url = f'https://www.nvshens.org/girl/search.aspx?name={keyword}'
resp = requests.get(url=search_url, headers=headers).text
find_link = re.findall(r'<a style="line-height:19px;" href=\'(.*?)\' target=', resp)
url = 'https://www.nvshens.org' + find_link[0] + 'album/'
search_path = './搜索' + f'/{kw}'
if not os.path.exists(search_path):
os.mkdir(search_path)
# 解析每个分页
response = requests.get(url=url, headers=headers).text
tree = etree.HTML(response)
search_info = tree.xpath('//*[@id="post"]/div[2]/div/text()')[0]
page_num = int(re.findall(r'共(.*?)部', search_info)[0])
for i in range(1, page_num // 30 + 2):
href = url + f'{i}.html'
url_queue.put(href)
crawl_list = []
for i in range(5):
Crawl = CrawlInfo(url_queue,html_queue)
crawl_list.append(Crawl)
Crawl.start()
for crawl in crawl_list:
crawl.join() # 等待操作
parse_list = []
for i in range(10):
parse = ParseInfo(html_queue,detail_queue)
parse_list.append(parse)
parse.start()
for parse in parse_list:
parse.join()
detail_list = []
for i in range(10):
detail = DetailInfo(detail_queue)
detail_list.append(detail)
detail.start()
for detail in detail_list:
detail.join()
print(time.time()-star)
python 爬虫 全站高质量数据内容快速爬取 要素过多 建议收藏
最新推荐文章于 2024-05-06 13:16:59 发布