python素材库_Python素材下载爬虫，多线程rar下载爬虫

最新推荐文章于 2021-09-24 17:03:47 发布

weixin_39632327

最新推荐文章于 2021-09-24 17:03:47 发布

阅读量225

点赞数

文章标签： python素材库

一个多线程素材下载爬虫，实现多线程素材下载，包含素材包rar，素材图及素材描述，应用了经典的生产者与消费者模式，不过数据没有下载全，容易出现卡死的问题，期待后期能够解决相关问题，可以算是一个半成品，供大家参考和学习，有更好的多线程解决方案也可以交流！

几个关键点：

1.素材图片的获取

素材图存在不少情况，无图，单图，多图等都有可能存在

xpath获取图片

imgs = req.xpath('//div[@class="contentinfo"]/table//@src')

条件判断，看是否存在图片

if imgs:

遍历获取图片

for img in imgs:

图片后缀的获取

suffix = os.path.splitext(img)[1]

2.是否能够下载，素材是否有下载权限

如果能够下载，获取到下载相关数据，下载路径链接以及素材包名，不能下载则返回为空

if int(req.xpath('//div[@class="download"]/dl[@class="downlink"]/dd[1]/b/text()')[0]) == 0:

down_url = req.xpath('//div[@class="download"]/dl[@class="downlink"]/dt/li/a/@href')[0]

down_name = f'{h2}/{h2}.rar'

down_data=down_url, down_name

print(down_data)

else:

down_data=[]

3.队列的使用，老生常谈

队列只能传递一个参数！！！

data=text_data,img_data,down_data

self.down_queue.put(data)

4.一些能程序能够运行的设置

a.sleep操作延迟

time.sleep(1)

b.队列容器的设定

一般而言，往大了设置，哪个队列数据更多，则设置更多

page_queue = Queue(1000)

down_queue = Queue(1500)

运行效果：

采集效果：

素材效果：

附源码：

# -*- coding: UTF-8 -*-

# 20200516 by 微信公众号：二爷记

import requests, os,time

from lxml import etree

from fake_useragent import UserAgent

import threading

from queue import Queue

# 生产者模式

class Procuder(threading.Thread):

def __init__(self, page_queue, down_queue, *args, **kwargs):

super(Procuder, self).__init__(*args, **kwargs)

self.page_queue = page_queue

self.down_queue = down_queue

self.ua = UserAgent()

self.headers = {"User-Agent": self.ua.random}

def run(self):

while True:

if self.page_queue.empty():

break

url = self.page_queue.get()

self.parse(url)

def parse(self, url):

print(f'>>> 正在抓取列表页{url}数据...')

response = requests.get(url, headers=self.headers, timeout=6).content.decode("gbk")

time.sleep(1)

req = etree.HTML(response)

urllist = req.xpath('//dl[@class="imglist"]/dt/ul[@class="listimg"]/li/span[@class="listpic"]/a/@href')

print(len(urllist))

print(urllist)

for href in urllist:

try:

self.parse_page(href)

except Exception as e:

print(f'获取详情数据失败，错误代码：{e}')

def parse_page(self, url):

print(f'>>> 正在抓取详情页{url}数据...')

response = requests.get(url, headers=self.headers, timeout=6).content.decode("gbk")

time.sleep(1)

req = etree.HTML(response)

h2 = req.xpath('//div[@class="arcinfo"]/h2/text()')[0]

print(h2)

article = req.xpath('//div[@class="contentinfo"]/table//text()')

article = ''.join(article)

article = article.strip()

print(article)

texts = f'{h2}\n{article}'

text_data=h2,texts

imgs = req.xpath('//div[@class="contentinfo"]/table//@src')

if imgs:

i = 1

for img in imgs:

img_url = f'http://www.uimaker.com{img}'

suffix = os.path.splitext(img)[1]

img_name = f'{i}{suffix}'

img_data=img_url, img_name

print(img_data)

i = i + 1

if int(req.xpath('//div[@class="download"]/dl[@class="downlink"]/dd[1]/b/text()')[0]) == 0:

down_url = req.xpath('//div[@class="download"]/dl[@class="downlink"]/dt/li/a/@href')[0]

down_name = f'{h2}/{h2}.rar'

down_data=down_url, down_name

print(down_data)

else:

down_data=[]

data=text_data,img_data,down_data

self.down_queue.put(data)

# 消费者模式

class Consumer(threading.Thread):

def __init__(self, page_queue, down_queue, *args, **kwargs):

super(Consumer, self).__init__(*args, **kwargs)

self.page_queue = page_queue

self.down_queue = down_queue

self.ua = UserAgent()

self.headers = {"User-Agent": self.ua.random}

def run(self):

while True:

if self.page_queue.empty() and self.down_queue.empty():

break

text_data,img_data,down_data=self.down_queue.get()

h2,texts=text_data

os.makedirs(f'{h2}/', exist_ok=True) #创建目录

self.get_text(h2, texts)

img_url,img_name=img_data

self.get_downimg(h2, img_url, img_name)

if down_data !=[]:

down_url, down_name = down_data

self.down(down_url, down_name)

# 保存文本内容

def get_text(self, h2, texts):

print("开始保存文本内容...")

with open(f'{h2}/{h2}.txt', 'w', encoding="utf-8") as f:

f.write(texts)

print(">>>保存文本内容完成！")

# 下载图片

def get_downimg(self, h2, img_url, img_name):

print("开始下载图片...")

r = requests.get(img_url, headers=self.headers, timeout=6)

time.sleep(1)

with open(f'{h2}/{img_name}', 'wb') as f:

f.write(r.content)

print(">>>下载图片完成！")

# 下载素材

def down(self, down_url, down_name):

print("开始下载素材...")

r = requests.get(down_url, headers=self.headers, timeout=6)

time.sleep(1)

with open(down_name, 'wb') as f:

f.write(r.content)

print(">>>下载素材完成！")

def main():

page_queue = Queue(1000)

down_queue = Queue(1500)

for i in range(1, 71):

url = f"http://www.uimaker.com/uimakerdown/list_36_{i}.html"

print(f'>>> 正在爬取第{i + 1}页列表页，链接：{url}...')

page_queue.put(url)

for x in range(3):

t = Procuder(page_queue, down_queue)

t.start()

for x in range(6):

t = Consumer(page_queue, down_queue)

t.start()

if __name__ == '__main__':

main()

weixin_39632327

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫