"""
多线程彭于晏
"""
import requests
from urllib import request
import os
import re
from queue import Queue
import threading
from urllib.parse import urlencode
class Procuder(threading.Thread): #生产者
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest'
}
def __init__(self,page_queue,imag_queue,*args,**kwargs): #初始化 两个队列
super().__init__(*args,**kwargs)
self.page_queue = page_queue
self.imag_queue = imag_queue
def run(self):
while True:
if self.page_queue.empty(): #判断 如果队列为空 就退出循环
break
url = self.page_queue.get()
self.pares_page(url)
def pares_page(self,url):
response = requests.get(url,headers = self.headers)
response.encoding = 'utf-8'
if response.status_code == 200:
response = response.json()
data = response.get('data')
if data:
for item in data:
image = item.get('hoverURL')
name = item.get('fromPageTitleEnc')
if image != None:
# name = re.sub(r'[\??\.,。\!\*()/>]', '', name)
reg = "[^0-9A-Za-z\u4e00-\u9fa5]" # 使用删除字符串里的符号 ?!,。之类的 保留允许存在的命名的字符串
name = re.sub(reg, '', name)
suffix = os.path.splitext(image)[1]
filename = name + suffix
self.imag_queue.put((image, filename)) #将图片URL和名字放入队列(元组)
request.urlretrieve(image, 'images/' + filename) # 下载保存
print(filename + '下载完成')
else:
print('None')
# html = etree.HTML(text)
# imgs = html.xpath('//div[@class="page-content text-center"]//img[@class!="gif"]')
class Consumer(threading.Thread):
def __init__(self,page_queue,imag_queue,*args,**kwargs):
super().__init__(*args,**kwargs)
self.page_queue = page_queue
self.imag_queue = imag_queue
def run(self):
while True:
if self.page_queue.empty() and self.imag_queue.empty():
break
image, filename = self.imag_queue.get() # 将图片URL和名字从队列取出 元组的解开方式
request.urlretrieve(image, 'images/' + filename) # 下载保存
print(filename + '下载完成')
def main():
page_queue = Queue(200) # 页面队列容量
imag_queue = Queue(1000) # 图片URL队列容量
pn = 30
for x in range(30):
pn = pn*x
data = {
"tn": "resultjson_com", # 这里不能有空格 否则会打印的是其他的网页源码
"ipn": " rj",
"ct": "201326592",
"queryWord": "彭于晏图片",
"ie": "utf-8",
"oe": "utf-8",
"adpicid": "",
"copyright": "",
"word": " 彭于晏图片",
"pn": pn,
}
url = "https://image.baidu.com/search/acjson?" + urlencode(data)
page_queue.put(url)
for i in range(10): #生产者线程
t = Procuder(page_queue,imag_queue)
t.start()
for i in range(10): #消费者线程
t = Consumer(page_queue, imag_queue)
t.start()
if __name__ == '__main__':
main()
多线程爬取图片
最新推荐文章于 2024-04-19 10:19:46 发布