多线程
效率
在线程中,访问一些全局变量, 加锁是一个经常的过程。如果你是想把一些数据存储到某队列中,那么python内置了一个线程安全模块叫作queue模块。Python中的queue模块中提供了同步线程安全的队列类, 包括FIFO(先进先出) 队列Queue , LIFO(后入先出) 队列LifoQueue 。这些队列都实现了锁原理(可以理解为原子操作, 既要么不做,要么都做完),能够在多线程中直接使用。可以使用队列来实现线程间的同步。
例:食堂打饭 先进先出
容器里放书 先进后出
初始化Queue(maxsize):创建一个先进先出的队列
empty():判断队列是否为空
full():判断队列是否满了
get():从队列中取一个数据
put():将一个数据放到队列中
普通方式爬取王者荣耀高清壁纸.py
# @ Time : 2021/3/7 18:58
# @ Author : Ellen
# https://apps.game.qq.com/cgi-bin/ams/module/ishow/V1.0/query/workList_inc.cgi?activityId=2735&sVerifyCode=ABCD&sDataType=JSON&iListNum=20&totalpage=0&page=0&iOrder=0&iSortNumClose=1&jsoncallback=jQuery17100193411045169809_1615112629840&iAMSActivityId=51991&_everyRead=true&iTypeId=2&iFlowId=267733&iActId=2735&iModuleId=2735&_=1615112630011
# from urllib import parse
# img = parse.unquote('http%3A%2F%2Fshp%2Eqpic%2Ecn%2Fishow%2F2735030216%2F1614674580%5F84828260%5F31404%5FsProdImgNo%5F1%2Ejpg%2F200')
# img = parse.unquote('http%3A%2F%2Fshp%2Eqpic%2Ecn%2Fishow%2F2735030216%2F1614674580%5F84828260%5F31404%5FsProdImgNo%5F2%2Ejpg%2F200')
#
# print(img)
# 第一步 确定目标url
# 第二步 获取高清的url把后缀200改成0
# 第三步 实现翻页 page有参数 它是从0开始的
import requests
from urllib import parse
import os
from urllib import request
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36',
'referer': 'https://pvp.qq.com/'
}
# 定义一个函数来处理图片的url
def extract_images(data):
image_urls = []
for x in range(1, 9):
image_url = parse.unquote(data['sProdImgNo_%d'%x]).replace('200', '0')
image_urls.append(image_url)
return image_urls
def main():
# 目前url 需要把 jsoncallback 参数删掉
page_url = 'https://apps.game.qq.com/cgi-bin/ams/module/ishow/V1.0/query/workList_inc.cgi?activityId=2735&sVerifyCode=ABCD&sDataType=JSON&iListNum=20&totalpage=0&page=0&iOrder=0&iSortNumClose=1&_everyRead=true&iTypeId=2&iFlowId=267733&iActId=2735&iModuleId=2735&_=1615193303228'
resp = requests.get(page_url, headers=headers)
# print(type(resp.text), resp.text)
# print(type(resp.json()), resp.json())
# 转换成json格式
result = resp.json()
datas = result['List']
for data in datas:
# 获取图片的URL
# sProdImgNo_1 = parse.unquote(data['sProdImgNo_2']).replace('200', '0')
# print(sProdImgNo_1)
# 调用函数
image_urls = extract_images(data)
# 获取图片的名字
name = parse.unquote(data['sProdName'])
# print('=' * 50)
# print(name)
# print(image_urls)
# print('=' * 50)
# 创建文件夹 image/露娜-瓷语鉴心
dirpath = os.path.join('image', name)
os.mkdir(dirpath)
# 下载图片 图片也得有名字 例如 1.jpg 2.jpg
for index, image_url in enumerate(image_urls):
request.urlretrieve(image_url, os.path.join(dirpath, '%d.jpg'%(index+1)))
print('%s下载完成'%name)
if __name__ == '__main__':
main()
多线程方式爬取王者荣耀.py
# @ Time : 2021/3/8 20:26
# @ Author : Ellen
import requests
from urllib import parse
import os
from urllib import request
import threading
import queue
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'
}
# 队列 1 每一页的url地址 生产者--> 从队列1里面请求每一页的url 解析数据 获取每一页图片的url图片及名字
# 队列 2 将每张图片的url和名字放在队列2中
# 消费者 请求每张图片的url 从队列2里面下载并保存图片
# 定义生产者
class Producer(threading.Thread):
def __init__(self, page_queue, image_queue, *args, **kwargs):
super(Producer, self).__init__(*args, **kwargs)
self.page_queue = page_queue
self.image_queue = image_queue
def run(self) -> None:
while not self.page_queue.empty():
page_url = self.page_queue.get()
resp = requests.get(page_url, headers=headers)
result = resp.json()
datas = result['List']
for data in datas:
image_urls = extract_images(data) # 获取图片的名字
# 1:1等身雕塑.铠
name = parse.unquote(data['sProdName']).replace('1:1', ' ').strip()
dir_path = os.path.join('image', name)
# 没有文件我才去创建
if not os.path.exists(dir_path):
os.mkdir(dir_path)
# 把图片的url地址放到队列当中
for index, image_url in enumerate(image_urls):
self.image_queue.put(
{'image_url': image_url, 'image_path': os.path.join(dir_path, "%d.jpg"%(index+1))})
# 定义消费者
class Consumer(threading.Thread):
def __init__(self, image_queue, *args, **kwargs):
super(Consumer, self).__init__(*args, **kwargs)
self.image_queue = image_queue
def run(self) -> None:
while True:
try:
image_obj = self.image_queue.get(timeout=10)
# 获取图片的url 和下载的地址
image_url = image_obj['image_url']
image_path = image_obj['image_path']
try:
# 下载图片
request.urlretrieve(image_url, image_path)
print(image_path + '下载完成!')
except:
print(image_path + '下载完成!')
except:
break
def extract_images(data):
image_urls = []
for x in range(1, 9):
image_url = parse.unquote(data['sProdImgNo_%d'%x]).replace('200', '0')
image_urls.append(image_url)
return image_urls
def main():
# 创建页数的队列
page_queue = queue.Queue(10)
# 创建图片的队列
image_queue = queue.Queue(1000)
# 目前url 需要把 jsoncallback 参数删掉
for x in range(0, 10):
page_url = 'https://apps.game.qq.com/cgi-bin/ams/module/ishow/V1.0/query/workList_inc.cgi?activityId=2735&sVerifyCode=ABCD&sDataType=JSON&iListNum=20&totalpage=0&page={page}&iOrder=0&iSortNumClose=1&_everyRead=true&iTypeId=2&iFlowId=267733&iActId=2735&iModuleId=2735&_=1615193303228'.format(
page=x)
# 把页数的url添加到队列当中
page_queue.put(page_url)
# 定义3个生产者线程
for x in range(3):
th = Producer(page_queue, image_queue)
th.start()
# 定义5个消费者线程
for x in range(5):
th = Consumer(image_queue)
th.start()
if __name__ == '__main__':
main()