1、多线程爬虫
①生产者和消费者模式是多线程开发中常见的一种模式。通过生产者和消费者模式,可以让代码使每个线程达到高内聚的目标,线程管理更加方便,程序分工更加明确。
②生产者的线程专门用来生产一些数据,然后存放到容器中(中间变量)。消费者在从这个中间的容器中取出数据进行消费
1.1Lock版的生产者和消费者
gMoney = 0
gTime = 0
gLock = threading.Lock()
class Product(threading.Thread):
# global gMoney,gTime
def run(self):
# time.sleep(5)
# gLock.acquire()
global gMoney,gTime
while gTime<10:
gLock.acquire()
money = random.randint(1,10)
# money = random.randint(1,10)
gMoney += money
print('%s生成了%d元钱' % (threading.current_thread().name, money))
gTime += 1
# time.sleep(10)
time.sleep(1)
gLock.release()
# gLock.release()
class Custom(threading.Thread):
def run(self):
# time.sleep(5)
# gLock.acquire()
while True:
global gMoney,gTime
gLock.acquire()
money = random.randint(1,100)
if money<=gMoney:
gMoney -= money
print('%s消费了%d元钱' % (threading.current_thread().name, money))
elif gTime>=10:
gLock.release()
break
else:
print('%s消费了%d元钱,但是余额只有%d'%(threading.current_thread().name,money,gMoney))
# time.sleep(10)
gLock.release()
time.sleep(1)
# gLock.release()
def main():
p1 = Product(name='生产者1号')
p3 = Product(name='生产者2号')
p4 = Product(name='生产者3号')
p1.start()
p3.start()
p4.start()
# for i in range(5):
# p1 = Product(name='生产者%d号'%i)
# p1.start()
# # time.sleep(2)
for i in range(5):
p2 = Custom(name='消费者%d号' % i)
p2.start()
# time.sleep(2)
if __name__ == '__main__':
main()
1.2Condition版的生产者和消费者
gMoney = 0
# 定义一个变量 保存生成的次数 默认是0次
gTimes = 0
# 定义一把锁
# gLock = threading.Lock()
gCond = threading.Condition()
# 定义生产者
class Producer(threading.Thread):
def run(self) -> None:
global gMoney
global gTimes
while True:
gCond.acquire() # 上锁
if gTimes >= 10:
gCond.release()
break
money = random.randint(0,100) # 0 <= money <= 100
gMoney += money
gTimes += 1
print('%s生成了%d元钱,剩余%d元' % (threading.current_thread().name, money,gMoney))
gCond.notify_all()
gCond.release() # 解锁
time.sleep(1)
# 定义消费者
class Consumer(threading.Thread):
def run(self) -> None:
global gMoney
while True:
gCond.acquire() # 上锁
money = random.randint(0, 100) # 0 <= money <= 100
while gMoney < money:
if gTimes >= 10:
gCond.release()
return # 这里如果用break退出了内层循环,但是外层循环没有退出,直接用return
print('%s消费了%d元钱,但是余额只有%d元了。生产者也不在生成了' % (threading.current_thread().name, money,gMoney))
gCond.wait()
# 开始消费
gMoney -= money
print('%s消费了%d元钱,剩余%d元' % (threading.current_thread().name, money,gMoney))
gCond.release() # 解锁
time.sleep(1)
def main():
# 开启5个生产者
for i in range(5):
th = Producer(name='生产者%d号'%i)
th.start()
# 开启5个消费者
for i in range(5):
th = Consumer(name='消费者%d号' % i)
th.start()
if __name__ == '__main__':
main()
2、普通方法爬取王者荣耀首页图片
import requests
import os
import random
from urllib import parse
from urllib import request
headers_list = [{'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1'},{'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0'},{'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'}]
header = {**random.choice(headers_list),**{'referer':'https://pvp.qq.com/web201605/wallpaper.shtml'}}
def get_url(data):
images = []
for x in range(1, 9):
image_url = parse.unquote(data['sProdImgNo_%d' % x]).replace('200', '0')
images.append(image_url)
return images
def main():
base_url = 'https://apps.game.qq.com/cgi-bin/ams/module/ishow/V1.0/query/workList_inc.cgi?activityId=2735&sVerifyCode=ABCD&sDataType=JSON&iListNum=20&totalpage=0&page=0&iOrder=0&iSortNumClose=1&iAMSActivityId=51991&_everyRead=true&iTypeId=2&iFlowId=267733&iActId=2735&iModuleId=2735&_=1597325146548'
response = requests.get(base_url,headers = header,verify=False).json()
datas = response['List']
print(datas,len(datas))#20条数据
for data in datas:
urls = get_url(data)
name = parse.unquote(data['sProdName'])
dirpath = os.path.join('ablum',name)
os.mkdir(dirpath)
for index, image_url in enumerate(urls):
request.urlretrieve(image_url, os.path.join(dirpath, '%d.jpg' % (index + 1)))
print('%s第%d张图片下载完成!' %(name,index+1))
if __name__ == '__main__':
main()
3、利用多线程爬取王者荣耀的图片
import requests
from urllib import parse
import os
from urllib import request
import threading
import queue
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
'referer': 'https://pvp.qq.com/web201605/wallpaper.shtml'
}
class Producer(threading.Thread):
def __init__(self, page_queue, image_queue, *args, **kwargs):
super(Producer, self).__init__(*args, **kwargs)
self.page_queue = page_queue
self.image_queue = image_queue
def run(self) -> None:
while not self.page_queue.empty():
page_url = self.page_queue.get()
resp = requests.get(page_url, headers=headers)
result = resp.json()
datas = result['List']
for data in datas:
image_urls = extract_images(data)
# FileNotFoundError: [WinError 3] 系统找不到指定的路径。: '1:1等身雕塑·铠'
name = parse.unquote(data['sProdName']).replace('1:1', '').strip()
dir_path = os.path.join('image', name)
# 没有文件夹我才去创建
if not os.path.exists(dir_path):
os.mkdir(dir_path)
# 把图片的url放到队列当中
for index, image_url in enumerate(image_urls):
self.image_queue.put(
{'image_url': image_url, 'image_path': os.path.join(dir_path, '%d.jpg' % (index + 1))})
# 定义消费者
class Consumer(threading.Thread):
def __init__(self, image_queue, *args, **kwargs):
super(Consumer, self).__init__(*args, **kwargs)
self.image_queue = image_queue
def run(self) -> None:
while True:
# 获取图片的url和下载路径
try:
image_obj = self.image_queue.get(timeout=10)
image_url = image_obj.get('image_url')
image_path = image_obj.get('image_path')
try:
# 下载图片
request.urlretrieve(image_url, image_path)
print(image_path, '下载完成!')
except:
print(image_path + '下载视频!')
except:
break
# 定义一个函数来获取每张图片的url
def extract_images(data):
image_urls = []
for x in range(1, 9):
image_url = parse.unquote(data['sProdImgNo_%d' % x]).replace('200', '0')
image_urls.append(image_url)
return image_urls
def main():
# 创建页数的队列
page_queue = queue.Queue(18)
# 创建图片的队列
image_queue = queue.Queue(1000)
for x in range(0, 18):
page_url = 'https://apps.game.qq.com/cgi-bin/ams/module/ishow/V1.0/query/workList_inc.cgi?activityId=2735&sVerifyCode=ABCD&sDataType=JSON&iListNum=20&totalpage=0&page={page}&iOrder=0&iSortNumClose=1&iAMSActivityId=51991&_everyRead=true&iTypeId=2&iFlowId=267733&iActId=2735&iModuleId=2735&_=1597325146548'.format(
page=x)
# 把页数的url添加到页数的队列当中
page_queue.put(page_url)
# 定义3个生产者线程
for x in range(3):
p = Producer(page_queue, image_queue)
p.start()
# 定义5个消费者线程
for x in range(5):
c = Consumer(image_queue)
c.start()
if __name__ == '__main__':
main()