将上一次的内容做一点延伸:
一、单线程的下载
import requests
from bs4 import BeautifulSoup
from urllib import request
import os
#用来存储所有的页面的ur1
PAGE_URLS=[]
#首先对请求身份进行伪装
def parse_page(page_url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
}
response = requests.get(page_url, headers=headers)
print(response)
text = response.text
soup = BeautifulSoup(text, "lxml")
img_list = soup.find_all("img", attrs={"class": "img-responsive lazy image_dta"})
# print(img_list)
for img in img_list:
img_url = img["data-original"]
# print(img_url)
# request.urlretrieve(img_url, "test.jpg")
# https://ws2. sinaimg. cn/bmiddle/9150ede5gy1g0saavmreuj20250250sh. jpg
#I' https:','",' ws2. sinaimg. cn',' bmiddle','9150e4e5gy1g0saavmreuj20250250sh. jpg'
filename=img_url.split("/")[-1]
fullpath=os.path.join("images",filename)
request.urlretrieve(img_url, fullpath)
def main():
# 1.先获取所有页面的url
for x in range(1,5):
page_url="https://www.doutula.com/photo/list/?page="+str(x)
PAGE_URLS.append(page_url)
# print(PAGE_URLS)
#2.获载每一页的图片数据
for page_url in PAGE_URLS:
print(page_url)
parse_page(page_url)
if __name__=="__main__":
main()
二、多线程的下载
引例:
import time
import threading
def greet(index):
print("helloworld-%d"%index)
time.sleep(0.5)
def line_run():
for x in range(5):
greet(x)
def thread_run():
for x in range(5):
th=threading. Thread(target=greet,args=[x])
th.start()
if __name__ =="__main__":
# line_run()
thread_run()
代码说明:
import time
import threading
引入时间库与多线程库。
def greet(index):
print("helloworld-%d"%index)
time.sleep(0.5)
定义一个打招呼的函数,执行完以后休息0.5秒
def line_run():
for x in range(5):
greet(x)
定义一个单线程函数,执行5次打招呼函数
def thread_run():
for x in range(5):
th=threading. Thread(target=greet,args=[x])
th.start()
定义一个多线程函数,以threading方式执行5次打招呼函数
if __name__ =="__main__":
# line_run()
thread_run()
初始化,可选择运行哪一种执行方式。
生产者和消费者模式
形成“高类聚、低耦合"的模式。
import time
import threading
import random
gMoney=0
gLock=threading.Lock()
def produter():
global gMoney
while True:
money=random. randint(0,100)
gLock.acquire()
#上锁
gMoney += money
gLock.release()
#立即释放
print("%s生产者生产了%s元钱,剩余%s元钱" %(threading.current_thread(),money,gMoney))
time.sleep(0.5)
def consumer():
global gMoney
while True:
money=random.randint(0,100)
if gMoney >=money:
gLock.acquire()
# 上锁
gMoney-=money
gLock.release()
# 立即释放
print("%s消费者消费了%s元钱,剩余%s元钱"%(threading.current_thread(),money,gMoney))
else:
print("%s消费者想消费%s元钱,但是余额不足!剩余%s元钱!" %(threading.current_thread(),money,gMoney))
time.sleep(0.5)
if __name__ =="__main__":
for x in range(5):
th = threading.Thread(target=produter)
th.start()
for x in range(5):
th=threading. Thread(target=consumer)
th.start()
代码说明:
gMoney=0
gLock=threading.Lock()
定义两个全局变量
def produter():
global gMoney
while True:
money=random. randint(0,100)
gLock.acquire()
#上锁
gMoney += money
gLock.release()
#立即释放
print("%s生产者生产了%s元钱,剩余%s元钱" %(threading.current_thread(),money,gMoney))
time.sleep(0.5)
定义生产者;声明全局变量gMoney;随机产生money值,上锁;修改money值;取消上锁;打印明细
def consumer():
global gMoney
while True:
money=random.randint(0,100)
if gMoney >=money:
gLock.acquire()
# 上锁
gMoney-=money
gLock.release()
# 立即释放
print("%s消费者消费了%s元钱,剩余%s元钱"%(threading.current_thread(),money,gMoney))
else:
print("%s消费者想消费%s元钱,但是余额不足!剩余%s元钱!" %(threading.current_thread(),money,gMoney))
time.sleep(0.5)
消费者函数,同上。
脏数据,要加锁;上锁的过程,及防止,一个操作未完成,同时另一个操作进行,引发数据破坏问题。
多线程—生产者与消费者模式
import requests
from bs4 import BeautifulSoup
from urllib import request
import os
import threading
import time
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/63.0.3239.132 Safari/537.36"
}
#用来存储所有的页面的ur1
PAGE_URLS=[]
IMG_URLS=[]
gLock=threading.Lock()
#生产者:专门用与获取表情包的链接
#消费者:专门从表情包的url链接中下载图片
#全局变最:就是一个列表,这个列表储存许多链接
def producer():
global PAGE_URLS
global IMG_URLS
while True:
gLock.acquire()
if len(PAGE_URLS) == 0:
gLock.release()
break
page_url = PAGE_URLS.pop()
gLock.release()
response = requests.get(page_url, headers=headers)
text = response.text
soup = BeautifulSoup(text, "lxml")
img_list = soup.find_all("img", attrs={"class": "img-responsive lazy image_dta"})
for img in img_list:
img_url = img["data-original"]
IMG_URLS.append(img_url)
def consumer():
global PAGE_URLS
global IMG_URLS
while True:
gLock.acquire()
if len(IMG_URLS)==0 and len(PAGE_URLS)==0:
gLock.release()
break
if len(IMG_URLS) > 0:
img_url = IMG_URLS.pop()
else:
img_url =""
gLock.release()
if img_url:
filename = img_url.split("/")[-1]
fullpath = os.path.join("images", filename)
request.urlretrieve(img_url, fullpath)
print("%s下载完成"%filename)
def main():
for x in range(1,3):
page_url="https://www.doutula.com/photo/list/?page="+str(x)
PAGE_URLS.append(page_url)
for x in range(5):
#5个生产者线程
th = threading.Thread(target=producer)
th.start()
time.sleep(0.5)
for x in range(5):
#5个消费者线程
th = threading.Thread(target=consumer)
th.start()
if __name__=="__main__":
main()