斗图网
单线程
import os
import re
import time
import requests
import urllib.request
def parse_page(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
}
try:
resp = requests.get(url, headers=headers)
text = resp.text
img_urls = re.findall(r'data-original="(.*?)"', text)
img_names = re.findall(r' alt="(.*?)"', text)
for i,j in zip(img_urls, img_names):
j = re.sub(r"[\?\.]", "", j)
img_name = j + os.path.splitext(i)[1]
urllib.request.urlretrieve(i, "images/"+img_name)
print(img_name + " 下载完成")
except Exception as e:
print(e)
def main():
t1 = time.time()
for i in range(5):
url = rf"http://www.doutula.com/photo/list/?page={i}"
parse_page(url)
t2 = time.time()
t = int(t2) - int(t1)
print(f"共耗时{t}")
if __name__ == '__main__':
main()
多线程
import os
import re
import time
import requests
import urllib.request
import threading
from queue import Queue
class Producer(threading.Thread):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
}
proxys = {
"http": "http://60.5.254.169:8081"
}
def __init__(self, page_queue, image_queue, *args, **kwargs):
threading.Thread.__init__(self, *args, **kwargs)
self.image_queue = image_queue
self.page_queue = page_queue
def run(self):
while 1:
if self.page_queue.empty():
break
url = self.page_queue.get()
self.parse_page(url)
print(url)
def parse_page(self, url):
resp = requests.get(url, headers=self.headers, proxies=self.proxys)
text = resp.text
img_urls = re.findall(r'data-original="(.*?)"', text)
img_names = re.findall(r' alt="(.*?)"', text)
for i,j in zip(img_urls, img_names):
j = re.sub(r"[\?\.]", "", j)
img_name = j + os.path.splitext(i)[1]
self.image_queue.put((i, img_name))
class Consumer(threading.Thread):
def __init__(self, page_queue, image_queue,*args, **kwargs):
threading.Thread.__init__(self, *args, **kwargs)
self.image_queue = image_queue
self.page_queue = page_queue
def run(self) -> None:
while 1:
if self.image_queue.empty() and self.page_queue.empty():
break
img_url, filename = self.image_queue.get()
urllib.request.urlretrieve(img_url, "images/" + filename)
print(filename+" 下载完成")
def main():
print("程序开始")
page_queue = Queue(500)
img_queue = Queue(1000)
for i in range(2,10):
url = rf"http://www.doutula.com/photo/list/?page={i}"
page_queue.put(url)
for i in range(5):
t1 = Producer(page_queue, img_queue)
t1.start()
for i in range(3):
t = Consumer(page_queue, img_queue)
t.start()
if __name__ == '__main__':
main()