import requests import urllib,requests import time import os import threading import bs4 from bs4 import BeautifulSoup PAGE_URL_LIST=[] headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'} BASE_PAGE_URL='http://www.doutula.com/photo/list/?page=' URL_LIST=[] gLock=threading.Lock() for x in range(1,870): url=BASE_PAGE_URL+str(x) PAGE_URL_LIST.append(url) def producer(): while True: gLock.acquire() if len(PAGE_URL_LIST)==0: gLock.release() break else: page_url=PAGE_URL_LIST.pop() response=requests.get(page_url,headers=headers) gLock.release() content=response.content soup=BeautifulSoup(content,'lxml') img_list=soup.find_all('img',attrs={'class':'img-responsive lazy image_dta'}) gLock.acquire() for imag in img_list: url=imag['data-original'] URL_LIST.append(url) gLock.release() def customer(): while True: gLock.acquire() if len(URL_LIST)==0: gLock.release() continue else: url=URL_LIST.pop() gLock.release() split_list = url.split('/') filename = split_list.pop() path = os.path.join('images', filename) print(path) urllib.request.urlretrieve(url, filename=path) pass def main(): for x in range(3): th = threading.Thread(target=producer) th.start() for x in range(5): th = threading.Thread(target=customer) th.start() if __name__=='__main__': main()
爬虫斗图
最新推荐文章于 2020-04-25 16:25:38 发布