阻塞
import time
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import os
import threading
from queue import Queue
class Tencent(threading.Thread):
def __init__(self,q,name):
super().__init__()
self.q = q
self.name = name
def save_to_html(self,html_str,filename):
dirname = os.path.dirname(filename)
if not os.path.exists(dirname):
os.mkdir(dirname)
with open(filename,'w',encoding='utf-8') as fp:
fp.write(html_str)
print('下载完成!',filename)
def get_content_by_selenium(self,url):
driver = webdriver.PhantomJS()
wait = WebDriverWait(driver,20)
driver.get(url)
wait.until(EC.presence_of_all_elements_located((By.XPATH,'//div[@class="recruit-list"]')))
html_str = driver.page_source
return html_str
def download(self,i):
base_url = 'https://careers.tencent.com/search.html?index=%s'
html_str = self.get_content_by_selenium(base_url %i)
self.save_to_html(html_str,'./tencent/{}.html'.format(i))
def run(self):
while True:
if self.q.empty():
break
i = self.q.get()
print('============第{}页==================@{}'.format(i,self.name))
self.download(i)
if __name__ == '__main__':
start = time.time()
crawl_list = []
q = Queue()
for i in range(1, 20):
q.put(i)
crawl_list = ['aa','bb','cc','dd']
join_list = []
for crawl in crawl_list:
t = Tencent(q,crawl)
t.start()
join_list.append(t)
for t in join_list:
t.join()
print(time.time()-start)
队列
from queue import Queue
q = Queue()
for i in range(100):
q.put(i)
while True:
if q.empty():
break
print(q.get())
print(q.get(block=False))
公共缓冲区
import time
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from queue import Queue
import threading
import os
class Productor(threading.Thread):
def __init__(self,name,q):
super().__init__()
self.q_page = q
self.name = name
def get_content_by_selenium(self, url):
driver = webdriver.PhantomJS()
wait = WebDriverWait(driver, 20)
driver.get(url)
wait.until(EC.presence_of_all_elements_located((By.XPATH, '//div[@class="recruit-list"]')))
html_str = driver.page_source
return html_str
def download(self,page):
base_url = 'https://careers.tencent.com/search.html?index=%s'
html_str = self.get_content_by_selenium(base_url % page)
return html_str
def run(self):
while True:
if self.q_page.empty():
break
page = self.q_page.get()
html_str = self.download(page)
print('========productor第{}页=========@{}'.format(page,self.name))
q_html.put(html_str,block=False)
class Consumer(threading.Thread):
def __init__(self,crawl):
super().__init__()
self.name = crawl
def save_to_html(self, html_str, filename):
dirname = os.path.dirname(filename)
if not os.path.exists(dirname):
os.mkdir(dirname)
with open(filename, 'w', encoding='utf-8') as fp:
fp.write(html_str)
print('下载完成!', filename)
def run(self):
while True:
if q_html.empty() and flag:
break
try:
html_str = q_html.get(block=False)
print('====consumer========@{}'.format(self.name))
self.save_to_html(html_str, './tencent/{}.html'.format(int(time.time() * 1000)))
except Exception:
continue
if __name__ == '__main__':
q_html =Queue()
flag = False
q_page = Queue()
for i in range(1,20):
q_page.put(i)
crawl_P = ['aa','bb','cc','dd']
join_p = []
for crawl in crawl_P:
t = Productor(crawl,q_page)
t.start()
join_p.append(t)
crawl_C = ['11','22','33','44']
join_C = []
for crawl in crawl_C:
t = Consumer(crawl)
t.start()
join_C.append(t)
for p in join_p:
p.join()
flag = True