import threading,urllib,json
from queue import Queue
from bs4 import BeautifulSoup as bs
CRAWL_EXIT = False
class ThreadCrawl(threading.Thread):
def __init__(self,threadName,pageQueue,dataQueue):
threading.Thread.__init__(self)
self.threadName = threadName
self.pageQueue = pageQueue
self.dataQueue = dataQueue
self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
'Accept-Language': 'zh-CN,zh;q=0.8'}
def run(self):
print("启动 "+self.threadName)
while not CRAWL_EXIT:
try:
page = self.pageQueue.get(False)
print("page: %d" , page)
url = 'https://job.e0575.com/list.php?page='+str(page)
request=urllib.request.Request(url,headers=self.headers)
response=urllib.request.urlopen(request)
content=response.read().decode('utf-8')
self.dataQueue.put(content)
except Exception as e:
print(e)
print("结束 "+self.threadName)
PARSE_EXIT = False
class ThreadParse(threading.Thread):
def __init__(self,threadName,dataQueue,localFile,lock):
super(ThreadParse,self).__init__()
self.threadName = threadName
self.dataQueue = dataQueue
self.localFile = localFile
self.lock = lock
def parse(self,html):
html=bs(html,'lxml')
result1=html.select('li[class="bg1"]')
result2=html.select('li[class="bg3"]')
result1+=result2
items=[]
for site in result1:
name=site.find('span').text
detailLink=site.find('a').attrs['href']
wage=site.select('.dd1')[0].text[7:-5]
publishTime=site.select('.dd2')[0].text[7:-5]
companyname=site.find('a',{'class':None}).text[7:-6]
workrequest=site.find('a').attrs['title'].replace('\u3000',' ')
items={
'职位名称':name,
'详情链接':detailLink,
'工作薪酬':wage,
'发布时间':publishTime,
'发布公司':companyname}
with self.lock:
self.localFile.write(str(items).encode())
def run(self):
print("启动"+self.threadName)
while not PARSE_EXIT:
try:
html = self.dataQueue.get()
self.parse(html)
except Exception as e:
print(e)
print("结束" + self.threadName)
def main():
pageQueue = Queue(20)
dataQueue = Queue()
threadCrawls = []
threadParses = []
for i in range(1,21):
pageQueue.put(i)
localFile = open('job.json','ab')
lock = threading.Lock()
for threadName in ["采集线程1号","采集线程2号","采集线程3号"]:
thread = ThreadCrawl(threadName,pageQueue, dataQueue)
thread.start()
threadCrawls.append(thread)
for threadName in ["解析线程1号","解析线程2号","解析线程3号"]:
thread = ThreadParse(threadName,dataQueue,localFile,lock)
thread.start()
threadParses.append(thread)
while not pageQueue.empty():
pass
global CRAWL_EXIT
CRAWL_EXIT = True
print("pageQueue为空")
for thread in threadCrawls:
thread.join()
while not dataQueue.empty():
pass
print("dataQueue为空")
global PARSE_EXIT
PARSE_EXIT = True
for thread in threadParses:
thread.join()
with lock:
localFile.close()
if __name__ == "__main__":
main()