python之多线程爬取绍兴e网招聘信息_绍兴e网用php开发-CSDN博客

本文链接：https://blog.csdn.net/GrofChen/article/details/101779088
import threading,urllib,json
from queue import Queue 
from bs4 import BeautifulSoup as bs
CRAWL_EXIT = False
class ThreadCrawl(threading.Thread):
    def __init__(self,threadName,pageQueue,dataQueue):
        threading.Thread.__init__(self)
        self.threadName = threadName 
        self.pageQueue = pageQueue 
        self.dataQueue = dataQueue
        self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
                    'Accept-Language': 'zh-CN,zh;q=0.8'}
    def run(self):
        print("启动 "+self.threadName)
        while not CRAWL_EXIT:
            try:
                page = self.pageQueue.get(False)
                print("page: %d" , page)
                url = 'https://job.e0575.com/list.php?page='+str(page)
                request=urllib.request.Request(url,headers=self.headers)
                response=urllib.request.urlopen(request)
                content=response.read().decode('utf-8')
                self.dataQueue.put(content)
            except Exception as e:
                print(e)
        print("结束 "+self.threadName)
PARSE_EXIT = False
class ThreadParse(threading.Thread):
    def __init__(self,threadName,dataQueue,localFile,lock):
        super(ThreadParse,self).__init__()
        self.threadName = threadName
        self.dataQueue = dataQueue
        self.localFile = localFile
        self.lock = lock
        
    def parse(self,html):
        html=bs(html,'lxml')
        result1=html.select('li[class="bg1"]')
        result2=html.select('li[class="bg3"]')
        result1+=result2
        items=[]
        for site in result1:
            name=site.find('span').text
            detailLink=site.find('a').attrs['href']
            wage=site.select('.dd1')[0].text[7:-5]
            publishTime=site.select('.dd2')[0].text[7:-5]
            companyname=site.find('a',{'class':None}).text[7:-6]
            workrequest=site.find('a').attrs['title'].replace('\u3000','  ')
            items={
                '职位名称':name,
                '详情链接':detailLink,
                '工作薪酬':wage,
                '发布时间':publishTime,
                '发布公司':companyname}
            with self.lock:
                self.localFile.write(str(items).encode())
        
    def run(self):

        print("启动"+self.threadName)
        while not PARSE_EXIT:
            try:
                html = self.dataQueue.get()
                self.parse(html)
            except Exception as e:
                print(e)
        print("结束" + self.threadName)

        
        
    

def main():

    pageQueue = Queue(20)
    dataQueue = Queue()
    threadCrawls = []
    threadParses = [] 
    for i in range(1,21):
        pageQueue.put(i)
    localFile = open('job.json','ab')
    lock = threading.Lock()  
    

    for threadName in ["采集线程1号","采集线程2号","采集线程3号"]:
        thread = ThreadCrawl(threadName,pageQueue, dataQueue)
        thread.start()
        threadCrawls.append(thread)
        

    for threadName in ["解析线程1号","解析线程2号","解析线程3号"]:
        thread = ThreadParse(threadName,dataQueue,localFile,lock)
        thread.start()
        threadParses.append(thread)

    while not pageQueue.empty():
        pass
    global  CRAWL_EXIT
    CRAWL_EXIT = True
    print("pageQueue为空")
    for thread in threadCrawls:
        thread.join()
    while not dataQueue.empty():
        pass
    print("dataQueue为空")
    global PARSE_EXIT
    PARSE_EXIT = True
    for thread in threadParses:
        thread.join()
    with lock:
        localFile.close()



if __name__ == "__main__":
    main()