python爬虫实战之多线程爬取前程无忧简历

python爬虫实战之多线程爬取前程无忧简历

import requests
import re
import threading
import time
from queue import Queue

HEADERS = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
}


# 自定义线程--生产者
class Procuder(threading.Thread):
    # 初始化,传入url和保存数据的队列
    def __init__(self, pageurl_queue, jobinfo_queue, *args, **kwargs):
        super(Procuder, self).__init__(*args, **kwargs)
        self.pageurl_queue = pageurl_queue
        self.jobinfo_queue = jobinfo_queue

    # 重写run方法
    def run(self):
        while True:
            # 当url队列为空时,退出循环
            if self.pageurl_queue.empty():
                break
            # 获取队列中的url
            url = self.pageurl_queue.get()
            # 调用解析url函数
            self.parse_page(url)

    # 解析url函数
    def parse_page(self, url):
        # 模拟请求,并以gbk编码返回
        resp = requests.get(url, headers=HEADERS)
        resp.encoding = "gbk"
        text = resp.text
        # 通过正则爬取每条职位的url
        jobs_url = re.findall('<div class="el">.*?<a target="_blank".*?href="(.*?)".*?>', text, re.DOTALL)
        for x in jobs_url:
            # 调用解析每条职位的url函数
            self.parse_job_info(x)

    # 解析每条职位的url函数
    def parse_job_info(self, url):
        # 模拟请求,并以gbk编码返回
        resp = requests.get(url, headers=HEADERS)
        resp.encoding = "gbk"
        text = resp.text
        # 通过正则爬取想要的信息
        info = re.findall(r'<p class="msg ltype" title="(.*?)"', text, re.DOTALL)
        if len(info) > 0:
            all_info = re.sub("&nbsp;", "", info[0])
            infos = all_info.split("|")
            if len(infos) >= 5 and infos[4].find("发布") >= 0:
                jobname = re.findall(r'<div class="cn">.*?>(.*?)<input', text, re.DOTALL)[0].split(' ', 1)[0]
                if jobname == "": jobname = "null"
                companyname = re.findall(r'<p class="cname">.*?title="(.*?)"', text, re.DOTALL)[0]
                if companyname == "": companyname = "null"
                companytype = re.findall(r'<div class="com_tag".*?title="(.*?)"', text, re.DOTALL)[0]
                if companytype == "": companytype = "null"
                companysize = re.findall(r'<div class="com_tag".*?</p>.*?title="(.*?)"', text, re.DOTALL)[0]
                if companysize == "": companysize = "null"
                companysalary = re.findall(r'<div class="cn">.*?<strong>(.*?)<', text, re.DOTALL)[0]
                if companysalary == "": companysalary = "null"
                companycity = infos[0]
                workingExp = infos[1]
                edulevel = infos[2]
                needperson = infos[3]
                createdata = infos[4]
                welfare = re.findall(r' <div class="t1">(.*?)<div', text, re.DOTALL)[0]
                if welfare == "" or welfare.isspace(): welfare = "null"
                welfare = re.sub("\n", "", welfare)
                welfare = re.sub(" ", "", welfare)
                welfare = re.sub("<.*?>", "/", welfare)
                welfare = re.sub("//", ",", welfare)
                welfare = re.sub("/", "", welfare)
                welfare = re.sub("\r", "", welfare)
                # 将爬取内容存储到数据的队列
                self.jobinfo_queue.put((jobname, companyname, companytype, companysize, companycity, companysalary,
                                        edulevel, workingExp, welfare, needperson, createdata))


# 自定义线程--消费者
class Consumer(threading.Thread):
    # 初始化,传入url和保存数据的队列
    def __init__(self, pageurl_queue, jobinfo_queue, *args, **kwargs):
        super(Consumer, self).__init__(*args, **kwargs)
        self.pageurl_queue = pageurl_queue
        self.jobinfo_queue = jobinfo_queue

    # 重写run方法
    def run(self):
        while True:
            # 当url队列和存储数据队列为空时,退出循环
            if self.jobinfo_queue.empty() and self.pageurl_queue.empty():
                break
            # "jobname","companyname","companytype","companysize","companycity","companysalary","edulevel","workingExp","welfare","needperson","createdata"
            values = self.jobinfo_queue.get()
            # 追加写入到文本中
            with open("qcwy.txt", "a+", encoding="utf-8", newline="") as f:
                f.write(values[0] + '\001' + values[1] + '\001' + values[2] + '\001' + values[3] + '\001' + values[
                    4] + '\001' + values[5] + '\001' + values[6] + '\001' + values[7] + '\001' + values[8] + '\001' +
                        values[9] + '\001' + values[10] + "\n")
                print("完成")


# 根据url返回页数
def return_pages(url):
    resp = requests.get(url, headers=HEADERS)
    resp.encoding = "gbk"
    text = resp.text
    page = re.findall('<div class="rt">.*?</span>&nbsp;/&nbsp;(.*?)<', text, re.DOTALL)[0]
    return page.strip()


def main():
    # 创建队列
    pageurl_queue = Queue(200000)
    jobinfo_queue = Queue(200000)
    start_url = "https://search.51job.com/list/{},000000,0000,00,9,99,%2520,2,1.html"
    info_url = "https://search.51job.com/list/{},000000,0000,00,9,99,%2520,2,{}.html"
	
	#相关省份和程式码
    city_code = ['010000', '020000', '030000', '050000', '060000', '070000',
                 '080000', '090000', '100000', '110000', '120000', '130000',
                 '140000', '150000', '160000', '170000', '180000', '190000',
                 '200000', '210000', '220000', '230000', '240000', '250000',
                 '260000', '270000', '280000', '290000', '300000', '310000',
                 '320000', '110200', '030200', '040000', '080200', '180200',
                 '200200', '070200', '090200', '030800', '230300', '230200',
                 '080300', '170200', '070300', '250200', '190200', '150200',
                 '120300', '120200', '220200', '240200']
    #获取每条职位对应的url放入队列中
    for x in city_code:
        for y in range(1, int(return_pages(start_url.format(x))) + 1):
            u = info_url.format(x, y)
            pageurl_queue.put(u)
    #循环开启生产者线程    100=100线程
    for x in range(100):
        t = Procuder(pageurl_queue, jobinfo_queue)
        t.start()
    time.sleep(8)
    #循环开启消费者线程    100=100线程
    for x in range(100):
        t = Consumer(pageurl_queue, jobinfo_queue)
        t.start()


if __name__ == '__main__':
    main()
发布了1 篇原创文章 · 获赞 0 · 访问量 380
展开阅读全文

没有更多推荐了,返回首页

©️2019 CSDN 皮肤主题: 大白 设计师: CSDN官方博客

分享到微信朋友圈

×

扫一扫,手机浏览