爬虫基础2：多线程爬取51job职位

最新推荐文章于 2024-07-31 14:30:10 发布

小熊饼干学编程

最新推荐文章于 2024-07-31 14:30:10 发布

阅读量309

点赞数

文章标签： python 多线程

本文链接：https://blog.csdn.net/qq_43906274/article/details/105590641

版权

学习记录：
1.requests思维脑图，记录基础用法
2.python多线程threading模块
3.队列模块Queue

自己用百度脑图画的简单用法

# -*- coding=utf-8 -*-
import time

import requests
import threading
from multiprocessing import Queue
from lxml import etree


class CrawlPage(threading.Thread):
    def __init__(self, page_queue, data_queue, thread_name):
        super(CrawlPage,self).__init__()
        self.page_queue = page_queue
        self.data_queue = data_queue
        self.thread_name = thread_name
        # 默认请求头
        self.header = {
            "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "Connection": "keep-alive",
            "Host": "search.51job.com",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3610.2 Safari/537.36",
        }

    def run(self):
        print("当前工作线程为：{}".format(self.thread_name))
        # 1.循环取page_queue里的url直到取完 2.requests发送请求返回数据保存到data_queue
        while not page_flag:
            try:
                page = self.page_queue.get(block=False)
                page_url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,{}.html'.format(page)
                response = requests.get(url=page_url, headers=self.header)
                print("当前爬取的url为：{}".format(page_url),"爬取页面响应状态码为：{}".format(response.status_code))
                response.encoding = 'gbk'
                self.data_queue.put(response.text)
            except:
                pass


class CrawlData(threading.Thread):
    def __init__(self, thread_name, data_queue):
        super(CrawlData, self).__init__()
        self.thread_name = thread_name
        self.data_queue = data_queue

    def run(self):
        print("当前处理文本数据的线程为：{}".format(self.thread_name))
        while not data_flag:
            try:
                print("当前剩余数据量为{}".format(self.data_queue.qsize()))
                text = self.data_queue.get(block=False)
                html = etree.HTML(text)
                all_div = html.xpath("//div[@id='resultList']//div[@class='el']")
                info_list = []
                for item in all_div:
                    info = {}
                    # 获取数据的时候，使用列表索引为0的数据
                    info['job_name'] = item.xpath("./p/span/a/@title")[0]
                    info['company_name'] = item.xpath(".//span[@class='t2']/a/@title")[0]
                    info['company_address'] = item.xpath(".//span[@class='t3']/text()")[0]
                    # money字段可能为空，try-except来进行异常处理
                    try:
                        info['money'] = item.xpath(".//span[@class='t4']/text()")[0]
                    except:
                        info['money'] = '无数据'
                    info['date'] = item.xpath(".//span[@class='t5']/text()")[0]
                    info_list.append(info)
                print("当前处理的线程为：{}，解析出的数据为：{}".format(self.thread_name,info_list))
                time.sleep(1)
            except:
                pass


# 设置两个全局标志位，当数据取完时结束while循环
page_flag = False
data_flag = False


def main():
    # 构造存放页码和文本数据队列
    page_queue = Queue()
    data_queue = Queue()

    #存入页码数据
    for page in range(1,10):
        page_queue.put(page)
    print("当前队列中共有页码数为：{}".format(page_queue.qsize()))

    #启动线程爬取页面信息
    global page_flag
    page_thread_name = ['页面爬取1号','页面爬取2号','页面爬取3号']
    page_crawl_list = []
    for page_thread in page_thread_name:
        page_crawl = CrawlPage(page_queue,data_queue,page_thread)
        page_crawl.start()
        page_crawl_list.append(page_crawl)

    # 主线程进行阻塞，直到page_queue里的数据全部取完
    while not page_queue.empty():
        pass
    
    # 当page_queue里的数据全部取完后，将标志位设置为True，结束CrawlPage.run() 方法中的while循环
    page_flag = True
    for page_crwal_join in page_crawl_list:
        page_crwal_join.join()
        print(page_crwal_join.thread_name+"页面爬取工作结束！")
    print("当前data_queue的数据总量为:{}".format(data_queue.qsize()))

    # 设置3个文本处理线程，启动文本处理类
    crawl_thread_name = ["文本处理1号","文本处理2号","文本处理3号"]
    crawl_data_list = []
    for crawl_data_name in crawl_thread_name:
        crawl_data = CrawlData(crawl_data_name, data_queue)
        crawl_data.start()
        crawl_data_list.append(crawl_data)


    # 阻塞主线程，直到data_queue的数据全部取完
    while not data_queue.empty():
        pass

    global data_flag
   # 数据取完后退出线程
    data_flag = True
    for crawl_data_join in crawl_data_list:
        crawl_data_join.join()
        print("线程{}结束".format(crawl_data_join.thread_name))

    # 数据为0，解析html文本数据结束
    print("data_queue的数据为：{}".format(data_queue.qsize()))


if __name__ == '__main__':
    main()

代码是自己写的，参考的是别的学习网站

小熊饼干学编程

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
爬虫基础2：多线程爬取51job职位

学习记录：1.requests思维脑图，记录基础用法2.python多线程threading模块3.队列模块Queue# -*- coding=utf-8 -*-import timeimport requestsimport threadingfrom multiprocessing import Queuefrom lxml import etreeclass Cr...
复制链接

扫一扫