Python笔记-多线程爬虫实例

如下,线程池两个线程:

线程池关键代码:

源码如下:

import re, multiprocessing
import requests, time

class HandleLaGou(object):
    def __init__(self):
        self.laGou_session = requests.session()
        self.header = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
        }
        self.city_list = ""

    #获取全国城市列表
    def handle_city(self):
        city_search = re.compile(r'zhaopin/">(.*?)</a>')
        city_url = "https://www.lagou.com/jobs/allCity.html"
        city_result = self.handle_request(method = "GET", url = city_url)
        self.city_list = city_search.findall(city_result)
        self.laGou_session.cookies.clear()

    def handle_city_job(self, city):
        first_request_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % city
        first_response = self.handle_request(method = "GET", url = first_request_url)
        total_page_search = re.compile(r'class="span\stotalNum">(\d+)</span>')
        try:
            total_page = total_page_search.search(first_response).group(1)
        except:
            return
        else:
            for i in range(1, int(total_page) + 1):
                data = {
                    "pn": i,
                    "kd": "python"
                }
                page_url = "https://www.lagou.com/jobs/positionAjax.json?city=%s&needAddtionalResult=false" % city
                referer_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % city
                self.header['Referer'] = referer_url.encode()
                response = self.handle_request(method = "POST", url = page_url, data = data, info = city)
                print(response)

    def handle_request(self, method, url, data = None, info = None):

        while True:
            proxyinfo = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
                "host" : "http-dyn.abuyun.com",
                "port" : 9020,
                "user" : "V21C9SWA4CQ3FSHD",
                "pass" : "1DF3191F6103Q34",
            }
            proxy = {
                "http": proxyinfo,
                "https": proxyinfo
            }
            try:
                if method == "GET":
                    response = self.laGou_session.get(url=url, headers=self.header, proxies=proxy,timeout=6)
                    return response.text
                elif method == "POST":
                    response = self.laGou_session.post(url=url, headers=self.header, data=data, proxies=proxy,timeout=6)
                    print(response.text)
            except:
                self.laGou_session.cookies.clear()
                first_request_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % info
                self.handle_request(method="GET", url=first_request_url)
                time.sleep(10)
                continue

            response.encoding = 'utf-8'
            if '频繁' in response.text:
                # 先清除cookies再重新获取cookies
                self.laGou_session.cookies.clear()
                first_request_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % info
                self.handle_request(method="GET", url=first_request_url)
                time.sleep(10)
                continue

        return response.text


if __name__ == '__main__':
    laGou = HandleLaGou()
    laGou.handle_city()

    #多进程爬网站
    pool = multiprocessing.Pool(2)
    for city in laGou.city_list:
        pool.apply_async(laGou.handle_city_job, args=(city,))

    pool.close()
    pool.join()
    pass

 

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

IT1995

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值