高性能爬虫之协程

协程爬虫案例

import gevent
from gevent import monkey
monkey.patch_all(thread=False)

import random
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
class Crawler:
    def __init__(self, excelData: dict):
        self.excelData = excelData

    def parseHTML(self, html):
        """
        bs解析html
        :param html:
        :param excelData:
        :return:
        """
        print('start parse')
        bs = BeautifulSoup(html, "html.parser")  # 生成靓汤对象解析
        tbody = bs.find_all(class_="searchArea")  # 搜索区域
        excel_author = '{LastName}, {FirstName}'.format(LastName=self.excelData.get('LastName'),
                                                        FirstName=self.excelData.get('FirstName'))

        print(f'{excel_author} 一共有 {len(tbody)} 条作者检索结果')
        content_list = []
        for tr in tbody:
            print(f'正在检查 {excel_author}{tbody.index(tr) + 1} 条')
            th_input = tr.find('th').find('input')
            name = th_input.get('data-name')
            
            data = {
                'author': name,
            }
            # print(data)
            
            content_list.append(data)

        print('over parse')
        return content_list
class Spider:
    def __init__(self, file):
        self.file = file

    def getExcelData(self, file):
        print('start readExcel')
        raw_data = pd.read_excel(self.file, header=0, keep_default_na=False)  # header=0表示第一行是表头,就自动去除了,keep_default_na=False 表示如果有空行为nal则变成字符串 ''
        raw_list = raw_data.values.tolist()
        url_list = []
        excelData_list = []
        for raw in raw_list:
            LastName = raw[1]
            if LastName == '':
                continue
            FirstName = raw[2]
            country = raw[8]
            email = raw[5]

            excelData_list.append({
                'LastName': raw[1],
                'FirstName': raw[2],
                'email': email,
            })

           
            url = f'https://www.baidu.com'
            url_list.append(url)

        print('over readExcel')
        return url_list, excelData_list

    def spider(self, args: tuple) -> list:
        url = args[0]
        excelData = args[1]
        # 定义爬虫任务,获取网页内容并解析
        c = Crawler(excelData=excelData)
        try:
            res = requests.get(url, headers=c.getHeaders())
        except Exception as e:
            print(e)
        else:
            content_list = []
            if res.status_code == 200:
                content = res.text
                # 处理页面数据...
                content_list = c.parseHTML(html=content)
            
            return content_list

    def crawl_with_gevent(self, args: tuple):
        urls = args[0]
        excelDatas = args[1]
        # 创建协程池
        jobs = [gevent.spawn(self.spider, (urls[i], excelDatas[i])) for i in range(len(urls))]
        # 等待所有协程完成
        gevent.joinall(jobs)
        results = [job.value for job in jobs]
        print(results)
        # 处理爬虫结果...
        return results

    def run(self):
        self.crawl_with_gevent(self.getExcelData(self.file))
if __name__ == '__main__':
    start = time.perf_counter()
	
	file_path = 'E:\1.xlx'
	Spider(file_path ).run

	end = time.perf_counter()
    print('Running time: %s Seconds' % (end - start))

遇到的问题

导包

注意导包这个一定要放在最上面

import gevent
from gevent import monkey
monkey.patch_all(thread=False)

调用

调用函数必须用 if __name__ == '__main__': 不然无法调用

if __name__ == '__main__':
    start = time.perf_counter()
	
	file_path = 'E:\1.xlx'
	Spider(file_path ).run

	end = time.perf_counter()
    print('Running time: %s Seconds' % (end - start))

运行

无法使用debug,使用了也和普通运行一样,断点拦不到

报错

2020-08-01 22:06:12:034678: ----------------------------------------
2020-08-01 22:06:50:875038: Exception happened during processing of request from ('127.0.0.1', 55207)
2020-08-01 22:06:50:875038: Traceback (most recent call last):
2020-08-01 22:06:50:884013:   File "e:\python36\lib\socketserver.py", line 654, in process_request_thread
2020-08-01 22:06:50:885011:     self.finish_request(request, client_address)
2020-08-01 22:06:50:885011:   File "e:\python36\lib\socketserver.py", line 364, in finish_request
2020-08-01 22:06:50:885011:     self.RequestHandlerClass(request, client_address, self)
2020-08-01 22:06:50:885011:   File "e:\python36\lib\socketserver.py", line 724, in __init__
2020-08-01 22:06:50:885011:     self.handle()
2020-08-01 22:06:50:885011:   File "E:\work_space\warclouds\version01\openstack_api_server\venv\lib\site-packages\werkzeug\serving.py", line 345, in handle
2020-08-01 22:06:50:885011:     BaseHTTPRequestHandler.handle(self)
2020-08-01 22:06:50:885011:   File "e:\python36\lib\http\server.py", line 418, in handle
2020-08-01 22:06:50:885011:     self.handle_one_request()
2020-08-01 22:06:50:885011:   File "E:\work_space\warclouds\version01\openstack_api_server\venv\lib\site-packages\werkzeug\serving.py", line 375, in handle_one_request
2020-08-01 22:06:50:885011:     self.raw_requestline = self.rfile.readline()
2020-08-01 22:06:50:885011:   File "e:\python36\lib\socket.py", line 586, in readinto
2020-08-01 22:06:50:885011:     return self._sock.recv_into(b)
2020-08-01 22:06:50:885011:   File "e:\python36\lib\site-packages\gevent\_socket3.py", line 502, in recv_into
2020-08-01 22:06:50:885011:     self._wait(self._read_event)
2020-08-01 22:06:50:885011:   File "src\\gevent\\_hub_primitives.py", line 317, in gevent._gevent_c_hub_primitives.wait_on_socket
2020-08-01 22:06:50:885011:   File "src\\gevent\\_hub_primitives.py", line 322, in gevent._gevent_c_hub_primitives.wait_on_socket
2020-08-01 22:06:50:885011:   File "src\\gevent\\_hub_primitives.py", line 304, in gevent._gevent_c_hub_primitives._primitive_wait
2020-08-01 22:06:50:885011:   File "src\\gevent\\_hub_primitives.py", line 46, in gevent._gevent_c_hub_primitives.WaitOperationsGreenlet.wait
2020-08-01 22:06:50:885011:   File "src\\gevent\\_hub_primitives.py", line 46, in gevent._gevent_c_hub_primitives.WaitOperationsGreenlet.wait
2020-08-01 22:06:50:885011:   File "src\\gevent\\_hub_primitives.py", line 55, in gevent._gevent_c_hub_primitives.WaitOperationsGreenlet.wait
2020-08-01 22:06:50:885011:   File "src\\gevent\\_waiter.py", line 151, in gevent._gevent_c_waiter.Waiter.get
2020-08-01 22:06:50:885011:   File "src\\gevent\\_greenlet_primitives.py", line 61, in gevent._gevent_c_greenlet_primitives.SwitchOutGreenletWithLoop.switch
2020-08-01 22:06:50:885011:   File "src\\gevent\\_greenlet_primitives.py", line 61, in gevent._gevent_c_greenlet_primitives.SwitchOutGreenletWithLoop.switch
2020-08-01 22:06:50:885011:   File "src\\gevent\\_greenlet_primitives.py", line 65, in gevent._gevent_c_greenlet_primitives.SwitchOutGreenletWithLoop.switch
2020-08-01 22:06:50:886007:   File "src\\gevent\\_gevent_c_greenlet_primitives.pxd", line 35, in gevent._gevent_c_greenlet_primitives._greenlet_switch
2020-08-01 22:06:50:889000: greenlet.error: cannot switch to a different thread

2020-08-01 22:06:50:889000: greenlet.error: cannot switch to a different thread

一定会报上面这个错,但是不影响使用

总结

初步运用gevent协程来写程序,个人感觉不如使用aiohttpasyncio来实现协程,而且官方对上面那个报错也没有说明 只说以后有可能会修复然后推荐用aiohttp

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值