协程爬虫案例
import gevent
from gevent import monkey
monkey.patch_all(thread=False)
import random
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
class Crawler:
def __init__(self, excelData: dict):
self.excelData = excelData
def parseHTML(self, html):
"""
bs解析html
:param html:
:param excelData:
:return:
"""
print('start parse')
bs = BeautifulSoup(html, "html.parser") # 生成靓汤对象解析
tbody = bs.find_all(class_="searchArea") # 搜索区域
excel_author = '{LastName}, {FirstName}'.format(LastName=self.excelData.get('LastName'),
FirstName=self.excelData.get('FirstName'))
print(f'{excel_author} 一共有 {len(tbody)} 条作者检索结果')
content_list = []
for tr in tbody:
print(f'正在检查 {excel_author} 第 {tbody.index(tr) + 1} 条')
th_input = tr.find('th').find('input')
name = th_input.get('data-name')
data = {
'author': name,
}
# print(data)
content_list.append(data)
print('over parse')
return content_list
class Spider:
def __init__(self, file):
self.file = file
def getExcelData(self, file):
print('start readExcel')
raw_data = pd.read_excel(self.file, header=0, keep_default_na=False) # header=0表示第一行是表头,就自动去除了,keep_default_na=False 表示如果有空行为nal则变成字符串 ''
raw_list = raw_data.values.tolist()
url_list = []
excelData_list = []
for raw in raw_list:
LastName = raw[1]
if LastName == '':
continue
FirstName = raw[2]
country = raw[8]
email = raw[5]
excelData_list.append({
'LastName': raw[1],
'FirstName': raw[2],
'email': email,
})
url = f'https://www.baidu.com'
url_list.append(url)
print('over readExcel')
return url_list, excelData_list
def spider(self, args: tuple) -> list:
url = args[0]
excelData = args[1]
# 定义爬虫任务,获取网页内容并解析
c = Crawler(excelData=excelData)
try:
res = requests.get(url, headers=c.getHeaders())
except Exception as e:
print(e)
else:
content_list = []
if res.status_code == 200:
content = res.text
# 处理页面数据...
content_list = c.parseHTML(html=content)
return content_list
def crawl_with_gevent(self, args: tuple):
urls = args[0]
excelDatas = args[1]
# 创建协程池
jobs = [gevent.spawn(self.spider, (urls[i], excelDatas[i])) for i in range(len(urls))]
# 等待所有协程完成
gevent.joinall(jobs)
results = [job.value for job in jobs]
print(results)
# 处理爬虫结果...
return results
def run(self):
self.crawl_with_gevent(self.getExcelData(self.file))
if __name__ == '__main__':
start = time.perf_counter()
file_path = 'E:\1.xlx'
Spider(file_path ).run
end = time.perf_counter()
print('Running time: %s Seconds' % (end - start))
遇到的问题
导包
注意导包这个一定要放在最上面
import gevent
from gevent import monkey
monkey.patch_all(thread=False)
调用
调用函数必须用 if __name__ == '__main__': 不然无法调用
if __name__ == '__main__':
start = time.perf_counter()
file_path = 'E:\1.xlx'
Spider(file_path ).run
end = time.perf_counter()
print('Running time: %s Seconds' % (end - start))
运行
无法使用debug,使用了也和普通运行一样,断点拦不到
报错
2020-08-01 22:06:12:034678: ----------------------------------------
2020-08-01 22:06:50:875038: Exception happened during processing of request from ('127.0.0.1', 55207)
2020-08-01 22:06:50:875038: Traceback (most recent call last):
2020-08-01 22:06:50:884013: File "e:\python36\lib\socketserver.py", line 654, in process_request_thread
2020-08-01 22:06:50:885011: self.finish_request(request, client_address)
2020-08-01 22:06:50:885011: File "e:\python36\lib\socketserver.py", line 364, in finish_request
2020-08-01 22:06:50:885011: self.RequestHandlerClass(request, client_address, self)
2020-08-01 22:06:50:885011: File "e:\python36\lib\socketserver.py", line 724, in __init__
2020-08-01 22:06:50:885011: self.handle()
2020-08-01 22:06:50:885011: File "E:\work_space\warclouds\version01\openstack_api_server\venv\lib\site-packages\werkzeug\serving.py", line 345, in handle
2020-08-01 22:06:50:885011: BaseHTTPRequestHandler.handle(self)
2020-08-01 22:06:50:885011: File "e:\python36\lib\http\server.py", line 418, in handle
2020-08-01 22:06:50:885011: self.handle_one_request()
2020-08-01 22:06:50:885011: File "E:\work_space\warclouds\version01\openstack_api_server\venv\lib\site-packages\werkzeug\serving.py", line 375, in handle_one_request
2020-08-01 22:06:50:885011: self.raw_requestline = self.rfile.readline()
2020-08-01 22:06:50:885011: File "e:\python36\lib\socket.py", line 586, in readinto
2020-08-01 22:06:50:885011: return self._sock.recv_into(b)
2020-08-01 22:06:50:885011: File "e:\python36\lib\site-packages\gevent\_socket3.py", line 502, in recv_into
2020-08-01 22:06:50:885011: self._wait(self._read_event)
2020-08-01 22:06:50:885011: File "src\\gevent\\_hub_primitives.py", line 317, in gevent._gevent_c_hub_primitives.wait_on_socket
2020-08-01 22:06:50:885011: File "src\\gevent\\_hub_primitives.py", line 322, in gevent._gevent_c_hub_primitives.wait_on_socket
2020-08-01 22:06:50:885011: File "src\\gevent\\_hub_primitives.py", line 304, in gevent._gevent_c_hub_primitives._primitive_wait
2020-08-01 22:06:50:885011: File "src\\gevent\\_hub_primitives.py", line 46, in gevent._gevent_c_hub_primitives.WaitOperationsGreenlet.wait
2020-08-01 22:06:50:885011: File "src\\gevent\\_hub_primitives.py", line 46, in gevent._gevent_c_hub_primitives.WaitOperationsGreenlet.wait
2020-08-01 22:06:50:885011: File "src\\gevent\\_hub_primitives.py", line 55, in gevent._gevent_c_hub_primitives.WaitOperationsGreenlet.wait
2020-08-01 22:06:50:885011: File "src\\gevent\\_waiter.py", line 151, in gevent._gevent_c_waiter.Waiter.get
2020-08-01 22:06:50:885011: File "src\\gevent\\_greenlet_primitives.py", line 61, in gevent._gevent_c_greenlet_primitives.SwitchOutGreenletWithLoop.switch
2020-08-01 22:06:50:885011: File "src\\gevent\\_greenlet_primitives.py", line 61, in gevent._gevent_c_greenlet_primitives.SwitchOutGreenletWithLoop.switch
2020-08-01 22:06:50:885011: File "src\\gevent\\_greenlet_primitives.py", line 65, in gevent._gevent_c_greenlet_primitives.SwitchOutGreenletWithLoop.switch
2020-08-01 22:06:50:886007: File "src\\gevent\\_gevent_c_greenlet_primitives.pxd", line 35, in gevent._gevent_c_greenlet_primitives._greenlet_switch
2020-08-01 22:06:50:889000: greenlet.error: cannot switch to a different thread
2020-08-01 22:06:50:889000: greenlet.error: cannot switch to a different thread
一定会报上面这个错,但是不影响使用
总结
初步运用gevent协程来写程序,个人感觉不如使用aiohttp、asyncio来实现协程,而且官方对上面那个报错也没有说明 只说以后有可能会修复然后推荐用aiohttp