一.用gevent实现多协程爬虫的重点
1.定义爬虫函数
2.用gevent.spawn() 创建任务
3.用gevent.joinall()执行任务
二.用queue模块的重点
1.用Queue()创建队列
2.用put_nowait()存储数据
3.用get_nowwat()提取数据
三.queue对象的方法
1.put_nowait():往队列里存储数据
2.get_nowwat():从队列里提取数据
3.empty():判断队列是否为空
4.full():判断队列是否为满
5.qsize():判断队列还剩多少数量
例子:
#使用多协程和队列,爬取时光网电视剧TOP100的数据(剧名、导演、主演和简介),并用csv模块将数据存储下来。
#时光网TOP100链接:http://www.mtime.com/top/tv/top100/
#1.导入包
from gevent import monkey
from gevent.queue import Queue
monkey.patch_all()
from bs4 import BeautifulSoup
import numpy as np
import requests
import gevent
import openpyxl
import xlsxwriter
#以下两种方式解决非法字符问题
#import re
#ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE
#2.放任务到队列里
#请求头,一定要有,尤其是cookie,不然会报521,cookie问题还可以考虑用别的方式
headers={
'Cookie': 'waf_cookie=6d1f9892-9e25-49b6ce87a4f1adad3811482408cb2b0ce0ab; _userCode_=20203121830565094; _userIdentity_=20203121830561661; _tt_=F3EA8A5DD9B84BA9104330B8FBD89E1A; DefaultCity-CookieKey=364; DefaultDistrict-CookieKey=0; Hm_lvt_6dd1e3b818c756974fb222f0eae5512e=1584009057; __utma=196937584.1740204334.1584009057.1584009057.1584009057.1; __utmc=196937584; __utmz=196937584.1584009057.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); Hm_lpvt_6dd1e3b818c756974fb222f0eae5512e=1584013457; _ydclearance=7756d7585b4000e9c0cbb745-d9f2-46af-999c-30a9345f3e24-1584023564',
'Referer':'http://www.mtime.com/top/tv/top100/',
'Upgrade-Insecure-Requests':'1',
'Host':'www.mtime.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}
movie_infos=[]
work=Queue()
work.put_nowait('http://www.mtime.com/top/tv/top100/')
for i in range(2,11):
url='http://www.mtime.com/top/tv/top100/index-'+str(i)+'.html'
work.put_nowait(url)
#3.并发执行查询的任务(两个线程)
def getInfo():
while not work.empty():
url=work.get_nowait()
r=requests.get(url,headers=headers)
# print(r.status_code)
bs=BeautifulSoup(r.text,'html.parser')
movies=bs.find_all(class_='mov_con')
# print(movie)
for movie in movies:
movie_name=movie.find(class_='px14 pb6')
movie_info=[]
movie_info.append(ILLEGAL_CHARACTERS_RE.sub(r'', movie_name.text))
people=movie.find_all('p')
for i in people:
movie_info.append(ILLEGAL_CHARACTERS_RE.sub(r'', i.text))
movie_infos.append(movie_info)
task_list=[]
for x in range(2):
task=gevent.spawn(getInfo)
task_list.append(task)
gevent.joinall(task_list)
#4.把数据存到excel中
wb=openpyxl.Workbook()
sheet=wb.active
sheet.title='movie'
print(len(movie_infos))
for i in range(len(movie_infos)):
print(i)
sheet.append( movie_infos[i])
wb.save("c:\\Users\\69505\\Desktop\\movie.xlsx")