1、多线程爬虫
改写猫眼电影爬虫:
import json
import re
import time
import requests
from colorama import Fore
from fake_useragent import UserAgent
from requests import HTTPError
import lxml
from lxml import etree
def download_page(url,params=None):
try:
ua = UserAgent()
headers = {'User-Agent':ua.random}
response = requests.get(url,params = params,headers=headers)
except HTTPError as e:
print(Fore.RED + '[-]爬取网站%s失败:%s' %(url,str(e)))
return None
else:
return response.text
def parse_html(html):
"""
通过正则表达式对html解析获取电影名称、时间、评分、图片等信息
:param html:
:return:
"""
# 通过使用正则表达式解析页面
# pattern = re.compile(
# '<dd>'
# + '.*?<i class="board-index.*?">(\d+)</i>' # 获取电影的排名<i class="board-index board-index-1">1</i>
# + '.*?<img data-src="(.*?)" alt="(.*?)" class="board-img" />'
# + '.*?<p class="star">(.*?)</p>' # 获取电影主演信息<p class="star">主演:葛优,巩俐,牛犇</p>
# + '.*?<p class="releasetime">(.*?)</p>' # 获取上映时间<p class="releasetime">上映时间:1994-05-17(法国)</p>
# '.*?</dd>',
# re.S
# )
# # findall返回列表,finditer返回迭代器
# items = re.finditer(pattern,html)
# 通过是利用XPath解析页面
# 1)将传入的html文档内容,通过lxml解析器进行解析
html = etree.HTML(html)
# 2)使用XPath语法获取电影信息
# <dl class="board-wrapper">从根结点找到属性名为board-wrapper的dl标签,拿出里面的dd标签
movies = html.xpath('//dl[@class="board-wrapper"]/dd')
for movie in movies:
# 从当前dd节点寻找i标签,并获取i标签的内容
index = movie.xpath('./i/text()')[0]
# print(index)
image = movie.xpath('.//img[@class="board-img"]/@data-src')[0] # <img data-src="https://p0.meituan.net/movie/4c41068ef7608c1d4fbfbe6016e589f7204391.jpg@160w_220h_1e_1c" alt="活着" class="board-img" />
title = movie.xpath('.//img[@class="board-img"]/@alt')[0] # <img data-src="https://p0.meituan.net/movie/4c41068ef7608c1d4fbfbe6016e589f7204391.jpg@160w_220h_1e_1c" alt="活着" class="board-img" />
actors = movie.xpath('.//p[@class="star"]/text()')[0] # <p class="star">主演:葛优,巩俐,牛犇</p>
add_time = movie.xpath('.//p[@class="releasetime"]/text()')[0] # <p class="releasetime">上映时间:1994-05-17(法国)</p>
yield {
'index': index,
'image': image,
'title': title,
'actors': actors.strip().lstrip('主演:'),
'add_time':add_time.lstrip('上映时间:')
}
def save_to_json(data,filename):
"""将爬取的数据信息写入json文件中"""
import codecs # 可以直接指定文件的编码格式为UTF-8
# with open(filename,'a') as f:
# f.write(json.dumps(data,ensure_ascii=False,indent=4))
# print(Fore.GREEN + '[+] 保存电影%s成功' %(data['title']))
with codecs.open(filename,'a','utf-8') as f:
f.write(json.dumps(data,ensure_ascii=False,indent=4) + '\n')
def get_one_page(page=1):
# url = 'https://maoyan.com/board/4' # 爬取一页内容
url = 'https://maoyan.com/board/4?offset=%s' %((page-1)*10)
html = download_page(url)
# print(html)
items = parse_html(html)
print(Fore.GREEN + '[+] 采集[%s]页数据' % (page))
for item in items:
print(item)
save_to_json(item,'maoyan.json')
def no_use_thread():
for page in range(1,11):
get_one_page(page)
print(Fore.GREEN + '[+] 采集[%s]页数据' %(page))
time.sleep(1)
def use_multi_thread():
# 使用多线程实现的代码
from threading import Thread
threads = []
for page in range(1, 11):
thread = Thread(target=get_one_page, args=(page,))
thread.start()
print(Fore.GREEN + '[+] 采集第[%s]页数据' % (page))
threads.append(thread)
[thread.join() for thread in threads]
print(Fore.GREEN + '采集数据完成')
if __name__ == '__main__':
# 使用线程池实现多线程
from concurrent.futures import ThreadPoolExecutor
# 实例化线程池并指定线程池中线程的个数
pool = ThreadPoolExecutor(100)
pool.map(get_one_page,range(1,11))