要求:爬取https://ssr1.scrape.center/ 网站中所有电影标题、类型、地区、电影时长、上映日期、评分、简介;
分析:网站共有两个页面组成,电影列表和详情,我们所需要的内容都在详情页面里面可以找到。
列表页面共10页,根据分析可得出,列表页面地址:https://ssr1.scrape.center/page/{页码};
详情页面就利用正则表达式把一个个内容解析出来即可。
直接上代码了:
# coding:utf-8
import requests
import json
import re
from os import makedirs
from os.path import exists
import time
import multiprocessing
BASE_URL = 'https://ssr1.scrape.center'
RESULTS_DIR = 'movies'
exists(RESULTS_DIR) or makedirs(RESULTS_DIR)
TOTAL_PAGE = 10
# 抓取列表页面内容
def grap_html(page):
page_url = f'{BASE_URL}/page/{page}'
res = requests.get(page_url)
return res.text
# 解析列表页面,利用正则表达式解析出电影详情url地址。
def parse_index(html):
results = re.findall('.*?el-card.*?href="(.*?)"', html, re.S)
results = results if results else []
return results
# 抓取详情页面内容
def grap_detail(path):
detail_url = f'{BASE_URL}{path}'
res = requests.get(detail_url)
res.encoding = 'utf-8'
return res.text
# 解析电影详情页面,利用正则表达式解析出需要内容,返回字典
def parse_detail(html):
title_match = re.search('<h2.*?>(.*?)</h2>', html)
title = title_match.group(1) if title_match else None
categories = re.search('<div.*?categories">(.*?)</div>', html, re.S)
kind_match = re.findall('<span>(.*?)</span>', categories.group(1), re.S)
kind = kind_match if kind_match else None
area_html = re.search('<div.*?categories">.*?</div>.*?<div.*?>(.*?)</div>', html, re.S)
area_match = re.search('<span.*?>(.*?)</span>.*?<span.*?>.*?</span>.*?<span.*?>(.*?)</span>', area_html.group(1),
re.S)
area = area_match.group(1) if area_match else None
usetime = area_match.group(2) if area_match else None
time = re.search('\d{4}-\d{2}-\d{2}', html, re.S).group() if re.search('\d{4}-\d{2}-\d{2}', html, re.S) else None
remark = re.search('<div.*?drama.*?<p.*?>(.*?)</p>', html, re.S)
desc = remark.group(1).strip() if remark else None
score = re.search('<p.*?score.*?>(.*?)</p>', html, re.S)
score = score.group(1).strip() if score else None
logo = re.search('<div.*detail.*?<img.*?src="(.*?)@.*?".*?>', html, re.S)
logo = logo.group(1) if logo else None
dict = {
'title': title,
'kind': kind,
'area': area,
'usetime': usetime,
'onlinetime': time,
'desc': desc,
'score': score,
'logo': logo
}
return dict
# 下载电影海报,并以图片形式保存
def down_movie_poster(title, url):
path = f'{RESULTS_DIR}/{title}.jpg'
res = requests.get(url)
with open(path, "wb") as f:
f.write(res.content)
# 保存电影信息到文件,电影名称作为文件名
def savefile(movie):
text = '电影主题:{title}\n电影类型:{kind}\n上映地区:{area}\n电影时长:{usetime}\n上映时间:{onlinetime}\n评分:{score}\n简介:{desc}'.format(
**movie)
name = movie.get('title')
data_path = f'{RESULTS_DIR}/{name}.json'
# 将字典以json字符串形式写入文件
json.dump(movie, open(data_path, 'w', encoding='utf-8'), ensure_ascii=False, indent=2)
# with open(RESULTS_DIR + '/' + movie.get('title') + '.json', 'w') as f: f.write(text)
down_movie_poster(name, movie.get('logo'))
# 处理指定页码的列表页中数据,并存文件
def grapone(page):
print("开始下载第%d页" % page)
start = time.perf_counter()
s = time.time()
# 下载列表页面,并解析出电影详情url地址。
detailurls = parse_index(grap_html(page))
for result in detailurls:
# 抓取详情页面
detail_html = grap_detail(result)
# 解析详情页面,返回电影dict
movie = parse_detail(detail_html)
savefile(movie)
end = time.perf_counter()
e = time.time()
print("第%d页完成下载,CPU用时:%d,耗时:%d" % (page, end - start, e - s))
# 单一线程抓取网站内电影数据
def grapall():
# 单线程模式
start = time.perf_counter()
s = time.time()
for i in range(1, TOTAL_PAGE + 1):
grapone(i)
end = time.perf_counter()
e = time.time()
print("单线程CPU共用时:%d,耗时%d" % (end - start, e - s))
# 多线程抓取电影数据
def multigrap():
# 多线程模式
start = time.perf_counter()
s = time.time()
pool = multiprocessing.Pool()
pages = range(1, TOTAL_PAGE + 1)
pool.map(grapone, pages)
pool.close()
pool.join()
end = time.perf_counter()
e = time.time()
print("多线程CPU共用时:%d,耗时%d" % (end - start, e - s))
if __name__ == '__main__':
multigrap()
# grapall()
最终效果:
其他不说了,直接看代码得了,正则表达式部分比较烂。。。
本文参考文献:https://cuiqingcai.com/202224.html ,想学习爬虫的可以移步。