requests 正则表达正式基础爬虫案例二：爬取网站数据

wolf犭良

已于 2023-04-12 10:28:39 修改

阅读量416

点赞数

分类专栏： python 文章标签：爬虫 python 开发语言

于 2023-04-12 10:00:18 首次发布

本文链接：https://blog.csdn.net/ldz_wolf/article/details/130099120

版权

python 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

要求：爬取https://ssr1.scrape.center/ 网站中所有电影标题、类型、地区、电影时长、上映日期、评分、简介；
分析：网站共有两个页面组成，电影列表和详情，我们所需要的内容都在详情页面里面可以找到。
列表页面共10页，根据分析可得出，列表页面地址：https://ssr1.scrape.center/page/{页码}；
详情页面就利用正则表达式把一个个内容解析出来即可。
在这里插入图片描述

直接上代码了：

# coding:utf-8
import requests
import json
import re
from os import makedirs
from os.path import exists
import time

import multiprocessing

BASE_URL = 'https://ssr1.scrape.center'
RESULTS_DIR = 'movies'
exists(RESULTS_DIR) or makedirs(RESULTS_DIR)
TOTAL_PAGE = 10

# 抓取列表页面内容
def grap_html(page):
    page_url = f'{BASE_URL}/page/{page}'
    res = requests.get(page_url)
    return res.text
# 解析列表页面，利用正则表达式解析出电影详情url地址。
def parse_index(html):
    results = re.findall('.*?el-card.*?href="(.*?)"', html, re.S)
    results = results if results else []
    return results
# 抓取详情页面内容
def grap_detail(path):
    detail_url = f'{BASE_URL}{path}'
    res = requests.get(detail_url)
    res.encoding = 'utf-8'
    return res.text
# 解析电影详情页面，利用正则表达式解析出需要内容，返回字典
def parse_detail(html):
    title_match = re.search('<h2.*?>(.*?)</h2>', html)
    title = title_match.group(1) if title_match else None
    categories = re.search('<div.*?categories">(.*?)</div>', html, re.S)
    kind_match = re.findall('<span>(.*?)</span>', categories.group(1), re.S)
    kind = kind_match if kind_match else None

    area_html = re.search('<div.*?categories">.*?</div>.*?<div.*?>(.*?)</div>', html, re.S)
    area_match = re.search('<span.*?>(.*?)</span>.*?<span.*?>.*?</span>.*?<span.*?>(.*?)</span>', area_html.group(1),
                           re.S)
    area = area_match.group(1) if area_match else None
    usetime = area_match.group(2) if area_match else None
    time = re.search('\d{4}-\d{2}-\d{2}', html, re.S).group() if re.search('\d{4}-\d{2}-\d{2}', html, re.S) else None

    remark = re.search('<div.*?drama.*?<p.*?>(.*?)</p>', html, re.S)
    desc = remark.group(1).strip() if remark else None

    score = re.search('<p.*?score.*?>(.*?)</p>', html, re.S)
    score = score.group(1).strip() if score else None

    logo = re.search('<div.*detail.*?<img.*?src="(.*?)@.*?".*?>', html, re.S)
    logo = logo.group(1) if logo else None
    dict = {
        'title': title,
        'kind': kind,
        'area': area,
        'usetime': usetime,
        'onlinetime': time,
        'desc': desc,
        'score': score,
        'logo': logo
    }
    return dict

# 下载电影海报，并以图片形式保存
def down_movie_poster(title, url):
    path = f'{RESULTS_DIR}/{title}.jpg'
    res = requests.get(url)
    with open(path, "wb") as f:
        f.write(res.content)
# 保存电影信息到文件，电影名称作为文件名
def savefile(movie):
    text = '电影主题：{title}\n电影类型：{kind}\n上映地区：{area}\n电影时长：{usetime}\n上映时间：{onlinetime}\n评分：{score}\n简介：{desc}'.format(
        **movie)
    name = movie.get('title')
    data_path = f'{RESULTS_DIR}/{name}.json'
    # 将字典以json字符串形式写入文件
    json.dump(movie, open(data_path, 'w', encoding='utf-8'), ensure_ascii=False, indent=2)
    # with open(RESULTS_DIR + '/' + movie.get('title') + '.json', 'w') as f: f.write(text)
    down_movie_poster(name, movie.get('logo'))

# 处理指定页码的列表页中数据，并存文件
def grapone(page):
    print("开始下载第%d页" % page)
    start = time.perf_counter()
    s = time.time()
    # 下载列表页面，并解析出电影详情url地址。
    detailurls = parse_index(grap_html(page))
    for result in detailurls:
        # 抓取详情页面
        detail_html = grap_detail(result)
        # 解析详情页面，返回电影dict
        movie = parse_detail(detail_html)
        savefile(movie)

    end = time.perf_counter()
    e = time.time()
    print("第%d页完成下载,CPU用时：%d,耗时：%d" % (page, end - start, e - s))

# 单一线程抓取网站内电影数据
def grapall():
    # 单线程模式
    start = time.perf_counter()
    s = time.time()
    for i in range(1, TOTAL_PAGE + 1):
        grapone(i)
    end = time.perf_counter()
    e = time.time()
    print("单线程CPU共用时：%d,耗时%d" % (end - start, e - s))
    
# 多线程抓取电影数据
def multigrap():
    # 多线程模式
    start = time.perf_counter()
    s = time.time()
    pool = multiprocessing.Pool()
    pages = range(1, TOTAL_PAGE + 1)
    pool.map(grapone, pages)
    pool.close()
    pool.join()
    end = time.perf_counter()
    e = time.time()
    print("多线程CPU共用时：%d,耗时%d" % (end - start, e - s))

if __name__ == '__main__':
    multigrap()
    # grapall()

最终效果：

在这里插入图片描述

其他不说了，直接看代码得了，正则表达式部分比较烂。。。
本文参考文献：https://cuiqingcai.com/202224.html ，想学习爬虫的可以移步。

wolf犭良

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
打赏
0
评论
requests 正则表达正式基础爬虫案例二：爬取网站数据

要求：爬取https://ssr1.scrape.center/ 网站中所有电影标题、类型、地区、电影时长、上映日期、评分、简介；列表页面共10页，根据分析可得出，列表页面地址：https://ssr1.scrape.center/page/{页码}；本文参考文献：https://cuiqingcai.com/202224.html ，想学习爬虫的可以移步。分析：网站共有两个页面组成，电影列表和详情，我们所需要的内容都在详情页面里面可以找到。其他不说了，直接看代码得了，正则表达式部分比较烂。
复制链接

扫一扫