#!/usr/bin/env python
# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
from pathlib import Path
import json
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/'
'537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
def get_movie_info(start):
url = f'https://movie.douban.com/top250?start={start}'
response = requests.get(url, headers=headers, timeout=10)
# with open('douban.html', 'w', encoding='utf-8') as f:
# f.write(response.text)
# html = response.text.encode('utf-8')
html = response.text
return html
def parse_movie_info(html):
soup = BeautifulSoup(html, 'lxml')
# 获取class为grid_view的<ol>标签
ol = soup.find('ol', class_='grid_view')
# 获取<ol>标签下的所有<li>标签
lis = ol.find_all('li')
res = []
for li in lis:
try:
info = li.find('div', class_='info')
# 电影标题
title = info.find('span', class_='title').text.strip()
# 链式调用,获取电影图片的URL
# 可以通过get方法从标签对象中获取对应属性的内容,这里从<'img'>标签中获取
# src属性中的值,即图片的URL
img_url = li.find('div', class_='pic').find('img').get('src')
save_image(title, img_url)
background = info.find('div', class_='bd').text.strip()
star = info.find('span', class_='rating_num').text.strip()
desc = info.find('span', class_='inq').text.strip()
data = {
'img': img_url,
'title': title,
'background': background,
'star': star,
'desc': desc
}
res.append(data)
print(f'{title}获取完成')
except Exception as e:
print(e)
return res
def save_image(name, url):
# 保存图片
# 请求图片
r = requests.get(url, headers=headers, timeout=10)
dirpath = Path('./images')
if not dirpath.is_dir(): # 如果不是目录,则表明当前目录不存在
dirpath.mkdir(parents=True)
img = dirpath.joinpath(f'{name}.png')
with open(img, 'wb') as f:
# 将图片数据以二进制信息写入
f.write(r.content)
if __name__ == '__main__':
data = []
for i in range(0, 250, 25):
html = get_movie_info(i)
res = parse_movie_info(html)
data.extend(res)
with open('movies.json', 'w', encoding='utf-8') as f:
content = json.dumps(data, ensure_ascii=False, indent=4)
f.write(content)
豆瓣电影网页分析
于 2022-09-19 07:11:50 首次发布