1.从豆瓣爬取top250的电影数据,并保存为csv文件
# 导入所需的库
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
# 设置工作目录
os.chdir('D:/桌面/实验/python实验/python-class/')
# 设置请求头,模拟浏览器访问
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.69'
}
# 创建一个空的DataFrame,用于存储电影信息
all_movies = pd.DataFrame(columns=['电影名', '年份', '国家', '类型', '评分'])
# 循环爬取多个页面的数据
for page in range(0, 250, 25):
url = f'https://movie.douban.com/top250?start={page}&filter='
response = requests.get(url, headers=headers)
# 检查响应状态码是否为200
if response.status_code == 200:
# 使用BeautifulSoup解析HTML
soup = BeautifulSoup(response.text, 'html.parser')
# 找到每个电影的信息块
items = soup.find_all('div', class_='info')
movies = []
# 遍历每个电影信息块,提取所需信息
for item in items:
title = item.find('span', class_='title').text
details = item.find('div', class_='bd').text.strip().split('\n')
# 定义正则表达式模式
pattern = r'.*(\d{4}).*\xa0/\xa0(.*)\xa0/\xa0(.*)'
# 使用正则表达式匹配
matches = re.match(pattern, details[1])
year = matches.group(1)
country = matches.group(2)
genre = matches.group(3)
rating = item.find('span', class_='rating_num').text
movies.append({'电影名': title, '年份': year, '国家': country, '类型': genre, '评分': rating})
# 将当前页面的电影信息添加到总的电影信息DataFrame中
all_movies = pd.concat([all_movies, pd.DataFrame(movies)], ignore_index=True)
else:
print("检索页面失败")
# 打印前10行电影信息,检查结果
print(all_movies[:10])
## 电影名 年份 国家 类型 评分
## 0 肖申克的救赎 1994 美国 犯罪 剧情 9.7
## 1 霸王别姬 1993 中国大陆 中国香港 剧情 爱情 同性 9.6
## 2 阿甘正传 1994 美国 剧情 爱情 9.5
## 3 泰坦尼克号 1997 美国 墨西哥 剧情 爱情 灾难 9.5
## 4 千与千寻 2001 日本 剧情 动画 奇幻 9.4
## 5 这个杀手不太冷 1994 法国 美国 剧情 动作 犯罪 9.4
## 6 美丽人生 1997 意大利 剧情 喜剧 爱情 战争 9.5
## 7 星际穿越 2014 美国 英国 加拿大 剧情 科幻 冒险 9.4
## 8 盗梦空间 2010 美国 英国 剧情 科幻 悬疑 冒险 9.4
## 9 楚门的世界 1998 美国 剧情 科幻 9.4
将所有电影信息保存到CSV文件中
all_movies.to_csv('douban_top250_movies.csv', index=False, encoding='utf-8-sig')
2.绘制每个年份的电影数量的折线图
import matplotlib.pyplot as plt
# 统计每个年份的电影数量
movies_per_year = all_movies.groupby('年份').size()
# 创建新图形
plt.figure()
# 绘制每个年份的电影数量的曲线图
plt.plot(movies_per_year.index, movies_per_year.values, marker='o', linestyle='-')
# 设置标题
plt.title('Number of Movies per Year')
# 设置x轴标签
plt.xlabel('Year')
# 设置y轴标签
plt.ylabel('Number of Movies')
# 设置x轴刻度标签的旋转角度为30度,并且设置字体和字号
plt.xticks(rotation=30, fontproperties='Times New Roman', size=5)
## ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56], [Text(0, 0, '1931'), Text(1, 0, '1936'), Text(2, 0, '1939'), Text(3, 0, '1940'), Text(4, 0, '1950'), Text(5, 0, '1952'), Text(6, 0, '1953'), Text(7, 0, '1954'), Text(8, 0, '1957'), Text(9, 0, '1960'), Text(10, 0, '1965'), Text(11, 0, '1966'), Text(12, 0, '1968'), Text(13, 0, '1972'), Text(14, 0, '1974'), Text(15, 0, '1975'), Text(16, 0, '1978'), Text(17, 0, '1979'), Text(18, 0, '1982'), Text(19, 0, '1984'), Text(20, 0, '1985'), Text(21, 0, '1986'), Text(22, 0, '1987'), Text(23, 0, '1988'), Text(24, 0, '1989'), Text(25, 0, '1990'), Text(26, 0, '1991'), Text(27, 0, '1992'), Text(28, 0, '1993'), Text(29, 0, '1994'), Text(30, 0, '1995'), Text(31, 0, '1996'), Text(32, 0, '1997'), Text(33, 0, '1998'), Text(34, 0, '1999'), Text(35, 0, '2000'), Text(36, 0, '2001'), Text(37, 0, '2002'), Text(38, 0, '2003'), Text(39, 0, '2004'), Text(40, 0, '2005'), Text(41, 0, '2006'), Text(42, 0, '2007'), Text(43, 0, '2008'), Text(44, 0, '2009'), Text(45, 0, '2010'), Text(46, 0, '2011'), Text(47, 0, '2012'), Text(48, 0, '2013'), Text(49, 0, '2014'), Text(50, 0, '2015'), Text(51, 0, '2016'), Text(52, 0, '2017'), Text(53, 0, '2018'), Text(54, 0, '2019'), Text(55, 0, '2020'), Text(56, 0, '2021')])
# 根据图形元素自动调整子图参数,以防止x轴标签或标题遮挡
plt.tight_layout()
# 以300pi保存为pdf文件
plt.savefig('Number of Movies per Year.pdf', dpi=300)
# 展示
plt.show()
3.绘制每个国家或地区的电影数量的柱状图
import matplotlib.pyplot as plt
# 统计每个国家或地区的电影数量
movies_per_country = all_movies['国家'].str.split().explode().value_counts()
# 创建新图形
plt.figure()
# 绘制每个国家或地区的电影数量的柱状图
plt.bar(movies_per_country.index, movies_per_country.values)
# 设置标题
plt.title('Number of Movies per Country(Region)')
# 设置x轴标签
plt.xlabel('Country/Region')
# 设置y轴标签
plt.ylabel('Number of Movies')
# 设置x轴刻度标签的旋转角度为30度,并且设置字体和字号
plt.xticks(rotation=30, fontproperties='Microsoft YaHei', size=5)
## ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [Text(0, 0, '美国'), Text(1, 0, '英国'), Text(2, 0, '日本'), Text(3, 0, '中国香港'), Text(4, 0, '中国大陆'), Text(5, 0, '法国'), Text(6, 0, '德国'), Text(7, 0, '韩国'), Text(8, 0, '中国台湾'), Text(9, 0, '加拿大'), Text(10, 0, '意大利'), Text(11, 0, '澳大利亚'), Text(12, 0, '新西兰'), Text(13, 0, '瑞士'), Text(14, 0, '印度'), Text(15, 0, '西班牙'), Text(16, 0, '瑞典'), Text(17, 0, '巴西'), Text(18, 0, '爱尔兰'), Text(19, 0, '约旦'), Text(20, 0, '匈牙利'), Text(21, 0, '泰国'), Text(22, 0, '阿根廷'), Text(23, 0, '南非'), Text(24, 0, '希腊'), Text(25, 0, '丹麦'), Text(26, 0, '奥地利'), Text(27, 0, '伊朗'), Text(28, 0, '波兰'), Text(29, 0, '墨西哥'), Text(30, 0, '卡塔尔'), Text(31, 0, '塞浦路斯'), Text(32, 0, '黎巴嫩'), Text(33, 0, '捷克')])
# 根据图形元素自动调整子图参数,以防止x轴标签或标题遮挡
plt.tight_layout()
# 以300pi保存为pdf文件
plt.savefig('Number of Movies per Country(Region).pdf', dpi=300)
# 展示
plt.show()
4.绘制每种类型的电影数量占比的饼图
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
# 将类型字段拆分成单独的列,并将每个类型进行拆分
all_movies['类型'] = all_movies['类型'].str.split()
# 创建一个新的DataFrame,每行表示一个电影的类型
genres_df = all_movies.explode('类型')
# 计算每种类型的电影数量占比
genre_counts = genres_df['类型'].value_counts(normalize=True) * 100
# 找出占比小于 1% 的电影类型
other_genres = genre_counts[genre_counts < 1]
# 计算占比小于 1% 的电影类型的总占比
other_percentage = other_genres.sum()
# 将占比小于 1% 的电影类型合并为一个新的条目
pie_data = genre_counts.drop(other_genres.index)
pie_data['其他'] = other_percentage
# 创建新图形
plt.figure()
# 绘制每种类型的电影数量占比的饼图
plt.pie(pie_data, labels=pie_data.index, autopct='%1.1f%%',
textprops={'fontproperties':'Microsoft YaHei', 'fontsize':5})
## ([<matplotlib.patches.Wedge object at 0x000001D9D8FB0E50>, <matplotlib.patches.Wedge object at 0x000001D9D8F9DCD0>, <matplotlib.patches.Wedge object at 0x000001D9D8F9F4D0>, <matplotlib.patches.Wedge object at 0x000001D9D8F9E5D0>, <matplotlib.patches.Wedge object at 0x000001D9D8F8A310>, <matplotlib.patches.Wedge object at 0x000001D9D8F8BC50>, <matplotlib.patches.Wedge object at 0x000001D9D8F79510>, <matplotlib.patches.Wedge object at 0x000001D9D8F7ACD0>, <matplotlib.patches.Wedge object at 0x000001D9D94F8410>, <matplotlib.patches.Wedge object at 0x000001D9D8EF3810>, <matplotlib.patches.Wedge object at 0x000001D9D8F8AFD0>, <matplotlib.patches.Wedge object at 0x000001D9D94E89D0>, <matplotlib.patches.Wedge object at 0x000001D9D8F89ED0>, <matplotlib.patches.Wedge object at 0x000001D9D94EB9D0>, <matplotlib.patches.Wedge object at 0x000001D9D94DD2D0>, <matplotlib.patches.Wedge object at 0x000001D9D94DEB50>, <matplotlib.patches.Wedge object at 0x000001D9D94D0390>, <matplotlib.patches.Wedge object at 0x000001D9D94D1AD0>, <matplotlib.patches.Wedge object at 0x000001D9D8EF2D90>], [Text(0.732583305578164, 0.8205618199673749, '剧情'), Text(-0.39900420793784797, 1.025083236643684, '爱情'), Text(-0.8369315422785263, 0.7138246237972513, '喜剧'), Text(-1.067298556940656, 0.26622131836574187, '冒险'), Text(-1.083765455520508, -0.18828817652849522, '奇幻'), Text(-0.9390729630830821, -0.5728367743139057, '犯罪'), Text(-0.6830543049427025, -0.8622278216917161, '动画'), Text(-0.3803782404384725, -1.0321397164148522, '惊悚'), Text(-0.06966579681297874, -1.0977917274029774, '动作'), Text(0.22740396730457743, -1.0762376297333869, '悬疑'), Text(0.4721385960812427, -0.9935215881350709, '科幻'), Text(0.655394303870138, -0.8834355134669295, '家庭'), Text(0.790042953635913, -0.7653967150505957, '传记'), Text(0.8893330227256006, -0.6473691178066395, '战争'), Text(0.9567504074341425, -0.5427970687785653, '古装'), Text(0.9998416878109717, -0.4586028775698072, '历史'), Text(1.0338510198084145, -0.375702101193354, '音乐'), Text(1.0596750159124046, -0.2951082185420204, '同性'), Text(1.092389939951236, -0.12916740724089482, '其他')], [Text(0.3995908939517258, 0.447579174527659, '26.8%'), Text(-0.21763865887518977, 0.5591363108965548, '8.2%'), Text(-0.4565081139701052, 0.3893588857075916, '7.5%'), Text(-0.5821628492403578, 0.14521162819949554, '7.2%'), Text(-0.5911447939202771, -0.10270264174281556, '6.1%'), Text(-0.5122216162271357, -0.3124564223530394, '5.9%'), Text(-0.3725750754232922, -0.4703060845591178, '5.3%'), Text(-0.20747904023916683, -0.5629852998626467, '4.8%'), Text(-0.03799952553435204, -0.5987954876743512, '4.5%'), Text(0.12403852762067859, -0.5870387071273019, '4.2%'), Text(0.25753014331704144, -0.5419208662554931, '3.3%'), Text(0.35748780211098435, -0.48187391643650695, '2.9%'), Text(0.4309325201650434, -0.4174891173003249, '2.3%'), Text(0.48509073966850935, -0.3531104278945306, '2.2%'), Text(0.5218638586004413, -0.2960711284246719, '1.4%'), Text(0.545368193351439, -0.25014702412898576, '1.3%'), Text(0.563918738077317, -0.20492841883273852, '1.3%'), Text(0.5780045541340388, -0.16096811920473839, '1.2%'), Text(0.5958490581552196, -0.07045494940412445, '3.7%')])
# 设置标题
plt.title('Percentage of Movies by Genre')
# 保证饼图为圆形
plt.axis('equal')
## (-1.0999999997522056, 1.0999999999882002, -1.0999999824824653, 1.0999996520819912)
# 以300pi保存为pdf文件
plt.savefig('Percentage of Movies by Genre.pdf', dpi=300)
# 展示
plt.show()