最近在学习数据分析,看到了一个小作业:
通过requests,lxml爬取华语电影的影片时长,然后画出一个直方图分析
1.先列出我用到的库:
matplotlib (画图)
numpy(数组处理)
pandas(数据处理)
requests(爬虫请求)
lxml(解析html)
logging(日志打印)
re(正则提取)
ThreadPoolExecutor(线程池)
2.一共的程序文件分为三个:
3.爬虫程序库,用类实现(具体实现见注释)
# coding: utf-8
# @Author: Ruan
# coding:utf-8
import re
import requests
from lxml import etree
from urllib3 import disable_warnings
import logging
class ReadMoivesData:
"""
定义读取电影信息类
"""
def __init__(self, headers=None, cookies=None):
"""
析构方法初始化参数
:param headers: 传入请求头,默认只有简单的edge浏览器(PC)UA
:param cookies: 传入cookies,默认None,尽量传入一个登陆过的Cookies可以防止豆瓣拦截
:param log_setting: 传入logging日志参数
"""
disable_warnings() # 消除SSL证书验证警告
self.cookies = cookies
self.max_error_num = 5 # 请求的最大错误次数,如果超过这个错误次数,这一条数据就会跳过
if headers:
self.HEADERS = headers
else:
self.HEADERS = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.50',
"Referer": "https://movie.douban.com/explore"
}
def logging_settings(self, level=10, filename='.\\logging.log', encoding='utf-8', *args, **kwargs):
"""
设置logging参数
"""
logging.basicConfig(
# 日志级别,默认30即WARNING级别
level=10, # 10对应debug,所有日志都会输出
# 输出路径,默认输出到命令行
filename='.\\logging.log',
# 输出日志的编码
encoding='utf-8',
*args, **kwargs
)
def get_moives_id(self, start, addr):
"""
先通过API接口获取电影的id
:param start: 开始的数值,比如填0,就是从0-20,爬完一次后就要传入20,从20开始
:param addr: 电影的地区(咱们这里爬取的是华语,所以填写华语就可以)
:return: 返回一个列表[(电影ID,电影名称),(电影ID,电影名称),] 这样的一个列表
"""
error = 0
while True:
try:
url = 'https://m.douban.com/rexxar/api/v2/movie/recommend?refresh=0&start={}&count=20&selected_categories={}&uncollect=false&tags={}&ck=dSK4'.format(
start, f'%7B"地区":"{addr}"%7D', addr)
res = requests.get(url, headers=self.HEADERS, verify=False, cookies=self.cookies)
if res.status_code == 200:
return [(moive_datas["id"], moive_datas["title"]) for moive_datas in res.json()["items"]]
elif '有异常请求从你的 IP 发出,请' in res.text:
logging.error(f'爬取出错,IP被封,请添加Cookies!')
return False
else:
logging.warning(f'爬取出错,当前页面开始号:{start},网站状态码:{res.status_code}')
error += 1
if error == self.max_error_num:
return False
except Exception as e:
error += 1
logging.warning(f'爬取出错,当前页面开始号:{start},错误次数{error},原因:{e}')
if error == self.max_error_num:
return False
def get_moive_html(self, moive_id):
"""
获取电影的html页面,传入电影ID然后会自动访问,如果访问成功就会返回一个html字符串
如果访问失败就会logging错误信息
:param moive_id: 电影ID
:return: 返回html的字符
"""
error = 0
while True:
try:
url = f'https://movie.douban.com/subject/{moive_id}/'
res = requests.get(url, headers=self.HEADERS, verify=False, cookies=self.cookies)
if res.status_code == 200:
return res.text
elif '有异常请求从你的 IP 发出,请' in res.text:
logging.error(f'爬取出错,IP被封,请添加Cookies!')
return False
else:
logging.warning(f'爬取出错,当前Moive_Id:{moive_id},网站状态码:{res.status_code}')
error += 1
if error == self.max_error_num:
return False
except Exception as e:
error += 1
logging.warning(f'爬取出错,当前Moive_Id:{moive_id},错误次数{error},原因:{e}')
if error == self.max_error_num:
return False
@classmethod
def parse_moive_data(cls, html_str):
"""
解析电影的html页的数据,需要传入一个html字符串
下面的get_moives_data方法
已经整合了get_moive_html和parse_moive_data方法可以直接通过传入ID获取到值
:param html_str: html字符串
:return: 返回一个元组(电影名,电影类型,电影时长,电影平均分,电影评分人数)
"""
try:
ele = etree.HTML(html_str)
movie_name = ele.xpath(r'//*[@id="content"]/h1/span[1]/text()')[0]
moive_types = ele.xpath(r'.//span[@property="v:genre"]/text()')[0]
moive_time = int(
re.search(r'\d+', ele.xpath(r'//*[@id="info"]/span[@property="v:runtime"]/text()')[0]).group())
moive_avg_grade = float(
ele.xpath(r'//*[@id="interest_sectl"]//strong[@property="v:average"]/text()')[0].strip())
moive_grade_num = int(ele.xpath(
r'//*[@id="interest_sectl"]/div//div[@class="rating_self clearfix"]/div[@class="rating_right "]/div[@class="rating_sum"]/a/span/text()')[
0])
return (movie_name, moive_types, moive_time, moive_avg_grade, moive_grade_num)
except:
return False
def get_moives_data(self, moives_id):
"""
直接传入id就可以获取到电影数据
可以传入多个id也可以传入一个
:param moives_id: 传入id或者id列表
:return: 返回一个字典{"datas":[(电影1的数据5个),(电影2的数据5个)...],"bad_movies_id":[爬取失败的电影ID]}
"""
datas = []
bad_movies_id = []
if type(moives_id) in [int, str]:
moives_id = [moives_id, ]
for movie_id in moives_id:
html_ = self.get_moive_html(movie_id)
if html_:
data = self.parse_moive_data(html_)
datas.append(data) if data else bad_movies_id.append(movie_id)
else:
bad_movies_id.append(movie_id)
return {"datas": datas, "bad_movies_id": bad_movies_id}
if __name__ == '__main__':
cookies = \
{
"Cookie": 'll="11237"; bid=thh1-98Ocxk; __gads=ID=d2e0d94df32fa413-22fa0087f4d600ce:T=1665455276:RT=1665455276:S=ALNI_MaHXZUiV7LL5HrqjdAPkXtfpJu3Yg; _vwo_uuid_v2=D82CFC1A2611E97431321E4389D921C85|ddf0a270d2aac49a3f718ce7fa035332; douban-fav-remind=1; Hm_lvt_16a14f3002af32bf3a75dfe352478639=1665455277,1667806451; __yadk_uid=5HsdwHYd5OG6RgF2S3z0sLlcIV97BAFu; __utmc=30149280; __utmc=223695111; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1677560938%2C%22https%3A%2F%2Fcn.bing.com%2F%22%5D; _pk_ses.100001.4cf6=*; ap_v=0,6.0; __utma=30149280.865805491.1665455273.1677159210.1677560938.13; __utmb=30149280.0.10.1677560938; __utmz=30149280.1677560938.13.7.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utma=223695111.1009194368.1665455273.1677159210.1677560938.12; __utmb=223695111.0.10.1677560938; __utmz=223695111.1677560938.12.6.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __gpi=UID=00000a2bbcd85db7:T=1665455276:RT=1677560963:S=ALNI_Mb3g8I1x_0t32XdxtelpZdtxmr15A; dbcl2="179289673:hH3bKCGNZ7w"; ck=dSK4; frodotk_db="048218eaecdd8fdc0d864ac0c698af94"; push_noty_num=0; push_doumail_num=0; _pk_id.100001.4cf6=a61b0a7d7f6abeaf.1665455273.12.1677564849.1677159210.'
}
Spider = ReadMoivesData(cookies=cookies)
moive_id = 35914259
print(Spider.get_moives_id(0, "华语"))
4.获取数据并通过pandas写入到csv/excel
# coding: utf-8
# @Author: Ruan
# coding:utf-8
import re
import time
import numpy as np # 导入numpy处理数组
import pandas as pd # 导入pd,用于处理数据
from concurrent.futures import ThreadPoolExecutor # 导入线程池模块
from DouBanSpider import ReadMoivesData
def write_moives_id_to_csv(start, end, csv_path,):
"""
获取调用ID并写道CSV中
:param start: 开始的数据点
:param end: 结束数据点 例如start=0,end=2000 就会爬前2000的数据
:param csv_path: 储存的csv文件的文件路径
:return: 存的方式是按行存入 电影名,id
"""
Spider = ReadMoivesData()
movies_ids = []
for start_num in np.arange(start, end, 20):
movies_ids += Spider.get_moives_id(start=start_num, addr='华语')
print(f'{start_num}爬取完成!')
movies_ids = pd.DataFrame(movies_ids)
movies_ids.to_csv(csv_path)
print(f'Movies_id写入完成,路径:{csv_path}')
return True
def read_loc_movies_id(file_path):
"""
读取本地csv文件重的movies_id
:param file_path: 读取的movies_id的文件路径
:return: 返回一个movies_id 列表,只有movies没有电影名
"""
movies_id = []
with open(file_path, 'r', encoding='utf-8-sig') as f:
for line in f.readlines():
movie_id = re.search(r'\d+', line.strip()).group()
movies_id.append(movie_id)
return movies_id
if __name__ == '__main__':
# 导入Cookies防止反爬机制
COOKIES = \
{
"Cookie": 'll="118237"; bid=th1-98Ocxk; __gads=ID=d2e0d94df32fa413-22fa0087f4d600ce:T=1665455276:RT=1665455276:S=ALNI_MaHXZUiV7LL5HrqjdAPkXtfpJu3Yg; _vwo_uuid_v2=D82CFC1A2611E97431321E4389D921C85|ddf0a270d2aac49a3f718ce7fa035332; douban-fav-remind=1; Hm_lvt_16a14f3002af32bf3a75dfe352478639=1665455277,1667806451; __yadk_uid=5HsdwHYd5OG6RgF2S3z0sLlcIV97BAFu; __utmc=30149280; __utmc=223695111; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1677560938%2C%22https%3A%2F%2Fcn.bing.com%2F%22%5D; _pk_ses.100001.4cf6=*; ap_v=0,6.0; __utma=30149280.865805491.1665455273.1677159210.1677560938.13; __utmb=30149280.0.10.1677560938; __utmz=30149280.1677560938.13.7.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utma=223695111.1009194368.1665455273.1677159210.1677560938.12; __utmb=223695111.0.10.1677560938; __utmz=223695111.1677560938.12.6.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __gpi=UID=00000a2bbcd85db7:T=1665455276:RT=1677560963:S=ALNI_Mb3g8I1x_0t32XdxtelpZdtxmr15A; dbcl2="179289673:hH3bKCGNZ7w"; ck=dSK4; frodotk_db="048218eaecdd8fdc0d864ac0c698af94"; push_noty_num=0; push_doumail_num=0; _pk_id.100001.4cf6=a61b0a7d7f6abeaf.1665455273.12.1677564849.1677159210.'
}
MOVIES_ID_FILE_PATH = './/movies_ids.csv'
SUC_FILE_PATH = './/movies_data_suc.csv'
BAD_FILE_PATH = './/movies_data_bad.csv'
SUC_DATA = [] # 初始化定义一个成功获取的数据列表
BAD_DATA = [] # 初始化定义一个失败的Movies_id列表
TIMESLEEP = 1 # 设置多线程延时时间,防止太快被封
MAX_WORKERS = 10 # 设置最大线程工作数量
# 我这里已经读取过Movies_id并且已经写道MOVIES_ID_FILE_PATH里了
# 所以直接调用,如果还没写入,直接调write_moives_id_to_csv方法先获取Movies_id
movies_id = read_loc_movies_id(MOVIES_ID_FILE_PATH) # 读取本地movies_id
Spider = ReadMoivesData(cookies=COOKIES) # 获取读取电影的类实例对象
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as Pool: # 创建线程池
threads = [] # 初始化线程列表
num = 0
for movie_id in movies_id:
# 创建线程,并添加到threads列表
threads.append(Pool.submit(Spider.get_moives_data, movie_id))
num += 1
print(f'第{num}条数据提交中!!!')
if num % 10 == 0: # 如果满足10条就延时提交
time.sleep(TIMESLEEP)
for thread in threads:
# 读取线程执行结果
suc_data = thread.result()["datas"]
bad_data = thread.result()["bad_movies_id"]
SUC_DATA.extend(suc_data) # 添加成功数据
BAD_DATA.extend(bad_data) # 添加失败的ID
print(f'第{threads.index(thread) + 1}条数据完成,成功数据:{suc_data},失败数据:{bad_data}')
pd.DataFrame(SUC_DATA).to_csv(SUC_FILE_PATH, index=False) # 用pd写入成功数据
pd.DataFrame(BAD_DATA).to_csv(BAD_FILE_PATH, index=False) # 写入失败的movies_id
pd.read_csv(SUC_FILE_PATH, header=None, names=['名称', '类型', '时长', '评分', '评价数量']).to_csv(SUC_FILE_PATH, index=False) #添加表头
pd.read_csv(BAD_FILE_PATH, header=None, names=['MOVIE_ID']).to_csv(BAD_FILE_PATH, index=False) # 添加表头
5.分析数据并画图
我这里分析的是华语电影的电影时长,里面还有评分等等可以自己分析
也不一定非要分析直方图,也可以通过这个练习其他数据图
# coding: utf-8
# @Author: Ruan
# coding:utf-8
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib import rc
rc('font', family="MicroSoft YaHei") # 设置中文可用字体
csv_path = 'movies_data_suc.csv' # CSV数据路径
datas = np.array(pd.read_csv(csv_path)) # 读取CSV数据
movies_time = datas[:, [2]] # 读取CSV中电影时长这一行
movies_time = movies_time.reshape(len(datas)) # 将数据转换为一维数组
bins = int(np.around(1 + np.log(len(datas)) / np.log(2))) # 计算组数,组距计算公式:K=1+lgn/lg2
bins = 5 # 自定义组数
group_space = (np.max(movies_time) - np.min(movies_time)) / bins # 计算组距
plt.figure(figsize=(20, 8), dpi=100)
x_ticks = range(np.min(movies_time) - bins, np.max(movies_time) + bins, bins) # 自定义一个x轴的坐标显示
plt.xticks(x_ticks) # 修改x轴坐标显示
plt.hist(movies_time, bins=x_ticks) # 画直方图,bins参数接受组数值,如果传入的是容器类,则会按照容器内值作为x轴值
# density=True参数,可用于绘制频率分布直方图
for x in x_ticks: # 给每个柱状图上添加个数显示
y = ((movies_time < x + bins) & (movies_time >= x)).sum()
# 计算满足这个范围的数据个数
if y: # 如果不为0则显示
plt.text(x + bins / 2, y + 1, s=y)
plt.scatter(x + bins / 2, y + 1) # 添加点的显示
plt.title("华语电影时长分布直方图", fontdict={"size": 15}) # 设置图像标题
plt.savefig('.//华语电影时长分布直方图') # 保存图像
plt.show() # 展示图像