数据的抓取:存到mongodb中用pandas读取:
# coding=utf-8
import requests,pymongo,time
import json
class DoubanSpider:
def __init__(self):
client = pymongo.MongoClient('localhost',port=27017)
self.collection= client['douabn']['tv']
self.url_temp_list =[
{"url_temp": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_american_hot/items?start={}&count=18&loc_id=108288",
"country": "american"},
{ "url_temp": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_english_hot/items?start={}&count=18&loc_id=108288",
"country": "english"},
{"url_temp": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_domestic_hot/items?start={}&count=18&loc_id=108288",
"country": "china"},
{
"url_temp": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_korean_drama_hot/items?start={}&count=18&loc_id=108288",
"country": "korean"},
{
"url_temp": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_hongkong_hot/items?start={}&count=18&loc_id=108288&_=0",
"country": "hongkong"},
{
"url_temp": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_japanese_hot/items?start={}&count=18&loc_id=108288&_=0",
"country": "japanese"},
]
self.headers = {"User-Agent": "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Mobile Safari/537.36",
"Referer": "https://m.douban.com/tv/"}
def parse_url(self, url): # 发送请求,获取响应
print(url)
response = requests.get(url, headers=self.headers)
return response.content.decode()
def get_content_list(self, json_str,country): # 提取是数据
dict_ret = json.loads(json_str)
content_list = dict_ret["subject_collection_items"]
total = dict_ret["total"]
results=[]
for result in content_list:
result["country"] = country
# 提取分类即tag
temp_info= result['info']
temp_info = [i.strip() for i in temp_info.split("/")]
result['tag']=temp_info[1]
try:
release_date=result['release_date']
release_date=release_date.replace('.','-') if release_date !=None else '06-20'
year = result['year']
data = year + '-' + release_date
result['data'] = data
except Exception as e:
print(e)
result['data']=data
info={
'title':result['title'],
"actors":'-'.join(result["actors"]),
'tag_list':result['tag'],
'value':result['rating']['value'],
'county':result["country"],
'data':result['data'],}
results.append(info)
self.collection.insert(info)
print(results)
return results,total
def save_content_list(self, file): # 保存
with open("douban.txt", "a", encoding="utf-8") as f:
f.write(json.dumps(file, ensure_ascii=False))
f.write("\n") # 写入换行符,进行换行
print("保存成功")
def run(self): # 实现主要逻辑
for url_temp in self.url_temp_list:
num = 0
total = 100 # 假设有第一页
while num < total + 18:
# 1.start_url
url = url_temp['url_temp'].format(num)
# 2.发送请求,获取响应
json_str = self.parse_url(url)
# 3.提取是数据
file,total = self.get_content_list(json_str,url_temp['country'])
#time.sleep(3)
# 4.保存
#self.save_content_list(file)
# if len(content_list)<18:
# break
# 5.构造下一页的url地址,进入循环
num += 18
if __name__ == '__main__':
douban_spider = DoubanSpider()
douban_spider.run()
数据的展示:
import pandas as pd
from pymongo import MongoClient
from matplotlib import pyplot as plt
import numpy as np
#读取mongodb数据并导出csv
# client=MongoClient()
# collection=client['douabn']['tv']
# data=pd.DataFrame(list(collection.find()))
# del data["_id"]
# #data=data['value'] #只展示一列数据
# data.to_csv("douban_tv.csv",encoding='utf-8') #到处CSV文件
# print(data)
file_path = "data/douban_tv.csv"
df = pd.read_csv(file_path)
df=df.dropna(axis=0)
#print(df.info())
#1.四个国家电视剧的平均分
df_country_rating = df[["county", "value"]]
#print(df_country_rating)
# 根据国家分组,并且获取平均值
grouped_rating = df_country_rating.groupby("county").mean()
print(grouped_rating)
y = grouped_rating["value"]
x = np.arange(len(grouped_rating.index))
#画图
plt.figure(figsize=(20,8),dpi=80)
plt.bar(x, y, width=0.5, align="center")
plt.xticks(x, grouped_rating.index)
# x轴的值
plt.xlabel("国家")
# y轴值
plt.ylabel("平均分")
# 图的标题
plt.title("豆瓣电视剧平均分统计")
plt.show()
#2.不同分类电视剧的数量统计
temp_list = df["tag_list"].str.split(" ").tolist()
gen_list=list(set([i for j in temp_list for i in j]))
#print('统计电影分类列表:',gen_list)
#print(df.shape[0]) #shape[0]取行 shape[1]取出列
#构造全为0的数组
zero_df=pd.DataFrame(np.zeros((df.shape[0],len(gen_list))),columns=gen_list)
#print(zero_df)
#给每个电影出现的赋值1
for i in range(df.shape[0]):
zero_df.loc[i,temp_list[i]]=1
#统计每个分类的电影的数量和
genre_count=zero_df.sum(axis=0)
#print(genre_count)
#排序
genre_count=genre_count.sort_values(ascending=False)
print(genre_count)
_x=genre_count.index
_y=genre_count.values
#画图
plt.figure(figsize=(20,8),dpi=80)
plt.bar(range(len(_x)),_y)
plt.xticks(range(len(_x)),_x)
plt.xlabel("分类")
# y轴值
plt.ylabel("数量")
# 图的标题
plt.title("不同分类电视剧的数量统计")
plt.show()
#3.实现功能: 7分以上的电视剧时间的分布,散点图
df = df[df["value"]>=7]
df["timeStamp"] = pd.to_datetime(df["data"]) #把时间字符串转为索引
df.set_index('timeStamp',inplace=True)
print(df.head())
count_by_month=df.resample('5M').count()['value'] #重采样
print(count_by_month.sort_values(ascending=False))
#画图
_x = count_by_month.index
_y = count_by_month.values
xticklables = [i.strftime("%Y-%m") for i in _x]
plt.figure(figsize=(20,8),dpi=80)
plt.scatter(_x, _y)
#解决xticklable刻度太密集
plt.xticks(_x[::6],xticklables[::6],rotation=45)
plt.xlabel("时间")
plt.ylabel("时间段内的数量合计")
plt.title("7分以上的电视剧时间的分布散点图")
plt.show()
#4.7分以上的不同国家电视剧随时间的变化情况
df_county = df[df["value"]>=7]
df["timeStamp"] = pd.to_datetime(df["data"]) #把时间字符串转为索引
df.set_index('timeStamp',inplace=True)
#print(df.head())
plt.figure(figsize=(20,8),dpi=80)
for group_name,group_data in df.groupby(by='county'):
county_by_month = group_data.resample('4M').count()['title'] # 重采样
#print(county_by_month)
# 画图
_x = county_by_month.index
_y = county_by_month.values
plt.plot(range(len(_x)), _y, label=group_name)
#添加图例:
plt.legend(loc="best")
xticklables = [i.strftime("%Y%m%d") for i in _x]
#解决xticklable刻度太密集
plt.xticks(range(0, len(_x), 4),xticklables[::4],rotation=45)
plt.xlabel("时间")
plt.ylabel("时间段内的数量合计")
plt.title("不同国家7分以上的电视剧随时间的变化情况")
plt.show()