# 数据处理实战

### 911问题

import pandas as pd
import numpy as np
temp_list = df["title"].str.slipt(": ").tolist()
#set() 函数创建一个无序不重复元素集
cate_list = list(set(i[0] for i in temp_list))
print(cate_list)
"""方法一"""
#构造一个全为0的数组
zeros_df = pd.DataFrame(np.zeros((df.shape[0],len(cate_list))),columns = cate_list)
#赋值
for cate in cate_list:
zeros_df[cate][df["title"].str.contains(cate)] = 1
sum_ret = zeros_df.sum(axis = 0)
print(sum_ret)
"""方法一结束"""
"""方法二"""
df["cate"] = pd.DataFrame(np.array(cate_list).reshape((df.shape[0],1)))
print(df.groupby(by = "cate").count()["title"])
"""方法二结束"""

""""""
#转化时间类型
df["timeStamp"] = pd.to_datetime(df["timeStamp"])
df.set_index("timeStamp",inplace = True)
#统计911数据中不同月份的电话次数
count_by_month = df.resample("M").count
print(count_by_month)
_x = count_by_month.index
_y = count_by_month.values
plt.figure(figsize = (20.8),dpi = 80)
plt.plot(range(len(_x)),_y)
plt.xticks(range(len(_x)),_x,rotation = 45)
plt.show()

### 时间序列问题

pd.date_range(start=None, end=None, periods=None, freq='D')
#start和end以及freq配合能够生成start和end范围内以频率freq的一组时间索引
#start和periods以及freq配合能够生成从start开始的频率为freq的periods个时间索引

### pandas重采样

pandas提供了一个resample的方法来帮助我们实现频率转化

### 实例——爬取豆瓣网站电影信息并进行处理

1. 爬取数据
# __init__
# coding=utf-8
#douban_spider
# coding=utf-8
from spider.parse import parse_url
import json
from spider.save import mongo_client

class DoubanSpider:
def __init__(self):
self.url_temp = "https://m.douban.com/rexxar/api/v2/subject_collection/{}/items?os=ios&for_mobile=1&start={}&count=50&_=0"
self.start_urls_temp =[
{
"tv_category":"chinese",
"tv_url_parameter":"filter_tv_domestic_hot",
"total_num":None
},
{
"tv_category":"american",
"tv_url_parameter":"filter_tv_american_hot",
"total_num": None
},
{
"tv_category":"english",
"tv_url_parameter":"filter_tv_english_hot",
"total_num": None
},
{
"tv_category":"korean",
"tv_url_parameter":"filter_tv_korean_drama_hot",
"total": None
},
{
"tv_category": "japanese",
"tv_url_parameter": "filter_tv_japanese_hot",
"total": None
},
]

def get_start_urls(self): #设置开始的url
items = []
for item in self.start_urls_temp:
item["parse_url"] = self.url_temp.format(item["tv_url_parameter"], 0)
items.append(item)
return items

def get_content_list(self,html_str,item): #提取数据
if item.get("total") is None:
item["total"] = data["total"]
subject_collection_items = data["subject_collection_items"]
content_list = []
for item_temp in subject_collection_items:
print(item_temp)
item_temp.update(item)
content_list.append(item_temp)
now_page_start = data["start"]  #当前url启动的时候的offsite
if now_page_start<item["total"]:
next_page_url = self.url_temp.format(item["tv_url_parameter"], now_page_start+50)
else:
next_page_url = None

return  content_list,next_page_url

def run(self): #主逻辑
items = self.get_start_urls()
print(items)
for item in items:
next_page_url = item["parse_url"]
while next_page_url is not None:
html_str = parse_url(next_page_url)
content_list,next_page_url = self.get_content_list(html_str,item)
mongo_client.save_to_db(content_list)

if __name__ == '__main__':
douban_spider = DoubanSpider()
douban_spider.run()
#parse
# coding=utf-8
import time
from retrying import retry
import requests

@retry(stop_max_attempt_number=3)
def _parse_url(url):
assert response.status_code == 200
return response.content.decode()

def parse_url(url):  #发送请求,获取响应
print("now parseing",url)
try:
time.sleep(0.4)
html_str = _parse_url(url)
except Exception as e:
print(e)
html_str = None
return html_str
#save
# coding=utf-8
from pymongo import MongoClient
from config import MONGO_PORT,MONGO_HOST,MONGO_DB,MONGO_COLLECTION

class SaveClient: #实现数据的保存
def __init__(self):
client = MongoClient(host=MONGO_HOST,port=MONGO_PORT)
self.collection = client[MONGO_DB][MONGO_COLLECTION]

def save_to_db(self,content_list):
if isinstance(content_list,list):
for content in content_list:
self.collection.insert(content)
elif isinstance(content_list,dict):
self.collection.insert(content)
print("save suceesss")

_mongo_client = SaveClient()
mongo_client = _mongo_client

1. 对数据进行处理
#data_format
# coding=utf-8
from pymongo import MongoClient
from config import MONGO_PORT,MONGO_HOST,MONGO_DB,MONGO_COLLECTION
import re

def choose_data():
'''
处理mongodb中的数据,提取有用的字段
:return: list
'''
client = MongoClient(host=MONGO_HOST, port=MONGO_PORT)
collection = client[MONGO_DB][MONGO_COLLECTION]
db_data = collection.find()
data_list = []
for data in db_data:
item = {}
#国家
item["country"] = data["tv_category"]
#电视剧的名字
item["title"] = data["title"]
#导演
item["directors"] = "_".join(data["directors"])
#演员
item["actors"] = "_".join(data["actors"])
#data["info"] "刘进/张嘉译/秦海璐/何冰/剧情/历史/2017-04-16(中国大陆)"

directors_actors_list = data["directors"]
directors_actors_list.extend(data["actors"])
temp_info = data["info"].split("(")[0]
#提取时间
item["release_date"] = re.findall(r"\d+.*", temp_info)
if len(item["release_date"])<1:
item["release_date"] = None
temp_info = temp_info.split("/")
else:
item["release_date"] = item["release_date"][0]
if "/" in item["release_date"]:
item["release_date"] = item["release_date"].split("/")[-1]
temp_info = temp_info.split("/")[:-1]
#提取分类即tag
tag_list = []
temp_info = [i.strip() for i in temp_info if 2>=len(i.strip())>0]
for i in temp_info:
if i.strip() not in directors_actors_list:
tag_list.append(i)
item["tag"] = "_".join(tag_list)
#打分的人数
item["rating_count"] = data["rating"]["count"]
#分数
item["rating_value"] = data["rating"]["value"]
data_list.append(item)
return data_list

if __name__ == '__main__':
data_list = choose_data()
for i in data_list:
print(i["tag"],"***",i["release_date"])
print(data_list)
#show_data
# coding=utf-8
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from data_visualization.data_format import choose_data
import matplotlib.font_manager as fm

'''

1.四个国家电视剧的平均分
2.8分以上的不同分类电视剧的数量统计
2.1 8分以上的不同分类电视剧的用户评价次数
5.不同国家7分以上的电视剧随时间的变化情况
'''

#设置字体
myfont = fm.FontProperties(fname='/System/Library/Fonts/PingFang.ttc')

def get_data_frame(): #从数据库获取数据,并且把release_date变成时间格式
temp_df = pd.DataFrame(choose_data())
temp_df["release_date"] = pd.to_datetime(temp_df["release_date"])
return temp_df

def plot_four_country_ave_rating_value():
figure = plt.figure()
ax = plt.subplot()
df = get_data_frame()
df_country_rating = df[["country","rating_value"]]
#根据国家分组,并且获取平均值
grouped_rating = df_country_rating.groupby("country").mean()
print(type(grouped_rating))
y = grouped_rating["rating_value"]
x = np.arange(len(grouped_rating.index))
ax.bar(x,y,width=0.5,align="center")
plt.xticks(x,grouped_rating.index)
# x轴的值
plt.xlabel("国家",fontproperties=myfont)
# y轴值
plt.ylabel("平均分",fontproperties=myfont)
# 图的标题
plt.title("豆瓣电视剧平均分统计",fontproperties=myfont)
plt.show()

if __name__ == '__main__':
# plot_four_country_ave_rating_value()
show_tag_count()
#tag_count
# coding=utf-8
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from data_visualization.show_data import get_data_frame
import matplotlib.font_manager as fm
'''

'''

myfont = fm.FontProperties(fname='/System/Library/Fonts/PingFang.ttc')
def show_tag_count():
df = get_data_frame()
df_tag = df[["country", "rating_value", "tag"]]

# 切割组成集合,tags是一个带集合的列表
tags = [set(x.split("_")) for x in df_tag["tag"]]
# 把所有的tag组成一个集合
tags = set.union(*tags)
# 设置一个(len(df_tag),len(tags))的全是0元素的2维数组
dummies = pd.DataFrame(np.zeros((len(df_tag), len(tags))), columns=tags)

for i, tag in enumerate(df_tag["tag"]):
dummies.ix[i, tag.split("_")] = 1
df_new = df_tag.join(dummies)
# 删除空字符串的那一列
df_new = df_new.drop("", axis=1)
print(df_new.columns)
tag_list = df_new.columns[4:]
tag_count = []
for tag in tag_list:
tag_count.append([tag,df_new[tag].sum()])
#排序,让柱状图按照顺序显示
tag_count.sort(key=lambda x:x[1],reverse=True)
figure = plt.figure(figsize=(10, 8))
ax = plt.subplot()
# 画竖着的直方图
# ax.bar(range(len(tag_list)), count, width=0.5, align="center",)
# plt.xticks(range(len(tag_list)), tag_list,rotation=90,fontproperties=myfont)
# 画横着的直方图
ax.barh(range(len(tag_count)), [i[1] for i in tag_count], align="center", color='#EE7600', ecolor='black')
plt.yticks(range(len(tag_count)), [i[0] for i in tag_count], fontproperties=myfont)

plt.ylabel("分类", fontproperties=myfont)
# y轴值
plt.xlabel("数量", fontproperties=myfont)
# 图的标题
plt.title("不同分类电视剧的数量统计", fontproperties=myfont)
plt.savefig("不同分类电视剧的数量统计.jpg")
plt.show()

if __name__ == '__main__':
# plot_four_country_ave_rating_value()
show_tag_count()
# coding=utf-8
#tv_date_distribute
# coding=utf-8
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# from data_visualization.data_format import choose_data
from data_visualization.show_data import get_data_frame
import matplotlib.font_manager as fm
myfont = fm.FontProperties(fname='/System/Library/Fonts/PingFang.ttc')

'''

'''

def _show_tv_date_distribute(rate):
#获取数据,
df = get_data_frame()
# 为现存的每条数据作出统计,即让其数量为1,方便之后分组后的聚合
count_df = pd.DataFrame(np.ones(shape=(len(df),1)),columns=["count"])
df = df.join(count_df)
# 去除没有时间的电视剧
new_df = df[pd.notnull(df["release_date"])]
#选择大于7分的电视剧
new_df = new_df[new_df["rating_value"]>=rate]
#设置日期为索引
new_df = new_df.set_index("release_date")
#只选择据中的count列
new_df = new_df["count"]
#调整统计时间的范围,实现重新采样
new_df = new_df.resample("5M").sum()
return new_df

def show_tv_date_distribute(rate=7):
df = _show_tv_date_distribute(rate)
df = pd.DataFrame(df,columns=["count"])
fig = plt.figure(figsize=(16,8))
ax = plt.subplot()
_x = range(len(df.index))
_y = df["count"]

ax.scatter(_x, _y, c="green",alpha=0.7, edgecolors='none')

#解决xticklable时间带时分秒
xticklables = [i.strftime('%Y-%m') for i in df.index]
#解决xticklable刻度太密集
plt.xticks(range(0,len(df.index),6),xticklables[::6],rotation=45)
plt.xlabel("时间", fontproperties=myfont)
plt.ylabel("时间段内的数量合计", fontproperties=myfont)
plt.title("7分以上的电视剧时间的分布散点图", fontproperties=myfont)
plt.savefig("7分以上的电视剧时间的分布散点图.png")

plt.show()

if __name__ == '__main__':
show_tv_date_distribute()
#tv_date_distribute_by_country
# coding=utf-8
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# from data_visualization.data_format import choose_data
from data_visualization.show_data import get_data_frame
import matplotlib.font_manager as fm

'''

'''

#设置字体
myfont = fm.FontProperties(fname='/System/Library/Fonts/PingFang.ttc')

def show_tv_date_distribute(rate=7):
fig = plt.figure(figsize=(16, 8))
ax = plt.subplot()
#获取数据
df = get_data_frame()
#为现存的每条数据作出统计,即让其数量为1,方便之后分组后的聚合
count_df = pd.DataFrame(np.ones(shape=(len(df),1)),columns=["count"])
df = df.join(count_df)
#去除没有时间的电视剧
new_df = df[pd.notnull(df["release_date"])]
#选择2000年之后的电视剧
new_df = new_df[new_df["release_date"]>"19991231"]
#选择7分以上的上市局
new_df = new_df[new_df["rating_value"]>=rate]
#不同国家的电视剧的数量和时间的对应关系并不相同,需要先统一统计的时间,没有的时间段填充0
date_start = new_df["release_date"].min()
date_end = new_df["release_date"].max()
date_period = pd.DataFrame(pd.date_range(date_start, date_end, freq="D",),columns=["release_date"])
#定义绘图的颜色
colors = ['red', 'green', 'blue', "cyan", "orange"]
country_list = new_df["country"].unique().tolist()
#分组
for country,grouped in new_df.groupby(by=["country"]):
#对不同的国家添加统一的时间段,并设置为index
temp_grouped = grouped.merge(date_period,how="outer",on="release_date")
temp_grouped = temp_grouped[["release_date","count"]].set_index("release_date")
#对空白的时间段填充0
temp_grouped = temp_grouped.fillna(0)
temp_grouped = temp_grouped.resample("3M").sum()
# print(temp_grouped.index)
_x = range(len(temp_grouped.index))
_y = temp_grouped["count"]
#绘制散点图,但是效果不明显
# ax.scatter(_x, _y,
#            c=colors[country_list.index(country)],
#            alpha=0.5,
#            label=country
#            )
#绘制折线图
ax.plot(_x, _y,
c=colors[country_list.index(country)],
alpha=0.5,
label=country
)
#添加图例
plt.legend()
# 解决xticklable时间带时分秒
xticklables = [i.strftime('%Y-%m') for i in temp_grouped.index]
# 解决xticklable刻度太密集
plt.xticks(range(0, len(temp_grouped.index), 4), xticklables[::4], rotation=45)
plt.xlabel("时间",fontproperties=myfont)
plt.ylabel("时间段内的数量合计",fontproperties=myfont)
plt.title("不同国家7分以上的电视剧随时间的变化情况",fontproperties=myfont)
plt.savefig("不同国家7分以上的电视剧随时间的变化情况.png")
plt.show()

if __name__ == '__main__':
show_tv_date_distribute()

12-09