1. 安装python包
- numpy
- pandas
2. 数据预处理
1. 由于表格非常奇怪,我们对蔬菜和肉类分别处理,并合并
2. 对有丢失值的列直接drop
3. 编码分析
# coding: utf-8
# In[1]:
# coding = utf-8
import numpy as np
import pandas as pd
data = pd.read_excel("data.xls",encoding='utf-8')#.astype('float')
data.drop(["肉食禽蛋","批发价格"],axis=1,inplace=True)
data=data.drop_duplicates().dropna()
data=data.dropna(axis=1)
index2drop = data["蔬菜名"]!="蔬菜类"
data=data[index2drop]
data = data.pivot(index='日期',columns='蔬菜名',values='价格')
data2 = pd.read_excel("data.xls")#.astype('float')
data2.drop(["蔬菜名","价格"],axis=1,inplace=True)
data2=data2.drop_duplicates().dropna()
index2drop = data2["肉食禽蛋"]!="肉食禽蛋类"
data2=data2[index2drop]
data2 = data2.pivot(index='日期',columns='肉食禽蛋',values='批发价格')
data2.head(6)
data = pd.merge(data,data2,on="日期")
data.head(10)
data.drop(["冻芋头","北瓜"],axis=1,inplace = True)
data.dropna(thresh=10,axis=1,inplace=True)
data.head()
data.dtypes
# data.to_csv("data1.csv",encoding='utf-8')
#
# # help(pd.concat)
# data=pd.concat([data,data2],keys="日期")
# data.head(100)
# data = pd.merge(data,data2,left_on="日期")
# # 在excel中打开data1.csv
# ## 将乱码的缺失值替换成空 (ctrl + F)