一、生成DataFrame
1. 用字典生成
cities = {
"name":['London','Berlin','Madraid'],
'population':[8615247,3562166,3165235],
'country':['English','Germany','Spain']}
}
city_frame = pd.DataFrame(cities)
print(city_frame)
展示前后几行
city_frame.head(3)
city_frame.tail(2)
2. 修改标签
ordinals = ['first', 'second', 'third']
city_frame = pd.DataFrame(cities, index = ordinals)
city_frame
3. 调换列的位置
city_frame = pd.DataFrame(cities, columns = ['name', 'country', 'population'], index = ordinals]
city_frame
4. 将现存的列当作标签
city_frame = pd.DataFrame(cities, columns = ['name', 'population'], index = cities[ 'country'])
city_frame
二、数据处理
1. 获取行、列
city_frame = pd.DataFrame(cities,columns = ('name','population'),
index = cities['country'])
city_frame.loc['Germany'] # 获取一行
city_frame.loc[['Germany', 'Spain'],:] # 获取多行
city_frame.loc[city_frame.population>2000000]
city_frame.iloc[1:3] # 获取第2、3行
2. 求和与累计和
city_frame.sum()
city_frame['population'].sum()
15342648
city_frame['population'].cumsum()
3. 分配新数值给变量
city_frame = pd.DataFrame(cities,columns = ['country','population','cum_population'],
index = cities['name'])
city_frame
city_frame['cum_population'] = city_frame['population'].cumsum()
city_frame
三、数据分析
1. 排序:sort_values
city_frame = city_frame.sort_values(by = 'area', ascending =False)
city_frame
2. 分组计数与求和
city_frame.groupby('country').count()
city_frame.grupby('country').sum()
四、字典与DataFrame
1. 嵌套字典生成DataFrame
字典外层作为列指标,内层作为行指标。
growth = {'Switzerland':{'2010':3, '2011':4, '2012':6},
'Germany':{'2010':4, '2011':5, '2012':5},
'China':{'2010':5, '2011':6, '2012':7}
growth_frame = pd.DataFrame(growth)
growth_frame
2. 对行重新排序
growth_frame = growth_frame.T
growth_frame2 = growth_frame.reindex(['Switzerland','Italy','Germany','Greece'])
3. 随机数填满数据框
names = ['Frank','Eve','Stella']
index = ['January','February','March','April','May']
df = pd.DataFrame(np.random.randn(5,3)*1000,columns=names,index=index)
df
五、csv文件的读取与保存
1. 保存
pd.to_csv('xxx.csv')
2. 读取
rv = pd.read_csv('xx.csv', index_col = 0) # 第一行作为列标签
rv = pd.read_csv('xx.csv', header = None) # 不需要第一行作为表头
六、数据合并
1. pd.concat()
(1) pd.concat 不生成新的索引
one = pd.DataFrame({
'Name':['Alex','Amy'],
'subject_id':['sub1','sub2'],
'scores':[98,90]
}, index = [1,2])
two = pd.DataFrame({
'Name':['Billy','Brian'],
'subject_id':['sub2','sub4'],
'scores':[89,80]}, index = [1,2])
pd.concat([one,two]) # 不生成新的索引
(2)添加参数ignore_index = True 重新生成新索引
pd.concat([one,two],ignore_index = True)
(3)添加参数axis = 1, 横向合并
pd.concat([one, two], axis = 1)
2. pd.merge()
left = pd.DataFrame({
'key':['K0','K1','K2'],
'A':['A0','A1','A2'],
'B':['B0','B1','B2']
})
right = pd.DataFrame({
'key':['K0','K1','K2'],
'C':['C0','C1','C2'],
'D':['D0','D1','D2']
})
pd.merge(left,right,how='left',on='key') # how='left',left有多少行 合并就有多少行;on表示按照什么合并
简易数据处理实例
1. 生成数据
data = {
"NorthEast": [5.9, 5.6, 4.4, 3.8, 5.8, 4.9, 4.3, 7.1, 8.3, 7.9, 5.7],
"MidWest": [4.5, 4.3, 3.6, 4. , 5.7, 5.7, 4.9, 8.1, 8.7, 7.4, 5.1],
"South": [5.3, 5.2, 4.2, 4. , 5.7, 5.2, 4.3, 7.6, 9.1, 7.4, 5.5],
"West": [6.6, 6., 5.2, 4.6, 6.5, 5.5, 4.5, 8.6, 10.7, 8.5, 6.1],
"National": [5.6, 5.3, 4.3, 4.2, 5.8, 5.3, 4.6, 7.8, 9.1, 8., 5.7]
} # type(data) is dict
years = list(range(1995, 2017, 2))
unemp_region = pd.DataFrame(data, index=years)
unemp_region
检索行列指标
unemp_region.index
unemp_region.values
2. 查看数据并绘图
unemp_region.head(5)
unemp_region.tail(5)
unemp_region.plot()
3. Indexing
unemp_region.loc[1995,"NorthEast"]
unemp_region.loc[[1995,2005],"South"]
unemp_region.loc[[1995,2005],["NorthEast", "National"]]
unemp_region.loc[:, "NorthEast"]
unemp_region["MidWest"] # 提取一整列
4. Computations with Columns
# Divide by 100 to move from percent units to a rate
unemp_region["West"] / 100
# Find maximum
unemp_region["West"].max()
# Find the difference between two columns
unemp_region["West"] - unemp_region["MidWest"]
# Find correlation between two columns
unemp_region["West"]。corr(unemp_region['MidWest'])
# find correlation between all column pairs
unemp_region.corr()
5. Changing DataFrames
Creating new columns
unemp_region["UnweightedMean"] = (unemp_region["NorthEast"] +
unemp_region["MidWest"] +
unemp_region["South"] +
unemp_region["West"])/4
unemp_region.head()
Changing values
unemp_region.loc[1995, "UnweightedMean"] = 0.0
Renaming Columns
names = {"NorthEast": "NE",
"MidWest": "MW",
"South": "S",
"West": "W"}
unemp_region.rename(columns=names)