- 建立pandas的DataFrame对象
# 按行建,使用columns参数说明列名
sales = [('Jones LLC', 150, 200, 50),
('Alpha Co', 200, 210, 90),
('Blue Inc', 140, 215, 95)]
labels = ['account', 'Jan', 'Feb', 'Mar']
df = pd.DataFrame.from_records(sales, columns=labels)
# 按列建,字典key为列名,value为列的列表
df = pd.DataFrame({'name': ['Raphael', 'Donatello'],
'mask': ['red', 'purple'],
'weapon': ['sai', 'bo staff']})
# 按行建,同时说明列名,较繁琐
df = pd.DataFrame({'name': 'Raphael', 'mask': 'red','weapon': 'sai'},
{'name': 'Donatello', 'mask': 'purple','weapon': 'bo staff'})
- 重新命名列名
df = df.rename(columns={0:"new_feature_name",1:"weight"})
# 0,1为原来的列名
- 读取excel文件
data = pd.read_excel(file, sheetname='tumor', encoding='utf8') # sheetname默认为0,也就是第一个sheet
- 保存excel文件
pd_data.to_excel(to_file, sheet_name='tumor', encoding='utf8')
- 读取csv文件
data = pd.read_csv(file, engine="python") # 如果是中文文件名,需要用engine='python'
- 保存csv文件
data.to_csv(file, index=None, header=['id', 'y']) # index=None不保留行名,header=None不保留列名
注意:index=0时不保存index,columns会保存为第一行,符合大部分习惯
- 取多列或多行,赋值
df = pd.DataFrame([[1, 2], [4, 5], [7, 8]],
index=['cobra', 'viper', 'sidewinder'],
columns=['max_speed', 'shield'])
df_sub = df.loc[["cobra"],:]
注意:1.df.loc['cobra', 'max_speed']返回的是一个数;df.loc[:, 'max_speed']返回的是Series,name为 'max_speed';df.loc[:, [ 'max_speed']]返回的是DataFrame; df.loc['cobra']和df.loc['cobra',:]效果一样
2.取值方法还有
--- 使用slice,如df.loc['cobra':'viper']
--- 使用bool列表,如 df.loc[[True, True], :]
--- 使用bool的Series,如df.loc[df['max_speed']<4]
--- 使用Callable that returns a boolean Series,如df.loc[lambda df: df['shield'] == 8]
3.对多行或多列赋值方法,类似上面的取值方法
df.loc["cobra"] = 0
df.loc[["cobra","viper"]] = 0
df.loc[:, 'max_speed'] = 0
df.loc[df["max_speed"]==4] = 0
df.loc[lambda df: df["max_speed"]<4] = 0
4.对multiindex的处理,和上面的是类似的
tuples = [
('cobra', 'mark i'), ('cobra', 'mark ii'),
('sidewinder', 'mark i'), ('sidewinder', 'mark ii'),
('viper', 'mark ii'), ('viper', 'mark iii')]
index = pd.MultiIndex.from_tuples(tuples)
values = [[12, 2], [0, 4], [10, 20],
[1, 4], [7, 1], [16, 36]]
df = pd.DataFrame(values, columns=['max_speed', 'shield'], index=index)
print(df)
print("#" * 20)
sub_df = df.loc[:, 'max_speed']
sub_df = df.loc[("viper", "mark ii")]
- 末尾增加一列
subdata['w'] = [2,3,4]
- 修改列的排列顺序
df=pd.DataFrame(df, columns=['weight', 'name', 'height', 'is_fat']) #数据跟随列名的顺序重排
df.insert(1, "age", list) # 在"weight"后面插入"age",数据列为list
进一步参考修改列顺序的几种方法整理
- 修改列名
import pandas as pd
df = pd.DataFrame({'name': [0.23456, 6.78902],
'mask': [0.31415, 3.65432890],
'weapon': ["gun", 3.65432890]})
print(df)
df.columns = ['number', 'mask', 'weapon'] #将'name'改为'number'
df.index = list('ab') #将index改为['a','b']
print(df)
df.rename(columns={'mask':'字符'},inplace=True) #将'mask'改为'字符'
df.rename(index={'a': 'One'}, inplace=True) #将'a'改为'One'
df.rename(mapper=str.upper, axis='index', inplace=True) #对index中每个label做str.upper操作
df.rename(mapper=lambda x:ord(x[0]), axis='index', inplace=True) #对index做函数操作
print(df)
- 改变某列或者几列的数据类型
df['y'] = df['y'].astype(int) df[['y', 'x0']] = df[['y','x0']].astype(int)
- 按某列的值顺序/逆序重排
df_test_gt.sort_values(by=0, ascending=True, inplace=True) #按照第0列(如果有列名,写列名字符串)升序排列
- 删除一行或多行
df_test_gt = df_test_gt.drop([0], axis=0) #删除第0行(如果有行名,写行名字符串)
df_test_gt = df_test_gt.drop(drop_list, axis=0) #删除drop_list中的所有行
- 判断是否有缺失
w = data.iloc[i,:].isnull().sum() #第i行缺失的总数 df = data.isnull().sum() #统计每一列的缺失的数目
- 筛选
df_filter = df[df['age']>30] #得到年龄大于30岁的行
df_filter = df[df['age'].isin([20, 30, 40])] #得到年龄为20,30,40岁的行
df_filter = df[df['age'].str.contains('2|0')] #得到年龄中有数字2或0的行
df_filter = df[~df['age'].isin([20, 30, 40])] #得到年龄不为20,30,40岁的行
进一步参考:Pandas系列5-DataFrame之过滤 - 简书(条件过滤方法)和4、pandas的数据筛选之isin和str.contains函数 - 简书(isin和str.contains)
- 按列合并多个DataFrame
import pandas as pd
df1 = pd.DataFrame([['a', 1], ['b', 2]],
columns=['letter', 'number'])
df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']],
columns=['letter', 'number', 'animal'])
print("df1:")
print(df1)
print("df3:")
print(df3)
df3.index = [1, 0]
print("nw_df3:")
print(df3)
df = pd.concat([df1, df3], axis=1)
print("df:")
print(df)
注意:会按照index匹配行
- 去重
data.drop_duplicates(subset=['A','B'],keep='first',inplace=True)
- merge两个表
import pandas as pd
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
result = pd.merge(left, right, on='key')
# on参数传递的key作为连接键
result
Out[4]:
A B key C D
0 A0 B0 K0 C0 D0
1 A1 B1 K1 C1 D1
2 A2 B2 K2 C2 D2
3 A3 B3 K3 C3 D3
- 转成字典
df = pd.DataFrame({"a":[1,1,1,2,2,2,2,3], "b": ["q","q","q","q","q","q","q","w"], "c":[0,0,0,0,0,0,0,0], "d": [1,1,1,1,1,1,1,1]})
df[["a", "b"]].set_index("a").to_dict()["b"]
# 结果
{1: 'q', 2: 'q', 3: 'w'}
- 添加行
ap_dicts = [{"featureName":"account_id","weight":405},
{"featureName":"campaign_id","weight":226},
{"featureName":"strategy_id","weight":166}]
for item in ap_dicts:
df = df.append(item,ignore_index=True)
- 按某列排序
pd.sort_values("xxx",inplace=True,ascending=False)