1. 生成series
d = {'a':1, 'b':2, 'c':3, 'd':4,'e':5}
s3 = pd.Series(d)
print(s3)
2. 创建、查看DataFrame
data = {
'num':[2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
'info':[1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
}
labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
df2 = pd.DataFrame(data, index = labels)
df2.index
df2.values
df2.describe()
df2.sort_values(by='info')
df.sort_index(axis=1,ascending=False) # 列名进行倒序排列,行同理
df.sort_values(by='info',ascending=False)# 里面的数值进行排序
df2 = df2.copy()
df3.fillna(value = 3)
2.1 以一列为标准对应的其他值进行压缩
data1 = submit_stage2_res.groupby(by='patient ID')['continue injection'].mean()
2.2 常用数据分析
train_data["employer_type"].value_counts() # 查看列数值分布
train_data.dtypes # 查看数据中列的数值类型
train_data.shape,train_internet.shape # 查看数据shape
work_year_dict = { # 对某一列值进行映射
'< 1 year': 0,
'1 year': 1,
}
train_data['work_year'] = train_data['work_year'].map(work_year_dict)
import seaborn as sns # 对数据和标签值相关性进行显示
train_data.groupby("class")["isDefault"].mean()
sns.boxplot(x="class",y="isDefault",data = train_data)
# 查看月份和标签之间的关系,是否是强相关的
train_data.groupby("issue_date_month")["isDefault"].mean()
train_data.groupby("issue_date_month")["isDefault"].mean().plot()
train_data.groupby("issue_date_month")["isDefault"].mean().plot(kind="bar")
col_to_drop = ['issue_date', 'earlies_credit_mon'] # 丢弃某些列
train_data = train_data.drop(col_to_drop, axis=1)
cat_cols = ['employer_type', 'industry'] #对某些列进行编码(通过数量),和映射逻辑相同,
from sklearn.preprocessing import LabelEncoder
for col in cat_cols:
lbl = LabelEncoder().fit(train_data[col])
train_data[col] = lbl.transform(train_data[col])
test_public[col] = lbl.transform(test_public[col])
from matplotlib import pyplot as plt
fig, axes = plt.subplots(4,4, figsize=(10,10), sharex=True)
axes = axes.flatten()
object_bol = train_data.dtypes == 'int64' # 用于筛选出一种数据类型
for ax, catplot in zip(axes, train_data.dtypes[object_bol].index):
sns.countplot(y=catplot, data=train_data, ax=ax)
plt.tight_layout()
plt.show()
2.3 取出两列映射为字典
import pandas as pd
data = {
'name': ['zhangsan', 'lisi', 'wangwu', 'zhaoliu', 'qianqi', 'zhangsan'],
'age': [18, 19, 20, 21, 22, 23],
'city': ['beijing', 'shanghai', 'guangzhou', 'nanjing', 'changsha', 'wuhan'],
'sex': ['man', 'women', 'man', 'women', 'man', 'women']
}
df = pd.DataFrame(data)
print(df)
dff = df[['name', 'age']] # 取出其中两列
dff = dff.drop_duplicates(subset=['name'], keep='first') #如果有重复项,需要去除,确定是保存那一列,否则会用后面的替换掉前面的
dff.set_index(keys='name', inplace=True) # 设置作为key的列为index
dff = dff.T #取它的转置
dic = dff.to_dict(orient='records')[0] #转化成字典,这可能会有多行,导出是一个字典类型的数组,我们取第一项就可以了
print(dic)
d = pd.Series(df.age.values,index=df.name).to_dict()
print(d)
2.4 取前一千行数据写入文件
train_df = pd.read_csv('./train.csv',nrows=1000)
train_df[:100] # 同样只取前一百行数据
train_df.to_csv('simple.csv',index=False) # 取消行索引写入文件
2.5 取值
df2.iloc[:, 1] # 取出第一列的所有值
print(df2['A']) # 取出列表的A列
print(df2.A # 取出数据的A列
print(df2[0:3]) # 取出数据的前三行
print(df2['20200102':'20200104']) # 取出行名的某些行
print(df2.loc['20200102']) # 取出数据的某一行
print(df2.loc[:,['A','B']]) # 通过行名和列名 iloc 标签筛选
print(df2.iloc[3,1]) # 直接行号列号索引 数字筛选
print(df2.iloc[2:5,1:4]) # 切片索引
print(df2.iloc[[1,3,5],1:4]) # 按照想要的列进行索引
print(df2[df2.A>8]) # 取出数值中大于某个数的所有行
dates= pd.date_range('20200101',periods=6)
df2 = pd.DataFrame(np.arange(24).reshape(6,4), index = dates, columns=['A','B','C','D'])
df2[df2.A>4]=0 # A列大于3后的所有行的值都为0
df2.A[df2.A>4]=0 #只是将A列的值变成0 也可以改B列
df2['F'] = np.nan # 增加一列nan值
df2['E'] = pd.Series(np.arange(1,7), index = dates) # 需要对齐行标签 datas为最开始的名
df2.iloc[0,1]=np.nan # 对数据的某个位置的值改变
df2.iloc[1,2]=np.nan
2.6对数据某一列值进行显示
data=pd.Series(other_info["preVA"].value_counts().sort_index(),other_info["preVA"].value_counts().sort_index().keys())
data.plot()
plt.show()
2.7 取、删出特定行列
train[~(train["preCST"]>1000)] #~代表取反
train_folds.loc[[1524]]train = train.drop([1625,1626,1627,1628,1629,1630,1631,1632,1633,1634])train.drop('image_path', 1) # 删除特定列
2.8一列插入某些值
for data_path in train_ori["patient ID"].items():
if data_path[1] in base_path:
train_ori.loc[data_path[0],"floders"] = data_path[1]
else:
train_ori.loc[data_path[0],"floders"] = data_path[1][:-1]
2.9筛选出特定的行添加标签
train.loc[train[train["floders"].isin(v_data)].index, 'fold'] = int(i)
2.10 丢弃nan值得方式
df2.dropna(axis=0,how='all') # 一行所有为nan才丢弃any时是只要有就丢弃
df2.fillna(value=0) # 将所有nan的值填充为0
df2.isnull() # 判断是否输出nan,为nan时输出True,有值时输出false
np.any(df2.isnull()) == True # 检测是否有nan,有就会输出True
2.11 对pd数据进行堆叠
df0 = pd.DataFrame(np.ones((3,4))*0,columns=['A','B','C','D'])
df1 = pd.DataFrame(np.ones((3,4))*1,columns=['B','C','D','E'])
df2 = pd.DataFrame(np.ones((3,4))*2,columns=['A','B','C','D'])
pd.concat([df0,df1,df2],axis=0,ignore_index=True) # 按照行进行堆叠,忽略前面的行号
pd.concat([df1,df2], join='inner',ignore_index=True) # inner是删除没有的列,out没有的列用nan补全
pd.concat([df1,df2], axis=1, join='inner') # 在列方向进行堆叠
res = df3.append([df4,df4],ignore_index=True)
print(res)
s1 = pd.Series([1,2,3,4],index=['A','B','C','D'])
result = df3.append(s1,ignore_index=True)
pd.merge(df3,df4, on='B',how='outer') # merge合并on=['B','C'] 代表列相同时,相同的行就合并有inner outer
一个表的数据映射到另一个表
import pandas as pd
df1=pd.DataFrame({'id':[1,2,3],'name':['Andy1','Jacky1','Bruce1']})
df2=pd.DataFrame({'id':[1,2],'name':['Andy2','Jacky2']})
s = df2.set_index('id')['name']
df1['name'] = df1['id'].map(s).fillna(df1['name']).astype(str)
print(df1)
new_merge = pd.merge(dedup_df,oof_data_true,how='inner',on=["name"],right_index=True)
2.12 循环读取数据
for index, data in df.iterrows():
print(data['images1'],data["images2"])
break
2.13 某一列的值转为列表
len(val_df['image_name'].tolist())
3.错误解决
因编码导致的不能打开
# 改变文件的编码格式
with open(f'./input/Test/{csv_name}', 'rb') as f:
result = chardet.detect(f.read())
encoding = result['encoding']
train_df = pd.read_csv(f'./input/Test/{csv_name}', encoding=encoding)
print(csv_name,encoding)
参考资料
pandas的中文网