首先建立两个表
df = pd.DataFrame({"id":[1001,1002,1003,1004,1005,1006],
"date":pd.date_range('20130102', periods=6),
"city":['Beijing', 'SH', ' guangzhou ', 'Shenzhen', 'shanghai', 'BEIJING '],
"age":[23,44,54,32,34,32],
"category":['100-A','100-B','110-A','110-C','210-A','130-F'],
"price":[1200,np.nan,2133,5433,np.nan,4432]},index = [1001,1002,1003,1004,1005,1006])
df1=pd.DataFrame({"id":[1001,1002,1003,1004,1005,1006],
"gender":['male','female','male','female','male','female'],
"pay":['Y','N','Y','Y','N','Y'],
"m-point":[10,12,20,40,40,41]},index=[1001,1002,1003,1004,1005,1006])
df=DataFrame({"a":[1,2,3,1,1,5],"b":[11,np.NAN,1,1,np.NAN,10],"c":[8,90,2,np.NAN,10,88]
,"d":["a-c","b-d","c-d","e-q","d-s","e-q"]})
df1=DataFrame({"a":["1",'2','3',"1",'1',np.NAN],"b":[1,np.NAN,1,1,np.NAN,10],"c":[8,90,2,np.NAN,10,88]
,"d":["a","b","c",np.NAN,"d","e"]})
查看数据表信息
df.shape#看维度
print(df.info())#看每行有无空值.每一列默认类型,看总数 #
print(df["dishes_id"].dtypes)#看某一列的类型
print(df.values)#查看数据表的值 #
print(df.columns)#查看列名
df.head() #默认前10行数据 # df.tail() #默认后10 行数据 #
print(df.order_id.describe())#查看数据信息
缺失值处理
df[‘prince’].fillna(df[‘prince’].mean())#用数字特征进行填充
df.fillna(value=0)#用数字填充
print(df.dropna(axis=1,how="all")) df.fillna(method="bfill")#ffill
df['taixin'] = df['taixin'].interpolate() #前后非缺失值的均值填充 #
for f in df: # 插值法填充
df[f] = df[f].interpolate()
异常值处理
df["a"]=df["a"].astype(str)#改变数据类型 注意object就是str类型
df["a"]=df["a"].apply(str)#改变数据类型
df["a"]=df["a"].map(str.strip) #去除空值可以给左右
df["a"]=df["a"].map(str.split(","))
df["a"]=df["a"].str.strip(",") #去除特殊符号
df[‘city’]=df[‘city’].str.lower() #改变大小写
df["a"]=df["a"].drop_duplicates(keep='last')#删除重复的值
df["a"].drop_duplicates(inplace=True)#不行
df[‘city’].replace(‘sh’, ‘shanghai’)#数据替换
df["b"]=df["b"].where(df["b"]<2,1)#符合条件的不变其他的变 np的where和pd的有所不同np.where(arr>0,2,-2)
数据预处理
df_inner=pd.merge(df,df1,how='inner') #交集并集
df_right=pd.merge(df,df1,how='right')
result = df1.append(df,ignore_index=True,verify_integrity=True)#追加
result=pd.concat([df,df1],)
透视表
df1=df[df["姓名"]=="张三"]#获取满足条件的行
uni=df1["活动分类"].unique()
table = pd.pivot_table(df1,index="活动分类",values=["数量"],aggfunc="sum")#按活动分类做做透视表,活动分类是列,数量是要观察的列
排序
df.set_index("a",append=True,inplace=True)#设置索引
df3.sort_values(by=[0],axis=1,inplace=True) #按值排序,可以横着也可以竖着 df_inner.sort_index()#按照索引排序
df["a"] = np.where(df["a"] > 2,"h","l")#大于2的h否则l
df_inner.loc[(df_inner[‘city’] == ‘beijing’) & (df_inner[‘price’] >= 4000), ‘sign’]=1 #符合条件求
拆分
df3=pd.DataFrame((x.split("-") for x in df["d"]),index=df.index,columns=["d","f"])#一定要注意空值他是浮点类型,什么都不能改变它
df_inner=pd.merge(df,df3,right_index=True, left_index=True)#将拆分来的按照索引连接起来
数据提取
print(df.loc[3])
print(df.loc[:,:"c"])
df.reset_index(inplace=True)#将索引变成列
x=df1["a"]
df.set_index(x,inplace=True)#好像不能自定义索引
print(df.loc[:3])#输出第四行之前
print(df["a"]==1)#判断
df=df[df["a"]==1] #过滤得到要求的值
print(df.loc[[1,3],["a","d"]])#提取想要的行和列
print(df[df["a"].isin([1,5])])#过滤出存在1或者5的值
df2=pd.DataFrame(df["category"].str[:3])#提取数据的前三个字符
数据筛选
print(df.loc[(df['age'] < 30) & (df['city'] == 'Beijing'), ['id','city','age','category','gender']])#根据df.loc[[],[]]分别设定条件&|!=与或非竖直的或
print(df.loc[(df['city'] != 'Beijing'), ['id','city','age','category','gender']].city.count())#计算单独一列
print(df.query('city == ["Beijing", "shanghai"]'))#筛选注意两个引号时‘‘’’不能同用
print(df.query('city == ["Beijing", "shanghai"]').price.sum())#计算
数据汇总
print(df.groupby("city").count())#安city分类之后进行汇总
print(df.groupby("city")["id"].count())
print(df.groupby(["city","date"]).count())#对多个字段分类汇总
print(df.groupby('city')['price'].agg([len,np.sum, np.mean]))#分类汇总并进行计算
部分参照博客https://blog.csdn.net/yiyele/article/details/80605909