导入库
# coding: utf-8
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import time
import os
import datetime
# 显示所有行列
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
Pandas 数据导入导出
# 数据导入导出
pwd = os.getcwd() #获取当前路径
os.chdir(r"文件路径")
df = pd.read_csv('file_name.csv')
df = pd.read_csv('file_name.csv',encoding='gbk',index_col=None,sep=',') #文档中有中文
df.to_csv('name_of_df.csv')
# 创建一个新的data frame的方法
dict_1 = {"a":[1,2,3];b":[5,6,7]}
df1 = pd.DataFrame(dict_1,index = [1,3,2])
npdata = np.random.randn(4, 5)
df2 = pd.DataFrame(npdata,columns=['a','b','c','d','e'])
数据预览
df
df.shape()
df.info()
df.ndim()
df.head()
df.tail()
df.columns()
df.rows()
df.describe() #统计数据查看
df['col_name'].quantile(0.01) # 查看下0.01分位点
# 显示所有行列
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
Dataframe 基础操作
# 重命名列
df = df.rename(columns = {'col':'new_col'})
# 删除列
df.drop('a',axis=1)
#重置index
df.reset_index()
数据清理——替换,去重,哑变量
data["col_name"]=data["col_name"].fillna(5) # 用5填充为nan的值
camp['dup'] = camp.duplicated() # 生成重复标识变量
camp_dup = camp[camp['dup'] == True] # 把有重复的数据保存出来,以备核查
camp_nodup = camp[camp['dup'] == False] # 注意与camp.drop_duplicates()的区别
df.drop_duplicates()
df.score.replace(999,np.nan)
df.replace({'score':{999:np.nan},'name':{'Bob':np.nan}})
# 均值填补空缺值
vmean = df['Age'].mean(axis=0, skipna=True)
df['Age_empflag'] = df['Age'].isnull()
df['Age']= df['Age'].fillna(vmean)
# 创新新的计算列
df['col1'] =df['a'] - df['b']
ndf=df.assign(col2 = df['a'] - df['b'],col3 = df['a'] + df['b'])
#自定义函数创建新的列
def transform(row):
if row['column_name'] == 1:
return ('class1')
elif row['column_name'] == 2:
return ('class2')
sample.apply(transform,axis=1)
sample.assign(new_column_name = sample.apply(transform,axis=1))
sample.loc[sample.column_name==1,'new_column_name']='class1'
sample.loc[sample.column_name==2,'new_column_name']='class2'
#盖帽法处理数据
def blk(floor, root):
def f(x):
if x < floor:
x = floor
elif x > root:
x = root
return x
return f
q1 = camp['Age'].quantile(0.01)
q99 = camp['Age'].quantile(0.99)
blk_tot = blk(floor=q1, root=q99) # 'blk_tot' is a function
camp['Age']= camp['Age'].map(blk_tot)
# 数据值的变量替换处理,新标签创建
district_dict = {'fengtai':'丰台区','haidian':'海淀区','chaoyang':'朝阳区','dongcheng':'东城区','xicheng':'西城区','shijingshan':'石景山区'}
df['district'] = df.dist.map(district_dict)
# - 分箱法——等宽分箱
camp['Age_group1'] = pd.qcut( camp['Age'], 4) # 这里以age_oldest_tr字段等宽分为4段
# - 分箱法——等深分箱
camp['Age_group2'] = pd.cut( camp['Age'], 4) # 这里以age_oldest_tr字段等宽分为4段
数据汇总
# 单维度变量
snd['district'].value_counts()
snd['dist'].value_counts().plot(kind = 'bar')
snd['dist'].value_counts().plot(kind = 'pie')
sample.sort_values('score',ascending=False,na_position='last')
sample.sort_values(['group','score'])
# 单数值变量
snd.price.hist(bins=40)
plt.hist(snd['price'], bins=20, normed=True,range=(df.price.min(),df.price.max()))#指定绘图的值域
snd.price.quantile([0.01,0.5,0.99])
snd.price.mean()
snd.price.agg(['mean','median','sum','std','skew'])
# 双维度变量
df = pd.crosstab(snd.dist,snd.school)
pd.crosstab(snd.dist,snd.school).plot(kind='bar')
pd.crosstab(snd.dist,snd.school).plot(kind='bar',stacked=True)
pd.melt(table1,id_vars='cust_id',value_vars=['Normal','Special_offer'],value_name='Monetary',var_name='TYPE')
# 将交叉表变为分组汇总表 id_vars 为主变量,value_vars为要变为分组的汇总列,value_name 变为汇总值的列名,var_name为分组变量名的列名
sub_df=pd.crosstab(snd.dist,snd.school)
sub_df['sum1']=sub_df.sum(1)
sub_df = sub_df.div(sub_df.sum1,axis=0)
sub_df.drop('sum1',axis=1)
sub_df[[0,1]].plot(kind='bar',stacked=True) # 标准化堆叠图
from stack2dim import *
stack2dim(snd,i='dist',j='school') #粗细会变的堆叠图
# 维度变量+数值变量
snd.price.groupby(snd.dist).mean()
snd.price.groupby(snd.dist).mean().sort_values(ascending = True).plot(kind='barh')
sns.boxplot(x = 'dist',y='price',data=snd)
sample.groupby(['grade','class'])[['math']].mean()
sample.groupby(['grade'])['math','chinese'].mean()
sample.groupby('class')['math'].agg(['mean','min','max'])
# 透视表分析
snd.pivot_table(values='price',index='district',columns='subway',aggfunc=np.mean).plot(kind='bar')
数据切片
sample['a']
sample.ix[:,'a']
sample[['a']]
sample.ix[0:2, 0:2]
sample[sample.score > 70]
sample[(sample.score > 70) & (sample.group ==1)]
sample.query('(group ==2) |(group == 1)')
sample[sample['score'].between(70,80,inclusive=True)]
sample[sample['name'].isin(['Bob','Lindy'])]
sample[sample['name'].str.contains('[M]+')]
ndf = df[df.A>df.B][df.A<df.B+datetime.timedelta(days=365)]
时间处理
data['time'] = pd.to_datetime(data['time'],format='%Y-%m-%d %H:%M:%S')
data["weekday"]=data["time"].dt.weekday
data["year"]=data["time"].dt.year
data["quarter"]=data["time"].dt.quarter
data["hour"]=data["time"].dt.hour
data["month"]=data["time"].dt.month
单行数据处理
data["new_col"]=data["ori_col"].map(fun_name)
lambda x:3 if x==-8 else x
lambda x:x-1
lambda x:0 if pd.isnull(x) else 1 # 若为空则为0 不为空为1
正确的赋值操作
data.loc[(data['A']==5),'C']='是'
data.loc[(data['A'].str.contains('团队')) & (data['B']>=40),'C']='是'