1 import语句
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import re
2 文件读取
df = pd.read_csv(path='file.csv')
参数:
header=None 用默认列名,0,1,2,3...
names=['A', 'B', 'C'...]
index_col='A'|['A', 'B'...]
skiprows=[0,1,2]
nrows=N
chunksize=M
sep=':'
skip_blank_lines=False
converters={'col1', func}
dfjs = pd.read_json('file.json')
dfex = pd.read_excel('file.xls', sheetname=[0,1..])
3 数据预处理
df.duplicated()
df.drop_duplicates()
df.fillna(0)
df.dropna()
del df['col1']
df.drop(['col1',...], aixs=1)
df.column = col_lst
df.rename(index={'row1':'A'},
columns={'col1':'A1'})
df.replace(dict)
def get_digits(str):
m = re.match(r'(\d+(\.\d+)?)', str.decode('utf-8'))
if m is not None:
return float(m.groups()[0])
else:
return 0
df.apply(get_digits) DataFrame.apply
df['col1'].map(func) Series.map
pd.merge(df1, df2, on='col1',how='inner',sort=True)
pd.merge(df1, df2, left_on='col1',right_on='col2')
pd.concat([sr1, sr2, sr3,...], axis=0)
pd.concat([sr1, sr2, sr3,...], axis=1)
df1.combine_first(df2)
df.stack()
df.unstack()
df.pivot()
pd.get_dummies(df['col1'], prefix='key')
4 数据筛选
df.columns
df.index
df.shape
df.head(n=N)
df.tail(n=M)
df.values
df.index
df.reindex(index=['row1', 'row2',...]
columns=['col1', 'col2',...])
df[m:n]
df[df['col1'] > 1]
df.query('col1 > 1')
df.query('col1==[v1,v2,...]')
df.ix[:,'col1']
df.ix['row1', 'col2']
df.ix[:,:'col2']
df.loc[m:n]
df.iloc[m:n]
df.loc[m:n-1,'col1':'coln']
sr=df['col']
sr.values
sr.index
5 数据运算与排序
df.T
df1 + df2
df1.add(df2, fill_value=0)
df1.add/sub//mul/div
df - sr
df * N
df.add(sr, axis=0)
sr.order()
df.sort_index(aixs=0, ascending=True)
df.sort_index(by=['col1', 'col2'...])
df.rank()
6 数学统计
sr.unique
sr.value_counts()
sr.describe()
df.describe()
df.count()
df.max()
df.min()
df.sum(axis=0)
df.mean()
df.median()
df.var()
df.std()
df.mad()
df.cumsum()
sr1.corr(sr2)
df.cov()
df1.corrwith(df2)
pd.cut(array1, bins)
pd.qcut(array1, 4)
df['col1'].groupby(df['col2'])
df.groupby('col1')
grouped.aggreagte(func)
grouped.aggregate([f1, f2,...])
grouped.aggregate([('f1_name', f1), ('f2_name', f2)])
grouped.aggregate({'col1':f1, 'col2':f2,...})
df.pivot_table(['col1', 'col2'], rows=['row1', 'row2'], aggfunc=[np.mean, np.sum]fill_value=0,margins=True)
pd.crosstab(df['col1'], df['col2'])