1 测量代码的运行时间
import timeit
start = timeit.default_timer()
#my code
end = timeit.default_timer() - start
2 Ignore warnings
import warnings
warnings.filterwarnings('ignore')
3 行列数据显示不全
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置value的显示长度为100,默认为50
pd.set_option('max_colwidth',100)
4 导入CSV或者xlsx文件
f = pd.DataFrame(pd.read_csv('name.csv',header=1))
df = pd.DataFrame(pd.read_excel('name.xlsx'))
5 写入xlsx或CSV文件
df.to_excel('temp.xlsx', sheet_name='bluewhale_cc',index=False)
df_inner.to_csv('excel_to_python.csv',index=False)
6 使用pandas创建数据
# 使用dict创建df
df = pd.DataFrame({"id":[1001,1002,1003,1004,1005,1006],
"date":pd.date_range('20130102', periods=6),
"city":['Beijing ', 'SH', ' guangzhou ', 'Shenzhen', 'shanghai', 'BEIJING '],
"age":[23,44,54,32,34,32],
"category":['100-A','100-B','110-A','110-C','210-A','130-F'],
"price":[1200,np.nan,2133,5433,np.nan,4432]})
# 使用list创建df
data = [['Apple', 6], ['Grape', 30], ['Banana', 5]]
df = pd.DataFrame(data, columns = ['Fruit', 'Price'])
7 数据信息查看
df.shape
df.head()
df.info()
df.dtypes
df[col].dtype
df.isnull()
df.unique()
df.values
df.columns
df.value_counts()
8 数据清洗
df.fillna(value=0)
df.fillna(value=df[col].mean(),inplace=True)
9 字符空格处理
df[col].map(str.strip)
10 更改数据格式换
df[col].str.lower()
df[col].str.upper()
11 更改数据格式
df[col].astype('int')
12 更改列名
df.rename(columns={'category':'category-size'})
13 删除重复值
df[col].drop_duplicates()
df[col].drop_duplicates(keep='last')
14 数据替换
df[col].replace('sh','shanghai')
15 数据表合并
df_inner=pd.merge(df,df1,how='inner') # 匹配合并,交集
df_left=pd.merge(df,df1,how='left') #
df_right=pd.merge(df,df1,how='right')
df_outer=pd.merge(df,df1,how='outer') #并集
result = df1.append(df2)
result = left.join(right, on='key')
pd.concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
keys=None, levels=None, names=None, verify_integrity=False,
copy=True)
16 设置索引列
df_inner.set_index('date')
17 使用column对dataframe排序
df.sort_values(by = ['hour'], ascending = False, inplace = True)
18 按照索引列排序
df_inner.sort_index()
19 np.where()条件填充
df_inner['group'] = np.where(df_inner['price'] > 3000,'high','low')
20 对复合条件的数据进行分组标记
df_inner.loc[(df_inner['city'] == 'beijing') & (df_inner['price'] >= 4000), 'sign']=1
21 多类别过滤数据滤
df.loc # 列值
df.iloc # 索引值
# 若有多个条件过滤
可以使用 & | 进行多条件过滤
22 多类别过滤数据
df[df[col].isin([‘Rain’,’Sunny’])]
23 透视表
df.pivot(index=’city’,columns=’day’,values=”temperature”)
24 数据表维度转变
header = pd.MultiIndex.from_product([[‘2018’,’2019'],[‘Physics’,’Chemistry’,’Maths’]])
data=([[31,45,65,43,32,65],[76,56,78,65,78,65],[44,56,73,76,87,56]])
df = pd.DataFrame(data,
index=[‘John’,’Gil’,’Gina’],
columns=header)
使用stack函数将上述数据重新格式化成行,以便进一步分析
df.stack()
25 多个list拼接成一个
##### Standard version
l1=[1, 2, 3]
l2=[4, 5, 6]
l3=[7, 8, 9]
import itertools
result=list(itertools.chain(l1, l2, l3))
Output:
# [1, 2, 3, 4, 5, 6, 7, 8, 9]
26 模型保存与加载
import pickle
model = SVM() # put yours model
model.fit(X_train, Y_train)
# save the model
pickle.dump(model, open(filename, 'wb'))
# load the model from disk
model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, Y_test)
print(result)
27 将数据集拆分为训练集和测试集
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=100)
28 检查pandas版本
import pandas as pd
pd.__version__
#if we want change version of pandas
import sys
!{sys.executable} -m pip install pandas==0.24.2
29 拼接dataframe
df1 = pd.read_csv("comments.csv", header=0, sep=";")
df2 = pd.read_csv("comments2.csv", header=0, sep=";")
df2=df2.append(df1, ignore_index=True)
30 将数据集拆分为X和Y
#first column as label
X = df[df.columns.difference([df.columns[0]])]
y = df[df.columns[0]]
#last column aslabel
X = df[df.columns.difference([df.columns[-1]])]
y = df[df.columns[-1]]
31 过滤dataframe数据
df_filtered = df[df['hour'] >12 ) & (df['hour']<18]
32 对dataframe进行label encoding
from sklearn.preprocessing import LabelEncoder
cat_df = cat_df.astype(str)
df = cat_df.apply(LabelEncoder().fit_transform)
pandas 二
1生成数据表
df = pd.DataFrame(pd.read_csv('name.csv',header=1))
df = pd.DataFrame(pd.read_excel('name.xlsx'))
或
import pandas as pd
from collections import namedtuple
Item = namedtuple('Item', 'reply pv')
items = []
with codecs.open('reply.pv.07', 'r', 'utf-8') as f:
for line in f:
line_split = line.strip().split('\t')
items.append(Item(line_split[0].strip(), line_split[1].strip()))
df = pd.DataFrame.from_records(items, columns=['reply', 'pv'])
3、用pandas创建数据表:
df = pd.DataFrame({"id":[1001,1002,1003,1004,1005,1006],
"date":pd.date_range('20130102', periods=6),
"city":['Beijing ', 'SH', ' guangzhou ', 'Shenzhen', 'shanghai', 'BEIJING '],
"age":[23,44,54,32,34,32],
"category":['100-A','100-B','110-A','110-C','210-A','130-F'],
"price":[1200,np.nan,2133,5433,np.nan,4432]},
columns =['id','date','city','category','age','price'])
4、维度查看
df.shape
5、数据表基本信息(维度、列名称、数据格式、所占空间等):
df.info()
6、每一列数据的格式:
df.dtypes
更多操作查看pandas用法