官方文档
http://pandas.pydata.org/pandas-docs/stable/
导入包pandas
import pandas as pd
获取文件夹下文件名称
import os
filenames=[]
for file in os.listdir(path):
filenames.append(file)
读前几行文件(大文件)
# -*- coding: utf-8 -*-
##读前几行文件
f= open("/Users/yuyin/Downloads/Action_all.csv")
for i in range(5):
print(f.readline().strip())
读取文件
file = pd.read_csv("./非空记录/"+filenames[i],sep='--->',header=None)
写文件
file.to_csv("./非空all/alldata.csv",header=False,mode='a',index=False) #a追加
pickle读入与写入
import cPickle as pickle
output = file('./temp.pkl', 'wb')
pickle.dump(t, output, True)
output.close()
f2 = file('temp.pkl', 'rb')
a2 = pickle.load(f2)
f2.close()
字符串截取与类型转换
str(12+int(t2[6:8])
创建DF list to DF
add_all=pd.DataFrame({'shop_id':add_01,'time':add_02,'year':add_03,'mouth':add_04,'day':add_05,'pay_count':add_06,'pay_user_count':add_07})
多维数组np.array 转换 dataframe
pd.DataFrame(new_weight,columns=list(word_set))
创建空DF
alldata = pd.DataFrame(columns=['mail','title','send','receive','time','state'])
DF复制
re2=re.copy(deep=True)
DF类型转换astype
model_off_train_1[model_off_train_1.iloc[:,5].astype(int)<=20160401]
DF截取某一列
alldata.iloc[:,4]
test_pre_uidmid['u1']
DF选取多列
train.iloc[:,2:(train.shape[1]-1)]
#最后一列
train.iloc[:,-1]
#带名字的列
val[['user_id','sku_id']]
DF去掉某列
del DF['column-name']
DF= DF.drop('column_name', 1);
DF.drop('column_name',axis=1, inplace=True)
DF.drop([DF.columns[[0,1, 3]]], axis=1,inplace=True)
DF查看各列分布
df.describe()
抽样
re = train.sample(frac=0.25, random_state=66)
利用sql执行DF
from pandasql import sqldf
pysqldf=lambda q:sqldf(q,globals())
pysqldf("select * from test_sample limit 5")
DF列重命名
feature_1.columns=['c0','c1','c2','c3','c4','c5','c6']
DF左连接
train_sample_all=pd.merge(train_sample_all,feature_1,how='left',on=['c0','c1'])
查看缺失值及缺失值补为0
train_sample_all[pd.isnull(train_sample_all.iloc[:,7])].head() #查看缺失值
train_sample_all.iloc[:,7]=train_sample_all.iloc[:,7].fillna(0) #填充缺失值
dfTest["avg_cvr"].fillna(np.mean(dfTrain["label"]), inplace=True)
重新DF建立行索引
tmp=pd.DataFrame(model_off_train_1_tmp)
tmp.index=[i for i in range(tmp.shape[0])]
tmp=train_all[(train_all['time']<=20161031)&(train_all['time']>=20161018)].reset_index()
##.reset_index(drop = True) 不生成新列index
显示DF所有的行列
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
选择DF满足多条件的部分
train_sample_1=model_off_train_1[(model_off_train_1.iloc[:,6].astype(int)<=20160615)&(model_off_train_1.iloc[:,6].astype(int)>=20160201)]
tmp=train_all[(train_all['time']<=20161031)&(train_all['time']>=20161018)].reset_index()
抽样及打乱数据
pture.sample(int(pture.shape[0]/2))
df.sample(n)
#比例抽样
df1=d.sample(frac=0.2)
#打乱数据
df.sample(frac=1)
from sklearn.utils import shuffle
df = shuffle(df)
多个list转DF
pd.DataFrame({'c0':tmp2.index,'c1':tmp2.iloc[:,1]})
feature_1=pd.DataFrame({'u5':tmp})
添加新列及列之间运算
re1['diff'] = map(lambda x, y: abs(x-y) , re1['label'], re1['pro'])
DF添加DF类型列(合并列)
test_pre_uidmid=pd.concat([test_pre_uidmid,feature_1],axis=1)
相同列的DF行合并(合并行)
test_pre_allend_all=pd.concat([test_pre_allend,test_pre_allend_1])
lambda对某列进行操作
y=[diffdays(x) for x in d2["ALTDATE"]]
#or
y=map(lambda x: diffdays(x),d2["ALTDATE"])
DF排序
lc.sort(["loan_amnt"])
#默认升序True
lc.sort(["loan_amnt"],ascending=True)
#降序
lc.sort(["int_rate","loan_amnt"],ascending=False)
更改DF列排序
默认字典排序
columns = ['shop_id','time','year','mouth','day','pay_count','pay_user_count']
add_all=add_all.ix[:,columns]
DF去重
tmp=tmp.drop_duplicates(['c0','c1','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','u1','u2','u3','u4','u5','u6','u7','m1','m2','m3','m4','m5'])
shop_id = user_pay_all['shop_id']
shop_id = shop_id.drop_duplicates()
DF选择缺失值部分
test_pre_allend_1=test_pre_allend[pd.isnull(test_pre_allend.iloc[:,7])]
test_pre_allend=test_pre_allend[-pd.isnull(test_pre_allend.iloc[:,7])]
DF选取满足含有某字符串的部分=like匹配
alldata=alldata[(alldata['c2'].str.contains('@nuc',na=False))&(alldata['c3'].str.contains('@nuc',na=False))]
分组运算groupby count sum 自定义函数
mean(), sum(), max()
reset_index将聚合后的key作为列
参考https://my.oschina.net/lionets/blog/280332
train_all.groupby(['send','recieve'])['time'].count().reset_index()
df.groupby('key1').agg({'data1':'min','data2':'max'})
df.groupby(['key1','key2']).agg({'data1':'min','data2':'max'})
df.groupby('key1')['data1','data2'].agg(['min','max'])
df.groupby('key1')['data1','data2'].agg(lambda arr:arr.max()-arr.min())
dfTrain.groupby(key).apply(lambda df: np.mean(df["label"])).reset_index()
遍历多个数组
d2["year1"]=[diffdays2(x,y) for (x,y) in zip(d2["FBDATE"],d2["SXENDDATE"])]
计算时间差
import datetime
#计算时间差-天
(datetime.datetime(2016,2,11)-datetime.datetime(2016,2,01)).days
#时间差 秒
a=datetime.now()
b=datetime.now()
(b-a)
(b-a).seconds
时间转换-字符串格式更改-时间戳转换
#如a = "2013-10-10 23:40:00",想改为 a = "2013/10/10 23:40:00"方法:先转换为时间数组,然后转换为其他格式
import time
a = "2013-10-10 23:40:00"
timeArray = time.strptime(a, "%Y-%m-%d %H:%M:%S")
otherStyleTime = time.strftime("%Y/%m/%d %H:%M:%S", timeArray)
#转换为时间戳:
timeStamp = int(time.mktime(timeArray))
timeStamp == 1381419600
#时间戳转换为指定格式日期:
timeStamp = 1381419600
timeArray = time.localtime(timeStamp)
otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
otherStyletime == "2013-10-10 23:40:00"
#获得当前时间时间戳
now = int(time.time()) ->这是时间戳
##datetime
import datetime
#获得当前时间
now = datetime.datetime.now() ->这是时间数组格式
#string转datetime
str = '2012-11-19'
date_time = datetime.datetime.strptime(str,'%Y-%m-%d')
#datetime转string
date_time.strftime('%Y-%m-%d')
#datetime转时间戳
time_time = time.mktime(date_time.timetuple())
#时间戳转string
time.strftime('%Y-%m-%d',time.localtime(time_time))
#date转datetime
date = datetime.date.today()
datetime.datetime.strptime(str(date),'%Y-%m-%d') #将date转换为str,在由str转换为datetime
时间间隔增加/减少
import datetime
now_time = datetime.datetime.now()
yes_time = now_time + datetime.timedelta(days=1) #-n n
yes_time_nyr = yes_time.strftime('%Y%m%d') ##//格式化输出
logging使用
- 日志等级:DEBUG < INFO < WARNING < ERROR < CRITICAL
import logging
logging.debug("This is a debug log.")
logging.info("This is a info log.")
logging.warning("This is a warning log.")
logging.error("This is a error log.")
logging.critical("This is a critical log.")
## 输出到日志文件
log_format = "%(asctime)s - %(levelname)s - %(message)s"
logging.basicConfig(filename='my.log', level=logging.DEBUG, format=log_format)
# 使用变量
logging.warning('%s before you %s', 'Look', 'leap!')
使用配置文件进行日志记录
# 读取日志配置文件内容
logging.config.fileConfig('logging.conf')
# 创建一个日志器logger
logger = logging.getLogger('simpleExample')
# 日志输出
logger.debug('debug message')
logger.info('info message')
logger.warn('warn message')
logger.error('error message')
logger.critical('critical message')
配置文件logging.conf内容如下:
[loggers]
keys=root,simpleExample
[handlers]
keys=fileHandler,consoleHandler
[formatters]
keys=simpleFormatter
[logger_root]
level=DEBUG
handlers=fileHandler
[logger_simpleExample]
level=DEBUG
handlers=consoleHandler
qualname=simpleExample
propagate=0
[handler_consoleHandler]
class=StreamHandler
args=(sys.stdout,)
level=DEBUG
formatter=simpleFormatter
[handler_fileHandler]
class=FileHandler
args=('logging.log', 'a')
level=ERROR
formatter=simpleFormatter
[formatter_simpleFormatter]
format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
datefmt=
分布查询
# 查看几条
df.head()
# 行列
df.shape
# 查看数据类型
df.dtypes
# 汇总信息 数值分布
df.describe()
# 非数值数据 count总数 unique去重后的个数 top出现频率最高的值 freq出现最高频率的值的频率
df.describe(exclude=[np.number])
# 非数值数据 出现分布比例
df.groupby('列名').size()
# 探索维度之间的相关性 默认计算皮尔逊相关系数 -1表示(线性)负相关,0表示(线性)无关,1表示(线性)正相关
df.corr()
np生成随机数
#生成100个0-100的等差数列
x = np.linspace(0, 100, 100)
x2=np.random.randint(0,50, 100) ##0-50范围100个随机数 np.random.random(100) 0-1范围 np.random.normal(0,0.1,100)均值0方差0.1正态分布
简单绘图
import matplotlib.pyplot as plt
import numpy as np
#生成100个0-100的等差数列
x = np.linspace(0, 100, 100)
#plt.plot(x,tmp1[0:100],'')#默认折线图
plt.plot(x,tmp1[0:100])
plt.show()
#直接把x按照折线图画出
plt.plot(x)
plt.ylabel('np_linspace')
plt.xlabel('1-100')
plt.title('test')
plt.show()
##颜色线的形状
##'g--'意思 颜色g 和--形状
plt.plot(x,'g--')
plt.show()
##'go-'意思 颜色g、点的形状o、线的形状-
x2=np.random.randint(0,50, 100)
plt.plot(x2,'go-')
##等价于
##plt.plot(x2,color='g',linestyle='-',marker='o')
plt.show()
##刻度
plt.xlim([0,100])
plt.ylim([0,100])
#散点图 o
plt.plot(x,tmp1[0:100],'o')
plt.show()
##柱状图
plt.hist(x2)
plt.show()
#一个图绘制多个线 o散点r颜色
plt.plot(x,tmp1[0:100],'or')
plt.plot(x,tmp1[0:100])
plt.show()
#汇制多个图
x = np.linspace(0, 100, 100)
x2=np.random.randint(0,50, 100)
fig=plt.figure()
#设置宽高
fig=plt.figure(figsize=(15,5)) ##宽x高
# fig.set_figwidth(15)
# fig.set_figheight(5)
##p1图是画布1x2(1行2列)里面第1个
p1=fig.add_subplot(1,2,1)
p2=fig.add_subplot(1,2,2)
p1.plot(x,'r--')
p1.set_ylabel('xx')#用set方法
p2.hist(x2)
plt.show()
生成时间序列
import datetime
def datelist(start, end):
start_date = datetime.date(*start)
end_date = datetime.date(*end)
result = []
curr_date = start_date
while curr_date != end_date:
result.append("%04d%02d%02d" % (curr_date.year, curr_date.month, curr_date.day))
curr_date += datetime.timedelta(1)
result.append("%04d%02d%02d" % (curr_date.year, curr_date.month, curr_date.day))
return result
if __name__ == "__main__":
print datelist((2014, 7, 28), (2014, 8, 3))
稀疏矩阵列合并
from scipy import sparse
sparse.hstack((X_train, x_train))
from scipy.sparse import coo_matrix, hstack
A = coo_matrix([[1, 2], [3, 4]])
B = coo_matrix([[5], [6]])
hstack([A,B]).toarray()
枚举
索引+值
feats = ["cID", "aID"]
for i,feat in enumerate(feats):
print i
print feat
改变数组大小reshape
#多行1列
dfTrain[feat].values.reshape(-1, 1)
import numpy as np
a = np.array([1, 2, 3, 4])
a.reshape((-1,1)).shape #4x1
a.reshape((1,-1)).shape #1x4
numpy
###返回numpy的array最大的topn
#topn
import heapq
heapq.nlargest(top_n, range(len(w)), w.take)
#top1
np.argmax(a)
返回array某个值的索引
ndex=word.index(w)
函数式编程
import numpy as np
#方法1
np.median([len(x) for x in train_nlp['sentence']])
#方法2
get_len=lambda x:len(x)
t=train_nlp['sentence'].apply(get_len)
np.median(t)