Pandas常用笔记

官方文档

http://pandas.pydata.org/pandas-docs/stable/

导入包pandas

import pandas as pd

获取文件夹下文件名称

import os
filenames=[]
for file in os.listdir(path):
    filenames.append(file)

读前几行文件(大文件)

# -*- coding: utf-8 -*-
##读前几行文件
f= open("/Users/yuyin/Downloads/Action_all.csv")
for i in range(5):
    print(f.readline().strip())

读取文件

file = pd.read_csv("./非空记录/"+filenames[i],sep='--->',header=None)

写文件

file.to_csv("./非空all/alldata.csv",header=False,mode='a',index=False) #a追加

pickle读入与写入

import cPickle as pickle
output = file('./temp.pkl', 'wb')
pickle.dump(t, output, True)
output.close()
f2 = file('temp.pkl', 'rb')
a2 = pickle.load(f2)
f2.close()

字符串截取与类型转换

str(12+int(t2[6:8])

创建DF list to DF

add_all=pd.DataFrame({'shop_id':add_01,'time':add_02,'year':add_03,'mouth':add_04,'day':add_05,'pay_count':add_06,'pay_user_count':add_07})

多维数组np.array 转换 dataframe

pd.DataFrame(new_weight,columns=list(word_set))

创建空DF

alldata = pd.DataFrame(columns=['mail','title','send','receive','time','state'])

DF复制

re2=re.copy(deep=True)

DF类型转换astype

model_off_train_1[model_off_train_1.iloc[:,5].astype(int)<=20160401]

DF截取某一列

alldata.iloc[:,4]
test_pre_uidmid['u1']

DF选取多列

train.iloc[:,2:(train.shape[1]-1)]
#最后一列
train.iloc[:,-1]
#带名字的列
val[['user_id','sku_id']]

DF去掉某列

del DF['column-name']
DF= DF.drop('column_name', 1);
DF.drop('column_name',axis=1, inplace=True)
DF.drop([DF.columns[[0,1, 3]]], axis=1,inplace=True) 

DF查看各列分布

df.describe()

抽样

re = train.sample(frac=0.25, random_state=66)

利用sql执行DF

from pandasql import sqldf
pysqldf=lambda q:sqldf(q,globals())
pysqldf("select * from test_sample  limit 5")

DF列重命名

feature_1.columns=['c0','c1','c2','c3','c4','c5','c6']

DF左连接

train_sample_all=pd.merge(train_sample_all,feature_1,how='left',on=['c0','c1'])

查看缺失值及缺失值补为0

train_sample_all[pd.isnull(train_sample_all.iloc[:,7])].head()  #查看缺失值
train_sample_all.iloc[:,7]=train_sample_all.iloc[:,7].fillna(0)  #填充缺失值

dfTest["avg_cvr"].fillna(np.mean(dfTrain["label"]), inplace=True)

重新DF建立行索引

tmp=pd.DataFrame(model_off_train_1_tmp)
tmp.index=[i for i in range(tmp.shape[0])]

tmp=train_all[(train_all['time']<=20161031)&(train_all['time']>=20161018)].reset_index()

##.reset_index(drop = True) 不生成新列index

显示DF所有的行列

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

选择DF满足多条件的部分

train_sample_1=model_off_train_1[(model_off_train_1.iloc[:,6].astype(int)<=20160615)&(model_off_train_1.iloc[:,6].astype(int)>=20160201)]

tmp=train_all[(train_all['time']<=20161031)&(train_all['time']>=20161018)].reset_index()

抽样及打乱数据

pture.sample(int(pture.shape[0]/2))
df.sample(n) 
#比例抽样
df1=d.sample(frac=0.2)
#打乱数据
df.sample(frac=1)  
from sklearn.utils import shuffle  
df = shuffle(df)

多个list转DF

pd.DataFrame({'c0':tmp2.index,'c1':tmp2.iloc[:,1]})
feature_1=pd.DataFrame({'u5':tmp})

添加新列及列之间运算

re1['diff'] = map(lambda x, y: abs(x-y) , re1['label'], re1['pro'])

DF添加DF类型列(合并列)

test_pre_uidmid=pd.concat([test_pre_uidmid,feature_1],axis=1)

相同列的DF行合并(合并行)

test_pre_allend_all=pd.concat([test_pre_allend,test_pre_allend_1])

lambda对某列进行操作

y=[diffdays(x) for x in d2["ALTDATE"]]
#or
y=map(lambda x: diffdays(x),d2["ALTDATE"])

DF排序

lc.sort(["loan_amnt"])
#默认升序True
lc.sort(["loan_amnt"],ascending=True)
#降序
lc.sort(["int_rate","loan_amnt"],ascending=False)

更改DF列排序

默认字典排序

columns = ['shop_id','time','year','mouth','day','pay_count','pay_user_count']
add_all=add_all.ix[:,columns]

DF去重

tmp=tmp.drop_duplicates(['c0','c1','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','u1','u2','u3','u4','u5','u6','u7','m1','m2','m3','m4','m5'])

shop_id = user_pay_all['shop_id']
shop_id = shop_id.drop_duplicates()

DF选择缺失值部分

test_pre_allend_1=test_pre_allend[pd.isnull(test_pre_allend.iloc[:,7])]
test_pre_allend=test_pre_allend[-pd.isnull(test_pre_allend.iloc[:,7])]

DF选取满足含有某字符串的部分=like匹配

alldata=alldata[(alldata['c2'].str.contains('@nuc',na=False))&(alldata['c3'].str.contains('@nuc',na=False))]

分组运算groupby count sum 自定义函数

mean(), sum(), max()
reset_index将聚合后的key作为列
参考https://my.oschina.net/lionets/blog/280332

train_all.groupby(['send','recieve'])['time'].count().reset_index()

df.groupby('key1').agg({'data1':'min','data2':'max'})
df.groupby(['key1','key2']).agg({'data1':'min','data2':'max'})

df.groupby('key1')['data1','data2'].agg(['min','max'])
df.groupby('key1')['data1','data2'].agg(lambda arr:arr.max()-arr.min())

dfTrain.groupby(key).apply(lambda df: np.mean(df["label"])).reset_index()

遍历多个数组

d2["year1"]=[diffdays2(x,y) for (x,y) in zip(d2["FBDATE"],d2["SXENDDATE"])]

计算时间差

import datetime
#计算时间差-天 
(datetime.datetime(2016,2,11)-datetime.datetime(2016,2,01)).days
#时间差  秒
a=datetime.now()
b=datetime.now()
(b-a)
(b-a).seconds

时间转换-字符串格式更改-时间戳转换

#如a = "2013-10-10 23:40:00",想改为 a = "2013/10/10 23:40:00"方法:先转换为时间数组,然后转换为其他格式
import time
a = "2013-10-10 23:40:00"
timeArray = time.strptime(a, "%Y-%m-%d %H:%M:%S")
otherStyleTime = time.strftime("%Y/%m/%d %H:%M:%S", timeArray)
#转换为时间戳:
timeStamp = int(time.mktime(timeArray))
timeStamp == 1381419600
#时间戳转换为指定格式日期:
timeStamp = 1381419600
timeArray = time.localtime(timeStamp)
otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
otherStyletime == "2013-10-10 23:40:00"
#获得当前时间时间戳
now = int(time.time())  ->这是时间戳
##datetime
import datetime
#获得当前时间
now = datetime.datetime.now()  ->这是时间数组格式
#string转datetime
str = '2012-11-19'
date_time = datetime.datetime.strptime(str,'%Y-%m-%d')
#datetime转string
date_time.strftime('%Y-%m-%d')
#datetime转时间戳
time_time = time.mktime(date_time.timetuple())
#时间戳转string
time.strftime('%Y-%m-%d',time.localtime(time_time))
#date转datetime
date = datetime.date.today()
datetime.datetime.strptime(str(date),'%Y-%m-%d') #将date转换为str,在由str转换为datetime

时间间隔增加/减少

import datetime
now_time = datetime.datetime.now()
yes_time = now_time + datetime.timedelta(days=1) #-n n
yes_time_nyr = yes_time.strftime('%Y%m%d')  ##//格式化输出

logging使用

  • 日志等级:DEBUG < INFO < WARNING < ERROR < CRITICAL
import logging
logging.debug("This is a debug log.")
logging.info("This is a info log.")
logging.warning("This is a warning log.")
logging.error("This is a error log.")
logging.critical("This is a critical log.")
## 输出到日志文件
log_format = "%(asctime)s - %(levelname)s - %(message)s"
logging.basicConfig(filename='my.log', level=logging.DEBUG, format=log_format)
# 使用变量
logging.warning('%s before you %s', 'Look', 'leap!')

使用配置文件进行日志记录

# 读取日志配置文件内容
logging.config.fileConfig('logging.conf')

# 创建一个日志器logger
logger = logging.getLogger('simpleExample')

# 日志输出
logger.debug('debug message')
logger.info('info message')
logger.warn('warn message')
logger.error('error message')
logger.critical('critical message')

配置文件logging.conf内容如下:

[loggers]
keys=root,simpleExample

[handlers]
keys=fileHandler,consoleHandler

[formatters]
keys=simpleFormatter

[logger_root]
level=DEBUG
handlers=fileHandler

[logger_simpleExample]
level=DEBUG
handlers=consoleHandler
qualname=simpleExample
propagate=0

[handler_consoleHandler]
class=StreamHandler
args=(sys.stdout,)
level=DEBUG
formatter=simpleFormatter

[handler_fileHandler]
class=FileHandler
args=('logging.log', 'a')
level=ERROR
formatter=simpleFormatter

[formatter_simpleFormatter]
format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
datefmt=

分布查询

# 查看几条
df.head()
# 行列
df.shape
# 查看数据类型
df.dtypes 
# 汇总信息 数值分布
df.describe()
# 非数值数据 count总数 unique去重后的个数  top出现频率最高的值  freq出现最高频率的值的频率
df.describe(exclude=[np.number])
# 非数值数据 出现分布比例
df.groupby('列名').size()
# 探索维度之间的相关性 默认计算皮尔逊相关系数 -1表示(线性)负相关,0表示(线性)无关,1表示(线性)正相关
df.corr()

np生成随机数

#生成100个0-100的等差数列
x = np.linspace(0, 100, 100) 
x2=np.random.randint(0,50, 100) ##0-50范围100个随机数 np.random.random(100) 0-1范围 np.random.normal(0,0.1,100)均值0方差0.1正态分布

简单绘图

import matplotlib.pyplot as plt
import numpy as np
#生成100个0-100的等差数列
x = np.linspace(0, 100, 100) 
#plt.plot(x,tmp1[0:100],'')#默认折线图
plt.plot(x,tmp1[0:100])
plt.show()
#直接把x按照折线图画出
plt.plot(x)
plt.ylabel('np_linspace')
plt.xlabel('1-100')
plt.title('test')
plt.show()
##颜色线的形状
##'g--'意思 颜色g 和--形状
plt.plot(x,'g--')
plt.show()
##'go-'意思 颜色g、点的形状o、线的形状-
x2=np.random.randint(0,50, 100)
plt.plot(x2,'go-')
##等价于
##plt.plot(x2,color='g',linestyle='-',marker='o')
plt.show()
##刻度
plt.xlim([0,100])
plt.ylim([0,100])
#散点图 o
plt.plot(x,tmp1[0:100],'o')
plt.show()
##柱状图
plt.hist(x2)
plt.show()
#一个图绘制多个线 o散点r颜色
plt.plot(x,tmp1[0:100],'or')
plt.plot(x,tmp1[0:100])
plt.show()
#汇制多个图
x = np.linspace(0, 100, 100) 
x2=np.random.randint(0,50, 100)
fig=plt.figure()
#设置宽高
fig=plt.figure(figsize=(15,5)) ##宽x高
# fig.set_figwidth(15)
# fig.set_figheight(5)
##p1图是画布1x2(1行2列)里面第1个
p1=fig.add_subplot(1,2,1)
p2=fig.add_subplot(1,2,2)
p1.plot(x,'r--')
p1.set_ylabel('xx')#用set方法
p2.hist(x2)
plt.show()

生成时间序列

import datetime

def datelist(start, end):
    start_date = datetime.date(*start)
    end_date = datetime.date(*end)

    result = []
    curr_date = start_date
    while curr_date != end_date:
        result.append("%04d%02d%02d" % (curr_date.year, curr_date.month, curr_date.day))
        curr_date += datetime.timedelta(1)
    result.append("%04d%02d%02d" % (curr_date.year, curr_date.month, curr_date.day))
    return result

if __name__ == "__main__":
    print datelist((2014, 7, 28), (2014, 8, 3))

稀疏矩阵列合并

from scipy import sparse
sparse.hstack((X_train, x_train))

from scipy.sparse import coo_matrix, hstack
A = coo_matrix([[1, 2], [3, 4]])
B = coo_matrix([[5], [6]])
hstack([A,B]).toarray()

枚举

索引+值

feats = ["cID", "aID"]
for i,feat in enumerate(feats):
	print i
	print feat

改变数组大小reshape

#多行1列
dfTrain[feat].values.reshape(-1, 1)

import numpy as np
a = np.array([1, 2, 3, 4])
a.reshape((-1,1)).shape  #4x1
a.reshape((1,-1)).shape  #1x4

numpy

###返回numpy的array最大的topn

#topn
import heapq
heapq.nlargest(top_n, range(len(w)), w.take)
#top1
np.argmax(a)

返回array某个值的索引

ndex=word.index(w)

函数式编程

import numpy as np
#方法1
np.median([len(x) for x in train_nlp['sentence']])
#方法2
get_len=lambda x:len(x)
t=train_nlp['sentence'].apply(get_len)
np.median(t)
  • 2
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值