1、删除后需要重置索引
data_y = data_y.drop([0,1]).reset_index(drop=True)
data_y
2、保留索引能整除4的行数据,for循环太慢
###for 循环
for i in data_y.index:
if i%4!=0:
data_y = data_y.drop([i])
data_y = data_y.reset_index(drop=True)
data_y
改进:
data_y = data_y.select(lambda x: x % 4 == 0)
data_y = data_y.reset_index(drop=True)
data_y
3、查看空值 info()
4、忽略警告
import warnings
warnings.filterwarnings("ignore")
5、左右连接接dataframe
data = data_x.join(data_y)
6、上下连接
data_all = pd.concat([data_all,data1], axis=0, ignore_index=True)
7、jupyter notebook完全显示dataframe的列、行
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
8、
### nrows:load subset of training data parse_dates:转成时间类型
X_train = pd.read_csv('../data/train.csv',nrows = 500000,parse_dates=['click_time'])
9、string 移除指定字符串(修改列名用)
d = {'col1': [1, 2], 'col2': [3, 4]}
df = pd.DataFrame(data=d)
df
df.columns = df.columns.str.strip('c')
#df.columns = df.columns.map(lambda x:x[1:])
df
10、Pandas中很多操作都是“传址”的,也就是说
# 比如有一个dataframe df1
# 把df1赋值到df2
df2 = df1
# 对df2中的一些元素进行操作
# 发现df1也跟着变了
解决
df2 = df1.copy()
11、dataframe每个数加高斯随机噪声,for循环太慢
import random
# 对输入数据加入gauss噪声
# 定义gauss噪声的均值和方差
mu = 0
sigma = 0.5
datat_noise = datat.copy()
for i in datat.index:
datat_noise.t_bottom[i] += random.gauss(mu,sigma)
datat_noise.t_top[i] += random.gauss(mu,sigma)
datat_noise.columns = ['bottom','top']
datat_noise.head()
解决:
datat_noise = datat_noise.applymap(lambda x: x+random.gauss(mu, sigma))
12、python2.7 print()兼容python3
from distutils.log import warn as printf
printf('Hello World!')
13、dataframe mix_max 归一化(注意:test的是transform)
import pandas as pd
from sklearn import preprocessing
import numpy as np
d = {'col1': [1, 2,0], 'col2': [2,0,-1]}
d_train = pd.DataFrame(data=d)
d_train
d = {'col1': [3], 'col2': [0]}
d_test = pd.DataFrame(data=d)
d_test
x = d_train.values # returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
# 创建df,并将原来的列名封装回去
d_train = pd.DataFrame(x_scaled, index=d_train.index.values, columns=d_train.columns)
d_train
y = d_test.values # returns a numpy array
y_scaled = min_max_scaler.transform(y)
# 创建df,并将原来的列名封装回去
d_test = pd.DataFrame(y_scaled, index=d_test.index.values, columns=d_test.columns)
d_test
14、dataframe 显示多列
pd.options.display.max_columns =100
15、pandas series只能是一维,想把他里面的多维np.array转成多维np.array
变成这样
y_train_list = y_train.tolist()
y_train_np =np.array(y_train_list)
16、统计dataframe某一列的某个值的个数
user[user.age == 2].count()
17、统计缺失值-1
"""
统计一下缺失值-1
"""
#missing data
total = commodity[commodity ==-1].count().sort_values(ascending=False)
percent = (commodity[commodity ==-1].count()/commodity.count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data
18、缺失值-1用随机数3和4按照2:1概率替换,lambda if else语句
d = {'col1': [1, -1,2,-1], 'col2': [-1,6, 7,-1]}
df = pd.DataFrame(data=d)
df = df.applymap(lambda x:np.random.choice([3,3,4]) if x==-1 else x)
df
19、去除重复数据,保留第一次出现的数据
"""
取4月份作为线下验证集,并且是4月份第一次购买
"""
y_train = order[(order.o_date>'2017-03-31')&(order.o_date<'2017-05-01')]
y_train = y_train.sort_values('o_date')
y_train = y_train.drop_duplicates(subset = ['user_id','sku_id'],keep = 'first')
y_train
20、日期增减,比如增加一个月
import dateutil
y_train['pred_date']= y_train.o_date.apply(lambda x:x+dateutil.relativedelta.relativedelta(months = 1))#days =30一样的
21、将一行的数据重复n行
eg = np.tile(eg,(1460,1))
eg = pd.DataFrame(eg, columns=['eg1', 'eg2', 'eg3', 'eg4', 'eg5', 'eg6', 'eg7', 'eg8', 'eg9', 'eg10'])
x_eigenvectors = pd.concat([x_eigenvectors,eg],axis = 0 ,ignore_index = True)
22、np数组的保存和读取
np.save("../data/data.npy",data_np)
test = np.load("../data/data.npy")
23、python产生一个范围内指定不重复的n个数
import random
random.sample(range(0,10),8)
24、np类似df.apply操作
x = np.arange(20)
np.where(x%6 < 4)
25、np索引操作太慢
import numpy as np
import pandas as pd
a=np.random.randint(0,10,size=[60,3,3])
X_train = a[:5]
X_val = a[5:7]
X_test = a[7:10]
for i in range(5):
i = i+1
train = a[i*10:i*10+5]
X_train = np.concatenate((X_train, train), axis = 0)
val = a[i*10+5:i*10+7]
X_val = np.concatenate((X_val, val), axis = 0)
test = a[i*10+7:i*10+10]
X_test = np.concatenate((X_test, test), axis = 0)
改进
train_idx = [idx for (idx, val) in enumerate(a) if ((idx % 10 >5)&(idx % 10 <8))]
val_idx = [idx for (idx, val) in enumerate(a) if ((idx % 10 >5)&(idx % 10 <8))]
test_idx = [idx for (idx, val) in enumerate(a) if ((idx % 10 >5)&(idx % 10 <8))]
train[train_idx]
26、读写json
# Writing JSON data
with open('data.json', 'w') as f:
json.dump(data, f)
# Reading data back
with open('data.json', 'r') as f:
data = json.load(f)
import json
data = []
with open('../data/raw/trainset/search.train.json') as f:
for line in f:
data.append(json.loads(line))
27、python读取mat文件
import scipy.io
fre_1 = scipy.io.loadmat('../data/FRE_noise_sigma_0p5.mat')
from time import time
start = time()
cols = ['f1','f2','f3','f4','f5','location','level']
y = np.zeros(shape=(1,7))
for i in range(120):
loc = i+1
for j in range(9):
lev = (j+1)*10
fre = fre_1['FRE_noise_sigma_0p5'][i,j]
y1=transpose(fre)
y1 = np.append(y1,[[loc,lev]])
y = np.vstack((y,y1))
end = time()
print('time',(end-start)/60)
data = pd.DataFrame(y,columns=cols)
data = data.drop([0]).reset_index(drop = True)
data.to_csv('../csv/FRE_noise_sigma_0p5.csv',index = 0)
cell中cell带cell这种为了看到hdf5的数据
import h5py
f = h5py.File('../data/EV1_NOnoise.mat', 'r')
list(f.keys())
ev = f[f[f['EV1'][8,12]][0,0]]
ev
type(ev[:])
28、np上下拼接、左右拼接,转置
y = np.zeros(shape=(1,12))
sn = f[f['SN'][0,0]][:]
y1=transpose(sn)
#y1.shape (5840, 10)
label = np.tile([0,0],(5840,1))
#label.shape (5840, 2)
# left-right join
y1 = np.concatenate([y1,label],axis=1)
#y1.shape (5840, 12)
# top-bottom join
y = np.vstack((y,y1))
#y.shape (5841, 12)
29、统计缺失值
# 统计缺失值
for i in test.columns:
test[i] = test[i].replace("\\N",-1)
train[i] = train[i].replace("\\N",-1)
for i in test.columns:
print(i,(train[[i]] ==-1).sum())
print(i,(test[[i]] ==-1).sum())
# 统计缺失值
for i in test.columns:
test[i] = test[i].replace("\\N",float('nan'))
train[i] = train[i].replace("\\N",float('nan'))
print(train.isnull().sum())
print(test.isnull().sum())
#missing data
total = train.isnull().sum().sort_values(ascending=False)
percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)
30、判断np数组是否含有其他类型的
for i,element in enumerate(arr):
if(np.issubdtype(element,float)):
print(i,element)
31、python列表for循环
t = 1539216000
train['time'] = [t + i for i in train.index]
train