python遇到的问题

1、删除后需要重置索引

data_y = data_y.drop([0,1]).reset_index(drop=True)
data_y

2、保留索引能整除4的行数据,for循环太慢

###for 循环
for i in data_y.index:
    if i%4!=0:
        data_y = data_y.drop([i])
data_y = data_y.reset_index(drop=True)
data_y

改进:

data_y = data_y.select(lambda x: x % 4 == 0)
data_y = data_y.reset_index(drop=True)
data_y

3、查看空值 info()

4、忽略警告

import warnings
warnings.filterwarnings("ignore")

5、左右连接接dataframe

data = data_x.join(data_y)

6、上下连接

data_all = pd.concat([data_all,data1], axis=0, ignore_index=True)

7、jupyter notebook完全显示dataframe的列、行

pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000

8、

### nrows:load subset of training data  parse_dates:转成时间类型
X_train = pd.read_csv('../data/train.csv',nrows = 500000,parse_dates=['click_time'])

9、string 移除指定字符串(修改列名用)

d = {'col1': [1, 2], 'col2': [3, 4]}
df = pd.DataFrame(data=d)
df
df.columns = df.columns.str.strip('c')
#df.columns = df.columns.map(lambda x:x[1:])
df

  

10、Pandas中很多操作都是“传址”的,也就是说

# 比如有一个dataframe df1
# 把df1赋值到df2
df2 = df1

# 对df2中的一些元素进行操作
# 发现df1也跟着变了

解决

df2 = df1.copy()

11、dataframe每个数加高斯随机噪声,for循环太慢

 

import random
# 对输入数据加入gauss噪声
# 定义gauss噪声的均值和方差
mu = 0
sigma = 0.5
datat_noise = datat.copy()
for i in datat.index:
    datat_noise.t_bottom[i] += random.gauss(mu,sigma)
    datat_noise.t_top[i] += random.gauss(mu,sigma)
datat_noise.columns = ['bottom','top']
datat_noise.head()

解决:

datat_noise = datat_noise.applymap(lambda x: x+random.gauss(mu, sigma))

12、python2.7 print()兼容python3

from distutils.log import warn as printf
printf('Hello World!')

13、dataframe mix_max 归一化(注意:test的是transform)

import pandas as pd
from sklearn import preprocessing
import numpy as np
d = {'col1': [1, 2,0], 'col2': [2,0,-1]}
d_train = pd.DataFrame(data=d)
d_train

d = {'col1': [3], 'col2': [0]}
d_test = pd.DataFrame(data=d)
d_test

x = d_train.values  # returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)

# 创建df,并将原来的列名封装回去
d_train = pd.DataFrame(x_scaled, index=d_train.index.values, columns=d_train.columns)
d_train

y = d_test.values  # returns a numpy array
y_scaled = min_max_scaler.transform(y)

# 创建df,并将原来的列名封装回去
d_test = pd.DataFrame(y_scaled, index=d_test.index.values, columns=d_test.columns)
d_test 

14、dataframe 显示多列

pd.options.display.max_columns =100

15、pandas series只能是一维,想把他里面的多维np.array转成多维np.array

 

变成这样


 

 

y_train_list = y_train.tolist()
y_train_np =np.array(y_train_list)

16、统计dataframe某一列的某个值的个数

user[user.age == 2].count()

17、统计缺失值-1

"""
统计一下缺失值-1
"""
#missing data
total = commodity[commodity ==-1].count().sort_values(ascending=False)
percent = (commodity[commodity ==-1].count()/commodity.count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data

18、缺失值-1用随机数3和4按照2:1概率替换,lambda if else语句

 

d = {'col1': [1, -1,2,-1], 'col2': [-1,6, 7,-1]}
df = pd.DataFrame(data=d)
df = df.applymap(lambda x:np.random.choice([3,3,4]) if x==-1 else x)
df

 

19、去除重复数据,保留第一次出现的数据

"""
取4月份作为线下验证集,并且是4月份第一次购买
"""
y_train = order[(order.o_date>'2017-03-31')&(order.o_date<'2017-05-01')]
y_train = y_train.sort_values('o_date')
y_train = y_train.drop_duplicates(subset = ['user_id','sku_id'],keep = 'first')
y_train

20、日期增减,比如增加一个月

import dateutil
y_train['pred_date']= y_train.o_date.apply(lambda x:x+dateutil.relativedelta.relativedelta(months = 1))#days =30一样的

21、将一行的数据重复n行

eg = np.tile(eg,(1460,1))
eg = pd.DataFrame(eg, columns=['eg1', 'eg2', 'eg3', 'eg4', 'eg5', 'eg6', 'eg7', 'eg8', 'eg9', 'eg10'])
x_eigenvectors = pd.concat([x_eigenvectors,eg],axis = 0 ,ignore_index = True)

22、np数组的保存和读取

np.save("../data/data.npy",data_np)
test = np.load("../data/data.npy")

23、python产生一个范围内指定不重复的n个数

import  random
random.sample(range(0,10),8)

24、np类似df.apply操作

x = np.arange(20)
np.where(x%6 < 4)

25、np索引操作太慢

import numpy as np
import pandas as pd
a=np.random.randint(0,10,size=[60,3,3])

X_train = a[:5]
X_val = a[5:7]
X_test = a[7:10]

for i in range(5):
    i = i+1
    train = a[i*10:i*10+5]
    X_train = np.concatenate((X_train, train), axis = 0)
    
    val = a[i*10+5:i*10+7]
    X_val = np.concatenate((X_val, val), axis = 0)

    test = a[i*10+7:i*10+10]
    X_test = np.concatenate((X_test, test), axis = 0)

改进

train_idx = [idx for (idx, val) in enumerate(a) if ((idx % 10 >5)&(idx % 10 <8))]
val_idx = [idx for (idx, val) in enumerate(a) if ((idx % 10 >5)&(idx % 10 <8))]
test_idx = [idx for (idx, val) in enumerate(a) if ((idx % 10 >5)&(idx % 10 <8))]

train[train_idx]

26、读写json

# Writing JSON data
with open('data.json', 'w') as f:
    json.dump(data, f)


# Reading data back
with open('data.json', 'r') as f:
    data = json.load(f)
import json    

data = []
with open('../data/raw/trainset/search.train.json') as f:
    for line in f:
        data.append(json.loads(line))

27、python读取mat文件

import scipy.io
fre_1 = scipy.io.loadmat('../data/FRE_noise_sigma_0p5.mat')

from time import time
start = time()

cols = ['f1','f2','f3','f4','f5','location','level']
y = np.zeros(shape=(1,7))
for i in range(120):
    loc = i+1
    for j in range(9):
        lev = (j+1)*10
        fre = fre_1['FRE_noise_sigma_0p5'][i,j]
        y1=transpose(fre)
        y1 = np.append(y1,[[loc,lev]])
        y = np.vstack((y,y1))
        
end = time()
print('time',(end-start)/60)
data = pd.DataFrame(y,columns=cols)
data = data.drop([0]).reset_index(drop = True)
data.to_csv('../csv/FRE_noise_sigma_0p5.csv',index = 0)

cell中cell带cell这种为了看到hdf5的数据

import h5py
f = h5py.File('../data/EV1_NOnoise.mat', 'r')
list(f.keys())

ev = f[f[f['EV1'][8,12]][0,0]] 
ev

type(ev[:])

28、np上下拼接、左右拼接,转置

y = np.zeros(shape=(1,12))

sn = f[f['SN'][0,0]][:]
y1=transpose(sn)
#y1.shape (5840, 10)
label = np.tile([0,0],(5840,1))
#label.shape (5840, 2)

# left-right join
y1 = np.concatenate([y1,label],axis=1)
#y1.shape (5840, 12)
# top-bottom join
y = np.vstack((y,y1))
#y.shape (5841, 12)

29、统计缺失值

# 统计缺失值
for i in test.columns:
    test[i] = test[i].replace("\\N",-1)
    train[i] = train[i].replace("\\N",-1) 
    
for i in test.columns:
    print(i,(train[[i]] ==-1).sum())
    print(i,(test[[i]] ==-1).sum())
# 统计缺失值
for i in test.columns:
    test[i] = test[i].replace("\\N",float('nan'))
    train[i] = train[i].replace("\\N",float('nan')) 
    
print(train.isnull().sum())
print(test.isnull().sum())
#missing data
total = train.isnull().sum().sort_values(ascending=False)
percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

30、判断np数组是否含有其他类型的

for i,element in enumerate(arr):
    if(np.issubdtype(element,float)):
        print(i,element)

31、python列表for循环

t = 1539216000
train['time'] = [t + i for i in train.index]
train

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值