python pandas 数据探索

来源于: Kaggle Lending Club Loan Data数据可视化分析与不良贷款预测

对特征缺失值的处理

1.计算特征缺失值比例的函数:

def draw_missing_data_table(data):
    total = data.isnull().sum().sort_values(ascending=False)
    percent = (data.isnull().sum() / data.shape[0]).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=["Total", "Percent"])
    missing_data.reset_index(inplace=True)
    missing_data.rename(columns={"index": "feature_name"}, inplace=True)
    return missing_data

保存所有特征缺失值比例的计算结果:

# save missing value percent of features
missing_data_count = draw_missing_data_table(loan_data)
missing_data_count.to_csv("missing_data_count.csv")
missing_data_count = pd.read_csv("missing_data_count.csv", header=0, index_col=0)
missing_data_count = missing_data_count[missing_data_count["Percent"] > 0.0]
print(missing_data_count.head())

2.快速查看数据分布

查看分布图

pandas中某一列里面元素的分布情况

其中aa代表属性列.图中的列向量表示的在这个属性中的特征,纵坐标表示特征出现的次数

plt.figure(figsize=(15,10))
df[' aa'].value_counts().head(15).plot(kind='bar')

plt.title('15 most common weathers in Delhi')
plt.show()

3.查看每列的不同值

df.unique()
在这里插入图片描述

4.按某列降序排列

Pandas学习之sort_values()

方法:

  • DataFrame.sort_values(by, axis=0, ascending=True, inplace=False, kind=‘quicksort’, na_position=‘last’, ignore_index=False, key=None)

df.sort_values(by =[‘imp’],ascending = False )

by字段 可以多个。 逗号隔开即可。

5.自动推断 eval()

eval在python中的意思是什么

a="[1,2,3,4,5]"
b=eval(a)
print(b) 
# a是字符串类型数据,b是列表类型数据

结果: [1, 2, 3, 4, 5]

6.pandas 使用apply同时处理两列数据

pandas 使用apply同时处理两列数据

import numpy as np
import pandas as pd
 
df = pd.DataFrame ({'a' : np.random.randn(6),
             'b' : ['foo', 'bar'] * 3,
             'c' : np.random.randn(6)})
 
def my_test(a, b):
    return a + b
 
df['Value'] = df.apply(lambda row: my_test(row['a'], row['c']), axis=1)
print (df)
 
 
#输出结果形如:
#           a    b         c     Value
# 0 -0.276507  foo -3.122166 -3.398672
# 1 -0.589019  bar -1.150915 -1.739934
# 2 -0.485433  foo  1.296634  0.811200
# 3  0.469688  bar -0.554992 -0.085304
# 4  1.297845  foo  1.672957  2.970802
# 5 -0.702724  bar -1.609585 -2.312309

7. 两个 list 一一对应,将一个 list 排序,要求另一个 list 随之排序

两个 list 一一对应,将一个 list 排序,要求另一个 list 随之排序

1. 将A,B两个 list 变成 list of tuple,使用 zip() 函数
2. 排序
3. 拆分

zipped = zip(A,B)
#假设A = [1,2,3], B = [6,5,4]
#此时 zipped = [(1,6), (2,5), (3,4)]

sort_zipped = sorted(zipped,key=lambda x:(x[1],x[0]))
# 先按 x[1] 进行排序,若 x[1] 相同,再按照 x[0] 排序

result = zip(*sort_zipped)
# 将 sort_zipped 拆分成两个元组,为 [(3,2,1),(4,5,6)]
 
x_axis, y_axis = [list(x) for x in result]
# 将拆分后的两个元组分别变成 list 列表,为 [3,2,1],[4,5,6]

python对两个关联list排序(sort,sorted)及自定义排序

list1 = ['three', 'two', 'four', 'one', 'zero']
list2 = [3,2,4,1,1]
l1, l2 = zip(*sorted(zip(list1, list2),key=lambda x:(x[1],x[0]) ,reverse=False)) # reverse -- 排序规则,reverse = True 降序 , reverse = False 升序(默认)。
# 将拆分后的两个元组分别变成 list 列表
l1 = list(l1)
l2 = list(l2)
print(l1)
print(l2)


('one', 'zero', 'two', 'three', 'four')
(1, 1, 2, 3, 4)


8.如果出现Nan怎么判断

❌ a == null
✔️ import numpy as np np.isnan(a)

9.查看某列的特征出现次数,并合并回原表

d = {'name':['Jack','Tom','Mary'],'age':[18,18,21],'gender':['m','m','w']}
frame = pd.DataFrame(d)
frame


	name 	age 	gender
0 	Jack 	18 	m
1 	Tom 	18 	m
2 	Mary 	21 	w


fs = "age"
nfs = "cr_age"
temp = frame.groupby(fs).size().reset_index().rename(columns={0: nfs})
temp



	age 	cr_age
0 	18 	2
1 	21 	1

frame = frame.merge(temp, 'left', on=fs)

frame

	name 	age 	gender 	cr_age
0 	Jack 	18 	m 	2
1 	Tom 	18 	m 	2
2 	Mary 	21 	w 	1


10.value_counts结果转化为Dataframe

s = train["make"].value_counts().reset_index(name='count').rename(columns={'index':'make'})
s.head()


	make 	count
0 	a5 	24226
1 	r9s 	12226
2 	r11 	10207
3 	a7x 	9140
4 	vivo y93 	7530

11.生成累计值

def addPercent(data, colname):
    ssum = data[colname].sum()
    data["sum"] = 0
    data["percent"] = 0.0
    add = 0
    for i in range(len(data)):
        add += data.loc[i,colname]
        data.loc[i,"sum"] = add
        data.loc[i,"percent"] = add/ssum
    return data
dmodelNumsaddPercent = addPercent(dmodelNums, "nums")        
dmodelNumsaddPercent.head(7)


 	model 	nums 	percent 	sum
0 	OPPO 	166869 	0.417173 	166869
1 	VIVO 	98535 	0.663510 	265404
2 	华为 	66108 	0.828780 	331512
3 	荣耀 	22791 	0.885757 	354303
4 	小米 	12619 	0.917305 	366922
5 	魅族 	9721 	0.941608 	376643
6 	三星 	6398 	0.957603 	383041

12.类别特征粗暴组合

# 交叉
first_feature = ['gender', 'city', 'province', "age"]


cross_feature = []
for i in range(len(first_feature)):
    for j in range(i+1,len(first_feature)):
        feat_1 = first_feature[i]
        feat_2 = first_feature[j]
        col_name = "cross_" + feat_1 + "_and_" + feat_2
        cross_feature.append(col_name)
        data[col_name] = data[feat_1].astype(str).values + '_' + data[feat_2].astype(str).values

# 统计交叉特征每个特征的出现次数
def feature_count(data, features=[]):
    if len(set(features)) != len(features):
        print('equal feature !!!!')
        return data
    new_feature = 'count'
    for i in features:
        new_feature += '_' + i.replace('add_', '')
    try:
        del data[new_feature]
    except:
        pass
    temp = data.groupby(features).size().reset_index().rename(columns={0: new_feature})
    data = data.merge(temp, 'left', on=features)
    return data
for i in cross_feature:
    n = data[i].nunique()
    if n > 5:
        print(i)
        data = feature_count(data, [i])
    else:
        print(i, ':', n)

13.对两列使用函数返回一列

# tagid以time字段排序 后重新生成新的data

def tagidSort(list1,list2):
    l1, l2 = zip(*sorted(zip(list1, list2),key=lambda x:(x[1],x[0]) ,reverse=False))
    return list(l1)

def timeSort(list1,list2):
    l1, l2 = zip(*sorted(zip(list1, list2),key=lambda x:(x[1],x[0]) ,reverse=False))
    return list(l2)

data['tagidSort'] = data.apply(lambda row: tagidSort(row['tagid'], row['time']), axis=1)
data["timeSort"] = data.apply(lambda row: timeSort(row['tagid'], row['time']), axis=1)

14.sklearn的tfidf模型存储

https://blog.csdn.net/u013083549/article/details/51262721

#document是处理好的文档

import joblib
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

tv = TfidfVectorizer(use_idf=True, smooth_idf=True, norm=None)  
vec = tv.fit_transform(document) 

joblib.dump(tv,"./model/tfidf_model.m")

tv = joblib.load("./model/tfidf_model.m")
tfidfVec = tv.transform(document)



15.对某个特征onhot

df = pd.get_dummies(df, columns=[‘color’])

16. tf1.x keras 自定义f1score

import tensorflow as tf

def f1( y_true, y_hat):

    epsilon = 1e-7
    y_hat = tf.round(y_hat)#将经过sigmoid激活的张量四舍五入变为0,1输出
    
    tp = tf.reduce_sum(tf.cast(y_hat*y_true, 'float'), axis=0)
    #tn = tf.sum(tf.cast((1-y_hat)*(1-y_true), 'float'), axis=0)
    fp = tf.reduce_sum(tf.cast(y_hat*(1-y_true), 'float'), axis=0)
    fn = tf.reduce_sum(tf.cast((1-y_hat)*y_true, 'float'), axis=0)
    
    p = tp/(tp+fp+epsilon)#epsilon的意义在于防止分母为0,否则当分母为0时python会报错
    r = tp/(tp+fn+epsilon)
    
    f1 = 2*p*r/(p+r+epsilon)
    f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)
    return f1

参考文档: 【机器学习】F1分数(F1 Score)详解及tensorflow、numpy实现

tf2.x 可以参考 F1-score参考

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值