来源于: Kaggle Lending Club Loan Data数据可视化分析与不良贷款预测
对特征缺失值的处理
1.计算特征缺失值比例的函数:
def draw_missing_data_table(data):
total = data.isnull().sum().sort_values(ascending=False)
percent = (data.isnull().sum() / data.shape[0]).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=["Total", "Percent"])
missing_data.reset_index(inplace=True)
missing_data.rename(columns={"index": "feature_name"}, inplace=True)
return missing_data
保存所有特征缺失值比例的计算结果:
# save missing value percent of features
missing_data_count = draw_missing_data_table(loan_data)
missing_data_count.to_csv("missing_data_count.csv")
missing_data_count = pd.read_csv("missing_data_count.csv", header=0, index_col=0)
missing_data_count = missing_data_count[missing_data_count["Percent"] > 0.0]
print(missing_data_count.head())
2.快速查看数据分布
pandas中某一列里面元素的分布情况
其中aa代表属性列.图中的列向量表示的在这个属性中的特征,纵坐标表示特征出现的次数
plt.figure(figsize=(15,10))
df[' aa'].value_counts().head(15).plot(kind='bar')
plt.title('15 most common weathers in Delhi')
plt.show()
3.查看每列的不同值
df.unique()
4.按某列降序排列
方法:
- DataFrame.sort_values(by, axis=0, ascending=True, inplace=False, kind=‘quicksort’, na_position=‘last’, ignore_index=False, key=None)
df.sort_values(by =[‘imp’],ascending = False )
by字段 可以多个。 逗号隔开即可。
5.自动推断 eval()
a="[1,2,3,4,5]"
b=eval(a)
print(b)
# a是字符串类型数据,b是列表类型数据
结果: [1, 2, 3, 4, 5]
6.pandas 使用apply同时处理两列数据
import numpy as np
import pandas as pd
df = pd.DataFrame ({'a' : np.random.randn(6),
'b' : ['foo', 'bar'] * 3,
'c' : np.random.randn(6)})
def my_test(a, b):
return a + b
df['Value'] = df.apply(lambda row: my_test(row['a'], row['c']), axis=1)
print (df)
#输出结果形如:
# a b c Value
# 0 -0.276507 foo -3.122166 -3.398672
# 1 -0.589019 bar -1.150915 -1.739934
# 2 -0.485433 foo 1.296634 0.811200
# 3 0.469688 bar -0.554992 -0.085304
# 4 1.297845 foo 1.672957 2.970802
# 5 -0.702724 bar -1.609585 -2.312309
7. 两个 list 一一对应,将一个 list 排序,要求另一个 list 随之排序
两个 list 一一对应,将一个 list 排序,要求另一个 list 随之排序
1. 将A,B两个 list 变成 list of tuple,使用 zip() 函数
2. 排序
3. 拆分
zipped = zip(A,B)
#假设A = [1,2,3], B = [6,5,4]
#此时 zipped = [(1,6), (2,5), (3,4)]
sort_zipped = sorted(zipped,key=lambda x:(x[1],x[0]))
# 先按 x[1] 进行排序,若 x[1] 相同,再按照 x[0] 排序
result = zip(*sort_zipped)
# 将 sort_zipped 拆分成两个元组,为 [(3,2,1),(4,5,6)]
x_axis, y_axis = [list(x) for x in result]
# 将拆分后的两个元组分别变成 list 列表,为 [3,2,1],[4,5,6]
python对两个关联list排序(sort,sorted)及自定义排序
list1 = ['three', 'two', 'four', 'one', 'zero']
list2 = [3,2,4,1,1]
l1, l2 = zip(*sorted(zip(list1, list2),key=lambda x:(x[1],x[0]) ,reverse=False)) # reverse -- 排序规则,reverse = True 降序 , reverse = False 升序(默认)。
# 将拆分后的两个元组分别变成 list 列表
l1 = list(l1)
l2 = list(l2)
print(l1)
print(l2)
('one', 'zero', 'two', 'three', 'four')
(1, 1, 2, 3, 4)
8.如果出现Nan怎么判断
❌ a == null
✔️ import numpy as np np.isnan(a)
9.查看某列的特征出现次数,并合并回原表
d = {'name':['Jack','Tom','Mary'],'age':[18,18,21],'gender':['m','m','w']}
frame = pd.DataFrame(d)
frame
name age gender
0 Jack 18 m
1 Tom 18 m
2 Mary 21 w
fs = "age"
nfs = "cr_age"
temp = frame.groupby(fs).size().reset_index().rename(columns={0: nfs})
temp
age cr_age
0 18 2
1 21 1
frame = frame.merge(temp, 'left', on=fs)
frame
name age gender cr_age
0 Jack 18 m 2
1 Tom 18 m 2
2 Mary 21 w 1
10.value_counts结果转化为Dataframe
s = train["make"].value_counts().reset_index(name='count').rename(columns={'index':'make'})
s.head()
make count
0 a5 24226
1 r9s 12226
2 r11 10207
3 a7x 9140
4 vivo y93 7530
11.生成累计值
def addPercent(data, colname):
ssum = data[colname].sum()
data["sum"] = 0
data["percent"] = 0.0
add = 0
for i in range(len(data)):
add += data.loc[i,colname]
data.loc[i,"sum"] = add
data.loc[i,"percent"] = add/ssum
return data
dmodelNumsaddPercent = addPercent(dmodelNums, "nums")
dmodelNumsaddPercent.head(7)
model nums percent sum
0 OPPO 166869 0.417173 166869
1 VIVO 98535 0.663510 265404
2 华为 66108 0.828780 331512
3 荣耀 22791 0.885757 354303
4 小米 12619 0.917305 366922
5 魅族 9721 0.941608 376643
6 三星 6398 0.957603 383041
12.类别特征粗暴组合
# 交叉
first_feature = ['gender', 'city', 'province', "age"]
cross_feature = []
for i in range(len(first_feature)):
for j in range(i+1,len(first_feature)):
feat_1 = first_feature[i]
feat_2 = first_feature[j]
col_name = "cross_" + feat_1 + "_and_" + feat_2
cross_feature.append(col_name)
data[col_name] = data[feat_1].astype(str).values + '_' + data[feat_2].astype(str).values
# 统计交叉特征每个特征的出现次数
def feature_count(data, features=[]):
if len(set(features)) != len(features):
print('equal feature !!!!')
return data
new_feature = 'count'
for i in features:
new_feature += '_' + i.replace('add_', '')
try:
del data[new_feature]
except:
pass
temp = data.groupby(features).size().reset_index().rename(columns={0: new_feature})
data = data.merge(temp, 'left', on=features)
return data
for i in cross_feature:
n = data[i].nunique()
if n > 5:
print(i)
data = feature_count(data, [i])
else:
print(i, ':', n)
13.对两列使用函数返回一列
# tagid以time字段排序 后重新生成新的data
def tagidSort(list1,list2):
l1, l2 = zip(*sorted(zip(list1, list2),key=lambda x:(x[1],x[0]) ,reverse=False))
return list(l1)
def timeSort(list1,list2):
l1, l2 = zip(*sorted(zip(list1, list2),key=lambda x:(x[1],x[0]) ,reverse=False))
return list(l2)
data['tagidSort'] = data.apply(lambda row: tagidSort(row['tagid'], row['time']), axis=1)
data["timeSort"] = data.apply(lambda row: timeSort(row['tagid'], row['time']), axis=1)
14.sklearn的tfidf模型存储
https://blog.csdn.net/u013083549/article/details/51262721
#document是处理好的文档
import joblib
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
tv = TfidfVectorizer(use_idf=True, smooth_idf=True, norm=None)
vec = tv.fit_transform(document)
joblib.dump(tv,"./model/tfidf_model.m")
tv = joblib.load("./model/tfidf_model.m")
tfidfVec = tv.transform(document)
15.对某个特征onhot
df = pd.get_dummies(df, columns=[‘color’])
16. tf1.x keras 自定义f1score
import tensorflow as tf
def f1( y_true, y_hat):
epsilon = 1e-7
y_hat = tf.round(y_hat)#将经过sigmoid激活的张量四舍五入变为0,1输出
tp = tf.reduce_sum(tf.cast(y_hat*y_true, 'float'), axis=0)
#tn = tf.sum(tf.cast((1-y_hat)*(1-y_true), 'float'), axis=0)
fp = tf.reduce_sum(tf.cast(y_hat*(1-y_true), 'float'), axis=0)
fn = tf.reduce_sum(tf.cast((1-y_hat)*y_true, 'float'), axis=0)
p = tp/(tp+fp+epsilon)#epsilon的意义在于防止分母为0,否则当分母为0时python会报错
r = tp/(tp+fn+epsilon)
f1 = 2*p*r/(p+r+epsilon)
f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)
return f1
参考文档: 【机器学习】F1分数(F1 Score)详解及tensorflow、numpy实现
tf2.x 可以参考 F1-score参考