1.数据格式
数据存放于data.txt文件中,形式如下,这里使用了5个特征:f1,f2,f3,f4,f5,即5列,外加一列label。
f1 f2 f3 f4 f5 label
9976.17129226 0.966292134831 1883 0.837399047422 2 1
6295.26515736 0.723188405797 712 0.909911678116 0 1
5905.76249754 0.457142857143 1887 0.928461081402 0 1
5026.02661901 0.8 218 0.972602739726 0 1
2.读取样本
读取样本并对缺失值做处理。
#读取样本数据
def getSamples():
data = pd.read_table(samples_file_path)
print len(data)
df = data[['f1','f2','f3','f4','f5','label']]
print df.columns
print "++++++++++++++++++++++++++++++++++++++"
ttt = df[df['label'].isin([0])]
print len(ttt)
ww = df['label']
print ww.value_counts()
dd = df[df['distance']<10000]
print len(dd)
# df2['E'].isin(['two', 'four'])
df = missingValueProcessor(df)
df = df.fillna(value=0)
df["f4"] = np.log2(df["f4"]+1)
df["f5"] = np.log2(df["f5"]+1)
df.to_csv(total_raw_features, sep='\t', encoding='utf-8', index=False)
#缺失值处理
def missingValueProcessor(data):
for column in data.columns:
if type(data[column][0]) == np.str:
data[column].fillna(value="PAD")
elif type(data[column][0]) == np.float64:
data[column].fillna(value=0.0)
data = data.dropna(how='all', axis=1)
data = data.ix[:, (data != data.ix[0]).any()]
return data
3.处理特征
def featureProcess():
df = pd.read_csv(total_raw_features, sep='\t')
feature = df.fillna(value=0).drop(['label'], axis=1)
feature = feature.ix[:, (feature != feature.ix[0]).any()]
X = feature.values
param = {"f1": 0.9,"f2": 0.9,"f3": 1.0, 'f4':1.0,
'f5':0.85}
for key,value in param.items():
if value < 1:
quantile = feature[key].quantile(value)
feature.loc[feature[key] > quantile, key] = quantile
elif value >= 1:
feature.loc[feature[key] > value, key] = value
X=feature.values
y = df['label'].values
X_total = X
y_total = y
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)
X_std_train = X_train
X_std_test = X_test
mm = MinMaxScaler()
mm.fit(X_train)
X_train_std = mm.transform(X_train)
X_test_std = mm.transform(X_test)
print list(feature.columns)
return X_total,y_total,X_train_std,X_test_std,y_train,y_test,mm.data_min_,mm.data_max_,feature,X_std_train,X_std_test
4.逻辑回归模型
from config import *
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import *
from sklearn.metrics import *
from sklearn.externals import joblib
from sklearn.model_selection import *
class lr(object):
def __init__(self):
print "lr model is using:"
def GridSeach(self,x_train,y_train):
# mm = MinMaxScaler()
# mm.fit(X)
# x_train = mm.transform(X)
# param_grid = {'C': range(1,200,5)}
param_grid = {'C':[0.01, 0.05, 0.1,0.5, 1.0, 5,10.0,50.0]}
clf = GridSearchCV(LogisticRegression(penalty='l2',max_iter=500), param_grid,cv=3)
clf.fit(x_train, y_train)
print ("C is : %d ",clf.best_params_['C'])
# print clf.best_score_
return clf.best_params_['C']
def training(self,X,y):
# X_train = X
# y_train = y
# mm = MinMaxScaler()
# mm.fit(X_train)
# x_train=mm.transform(X_train)
x_train = X
y_train = y
if type_lr == 'pack_lr\n':
rho = 10
else:
rho = self.GridSeach(x_train,y_train)
# rho = 10
# print rho
# nsamples = len(X)
# k=0
# print len(y)
# print (y[12])
# for i in range(nsamples):
# if(y[i]==1):
# k+=1
# print "k value is ",k
# print "nsamples valuse is ",nsamples
# cw = {1:1*nsamples/k,0:1*nsamples/(nsamples-k)}
print rho
lr = LogisticRegression(C = rho,class_weight=None)
lr.fit(x_train,y_train)
joblib.dump(lr,model_file_path)
return lr.coef_,lr.intercept_
def predict(self,X_test, y_test):
model = joblib.load(model_file_path)
y_preditions = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_preditions)
print ("auc is : %f " ,auc)
# k=0
# k1=0
# k2=0
# print len(y_test)
# for i in range(len(y_test)):
#
# if(y_test[i]==1):
# k+=1
# if(y_preditions[i]<0.5):
# k1+=1
# else:
# if(y_preditions[i]>0.50):
# k2+=1
#
#
# print 'error num is :',k,k1,k2
5.训练模型
if __name__ == '__main__':
getSamples()
X,y,X_tr,X_te,y_tr,y_te,min_list,max_list,feature, X_std_train,X_std_test= featureProcess()
print list(feature.columns)
lr = lr()
w,b = lr.training(X_tr,y_tr)
lr.predict(X_tr, y_tr)
lr.predict(X_te,y_te)
# mm = MinMaxScaler()
# mm.fit(X)
# x_test = mm.transform(X)
selected_feature = list(feature.columns)
min_max=[]
# for f in np.array(X).transpose():
# min_max.append([min(f),max(f)])
for i,k in enumerate(min_list):
min_max.append([k, max_list[i]])
mapLRFeature(columns=selected_feature,weights=list(w[0]),bias=list(b),min_max=min_max)
6.解析模型
from config import *
def loadFeatureConfig():
f = open(feature_config_path)
features = {}
for line in f.readlines():
if not line:
continue
id_feature = line.split(":")
# print id_feature
features[int(id_feature[0])] = id_feature[1].strip('\n')
f.close()
return features
def loadFeatureConfig_newmodel():
f = open(feature_config_newmodel_path)
features = {}
for line in f.readlines():
if not line:
continue
id_feature = line.split(":")
# print id_feature
features[int(id_feature[0])] = id_feature[1].strip('\n')
f.close()
return features
def mapLRFeature(columns, weights, bias, min_max):
registered_features = loadFeatureConfig()
registered_features_newmodel = loadFeatureConfig_newmodel()
# print registered_features
# print registered_features_newmodel
# print columns
idlist = [k for column in columns for k, v in registered_features.iteritems() if v == column]
print idlist
min_max = {k:min_max[i] for i,k in enumerate(idlist)}
print min_max
output_weights = {}
ids = ""
for id,feature in registered_features.iteritems():
if feature in columns:
ids = ids + str(id) +","
output_weights[int(id)] = weights[columns.index(feature)]
print output_weights
ids = ids[:-1]+"\n"
fout_new = open(model_config_file_new, 'w')
res_dict = {"featureWeights": [], "bias": ""}
for k in min_max.keys():
print k
feature_list = {"feature": "", "weight": "", "max": "", "min": ""}
feature_list["feature"] = registered_features_newmodel[k]
feature_list["weight"] = output_weights[k]
feature_list["max"] = min_max[k][1]
feature_list["min"] = min_max[k][0]
res_dict["featureWeights"].append(feature_list)
res_dict["bias"] = bias[0]
fout_new.write(str(res_dict) + '\n')
fout = open(model_config_file, 'w')
fout.write(type_lr)
fout.write(ids)
fout.write("MIN_MAX\n")
fout.write("b:"+str(bias[0])+'\n\n')
for k,v in output_weights.iteritems():
fout.write(str(k)+":"+str(v)+"\n")
fout.write("\n")
for k, v in min_max.iteritems():
s = '%d:%s\t%s\n' % (k, v[0], v[1])
fout.writelines(s)
fout.close()
if __name__ == '__main__':
registered_features = loadFeatureConfig()
registered_features_newmodel = loadFeatureConfig_newmodel()
7.模型格式
lrmodel
1,11,13,16,49
MIN_MAX
b:2.03692592895
16:2.17873424402
1:-3.17638312941
11:0.249115751716
13:-1.3409294835
49:-5.42113571484
16:0.0 264.0
49:0.0 10593.4328739
11:0.0 1.0
13:0.0 846.0
1:0.00334448160535 1.0