from sklearn import svm, metrics
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pylab as plt
def do_metrics(y_test,y_pred):
print ("metrics.accuracy_score:")
print (metrics.accuracy_score(y_test, y_pred))
print ("metrics.confusion_matrix:")
print (metrics.confusion_matrix(y_test, y_pred))
print ("metrics.precision_score:")
print (metrics.precision_score(y_test, y_pred))
print ("metrics.recall_score:")
print (metrics.recall_score(y_test, y_pred))
print ("metrics.f1_score:")
print (metrics.f1_score(y_test,y_pred))
print("classification_report:")
print(classification_report(y_test, y_pred))
plot_auc(y_test,y_pred)
# print("--->",type(y_test))
# print("--->",type(y_pred))
# print("metrics.auc:", metrics.auc(np.array(y_test), np.array(y_pred)))
#auc计算并生成图形
def plot_auc(y_test,y_pred):
print("auc:")
fpr, tpr, thread = metrics.roc_curve(np.array(y_test), np.array(y_pred))
x=metrics.auc(fpr, tpr)
print(x)
plt.title("ROC curve of %s (AUC = %.4f)" % ('lightgbm', x))
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.plot(fpr,tpr) # use pylab to plot x and y
plt.show() # show the plot on the screen
def svmModel():
textFile=open(r'5-22-train.csv','rb')
data = pd.read_csv(textFile,header=None,prefix='x', error_bad_lines=False,low_memory=False)
data=data.dropna()
# 打乱数据的分布
data.sample(frac=1).reset_index(drop=True)
data.rename(
columns={
'x0': 'userId', 'x1': 'vreN', 'x2': 'lireN', 'x3': 'repN','x4': 'ot', 'x5': 'appV', 'x6': 'age', 'x7': 'bage',
'x8': 'keyword1', 'x9': 'keyword2', 'x10': 'tag1', 'x11': 'tag2', 'x12': 'tag3', 'x13': 'ks1', 'x14': 'ks2',
'x15': 'historySubT','x16':'ttp1','x17':'ttp2','x18':'ttp3','x19':'ktw1','x20':'ktw2','x21':'ktw3','x22': 'apn',
'x23': 'city_id','x24': 'mode', 'x25': 'phone_price', 'x26': 'city_level', 'x27': 'itemId', 'x28': 'itemType',
'x29': 'label','x30': 'texL', 'x31': 'titL', 'x32': 'twN', 'x33': 'subT', 'x34': 'sour',
'x35': 'imgs_count', 'x36': 'categordId','x37': 'status', 'x38': 'author_type'
}, inplace=True
)
#删除"-"字符
data=data.drop(data[data["city_id"] == "-"]["city_id"].index)
data["city_id"]=data["city_id"].apply(lambda x:int(x))
#训练集的标签
y_train =data.label
#所有的label减1
#label==0表示曝光 label==1表示点击
y_train=y_train[:]-1
x_train = data.drop(['label'], axis=1).drop(labels="itemId", axis=1).drop(labels="userId", axis=1) \
.drop(labels="itemType", axis=1)
x_train=np.array(x_train)
y_train=np.array(y_train)
x_train, valid_train, y_train, valid_test = train_test_split(x_train, y_train, test_size=0.9,
random_state=1) # 分训练集和测试集
textFile = open(r'5-22-test.csv', 'rb')
data1 = pd.read_csv(textFile, header=None, prefix='x', low_memory=False)
# 打乱数据的分布
data1=data1.dropna()
data1.sample(frac=1).reset_index(drop=True)
data1.rename(
columns={
'x0': 'userId', 'x1': 'vreN', 'x2': 'lireN', 'x3': 'repN', 'x4': 'ot', 'x5': 'appV', 'x6': 'age',
'x7': 'bage',
'x8': 'keyword1', 'x9': 'keyword2', 'x10': 'tag1', 'x11': 'tag2', 'x12': 'tag3', 'x13': 'ks1', 'x14': 'ks2',
'x15': 'historySubT', 'x16': 'ttp1', 'x17': 'ttp2', 'x18': 'ttp3', 'x19': 'ktw1', 'x20': 'ktw2',
'x21': 'ktw3', 'x22': 'apn',
'x23': 'city_id', 'x24': 'mode', 'x25': 'phone_price', 'x26': 'city_level', 'x27': 'itemId',
'x28': 'itemType',
'x29': 'label', 'x30': 'texL', 'x31': 'titL', 'x32': 'twN', 'x33': 'subT', 'x34': 'sour',
'x35': 'imgs_count', 'x36': 'categordId', 'x37': 'status', 'x38': 'author_type'
}, inplace=True
)
# 删除"-"字
data1 = data1.drop(data1[data1["city_id"] == "-"]["city_id"].index)
data1["city_id"] = data1["city_id"].apply(lambda x: int(x))
# 训练集的标签
# 原始label
y_test = data1.label
y_test1 = y_test[:] - 1
y_test1=np.array(y_test1)
# 测试集数据
x_test = data1.drop(labels="label", axis=1).drop(labels="itemId", axis=1).drop(labels="userId", axis=1) \
.drop(labels="itemType", axis=1)
x_test=np.array(x_test)
model=svm.SVC(C=1,kernel='rbf',gamma=20)
model.fit(x_train,y_train)
model.score(x_train,y_train)
ypred= model.predict(x_test)
do_metrics(y_test1,ypred)
if __name__=="__main__":
svmModel()