01分类
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score , roc_auc_score , roc_curve
import matplotlib.pyplot as plt
def create_model(d_train , d_test):
print("训练样本 = %d" % len(d_train))
print("测试样本 = %d" %len(d_test))
vectorizer = TfidfVectorizer(ngram_range=(1,2),min_df=2 ) #tf-idf特征抽取ngram_range=(1,2)
features = vectorizer.fit_transform(d_train.title)
print("训练样本特征表长度为 " + str(features.shape))
# print(vectorizer.get_feature_names()[3000:3050]) #特征名展示
test_features = vectorizer.transform(d_test.title)
print("测试样本特征长度为:"+str(test_features.shape))
#支持向量机
#C: 目标函数的惩罚系数C,用来平衡分类间隔margin和错分样本的,default C = 1.0
svmmodel = SVC(C = 1.0 , kernel= "linear") #kernel:参数选择有rbf, linear, poly, Sigmoid, 默认的是"RBF";
nn = svmmodel.fit(features , d_train.two_category)
print(nn)
# predict = svmmodel.score(test_features ,d_test.two_category)
# print(predict)
pre_test = svmmodel.predict(test_features)
d_test["01category"] = pre_test
d_test.to_excel("wr01_pre_1025.xlsx", index=False)
# d_train , d_test = data_prepare()
print("对新样本进行01预测")
df = pd.read_excel("wr01_new_train1012.xlsx") #训练
d_train = df
d_test = pd.read_excel("wr_100样本1023.xlsx") #测试
create_model(d_train, d_test)
60分类
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score , roc_auc_score , roc_curve
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
def create_model(d_train , d_test):
print("训练样本 = %d" % len(d_train))
print("测试样本 = %d" %len(d_test))
vectorizer = TfidfVectorizer(ngram_range=(1,2),min_df=2 ) #tf-idf特征抽取ngram_range=(1,2)
features = vectorizer.fit_transform(d_train.title)
print("训练样本特征表长度为 " + str(features.shape))
# print(vectorizer.get_feature_names()[3000:3050]) #特征名展示
test_features = vectorizer.transform(d_test.title)
print("测试样本特征表长度为 "+ str(test_features.shape))
#支持向量机
#C: 目标函数的惩罚系数C,用来平衡分类间隔margin和错分样本的,default C = 1.0
svmmodel = SVC(C = 1.0 , kernel= "linear") #kernel:参数选择有rbf, linear, poly, Sigmoid, 默认的是"RBF";
nn = svmmodel.fit(features , d_train.sku)
print(nn)
# predict = svmmodel.score(test_features ,d_test.sku)
# print(predict)
pre_test = svmmodel.predict(test_features)
d_test["pre_skuno"] = pre_test
d_test.to_excel("wr60_svm_pre1012.xlsx", index=False)
print("对新样本进行60个车型预测")
d_train = pd.read_excel("wr60_train1012.xlsx") #训练
df = pd.read_excel("wr机器学习分析报告.xlsx",sheetname="01预测") #测试
d_test = df[df.pre_category == 1]
create_model(d_train, d_test)
# 训练样本 = 75987
# 测试样本 = 32606
# 训练样本特征表长度为 (75987, 18040)
# SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
# decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
# max_iter=-1, probability=False, random_state=None, shrinking=True,
# tol=0.001, verbose=False)
# 0.920137398025
#0.933329022245