import pandas as pd
train_df = pd. read_csv( '1/train_set.csv' , sep= '\t' , nrows= 15000 )
train_df. head( )
label text 0 2 2967 6758 339 2021 1854 3731 4109 3792 4149 15... 1 11 4464 486 6352 5619 2465 4802 1452 3137 5778 54... 2 3 7346 4068 5074 3747 5681 6093 1777 2226 7354 6... 3 2 7159 948 4866 2109 5520 2490 211 3956 5520 549... 4 3 3646 3055 3055 2490 4659 6065 3370 5814 2465 5...
Count Vectors + RidgeClassifier
import pandas as pd
from sklearn. feature_extraction. text import CountVectorizer
from sklearn. linear_model import RidgeClassifier
from sklearn. metrics import f1_score
vectorizer = CountVectorizer( max_features= 3000 )
train_test = vectorizer. fit_transform( train_df[ 'text' ] )
clf = RidgeClassifier( )
clf. fit( train_test[ : 10000 ] , train_df[ 'label' ] . values[ : 10000 ] )
val_pred = clf. predict( train_test[ 10000 : ] )
print ( f1_score( train_df[ 'label' ] . values[ 10000 : ] , val_pred, average= 'macro' ) )
0.65441877581244
TF-IDF + RidgeClassifier
from sklearn. feature_extraction. text import TfidfVectorizer
tfidf = TfidfVectorizer( ngram_range= ( 1 , 3 ) , max_features= 3000 )
train_test = tfidf. fit_transform( train_df[ 'text' ] )
clf = RidgeClassifier( )
clf. fit( train_test[ : 10000 ] , train_df[ 'label' ] . values[ : 10000 ] )
val_pred = clf. predict( train_test[ 10000 : ] )
print ( f1_score( train_df[ 'label' ] . values[ 10000 : ] , val_pred, average= 'macro' ) )
0.8719098297954606
尝试改变TF-IDF的参数,并验证精度
f1 = { }
def idf ( x) :
tfidf = TfidfVectorizer( ngram_range= x, max_features= 3000 )
train_test = tfidf. fit_transform( train_df[ 'text' ] )
clf = RidgeClassifier( )
clf. fit( train_test[ : 10000 ] , train_df[ 'label' ] . values[ : 10000 ] )
val_pred = clf. predict( train_test[ 10000 : ] )
f = f1_score( train_df[ 'label' ] . values[ 10000 : ] , val_pred, average= 'macro' )
f1[ x] = f
for a in range ( 1 , 5 ) :
for b in range ( 1 , 6 ) :
if a <= b:
x = ( a, b)
idf( x)
f1
{(1, 1): 0.8591560167309494,
(1, 2): 0.8720847996197074,
(1, 3): 0.8719372173702,
(1, 4): 0.8736955495856327,
(1, 5): 0.8753973492633325,
(2, 2): 0.8431077261676884,
(2, 3): 0.8247590162358751,
(2, 4): 0.8217618443321154,
(2, 5): 0.8254233297052576,
(3, 3): 0.7544838165811477,
(3, 4): 0.7579950226623512,
(3, 5): 0.7388010943827312,
(4, 4): 0.6651324921251808,
(4, 5): 0.6346451598446852}
f1 = { }
for b in range ( 1 , 20 ) :
x = ( 1 , b)
idf( x)
f1
尝试使用其他机器学习模型,完成训练和验证
from sklearn import svm
clf= svm. SVC( )
tfidf = TfidfVectorizer( ngram_range= ( 1 , 5 ) , max_features= 3000 )
train_test = tfidf. fit_transform( train_df[ 'text' ] )
clf. fit( train_test[ : 10000 ] , train_df[ 'label' ] . values[ : 10000 ] )
val_pred = clf. predict( train_test[ 10000 : ] )
print ( f1_score( train_df[ 'label' ] . values[ 10000 : ] , val_pred, average= 'macro' ) )
import sklearn. model_selection as sk_model_selection
accs= sk_model_selection. cross_val_score( model, iris_X, y= iris_y, scoring= None , cv= 10 , n_jobs= 1 )
print ( '交叉验证结果:' , accs)