方法一:CountVectors + RidgeClassifier
# Count Vectors + RidgeClassifier
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import f1_score
train_df = pd.read_csv('../data/train_set.csv', sep='\t', nrows=15000)
vectorizer = CountVectorizer(max_features=3000)
train_test = vectorizer.fit_transform(train_df['text'])
clf = RidgeClassifier()
clf.fit(train_test[:10000], train_df['label'].values[:10000])
val_pred = clf.predict(train_test[10000:])
print(f1_score(train_df['label'].values[10000:], val_pred, average='macro'))
# 0.74
`结果:
方法二:TF-IDF + RidgeClassifier
# TF-IDF + RidgeClassifier
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import f1_score
train_df = pd.read_csv('../data/train_set.csv', sep='\t', nrows=15000)
tfidf = TfidfVectorizer(ngram_range=(1,3), max_features=3000)
train_test = tfidf.fit_transform(train_df['text'])
clf = RidgeClassifier()
clf.fit(train_test[:10000], train_df['label'].values[:10000])
val_pred = clf.predict(train_test[10000:])
print(f1_score(train_df['label'].values[10000:], val_pred, average='macro'))
# 0.87
结果:
作业:
改变TF-IDF的参数,并验证精度
ngram_range | max_features | max_df | stop_words | f1_score |
---|---|---|---|---|
(1,3) | 3000 | default | default | 0.8721 |
(1,3) | 4000 | default | default | 0.8753 |
(1,3) | 2000 | default | default | 0.8603 |
(1,3) | 5000 | default | default | 0.8850 |
(1,4) | 5000 | default | default | 0.8849 |
(1,2) | 5000 | default | default | 0.8864 |
default | 5000 | default | default | 0.8605 |
(1,2) | 5000 | 0.8 | default | 0.8861 |
(1,2) | 5000 | 1.2 | default | 0.8864 |
(1,2) | 5000 | default | 900,3750,648 | 0.8895 |
使用其他机器学习模型,完成训练和验证
Logistic Regression(逻辑回归)
(Multinomial) Naive Bayes(多项式朴素贝叶斯)
Linear Support Vector Machine(线性支持向量机)
0.8932985819206637
Random Forest(随机森林)
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
LinearSVC(),
MultinomialNB(),
分类器 | f1值 |
---|---|
线性SVM | 0.8932 |
朴素贝叶斯 | 0.6468 |
# TF-IDF + RidgeClassifier
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
train_df = pd.read_csv('data/train_set.csv', sep='\t', nrows=15000)
# 有时单个词语作为特征还不够,能够加入一些词组更好,ngram_range允许词表使用1~3个词语组合
#max_features 限制词表大小 stop_words过滤指定停用词
#max_df 【0.0,1.0】默认值为1.0 当设置为浮点数时,过滤出现在超过max_df/低于min_df比例的句子中的词语
#当设置为正整数时,则是超过max_df的句子
#过滤指定停用词
tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=5000,stop_words=["900","3750","648"])
train_test = tfidf.fit_transform(train_df['text'])
clf = LinearSVC()
clf.fit(train_test[:10000], train_df['label'].values[:10000])
val_pred = clf.predict(train_test[10000:])
print(f1_score(train_df['label'].values[10000:], val_pred, average='macro'))