前言
CSDN的不少博客中,总是有不少博主,在博客中加入自己的软文广告,不是要你加群就是要你关注公众号,关注公众号后,天天在自己的公众号中卖课,推销,甚是烦人,把天真可爱的程序员当作韭菜来割,真是可恶。今天,我们就是要甄别这类文章。
一、数据
technology文件夹中的某段数据:
这就很正能量,踏踏实实写博客,分享自己的经验,向大佬致敬👍👍
advertisement文件夹中的某段数据:
转人家文章还给自己打广告,拉进群不是要你报班就是叫你买课😡😡
话不多说,数据准备好了,开搞:
二、训练
1.加载数据
def load_data(self):
'''加载文件内容和标签'''
files = get_files_path(self.train_data_dir, '.txt')
contents = []
labels = []
for file in files:
with open(file, 'r') as f:
data = f.read()
data_cut = ' '.join(jieba.cut(data))
contents.append(data_cut)
label = file.split('/')[-2]
labels.append(label)
X_train, X_test, y_train, y_test = train_test_split(
contents, labels, test_size=0.2, random_state=3114795823)
return X_train, X_test, y_train, y_test
2.加载停用词
def load_stopwords(self):
path = './data/pro/datasets/stopwords/cn_stopwords.txt'
with open(path, 'r') as f:
stopwords = f.read().split('\n')
return stopwords
3.训练
def train(self):
stopwords = self.load_stopwords()
X_train, X_test, y_train, y_test = self.load_data()
tfidf = TfidfVectorizer(stop_words=stopwords, max_df=0.5)
train_data = tfidf.fit(X_train)
train_data = tfidf.transform(X_train)
test_data = tfidf.transform(X_test)
joblib.dump(tfidf, self.tfidf_path, compress=1)
nb_model = MultinomialNB(alpha=0.001)
nb_model.fit(train_data, y_train)
predict_test = nb_model.predict(test_data)
joblib.dump(nb_model, self.model_path, compress=1)
print("准确率为:", metrics.accuracy_score(predict_test, y_test))
整体来说比较简单,先用tfidf+nb
跑个baseline
测试集准确率:84.8%
4.预测
我们随便找一条数据来测试下效果:
test_data = """
<ref bean="myRealm01"/>
<ref bean="myRealm02"/>
</list>
</property>
</bean>
</property>
</bean>
3. 小结
好啦,这就是松哥和大家分享的 Shiro 多 Realm 情况,感兴趣的小伙伴可以去试试哦~
公众号后台回复 shiro,获取 Shiro 相关资料。
"""
predict = train_blog_cls_tfidf_nb.test(test_data)
print(predict)
输出:
['advertisement']
果然没有让我失望,这种以技术之名打广告的也给识别出来了。
三、完整代码
import os
import joblib
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import train_test_split
from common.utils import get_files_path
from common.path.dataset.blog import get_blog_cls_train_data_dir
from common.path.model.blog import get_nb_model_path, get_tfidf_path
class TrainBlogClsTfidfNb:
def __init__(self):
self.train_data_dir = get_blog_cls_train_data_dir()
self.tfidf_path = get_tfidf_path()
self.model_path = get_nb_model_path()
def load_data(self):
'''加载文件内容和标签'''
files = get_files_path(self.train_data_dir, '.txt')
contents = []
labels = []
for file in files:
with open(file, 'r') as f:
data = f.read()
data_cut = ' '.join(jieba.cut(data))
contents.append(data_cut)
label = file.split('/')[-2]
labels.append(label)
X_train, X_test, y_train, y_test = train_test_split(
contents, labels, test_size=0.2, random_state=3114795823)
return X_train, X_test, y_train, y_test
def load_stopwords(self):
path = './data/pro/datasets/stopwords/cn_stopwords.txt'
with open(path, 'r') as f:
stopwords = f.read().split('\n')
return stopwords
def train(self):
stopwords = self.load_stopwords()
X_train, X_test, y_train, y_test = self.load_data()
tfidf = TfidfVectorizer(stop_words=stopwords, max_df=0.5)
train_data = tfidf.fit(X_train)
train_data = tfidf.transform(X_train)
test_data = tfidf.transform(X_test)
joblib.dump(tfidf, self.tfidf_path, compress=1)
nb_model = MultinomialNB(alpha=0.001)
nb_model.fit(train_data, y_train)
predict_test = nb_model.predict(test_data)
joblib.dump(nb_model, self.model_path, compress=1)
print("准确率为:", metrics.accuracy_score(predict_test, y_test))
def test(self, test_data):
model = joblib.load(self.model_path)
tf_idf = joblib.load(self.tfidf_path)
test_vec = tf_idf.transform([test_data])
res = model.predict(test_vec)
return res
总结
1、真心希望CSDN这些打广告的能够越来越少
2、目前准确率只有84.5%,后续还会继续优化