任务背景
存在一堆视频,每个视频有多个标签tag,如预告、电影、加勒比海盗、ted之类,然后需要根据tag对视频进行自动分类。(有一些历史数据作为测试和训练集)。 这个模型可以拓展成文章分类,只需要把tag改为根据词频统计的词组就行了。
选用方案
由于以前用过sklearn,所以直接用sklearn作为分类工具。用法很简单,定义一个由特征提取和分类器组合成的Pipeline,然后训练训练数据,测试正确率,最后拿模型去预测目标数据即可。
Pipeline
SVC_pipeline = Pipeline([
('tfidf', TfidfVectorizer(min_df=3,max_df=0.9,ngram_range=(1,2))),
('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
])
参考代码
# -*- coding: utf-8 -*-
"""
File Name: demo
Description :
Author : meng_zhihao
mail : 312141830@qq.com
date: 2020/3/17
"""
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score
from ast import literal_eval
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse as sp_sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import LinearSVC
import pandas as pd
video_class_dict = {0: '其他', 1: '电影', 2: '电视剧', 3: '音乐', 4: '创意', 5: '游戏', 6: '科技', 7: '新闻', 8: '教育', 9: '生活', 10: '演讲', 11: '少儿动画', 12: '旅游', 13: '综艺娱乐', 14: '汽车', 16: '军事', 19: '文化历史', 20: '体育', 21: '财经', 25: '美食', 26: '脱口秀', 27: '纪录片', 28: '搞笑', 29: '原创', 30: '时尚', 33: '科学', 34: '动漫', 35: '数码', 37: 'TED', 38: '新冠病毒', 39: 'www', 40: '少儿绘本', 41: '萌宠', 42: '少儿教育', 43: '少儿歌曲'}
def print_evaluation_scores(y_val, predicted):
accuracy=accuracy_score(y_val, predicted)
f1_score_macro=f1_score(y_val, predicted, average='macro')
f1_score_micro=f1_score(y_val, predicted, average='micro')
f1_score_weighted=f1_score(y_val, predicted, average='weighted')
print("accuracy:",accuracy)
print("f1_score_macro:",f1_score_macro)
print("f1_score_micro:",f1_score_micro)
# accuracy: 0.3305439330543933
def text_prepare(context):
return context
def read_data(filename):
data = pd.read_csv(filename)
# data['tags'] = data['tags'].apply(literal_eval)
# ast.literal_eval是python针对eval方法存在的安全漏洞而提出的一种安全处理方式。
#
# 简单点说ast模块就是帮助Python应用来处理抽象的语法解析的。而该模块下的literal_eval()函数:则会判断需要计算的内容计算后是不是合法的Python类型,如果是则进行运算,否则就不进行运算。
#
return data
SVC_pipeline = Pipeline([
('tfidf', TfidfVectorizer(min_df=3,max_df=0.9,ngram_range=(1,2))),
('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
])
NB_pipeline = Pipeline([
('cv', CountVectorizer(min_df=3,max_df=0.9,ngram_range=(1,2))), # 默认分词是根据\w+
('clf', OneVsRestClassifier(MultinomialNB(
fit_prior=True, class_prior=None))),
])
train = read_data(r'D:\py3\nlp\video_class_fier\video_class_tags.csv')
validation = read_data(r'D:\py3\nlp\video_class_fier\video_class_tags_valid.csv')
X_train, y_train = train.video_tag, train.class_id
X_val, y_val = validation.video_tag, validation.class_id
#开始进行数据清洗
X_train = [text_prepare(x) for x in X_train]
X_val = [text_prepare(x) for x in X_val]
SVC_pipeline.fit(X_train,y_train)
predicted = SVC_pipeline.predict(X_val)
print_evaluation_scores(y_val,predicted)
class_tags = []
for tag,class_id,origin_class_id in zip(X_val,predicted,y_val):
print(tag,class_id)
class_tags.append({"origin_class":video_class_dict.get(origin_class_id,''), "predict_class":video_class_dict.get(class_id,''),"video_tag":tag})
df = pd.DataFrame(class_tags) #
df.to_csv('test_result.csv') # 默认增加列名