# python借助sklearn库对视频自动分类

1 篇文章 0 订阅

### 任务背景

由于以前用过sklearn，所以直接用sklearn作为分类工具。用法很简单，定义一个由特征提取和分类器组合成的Pipeline，然后训练训练数据，测试正确率，最后拿模型去预测目标数据即可。

Pipeline

SVC_pipeline = Pipeline([
('tfidf', TfidfVectorizer(min_df=3,max_df=0.9,ngram_range=(1,2))),
('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
])


# -*- coding: utf-8 -*-
"""
File Name：     demo
Description :
Author :       meng_zhihao
mail :       312141830@qq.com
date：          2020/3/17
"""
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

from ast import literal_eval
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

from scipy import sparse as sp_sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import LinearSVC
import pandas as pd
video_class_dict = {0: '其他', 1: '电影', 2: '电视剧', 3: '音乐', 4: '创意', 5: '游戏', 6: '科技', 7: '新闻', 8: '教育', 9: '生活', 10: '演讲', 11: '少儿动画', 12: '旅游', 13: '综艺娱乐', 14: '汽车', 16: '军事', 19: '文化历史', 20: '体育', 21: '财经', 25: '美食', 26: '脱口秀', 27: '纪录片', 28: '搞笑', 29: '原创', 30: '时尚', 33: '科学', 34: '动漫', 35: '数码', 37: 'TED', 38: '新冠病毒', 39: 'www', 40: '少儿绘本', 41: '萌宠', 42: '少儿教育', 43: '少儿歌曲'}

def print_evaluation_scores(y_val, predicted):
accuracy=accuracy_score(y_val, predicted)
f1_score_macro=f1_score(y_val, predicted, average='macro')
f1_score_micro=f1_score(y_val, predicted, average='micro')
f1_score_weighted=f1_score(y_val, predicted, average='weighted')
print("accuracy:",accuracy)
print("f1_score_macro:",f1_score_macro)
print("f1_score_micro:",f1_score_micro)
# accuracy: 0.3305439330543933

def text_prepare(context):
return context

# data['tags'] = data['tags'].apply(literal_eval)
# ast.literal_eval是python针对eval方法存在的安全漏洞而提出的一种安全处理方式。
#
# 简单点说ast模块就是帮助Python应用来处理抽象的语法解析的。而该模块下的literal_eval()函数：则会判断需要计算的内容计算后是不是合法的Python类型，如果是则进行运算，否则就不进行运算。
#
return data

SVC_pipeline = Pipeline([
('tfidf', TfidfVectorizer(min_df=3,max_df=0.9,ngram_range=(1,2))),
('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
])

NB_pipeline = Pipeline([
('cv', CountVectorizer(min_df=3,max_df=0.9,ngram_range=(1,2))), # 默认分词是根据\w+
('clf', OneVsRestClassifier(MultinomialNB(
fit_prior=True, class_prior=None))),
])

X_train, y_train = train.video_tag, train.class_id
X_val, y_val = validation.video_tag, validation.class_id

#开始进行数据清洗
X_train = [text_prepare(x) for x in X_train]
X_val = [text_prepare(x) for x in X_val]

SVC_pipeline.fit(X_train,y_train)
predicted = SVC_pipeline.predict(X_val)
print_evaluation_scores(y_val,predicted)

class_tags = []
for tag,class_id,origin_class_id in  zip(X_val,predicted,y_val):
print(tag,class_id)
class_tags.append({"origin_class":video_class_dict.get(origin_class_id,''),  "predict_class":video_class_dict.get(class_id,''),"video_tag":tag})
df = pd.DataFrame(class_tags) #
df.to_csv('test_result.csv') # 默认增加列名

• 0
点赞
• 3
收藏
觉得还不错? 一键收藏
• 1
评论
11-14 427
07-11 2122
09-16 6400
12-21 2152
10-18 3185
05-29 7205
01-31 4479
12-05 428
09-14 3635

### “相关推荐”对你有帮助么？

• 非常没帮助
• 没帮助
• 一般
• 有帮助
• 非常有帮助

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。