论文种类分类Task4

Task4:论文种类分类

# 导入所需的package
import seaborn as sns #用于画图
from bs4 import BeautifulSoup #用于爬取arxiv的数据
import re #用于正则表达式,匹配字符串的模式
import requests #用于网络连接,发送网络请求,使用域名获取对应信息
import json #读取数据,我们的数据为json格式的
import pandas as pd #数据处理,数据分析
import matplotlib.pyplot as plt #画图工具
TF-IDF+机器学习分类器

直接使用TF-IDF对文本提取特征,使用分类器进行分类,分类器的选择上可以使用SVM、LR、XGboost等

data  = [] #初始化
#使用with语句优势:1.自动关闭文件句柄;2.自动显示(处理)文件读取数据异常
with open("/home/aistudio/data/data67990/arxiv-metadata-oai-2019.json", 'r') as f: 
    for idx, line in enumerate(f): 
        d = json.loads(line)
        d = {'title': d['title'], 'categories': d['categories'], 'abstract': d['abstract']}
        data.append(d)
        
        # 选择部分数据
        if idx > 200000:
            break
        
data = pd.DataFrame(data) #将list变为dataframe格式,方便使用pandas进行分析
data['text'] = data['title'] + data['abstract']

data['text'] = data['text'].apply(lambda x: x.replace('\n',' '))
data['text'] = data['text'].apply(lambda x: x.lower())
data = data.drop(['abstract', 'title'], axis=1)
# 多个类别,包含子分类
data['categories'] = data['categories'].apply(lambda x : x.split(' '))

# 单个类别,不包含子分类
data['categories_big'] = data['categories'].apply(lambda x : [xx.split('.')[0] for xx in x])
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
data_label = mlb.fit_transform(data['categories_big'].iloc[:])
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=4000)
data_tfidf = vectorizer.fit_transform(data['text'].iloc[:])
# 划分训练集和验证集
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data_tfidf, data_label,
                                                 test_size = 0.2,random_state = 1)

# 构建多标签分类模型
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
clf = MultiOutputClassifier(MultinomialNB()).fit(x_train, y_train)
from sklearn.metrics import classification_report
print(classification_report(y_test, clf.predict(x_test)))
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         0
           3       0.91      0.85      0.88      3625
           4       0.00      0.00      0.00         4
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         0
           8       0.77      0.76      0.77      3801
           9       0.84      0.89      0.86     10715
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00       186
          12       0.44      0.41      0.42      1621
          13       0.00      0.00      0.00         1
          14       0.75      0.59      0.66      1096
          15       0.61      0.80      0.69      1078
          16       0.90      0.19      0.32       242
          17       0.53      0.67      0.59      1451
          18       0.71      0.54      0.62      1400
          19       0.88      0.84      0.86     10243
          20       0.40      0.09      0.15       934
          21       0.00      0.00      0.00         1
          22       0.87      0.03      0.06       414
          23       0.48      0.65      0.55       517
          24       0.37      0.33      0.35       539
          25       0.00      0.00      0.00         1
          26       0.60      0.42      0.49      3891
          27       0.00      0.00      0.00         0
          28       0.82      0.08      0.15       676
          29       0.86      0.12      0.21       297
          30       0.80      0.40      0.53      1714
          31       0.00      0.00      0.00         4
          32       0.56      0.65      0.60      3398
          33       0.00      0.00      0.00         0

   micro avg       0.76      0.70      0.72     47851
   macro avg       0.39      0.27      0.29     47851
weighted avg       0.75      0.70      0.71     47851
 samples avg       0.74      0.76      0.72     47851



/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1272: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1272: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1272: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值