- 学习主题:论文分类(数据建模任务),利用已有数据建模,对新论文进行类别分类;
- 学习内容:使用论文标题完成类别分类;
- 学习成果:学会文本分类的基本方法、
- 对论文标题和摘要进行处理;
- 对论文类别进行处理;
- 构建文本分类模型;
- 思路1:TF-IDF+机器学习分类器
- 思路2:FastText
- 思路3:WordVec+深度学习分类器
- 思路4:Bert词向量
# 导入所需的package
import seaborn as sns #用于画图
from bs4 import BeautifulSoup #用于爬取arxiv的数据
import re #用于正则表达式,匹配字符串的模式
import requests #用于网络连接,发送网络请求,使用域名获取对应信息
import json #读取数据,我们的数据为json格式的
import pandas as pd #数据处理,数据分析
import matplotlib.pyplot as plt #画图工具
def readArxivFile(path, columns=['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi',
'report-no', 'categories', 'license', 'abstract', 'versions',
'update_date', 'authors_parsed'], count=None):
path: 文件路径
columns: 需要选择的列
count: 读取行数
data = []
with open(path, 'r') as f:
for idx, line in enumerate(f):
if idx == count:
d = json.loads(line)
d = {col : d[col] for col in columns}
data = pd.DataFrame(data)
return data
data = readArxivFile('arxiv-metadata-oai-snapshot.json',
['id', 'title', 'categories', 'abstract'])
id | title | categories | abstract | |
0 | 0704.0001 | Calculation of prompt diphoton production cros... | hep-ph | A fully differential calculation in perturba... |
1 | 0704.0002 | Sparsity-certifying Graph Decompositions | math.CO cs.CG | We describe a new algorithm, the $(k,\ell)$-... |
2 | 0704.0003 | The evolution of the Earth-Moon system based o... | physics.gen-ph | The evolution of Earth-Moon system is descri... |
3 | 0704.0004 | A determinant of Stirling cycle numbers counts... | math.CO | We show that a determinant of Stirling cycle... |
4 | 0704.0005 | From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a... | math.CA math.FA | In this paper we show how to compute the $\L... |
... | ... | ... | ... | ... |
1796906 | supr-con/9608008 | On the origin of the irreversibility line in t... | supr-con cond-mat.supr-con | We report on measurements of the angular dep... |
1796907 | supr-con/9609001 | Nonlinear Response of HTSC Thin Film Microwave... | supr-con cond-mat.supr-con | The non-linear microwave surface impedance o... |
1796908 | supr-con/9609002 | Critical State Flux Penetration and Linear Mic... | supr-con cond-mat.supr-con | The vortex contribution to the dc field (H) ... |
1796909 | supr-con/9609003 | Density of States and NMR Relaxation Rate in A... | supr-con cond-mat.supr-con | We show that the density of states in an ani... |
1796910 | supr-con/9609004 | Ginzburg Landau theory for d-wave pairing and ... | supr-con cond-mat.supr-con | The Ginzburg Landau theory for d_{x^2-y^2}-w... |
1796911 rows × 4 columns
data['text'] = data['title'] + data['abstract']
data['text'] = data['text'].apply(lambda x: x.replace('\n',' '))
data['text'] = data['text'].apply(lambda x: x.lower())
data = data.drop(['abstract', 'title'], axis=1) # df.drop()删除指定的行或列
id | categories | text | |
0 | 0704.0001 | hep-ph | calculation of prompt diphoton production cros... |
1 | 0704.0002 | math.CO cs.CG | sparsity-certifying graph decompositions we d... |
2 | 0704.0003 | physics.gen-ph | the evolution of the earth-moon system based o... |
3 | 0704.0004 | math.CO | a determinant of stirling cycle numbers counts... |
4 | 0704.0005 | math.CA math.FA | from dyadic $\lambda_{\alpha}$ to $\lambda_{\a... |
... | ... | ... | ... |
1796906 | supr-con/9608008 | supr-con cond-mat.supr-con | on the origin of the irreversibility line in t... |
1796907 | supr-con/9609001 | supr-con cond-mat.supr-con | nonlinear response of htsc thin film microwave... |
1796908 | supr-con/9609002 | supr-con cond-mat.supr-con | critical state flux penetration and linear mic... |
1796909 | supr-con/9609003 | supr-con cond-mat.supr-con | density of states and nmr relaxation rate in a... |
1796910 | supr-con/9609004 | supr-con cond-mat.supr-con | ginzburg landau theory for d-wave pairing and ... |
1796911 rows × 3 columns
# 多个类别,包含子分类
data['categories'] = data['categories'].apply(lambda x : x.split(' ')) # 返回每个类别形成的列表
# 单个类别,不包含子分类
data['categories_big'] = data['categories'].apply(lambda x : [xx.split('.')[0] for xx in x]) # 返回每个大类类别形成的列表
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
data_label = mlb.fit_transform(data['categories_big'].iloc[:])
array([[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 1],
[0, 0, 0, ..., 0, 0, 1],
[0, 0, 0, ..., 0, 0, 1]])
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
data_tfidf = vectorizer.fit_transform(data['text'].iloc[:])
<1796911x598592 sparse matrix of type '<class 'numpy.float64'>'
with 151272947 stored elements in Compressed Sparse Row format>
# 划分训练集和验证集
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data_tfidf, data_label,
test_size = 0.2,random_state = 1)
# 构建多标签分类模型
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
clf = MultiOutputClassifier(MultinomialNB()).fit(x_train, y_train)
from sklearn.metrics import classification_report
print(classification_report(y_test, clf.predict(x_test)))
precision recall f1-score support
0 0.00 0.00 0.00 13
1 0.00 0.00 0.00 106
2 0.00 0.00 0.00 275
3 0.00 0.00 0.00 5
4 0.97 0.82 0.89 56225
5 0.00 0.00 0.00 32
6 0.00 0.00 0.00 2
7 0.00 0.00 0.00 482
8 0.00 0.00 0.00 66
9 0.00 0.00 0.00 175
10 0.00 0.00 0.00 40
11 0.91 0.73 0.81 59797
12 0.86 0.84 0.85 62959
13 0.00 0.00 0.00 157
14 0.00 0.00 0.00 623
15 0.16 0.00 0.00 5696
16 0.00 0.00 0.00 83
17 0.87 0.09 0.16 16911
18 0.93 0.10 0.18 8578
19 0.26 0.00 0.00 4726
20 0.82 0.52 0.64 30796
21 0.91 0.34 0.50 27934
22 0.90 0.81 0.85 95914
23 0.02 0.00 0.00 12695
24 0.00 0.00 0.00 63
25 0.12 0.00 0.00 7035
26 0.11 0.00 0.00 4207
27 0.56 0.01 0.01 9741
28 0.00 0.00 0.00 151
29 0.82 0.14 0.25 36026
30 0.00 0.00 0.00 8
31 0.00 0.00 0.00 312
32 0.85 0.04 0.08 6604
33 0.14 0.00 0.00 2408
34 0.96 0.13 0.23 21601
35 0.00 0.00 0.00 304
36 0.84 0.06 0.11 16406
37 0.00 0.00 0.00 40
micro avg 0.90 0.53 0.66 489196
macro avg 0.32 0.12 0.15 489196
weighted avg 0.82 0.53 0.59 489196
samples avg 0.69 0.62 0.64 489196
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data['text'].iloc[:100000],
test_size = 0.95,random_state = 1)
# parameter
max_features= 500
max_len= 150
batch_size = 128
epochs = 5
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
tokens = Tokenizer(num_words = max_features)
y_train = data_label[:100000]
x_sub_train = tokens.texts_to_sequences(data['text'].iloc[:100000])
x_sub_train = sequence.pad_sequences(x_sub_train, maxlen=max_len)
# LSTM model
# Keras Layers:
from keras.layers import Dense,Input,LSTM,Bidirectional,Activation,Conv1D,GRU
from keras.layers import Dropout,Embedding,GlobalMaxPooling1D, MaxPooling1D, Add, Flatten
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D# Keras Callback Functions:
from keras.callbacks import Callback
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras.models import Model
from keras.optimizers import Adam
sequence_input = Input(shape=(max_len, ))
x = Embedding(max_features, embed_size, trainable=True)(sequence_input)
x = SpatialDropout1D(0.2)(x)
x = Bidirectional(GRU(128, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
x = Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool])
preds = Dense(38, activation="sigmoid")(x)
model = Model(sequence_input, preds)
model.fit(x_sub_train, y_train,
WARNING:tensorflow:From D:\softwares\DevTool\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:422: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.
Train on 80000 samples, validate on 20000 samples
Epoch 1/5
80000/80000 [==============================] - 570s 7ms/step - loss: 0.1266 - accuracy: 0.9677 - val_loss: 0.0764 - val_accuracy: 0.9757
Epoch 2/5
80000/80000 [==============================] - 622s 8ms/step - loss: 0.0694 - accuracy: 0.9768 - val_loss: 0.0636 - val_accuracy: 0.9786
Epoch 3/5
80000/80000 [==============================] - 621s 8ms/step - loss: 0.0609 - accuracy: 0.9789 - val_loss: 0.0576 - val_accuracy: 0.9800
Epoch 4/5
80000/80000 [==============================] - 620s 8ms/step - loss: 0.0572 - accuracy: 0.9799 - val_loss: 0.0557 - val_accuracy: 0.9805
Epoch 5/5
80000/80000 [==============================] - 618s 8ms/step - loss: 0.0553 - accuracy: 0.9804 - val_loss: 0.0541 - val_accuracy: 0.9808
<keras.callbacks.callbacks.History at 0x1b6b7ca1bc8>