主题:论文种类分类
利用已有数据建模,对新论文进行类别分类
使用论文标题
完成类别分类
import seaborn as sns
from bs4 import BeautifulSoup
import re
import json
import pandas as pd
import matplotlib.pyplot as plt
def readArxivFile(path, columns=['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi',
'report-no', 'categories', 'license', 'abstract', 'versions',
'update_date', 'authors_parsed'], count=None):
'''
定义读取文件的函数
path: 文件路径
columns: 需要选择的列
count: 读取行数
'''
data = []
with open(path, 'r') as f:
for idx, line in enumerate(f):
if idx == count:
break
d = json.loads(line)
d = {col : d[col] for col in columns}
data.append(d)
data = pd.DataFrame(data)
return data
data = readArxivFile('arxiv-metadata-oai-snapshot.json',
['id', 'title', 'categories', 'abstract'],
200000)
data.head(2)
id | title | categories | abstract | |
---|---|---|---|---|
0 | 0704.0001 | Calculation of prompt diphoton production cros... | hep-ph | A fully differential calculation in perturba... |
1 | 0704.0002 | Sparsity-certifying Graph Decompositions | math.CO cs.CG | We describe a new algorithm, the $(k,\ell)$-... |
1. 数据处理
data1 = data.copy()
将标题和摘要拼接一起完成分类
data1['text'] = data['title']+data['abstract']
data1['text'] = data1['text'].apply(lambda x: x.replace('\n',' '))
#text全部设置为小写
data1['text'] = data1['text'].apply(lambda x: x.lower())
data1 = data1.drop(['abstract','title'],axis=1)
data1.head(2)
id | categories | text | |
---|---|---|---|
0 | 0704.0001 | hep-ph | calculation of prompt diphoton production cros... |
1 | 0704.0002 | math.CO cs.CG | sparsity-certifying graph decompositions we d... |
# 多个类别,包含子分类
data1['categories'] = data1['categories'].apply(lambda x :x.split(' '))
# 单个类别,不包含子分类
data1['categories_big'] = data1['categories'].apply(lambda x : [xx.split('.')[0] for xx in x])
data1.head(2)
id | categories | text | categories_big | |
---|---|---|---|---|
0 | 0704.0001 | [hep-ph] | calculation of prompt diphoton production cros... | [hep-ph] |
1 | 0704.0002 | [math.CO, cs.CG] | sparsity-certifying graph decompositions we d... | [math, cs] |
将类别进行编码,这里类别是多个,所以需要多编码
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
data_label = mlb.fit_transform(data1['categories_big'].iloc[:])
2. 分类
2.1 TFIDF
如果某个词或短语在一篇文章中出现的频率TF高,并且在其他文章中很少出现,则认为此词或者短语具有很好的类别区分能力,适合用来分类
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=4000)
data_tfidf = vectorizer.fit_transform(data1['text'].iloc[:])
data_tfidf
<200000x4000 sparse matrix of type '<class 'numpy.float64'>'
with 13654199 stored elements in Compressed Sparse Row format>
# 划分训练集和验证集
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data_tfidf,data_label,
test_size=0.2, random_state=1)
#构建多标签分类模型
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
clf = MultiOutputClassifier(MultinomialNB()).fit(x_train, y_train)
clf
MultiOutputClassifier(estimator=MultinomialNB())
from sklearn.metrics import classification_report
print(classification_report(y_test,clf.predict(x_test)))
precision recall f1-score support
0 0.95 0.85 0.89 7925
1 0.85 0.79 0.82 7339
2 0.77 0.72 0.74 2944
3 0.00 0.00 0.00 4
4 0.72 0.48 0.58 2123
5 0.51 0.66 0.58 987
6 0.86 0.38 0.52 544
7 0.71 0.69 0.70 3649
8 0.76 0.61 0.68 3388
9 0.85 0.88 0.87 10745
10 0.46 0.13 0.20 1757
11 0.79 0.04 0.07 729
12 0.45 0.35 0.39 507
13 0.54 0.36 0.43 1083
14 0.69 0.14 0.24 3441
15 0.84 0.20 0.33 655
16 0.93 0.16 0.27 268
17 0.87 0.43 0.58 2484
18 0.82 0.38 0.52 692
micro avg 0.81 0.65 0.72 51264
macro avg 0.70 0.43 0.50 51264
weighted avg 0.80 0.65 0.69 51264
samples avg 0.72 0.72 0.70 51264
D:\C_Anaconda\lib\site-packages\sklearn\metrics\_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))
D:\C_Anaconda\lib\site-packages\sklearn\metrics\_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))
2.2 深度学习模型
单词进行词嵌入然后训练。将数据集处理进行编码,并进行截断
from sklearn.model_selection import train_test_split
x_trian, x_test, y_trian, y_test = train_test_split(data1['text'].iloc[:100000],
data_label[:100000],
test_size=0.95, random_state=1)
x_test[:2]
43660 personal recommendation via modified collabora...
87278 noise driven translocation of short polymers i...
Name: text, dtype: object
y_test
array([[0, 0, 0, ..., 0, 0, 0],
[0, 1, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
...,
[0, 0, 1, ..., 0, 0, 1],
[0, 0, 0, ..., 0, 0, 0],
[1, 0, 0, ..., 0, 0, 0]])
# 参数
max_features = 500
max_len = 150
embed_size = 100
batch_size = 128
epochs = 5
from keras_preprocessing.text import Tokenizer
from keras_preprocessing import sequence
tokens = Tokenizer(num_words = max_features)
tokens.fit_on_texts(list(data1['text'].iloc[:100000]))
y_train = data_label[:100000]
x_sub_trian = tokens.texts_to_sequences(data1['text'].iloc[:100000])
x_sub_trian = sequence.pad_sequences(x_sub_trian, maxlen=max_len)
定义模型并完成训练
#LSTM model
#keras layers:
from keras.layers import Dense,Input,LSTM,Bidirectional,Activation,Conv1D,GRU
from keras.layers import Dropout,Embedding,GlobalMaxPool1D,MaxPool1D,Add,Flatten
from keras.layers import GlobalAveragePooling1D, GlobalMaxPool1D,concatenate,SpatialDropout1D
from keras.callbacks import Callback
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras.models import Model
from keras.optimizers import Adam
sequence_input = Input(shape=(max_len,))
x = Embedding(max_features, embed_size, trainable=True)(sequence_input)
x = SpatialDropout1D(0.2)(x)
x = Bidirectional(GRU(128, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = Conv1D(64, kernel_size=3, padding="valid", kernel_initializer="glorot_uniform")(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPool1D()(x)
x = concatenate([avg_pool,max_pool])
preds = Dense(19, activation="sigmoid")(x)
model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy', optimizer=Adam(lr=1e-3),metrics=['accuracy'])
model.fit(x_sub_trian,y_train,
batch_size=batch_size,
validation_split=0.2,
epochs=epochs)