参加了天池的 学术前沿趋势分析比赛,这个比赛主要对arXiv论文完成对应的数据分析,这里整理下完成task的思路及踩过的坑:
下载数据集后的表结构如下:
字段名 | 字段说明 | 例子 |
---|---|---|
abstract | 摘要 | We systematically explore the evolution of the merger of two carbon-oxygen\n(CO) white dwarfs. The dynamical evolution of a 0.9 Msun + 0.6 Msun CO white… |
authors | 作者 | Sung-Chul Yoon, Philipp Podsiadlowski and Stephan Rosswog |
authors_parsed | 作者信息 | [[‘Yoon’, ‘Sung-Chul’, ‘’], [‘Podsiadlowski’, ‘Philipp’, ‘’],[‘Rosswog’, ‘Stephan’, ‘’]] |
categories | 论文在 arXiv 系统的所属类别或标签 | astro-ph |
comments | 论文页数和图表等其他信息 | 15 pages, 15 figures, 3 tables, submitted to MNRAS (Low resolution\n version; a high resolution version can be found at:\n http://www.astro.uva.nl/~scyoon/papers/wdmerger.pdf) |
doi | 数字对象标识符 | 10.1111/j.1365-2966.2007.12161.x |
id | arXiv ID,可用于访问论文 | 0704.0297 |
journal-ref | 论文发表的期刊的信息 | Chin.Phys.Lett.24:355-358,2007 |
license | 文章的许可证 | http://arxiv.org/licenses/nonexclusive-distrib/1.0/ |
report-no | 报告编号 | BELLE-CONF-0702 |
submitter | 提交者 | Liming Zhang |
title | 标题 | Measurement of D0-D0bar mixing in D0->Ks pi+ pi- decays |
update_date | 更新日期 | 2019-08-12 |
versions | 版本 | [{‘version’: ‘v1’, ‘created’: ‘Sat, 7 Apr 2007 20:23:54 GMT’},{‘version’: ‘v2’, ‘created’: ‘Thu, 21 Jun 2007 14:27:55 GMT’}] |
1 文献分类统计
论文数量统计(数据统计任务):统计2019年全年,计算机各个方向论文数量
此任务主要涉及爬取分类信息及饼图的绘制
1.1 数据导入
先导包
import seaborn as sns #用于画图
from bs4 import BeautifulSoup
import re
import requests #用于网络连接,发送网络请求,使用域名获取对应信息
import json #读取数据,我们的数据为json格式的
import pandas as pd #数据处理,数据分析
import matplotlib.pyplot as plt #画图工具
pd.set_option('display.max_rows', 200)
导数据
先读100个熟悉数据
#先读取100个数据
data = []
with open(r"E:\laptop\数分\天池_学术前沿\arxiv-metadata-oai-2019.json", 'r') as f:
for idx, line in enumerate(f):
if idx >= 100:
break
data.append(json.loads(line))
data = pd.DataFrame(data) #将list变为dataframe格式,方便使用pandas进行分析
data.shape #显示数据大小
查看数据头
data.head()
检查没有发现问题,选取需要使用的id、分类、更新日期3行
#读取所有数据,筛选出id、分类、更新日期3列
def readArxivFile(path, columns=['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi',
'report-no', 'categories', 'license', 'abstract', 'versions',
'update_date', 'authors_parsed'], count=None):
'''
定义读取文件的函数
path: 文件路径
columns: 需要选择的列
count: 读取行数
'''
data = []
with open(path, 'r') as f:
for idx, line in enumerate(f):
if idx == count:
break
d = json.loads(line)
d = {col : d[col] for col in columns}
data.append(d)
data = pd.DataFrame(data)
return data
data = readArxivFile(r'E:\laptop\数分\天池_学术前沿\arxiv-metadata-oai-2019.json', ['id', 'categories', 'update_date'])
1.2查看文章分类
describe()查看分类个数、种类、高频元素
#2.看分类
#查看分类信息
data['categories'].describe()
#count 元素个数
#unique 元素种类
#top 出现频率最高元素
#freq 频率最高元素出现次数
部分论文分类不止一种,将他们拆开后查看总的种类
uni_cate = set([i for l in [x.split(' ') for x in data['categories']] for i in l])
len(uni_cate)
1.3 筛选
筛选2019后论文
data['year']=pd.to_datetime(data['update_date']).dt.year#转换成时间戳模式
del data['update_date']
data = data[data['year']>=2019]
data.reset_index(drop=True,inplace=True)
data
1.4 筛选计算机领域所有文章
用bs4爬取具体分类表,这里level_3_code一定要将最后的空格去除,否则后面无法合并
#4.1链接分类网址
website_url = requests.get('https://arxiv.org/category_taxonomy').text
soup = BeautifulSoup(website_url)
root = soup.find('div',{'id':'category_taxonomy_list'})#找标签入口
tags = root.find_all(['h2','h3','h4','p'],recursive=True)
#4.2初始化str和list
level_1_name = ""
level_2_name = ""
level_2_code = ""
level_1_names = []
level_2_codes = []
level_2_names = []
level_3_codes = []
level_3_names = []
level_3_notes = []
#4.3找到具体分类
for t in tags:
if t.name == 'h2':
level_1_name = t.text
level_2_code = t.text
level_2_name = t.text
elif t.name == 'h3':
raw = t.text
level_2_code = re.sub(r"(.*)\((.*)\)",r"\2",raw)#找到小括号内具体分类,切出分类名和编号
level_2_name = re.sub(r"(.*)\((.*)\)",r"\1",raw)
elif t.name == 'h4':
raw = t.text
level_3_code = re.sub(r"(.*)\((.*)\)",r"\1",raw)
level_3_name = re.sub(r"(.*)\((.*)\)",r"\2",raw)
elif t.name == 'p':
notes = t.text
level_1_names.append(level_1_name)
level_2_names.append(level_2_name)
level_2_codes.append(level_2_code)
level_3_names.append(level_3_name)
level_3_codes.append(level_3_code[:-1])#去掉结尾空格,否则后面无法合并
level_3_notes.append(notes)
df_taxonomy = pd.DataFrame({
'group_name':level_1_names,
'archive_name':level_2_names,
'archive_id':level_2_codes,
'category_name':level_3_names,
'categories':level_3_codes,
'categrory_description':level_3_notes
})
df_taxonomy.groupby(['group_name','archive_name'])
df_taxonomy
1.5饼图可视化
合并爬下来的数据和具体类别并统计每个类别paper数
_df = data.merge(df_taxonomy,on='categories',how='left').drop_duplicates(["id","group_name"]).groupby("group_name").agg({"id":"count"}).sort_values(by='id',ascending=False).reset_index()
_df
matplotlib画饼图
fig = plt.figure(figsize=(15,12))
explode = (0,0,0,0.2,0.3,0.3,0.2,0.1)
#.pie() autopct内显示百分比小数位
#startangle :起始绘制角度,默认图是从x轴正方向逆时针画起,如设定=90则从y轴正方向画起;
#explode :(每一块)离开中心距离;
plt.pie(_df['id'], labels=_df['group_name'], autopct='%1.2f%%', startangle=160, explode=explode)
plt.tight_layout()
plt.show()
1.6统计计算机子类的论文数
group_name = 'Computer Science'
cats = data.merge(df_taxonomy,on='categories').query("group_name == @group_name")
#pivot()行列互转
cats.groupby(['year','category_name']).count().reset_index().pivot(index='category_name',columns='year',values='id')
2 作者统计
这个任务主要学习字符串操作、画直方图
2.1读文件
导包
import seaborn as sns
from bs4 import BeautifulSoup
import re
import requests
import json
import pandas as pd
import matplotlib.pyplot as plt
读取json文件,只取包含作者的列
def readArxivFile(path, columns=['id','submitter','authors','title','comments','journal-ref','doi','report-no','categories','license',
'abstract','versions','update_date','authors_parsed'],count=None):
data = []
with open(path,'r') as f:
for idx,line in enumerate(f):
if idx == count:
break
d = json.loads(line)
d = {col:d[col] for col in columns}
data.append(d)
data = pd.DataFrame(data)
return data
data = readArxivFile(r"E:\laptop\数分\天池_学术前沿\arxiv-metadata-oai-2019.json",['id','authors','categories','authors_parsed'],100000)
2.2 绘制作者出现频率直方图
获得名字
#2.1 获得名字
cv_data = data[data['categories'].apply(lambda x:'cs.CV' in x)]
all_authors = sum(cv_data['authors_parsed'],[])
all_authors
拼接名字
#2.2拼接名字
authors_names = [' '.join(x) for x in all_authors]
authors_names = pd.DataFrame(authors_names)
根据出现频率统计并画图
plt.figure(figsize=(10,6))
authors_names[0].value_counts().head(10).plot(kind='barh')
names = authors_names[0].value_counts().index.values[:10]#得到标签名
_ = plt.yticks(range(0,len(names)),names)
plt.ylabel('author')
plt.xlabel('count')
2.3 统计姓频率
authors_lastnames = [x[0] for x in all_authors]
authors_lastnames = pd.DataFrame(authors_lastnames)
plt.figure(figsize=(10,6))
authors_lastnames[0].value_counts().head(10).plot(kind='barh')
names = authors_lastnames[0].value_counts().index.values[:10]
_ = plt.yticks(range(0, len(names)),names)
plt.ylabel('author')
plt.xlabel('count')
2.4 统计姓首字母频率
authors_lastnames = [x[0][0] for x in all_authors]
authors_lastnames = pd.DataFrame(authors_lastnames)
plt.figure(figsize=(10,6))
authors_lastnames[0].value_counts().head(10).plot(kind='barh')
names = authors_lastnames[0].value_counts().index.values[:10]
_ = plt.yticks(range(0, len(names)),names)
plt.ylabel('author')
plt.xlabel('count')
3 文本分类
此任务的主要目的在于使用分类器解决文本多分类问题,主要利用paper的标题和摘要,用多项式贝叶斯算法及BILSTM两种方法完成对文本的分类。
3.1数据处理
导入InteractiveShell让jupyter可以显示多个结果
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
读取文章编号、标题、摘要、分类四列
def readArxivFile(path,columns=['id','submitter','authors','title','comments','journal-ref','doi','report-no','categories','license',
'abstract','versions','update_date','authors_parsed'],count=None):
data = []
with open(path,'r') as f:
for idx,line in enumerate(f):
if idx == count:
break
d = json.loads(line)
d = {col:d[col] for col in columns}
data.append(d)
data = pd.DataFrame(data)
return data
data = readArxivFile(r"E:\laptop\数分\天池_学术前沿\arxiv-metadata-oai-2019.json",['id','title','categories','abstract'])
将标题和摘要拼接
data['text'] = data['title']+data['abstract']
data['text'] = data['text'].apply(lambda x : x.replace('\n',' '))
data['text'] = data['text'].apply(lambda x : x.lower())
data = data.drop(['abstract','title'],axis=1)
处理多分类文本,将其划分为上位类
data['categories'] = data['categories'].apply(lambda x : x.split(' '))#如果是多类别,则切分
data['upper_categories'] = data['categories'].apply(lambda x : [xx.split('.')[0] for xx in x])#子类别划分为上位类
处理过后数据如下:
将类别编码成相应标签,主要为了处理一个文本可能有多个分类问题,导入
多标签分类格式
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
data_label = mlb.fit_transform(data['upper_categories'].iloc[:])
data_label.shape
mlb.classes_#所有类别标签列表
3.2 多项式贝叶斯进行分类
3.2.1TF-IDF提取特征词
先将文本向量化,获取最多4000个特征
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=4000)
data_tfidf = vectorizer.fit_transform(data['text'].iloc[:])
repr(data_tfidf)
提取TF-IDF最大的词作为特征值
max_value = data_tfidf.max(axis=0).toarray().ravel()#取tfidf最大值单词并展平
sorted_by_tfidf = max_value.argsort()#argsort()函数是将x中的元素从小到大排列,提取其对应的index(索引),然后输出到y
feature_names = np.array(vectorizer.get_feature_names())
print('feature with high tfidf:\n{}'.format(feature_names[sorted_by_tfidf[-20:]]))
3.2.2 划分数据集
20%80%划分训练集和测试集
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(data_tfidf, data_label, test_size=0.2, random_state=1)
3.2.3 构建多项式贝叶斯模型
导入模型并训练
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB#多项式贝叶斯算法
clf = MultiOutputClassifier(MultinomialNB()).fit(x_train,y_train)
3.2.4 结果显示
classification_report显示结果
from sklearn.metrics import classification_report
print(classification_report(y_test, clf.predict(x_test)))
suppprt为标签出现次数
最后精度在75%左右
3.3 BiLSTM进行分类
3.3.1 重新划分数据集
只取前100000个数据
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data['text'].iloc[:100000],
data_label[:100000],
test_size = 0.95,random_state = 1)
3.3.2 文本序列化
max_features= 500
max_len= 150
embed_size=100
batch_size = 128
epochs = 5
from keras.preprocessing.text import Tokenizer #分词
from keras.preprocessing import sequence
tokens = Tokenizer(num_words = max_features)
tokens.fit_on_texts(list(data['text'].iloc[:100000]))
y_train = data_label[:100000]
x_sub_train = tokens.texts_to_sequences(data['text'].iloc[:100000])
x_sub_train = sequence.pad_sequences(x_sub_train, maxlen=max_len)
pad_sequences()设置了最大序列长度来保证输入长度相等
3.3.3 构建BiLSTM并训练
这一步进行词嵌入并构建BiLSTM来训练,注意这里输出大小要改为34的Dense层
from keras.layers import Dense,Input,LSTM,Bidirectional,Activation,Conv1D,GRU
from keras.layers import Dropout,Embedding,GlobalMaxPool1D,MaxPooling1D,Add,Flatten
from keras.layers import GlobalAveragePooling1D,GlobalMaxPool1D,concatenate,SpatialDropout1D
from keras.callbacks import Callback,EarlyStopping,ModelCheckpoint
from keras import initializers, regularizers,constraints,optimizers,layers,callbacks
from keras.models import Model
from keras.optimizers import Adam
sequence_input = Input(shape=(max_len,))
x = Embedding(max_features, embed_size, trainable=True)(sequence_input)
x = SpatialDropout1D(0.2)(x)
x = Bidirectional(GRU(128,return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
x = Conv1D(64,kernel_size=3, padding='valid', kernel_initializer='glorot_uniform')(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPool1D()(x)
x = concatenate([avg_pool,max_pool])
preds = Dense(34,activation='sigmoid')(x)#输出大小为34
model = Model(sequence_input,preds)
model.compile(loss='binary_crossentropy',optimizer=Adam(lr=1e-3),metrics=['accuracy'])
model.fit(x_sub_train,y_train,
batch_size=batch_size,
validation_split=0.2,
epochs=epochs)
最后精度在71%左右
4 作者信息关联
这个task主要对作者进行关联,绘制作者关系图,进行社交网络分析。
利用了networkx这个包对论文作者进行合著分析。
4.1 读数据
只要id和作者名两列
def readArxivFile(path,columns=['id','submitter','authors','title','comments','journal-ref','doi','report-no','categories','license',
'abstract','versions','update_date','authors_parsed'],count=None):
data = []
with open(path,'r') as f:
for idx,line in enumerate(f):
if idx == count:
break
d = json.loads(line)
d = {col:d[col] for col in columns}
data.append(d)
data = pd.DataFrame(data)
return data
data = readArxivFile(r"E:\laptop\数分\天池_学术前沿\arxiv-metadata-oai-2019.json",['id','authors_parsed'],200000)
4.2 构建作者论文无向图
这里先绘制6篇论文作者关系图,将论文第一作者和后面作者链接
import networkx as nx
G = nx.Graph() #无向图
#先用6篇论文来构建
for row in data.iloc[:6].itertuples():#itertuples(): 将DataFrame迭代为元祖。
authors = row[1]
authors = [' '.join(x[:-1]) for x in authors]
#第一作者与其后作者链接
for author in authors[1:]:
G.add_edge(authors[0],author)
nx.draw(G, with_labels=True)
4.2 构建最大联通子图与子图节点度值
这里绘制前500篇论文的作者
4.2.1 构建论文作者无向图
for row in data.iloc[:500].itertuples():#itertuples(): 将DataFrame迭代为元祖。
authors = row[1]
authors = [' '.join(x[:-1]) for x in authors]
#第一作者与其后作者链接
for author in authors[1:]:
G.add_edge(authors[0],author)
4.2.2 构建子图节点度量值折线图
degree_sequence = sorted([d for n,d in G.degree()], reverse=True)
dmax = max(degree_sequence)
plt.loglog(degree_sequence,'b-',marker='o')
plt.title('degree rank plot')
plt.ylabel('degree')
plt.xlabel('rank')
4.2.3 构建最大联通图
plt.axes([0.45, 0.45, 0.45, 0.45])
Gcc = G.subgraph(sorted(nx.connected_components(G), key=len, reverse=True)[0])
pos = nx.spring_layout(Gcc)
plt.axis('off')
nx.draw_networkx_nodes(Gcc, pos, node_size=20)
nx.draw_networkx_edges(Gcc, pos, alpha=0.4)
plt.show()