import seaborn as sns #⽤于画图
from bs4 import BeautifulSoup #⽤于爬取arxiv的数据
import re #⽤于正则表达式,匹配字符串的模式
import requests #⽤于⽹络连接,发送⽹络请求,使⽤域名获取对应信息
import json #读取数据,我们的数据为json格式的
import pandas as pd #数据处理,数据分析
import matplotlib.pyplot as plt #画图⼯具
data = [] #初始化
#使⽤用with语句句优势: 1.⾃自动关闭⽂文件句句柄; 2.⾃自动显示(处理理)⽂文件读取数据异常
with open("arxiv-metadata-oai-2019.json", 'r') as f:
for idx, line in enumerate(f):
d = json.loads(line)
d = {
'title': d['title'], 'categories': d['categories'], 'abstract': d['abstract']}
data.append(d)
#选择部分数据
if idx > 200000:
break
data = pd.DataFrame(data) #将list变为dataframe格式,⽅方便便使⽤用pandas进⾏行行分析
data
title | categories | abstract | |
---|---|---|---|
0 | Remnant evolution after a carbon-oxygen white ... | astro-ph | We systematically explore the evolution of t... |
1 | Cofibrations in the Category of Frolicher Spac... | math.AT | Cofibrations are defined in the category of ... |
2 | Torsional oscillations of longitudinally inhom... | astro-ph | We explore the effect of an inhomogeneous ma... |
3 | On the Energy-Momentum Problem in Static Einst... | gr-qc | This paper has been removed by arXiv adminis... |
4 | The Formation of Globular Cluster Systems in M... | astro-ph | The most massive elliptical galaxies show a ... |
... | ... | ... | ... |
170613 | Enhancement of Magneto-Optic Effects via Large... | quant-ph | We utilize the generation of large atomic co... |
170614 | Explicit and Exact Solutions to a Kolmogorov-P... | solv-int nlin.SI | Some explicit traveling wave solutions to a ... |
170615 | Linear r-Matrix Algebra for a Hierarchy of One... | solv-int nlin.SI | We consider a hierarchy of many-particle sys... |
170616 | Pfaff tau-functions | solv-int adap-org hep-th nlin.AO nlin.SI | Consider the evolution $$ \frac{\pl m_\iy}{\... |
170617 | The General Solution of the Complex Monge-Amp\... | solv-int nlin.SI | A general solution to the Complex Monge-Amp\... |
170618 rows × 3 columns
data['text'] = data['title'] + data['abstract']
data['text']
0 Remnant evolution after a carbon-oxygen white ...
1 Cofibrations in the Category of Frolicher Spac...
2 Torsional oscillations of longitudinally inhom...
3 On the Energy-Momentum Problem in Static Einst...
4 The Formation of Globular Cluster Systems in M...
...
170613 Enhancement of Magneto-Optic Effects via Large...
170614 Explicit and Exact Solutions to a Kolmogorov-P...
170615 Linear r-Matrix Algebra for a Hierarchy of One...
170616 Pfaff tau-functions Consider the evolution $$...
170617 The General Solution of the Complex Monge-Amp\...
Name: text, Length: 170618, dtype: object
data['text'] = data['text'].apply(lambda x: x.replace('\n',' ')) #把换行符删除
data['text']
0 Remnant evolution after a carbon-oxygen white ...
1 Cofibrations in the Category of Frolicher Spac...
2 Torsional oscillations of longitudinally inhom