数据分析之学术前沿---任务4

import seaborn as sns #⽤于画图
from bs4 import BeautifulSoup #⽤于爬取arxiv的数据
import re #⽤于正则表达式,匹配字符串的模式
import requests #⽤于⽹络连接,发送⽹络请求,使⽤域名获取对应信息
import json #读取数据,我们的数据为json格式的
import pandas as pd #数据处理,数据分析
import matplotlib.pyplot as plt #画图⼯具
data = [] #初始化
#使⽤用with语句句优势: 1.⾃自动关闭⽂文件句句柄; 2.⾃自动显示(处理理)⽂文件读取数据异常
with open("arxiv-metadata-oai-2019.json", 'r') as f:
    for idx, line in enumerate(f):
        d = json.loads(line)
        d = {
   'title': d['title'], 'categories': d['categories'], 'abstract': d['abstract']}
        data.append(d)
        
        #选择部分数据
        if idx > 200000:
            break
data = pd.DataFrame(data) #将list变为dataframe格式,⽅方便便使⽤用pandas进⾏行行分析
data
title categories abstract
0 Remnant evolution after a carbon-oxygen white ... astro-ph We systematically explore the evolution of t...
1 Cofibrations in the Category of Frolicher Spac... math.AT Cofibrations are defined in the category of ...
2 Torsional oscillations of longitudinally inhom... astro-ph We explore the effect of an inhomogeneous ma...
3 On the Energy-Momentum Problem in Static Einst... gr-qc This paper has been removed by arXiv adminis...
4 The Formation of Globular Cluster Systems in M... astro-ph The most massive elliptical galaxies show a ...
... ... ... ...
170613 Enhancement of Magneto-Optic Effects via Large... quant-ph We utilize the generation of large atomic co...
170614 Explicit and Exact Solutions to a Kolmogorov-P... solv-int nlin.SI Some explicit traveling wave solutions to a ...
170615 Linear r-Matrix Algebra for a Hierarchy of One... solv-int nlin.SI We consider a hierarchy of many-particle sys...
170616 Pfaff tau-functions solv-int adap-org hep-th nlin.AO nlin.SI Consider the evolution $$ \frac{\pl m_\iy}{\...
170617 The General Solution of the Complex Monge-Amp\... solv-int nlin.SI A general solution to the Complex Monge-Amp\...

170618 rows × 3 columns

data['text'] = data['title'] + data['abstract']
data['text']
0         Remnant evolution after a carbon-oxygen white ...
1         Cofibrations in the Category of Frolicher Spac...
2         Torsional oscillations of longitudinally inhom...
3         On the Energy-Momentum Problem in Static Einst...
4         The Formation of Globular Cluster Systems in M...
                                ...                        
170613    Enhancement of Magneto-Optic Effects via Large...
170614    Explicit and Exact Solutions to a Kolmogorov-P...
170615    Linear r-Matrix Algebra for a Hierarchy of One...
170616    Pfaff tau-functions  Consider the evolution $$...
170617    The General Solution of the Complex Monge-Amp\...
Name: text, Length: 170618, dtype: object
data['text'] = data['text'].apply(lambda x: x.replace('\n',' '))  #把换行符删除
data['text']
0         Remnant evolution after a carbon-oxygen white ...
1         Cofibrations in the Category of Frolicher Spac...
2         Torsional oscillations of longitudinally inhom
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值