背景:想要阅读论文又无从下手,专业论文量积累不够,每次从arxiv上下载的论文都在想:要是能自动搜索推送自己感兴趣的论文就好了,手机端可以直接看看论文,大致扫一下,对感兴趣的精读细读,效率应该会提高很多的吧。
方法:通过解析arxiv官方的url信息,获取论文的相关信息:标题、ID、作者以及链接,设计关键词匹配标题获取感兴趣的论文,最好将获取的论文整理后通过第三方邮箱定时发送到手机邮箱。
具体步骤:
step1: 获取并解析arxiv网站的论文信息
#get all result
def get_all(url='https://arxiv.org/list/cs/pastweek?show=1000', save_path='arxiv/daily/'):
html = get_one_page(url)
soup = BeautifulSoup(html, features='html.parser')
content = soup.dl
date = soup.find('h3')
list_ids = content.find_all('a', title = 'Abstract')
list_title = content.find_all('div', class_ = 'list-title mathjax')
list_authors = content.find_all('div', class_ = 'list-authors')
list_subjects = content.find_all('div', class_ = 'list-subjects')
list_subject_split = []
for subjects in list_subjects:
subjects = subjects.text.split(': ', maxsplit=1)[1]
subjects = subjects.replace('\n\n', '')
subjects = subjects.replace('\n', '')
subject_split = subjects.split('; ')
list_subject_split.append(subject_split)
items = []
for i, paper in enumerate(zip(list_ids, list_title, list_authors, list_subjects, list_subject_split)):
items.append([paper[0].text, paper[1].text, paper[2].text, paper[3].text, paper[4]])
name = ['id', 'title', 'authors', 'subjects', 'subject_split']
paper = pd.DataFrame(columns=name,data=items)
if not os.path.exists(save_path):
os.makedirs(save_path)
paper.to_csv(save_path+time.strftime("%Y-%m-%d")+'_'+str(len(items))+'.csv')
subject_all = []
for subject_split in list_subject_split:
for subject in subject_split:
subject_all.append(subject)
subject_cnt = Counter(subject_all)
return list_title, subject_cnt, items, paper
step2: 根据自己的兴趣获取特定领域的论文
#split with keywords(multi-keywords)
def split_keywords(paper, key_words=['Detection'], save_path='arxiv/selected/'):
# key_words2 = ['quantization', 'compress', 'prun']
selected_papers = paper[paper['title'].str.contains(key_words[0], case=False)]
for key_word in key_words[1:]:
selected_paper1 = paper[paper['title'].str.contains(key_word, case=True)]
selected_papers = pd.concat([selected_papers, selected_paper1], axis=0)
if not os.path.exists(save_path):
os.makedirs(save_path)
selected_papers.to_csv(save_path+time.strftime("%Y-%m-%d")+'_'+str(len(selected_papers))+'.csv')
return selected_papers
step3: 本地下载(optional)
#download parper
def download_parper(save_path, selected_papers):
'''dowdload key_word selected papers'''
list_subject_split = []
if not os.path.exists(save_path+time.strftime("%Y-%m-%d")):
os.makedirs(save_path+time.strftime("%Y-%m-%d"))
for selected_paper_id, selected_paper_title in zip(selected_papers['id'], selected_papers['title']):
selected_paper_id = selected_paper_id.split(':', maxsplit=1)[1]
selected_paper_title = selected_paper_title.split(':', maxsplit=1)[1]
r = requests.get('https://arxiv.org/pdf/' + selected_paper_id)
while r.status_code == 403:
time.sleep(500 + random.uniform(0, 500))
r = requests.get('https://arxiv.org/pdf/' + selected_paper_id)
selected_paper_id = selected_paper_id.replace(".", "_")
pdfname = selected_paper_title.replace("/", "_") #pdf名中不能出现/和:
pdfname = pdfname.replace("?", "_")
pdfname = pdfname.replace("\"", "_")
pdfname = pdfname.replace("*","_")
pdfname = pdfname.replace(":","_")
pdfname = pdfname.replace("\n","")
pdfname = pdfname.replace("\r","")
print(save_path+time.strftime("%Y-%m-%d")+'/%s %s.pdf'%(selected_paper_id, selected_paper_title))
with open(save_path+time.strftime("%Y-%m-%d")+'/%s %s.pdf'%(selected_paper_id,pdfname), "wb") as code:
code.write(r.content)
step4: 发送提取的论文到个人邮箱
def send_parper(list_title, subject_cnt, items, selected_papers):
'''send email'''
#selected_papers.to_html('email.html')
content = 'Today arxiv has {} new papers in CS area, and {} of them is about CV, {} of them contain your keywords.\n\n'.format(len(list_title), subject_cnt['Computer Vision and Pattern Recognition (cs.CV)'], len(selected_papers))
# content += 'Ensure your keywords is ' + str(key_words) + ' and ' + str(Key_words) + '(case=True). \n\n'
content += 'This is your paperlist.Enjoy! \n\n'
for i, selected_paper in enumerate(zip(selected_papers['id'], selected_papers['title'], selected_papers['authors'], selected_papers['subject_split'])):
#print(content1)
content1, content2, content3, content4 = selected_paper
content += '------------' + str(i+1) + '------------\n' + content1 + content2 + str(content4) + '\n'
content1 = content1.split(':', maxsplit=1)[1]
content += 'https://arxiv.org/abs/' + content1 + '\n\n'
content += 'Here is the Research Direction Distribution Report. \n\n'
subject_items = []
for subject_name, times in subject_cnt.items():
subject_items.append([subject_name, times])
subject_items = sorted(subject_items, key=lambda subject_items: subject_items[1], reverse=True)
name = ['name', 'times']
subject_file = pd.DataFrame(columns=name,data=subject_items)
sub_path = 'arxiv/sub_cnt/'
if not os.path.exists(sub_path):
os.makedirs(sub_path)
subject_file.to_csv(sub_path+time.strftime("%Y-%m-%d")+'_'+str(len(items))+'.csv')
for subject_name, times in subject_items:
content += subject_name + ' ' + str(times) +'\n'
title = time.strftime("%Y-%m-%d") + ' you have {} papers'.format(len(selected_papers))
return title, content
step5: 添加日志,便于后续查询
#write log
def write_report(save_path='arxiv/report/', content):
if not os.path.exists(save_path):
os.makedirs(save_path)
freport = open(save_path +'.txt', 'w')
freport.write(content)
freport.close()
辅助方法1:get_one_page(),模拟下载过程,避免频繁下载被禁
def get_one_page(url):
response = requests.get(url)
print(response.status_code)
while response.status_code == 403:
time.sleep(500 + random.uniform(0, 500))
response = requests.get(url)
print(response.status_code)
print(response.status_code)
if response.status_code == 200:
return response.text
return None
辅助方法2:send_email(),通过网易邮箱SMTP服务获取授权码,发送邮箱到QQ邮箱等(可群发)
def send_email(title, content):
#发送者邮箱
sender = 'xxxxxx@163.com'
#发送者的登陆用户名和授权码
user = 'xxxxxxx@163.com'
password = 'xxxxxxx' #网易邮箱授权码
#发送者邮箱的SMTP服务器地址
smtpserver = 'smtp.163.com'
#接收者的邮箱地址
receiver = 'xxxxxx@qq.com' #receiver 可以是一个list
msg = MIMEMultipart('alternative')
part1 = MIMEText(content, 'plain', 'utf-8')
msg.attach(part1)
#发送邮箱地址
msg['From'] = sender
#收件箱地址
msg['To'] = receiver
#主题
msg['Subject'] = title
smtp = smtplib.SMTP() #实例化SMTP对象
smtp.connect(smtpserver) #(缺省)默认端口是25 也可以根据服务器进行设定
smtp.login(user, password) #登陆smtp服务器
smtp.sendmail(sender, receiver, msg.as_string()) #发送邮件 ,这里有三个参数
smtp.quit()
smtp.close()
最后的最后,main()函数:
def main():
url = 'https://arxiv.org/list/cs/pastweek?show=1000'
save_all_path = 'arxiv/daily/'
list_title, subject_cnt, items, paper = get_all(url,save_all_path)
print(paper.head())
print("*" * 20)
key_words = ['Object Detection', 'Detector']
save_split_path = 'arxiv/selected/'
selected_papers = split_keywords(paper, key_words, save_split_path)
print(selected_papers.head())
#download_parper(save_split_path, selected_papers)
title, content = send_parper(list_title, subject_cnt, items, selected_papers)
send_email(title , content)
save_log_path = 'arxiv/report/'
write_report(save_log_path, content)
time.sleep(5)
if __name__ == '__main__':
main()
以后的以后,打开电脑后,直接run一下(感兴趣or无所事事),linux下写个shell脚本按时发送(自动化,不要当成垃圾邮箱就好)。
PS:关于网易邮箱申请SMTP服务很easy,自行百度呗。