Download arxiv paper

12 篇文章 0 订阅

1. Code

#!/usr/bin/env python
# -*- coding: utf-8 -*-

'''
###########
Usage:
python download.py site.txt(containing https://...)

'''

from selenium import webdriver
import time
from pymouse import PyMouse

m = PyMouse()

def pause(length=1):
    time.sleep(length)

def download(url):
    b = webdriver.Firefox()
    #b.set_page_load_timeout(60) # useless
    b.maximize_window()
    pause(1)

    b.get(url)
    pause(2)

    loading_time = 60

    dt = b.find_elements_by_tag_name('dt')
    dd = b.find_elements_by_tag_name('dd')
    assert(len(dt) == len(dd))
    dst_type = "Computer Vision"

    print b.get_window_size()
    bias = [254, 171]
    screenIsVertical = False
    if screenIsVertical:
        print "No implement when screen is vertical"
        return
    else:
        pos = [b.get_window_size()['width']/2 + bias[0], b.get_window_size()['height']/2 + bias[1]]

    for i in xrange(4, len(dt)):

        # no Computer Vision paper
        if dst_type not in dd[i].find_element_by_class_name('primary-subject').text:
            continue

        # no 'pdf' button
        try:
            dt[i].find_element_by_link_text('pdf').click()
        except Exception, e:
            continue

        pause(loading_time)

        b.find_element_by_id('download').click()
        pause(2)

        m.click(pos[0], pos[1], 1, 1)
        time.sleep(1)

        b.back()
        time.sleep(1)
        dt = b.find_elements_by_tag_name('dt')
        dd = b.find_elements_by_tag_name('dd')

    b.close()

def main():
    import sys
    if len(sys.argv) != 2:
        print(__doc__)
        return

    with open(sys.argv[1], 'r') as fid:
        urls = [x.split('\n')[0] for x in fid.readlines()]

    for url in urls:
        if url.startswith('#'):
            continue
        else:
            download(url)


if __name__ == "__main__":
    main()

2. Usage

python download.py site.txt

site.txt (example)

https://arxiv.org/find/all/1/ti:+AND+object+detection/0/1/0/all/0/1
https://arxiv.org/find/all/1/ti:+AND+object+detection/0/1/0/all/0/1?skip=25&query_id=a6b6ed358647ff57
#https://arxiv.org/find/all/1/ti:+AND+object+detection/0/1/0/all/0/1?skip=50&query_id=a6b6ed358647ff57
https://arxiv.org/find/all/1/ti:+AND+object+detection/0/1/0/all/0/1?skip=75&query_id=a6b6ed358647ff57

You can use # to ignore specific url.

Refer this post for installing requirement.

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值