1. Code
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
###########
Usage:
python download.py site.txt(containing https://...)
'''
from selenium import webdriver
import time
from pymouse import PyMouse
m = PyMouse()
def pause(length=1):
time.sleep(length)
def download(url):
b = webdriver.Firefox()
#b.set_page_load_timeout(60) # useless
b.maximize_window()
pause(1)
b.get(url)
pause(2)
loading_time = 60
dt = b.find_elements_by_tag_name('dt')
dd = b.find_elements_by_tag_name('dd')
assert(len(dt) == len(dd))
dst_type = "Computer Vision"
print b.get_window_size()
bias = [254, 171]
screenIsVertical = False
if screenIsVertical:
print "No implement when screen is vertical"
return
else:
pos = [b.get_window_size()['width']/2 + bias[0], b.get_window_size()['height']/2 + bias[1]]
for i in xrange(4, len(dt)):
# no Computer Vision paper
if dst_type not in dd[i].find_element_by_class_name('primary-subject').text:
continue
# no 'pdf' button
try:
dt[i].find_element_by_link_text('pdf').click()
except Exception, e:
continue
pause(loading_time)
b.find_element_by_id('download').click()
pause(2)
m.click(pos[0], pos[1], 1, 1)
time.sleep(1)
b.back()
time.sleep(1)
dt = b.find_elements_by_tag_name('dt')
dd = b.find_elements_by_tag_name('dd')
b.close()
def main():
import sys
if len(sys.argv) != 2:
print(__doc__)
return
with open(sys.argv[1], 'r') as fid:
urls = [x.split('\n')[0] for x in fid.readlines()]
for url in urls:
if url.startswith('#'):
continue
else:
download(url)
if __name__ == "__main__":
main()
2. Usage
python download.py site.txt
site.txt (example)
https://arxiv.org/find/all/1/ti:+AND+object+detection/0/1/0/all/0/1
https://arxiv.org/find/all/1/ti:+AND+object+detection/0/1/0/all/0/1?skip=25&query_id=a6b6ed358647ff57
#https://arxiv.org/find/all/1/ti:+AND+object+detection/0/1/0/all/0/1?skip=50&query_id=a6b6ed358647ff57
https://arxiv.org/find/all/1/ti:+AND+object+detection/0/1/0/all/0/1?skip=75&query_id=a6b6ed358647ff57
You can use # to ignore specific url.
Refer this post for installing requirement.