en....
总的来说,就是把这个网页上的我想要的文件先从html里过滤出来,再下载。
其中访问需要cookie,不能通过get直接访问;
脚本如下
#!/usr/bin/python3
import sys, io, re, os
from urllib import request
def get_download_url(sub_url):
url_key_pattern = re.compile(r"href=\".*\"")
url_key = url_key_pattern.findall(str(sub_url))
url_key_pattern = re.compile(r"\/redmine.*\"")
url_key = url_key_pattern.findall(str(url_key))
sub_url = str(url_key[0])
sub_url = sub_url.split('"')
sub_url = str(sub_url[0])
download_url = "http://redmine.springgroup.cn" + sub_url
return download_url
def get_file_name(sub_url):
url_key_pattern = re.compile(r"href=\".*")
url_key = url_key_pattern.findall(str(sub_url))
this_file_name = str(url_key[0])
this_key_pattern = re.compile(r"\>.*?\
this_file_name = this_key_pattern.findall(str(this_file_name))
this_file_name = str(this_file_name[0])
this_file_name = this_file_name.strip('>,
return this_file_name
def pre_fun(save_dir, redmine_number):
#浏览器登录后得到的cookie,也就是刚才复制的字符串
#cookie_str = r'JSESSIONID=xxxxxxxxxxxxxxxxxxxxxx; iPlanetDirectoryPro=xxxxxxxxxxxxxxxxxx'
cookie_str = r'_redmine_session=ajhuOC9xbG9NaWlyUjJ4RTBzcDF4cjl1SVVzUlF4V1dURitCQ2x1U0FpQ1kva1ZrM1ppZ3FDTjVXbnNkdlNHSld3WCt4UjVIYlFBcFhMd29mTVdTc290ZGk5WGRERzl0RmR6V3VubFMxQkF1VGQvQlVGcHdEZWhkMTJFMzNGbVdQSlhYcnJldG8'
os.chdir(save_dir)
if not os.path.isdir(redmine_number):
os.makedirs(redmine_number)
os.chdir(redmine_number)
#登录后才能访问的网页
redmine_url = 'http://redmine.springgroup.cn/redmine/issues/' + redmine_number
return redmine_url, cookie_str
def start_download(redmine_url, cookie_str):
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8') #改变标准输出的默认编码
req = request.Request(redmine_url)
#设置cookie
req.add_header('cookie', cookie_str)
#设置请求头
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36')
resp = request.urlopen(req)
#print(resp.read().decode('utf-8'))
#this_text = resp.read().decode('utf-8')
while resp:
li = resp.readline().decode('utf-8')
if "