背景
(70条消息) 迅雷API批量下载巨潮年报_无敌的前任的博客-CSDN博客
代码块
from win32com.client import Dispatch
#pip install win32compat
#pip install pywin32
import os
import re
import openpyxl
import requests
import urllib.request
import time
def download(url, downpath,filename,i):
if filename in os.listdir(downpath):
print(str(i)+"th already there")
return
file_path=downpath+'\\'+filename
user_agent = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"}
request = urllib.request.Request(url, headers=user_agent)
#response = urllib.request.urlopen(request)
response = requests.get(url, headers=user_agent)
f = open(file_path, 'wb')
f.write(response.content)
#response对象数据存储
f.close()
print(str(i)+'th is done')
def code_revise(code_cell):
code=(code_cell.value)
code=str(code)
#用value就是数值,text不能用
for i in range(1,6-len(code)+1):
code='0'+code
return code
def url_revise(url):
#普通命令str.replace(old, new[, max])
#old --将被替换的子字符串。.new --新字符串,用于替换old子字符串。max --可选字符串,替换不超过max次
#re.sub(pattern, repl, string, count=0, flags=0)
#参数含义依次为旧字符正则匹配式、新子串、原文、次数默认全部替换
#print("url1:" + url)
old1=re.compile(r'disclosure/detail\?stockCode=\d+&announcementId')
old2=re.compile(r'orgId=\w+\d+&announcementTime')
new1='announcement/download?bulletinId'
new2='announceTime'
url=re.sub(old1,new1,url)
url = re.sub(old2, new2, url)
#print("url2:"+url)
return url
#input= r'E:\huang\Documents'
input= r'E:\huang\Documents\其他行业'
os.chdir(input)
downpath=r'E:\Alark\Users\Desktop\年报\2015\其他'
downlist='2015-2016年其他行业.xlsx'
wb = openpyxl.load_workbook(downlist)
ws = wb.active
#active_sheet = wb.active
i=1
for row in ws.rows:
if row[0].value==None:
#print("row[0]:",row[0].value)
break
else:
pass
#print(code_revise(row[0]),row[2].value)
filename=code_revise(row[0])+'_'+row[2].value+'.pdf'
url=url_revise(row[4].value)
i=i+1
download(url, downpath, filename,i)
wb.save("cache.xlsx")