1.提取各个表链接
2.下载对应的网页并保存到对应的文件夹
小白,不保证准确性
import xlrd
import re
from jieba import xrange
from selenium import webdriver
import os
wb = xlrd.open_workbook('xxxx.xlsx')
p = wb.sheet_names()
obj = re.compile(r"https:/.*?\d+")
for y in p:
print(y)
base_dir = os.path.dirname(__file__)
if os.path.exists(base_dir + '/' + y):
print('exist')
else:
os.mkdir(base_dir + '/' + y) # 创建
sh = wb.sheet_by_name(y)
sheet_data = []
for rownum in xrange(sh.nrows):
sheet_data.append((sh.row_values(rownum)))
found_list = []
rows_to_be_saved = []
for i in sheet_data:
for cell in i:
ret = obj.finditer(str(cell))
for it in ret:
# print(it.group())
# found_list.append(it.group())
filename = os.path.join(base_dir + '/' + y.strip(), re.search(r"\d+", it.group()).group() + '.mhtml')
if os.access(filename,os.F_OK):
print('Given file path is exist.')
else:
driver = webdriver.Chrome()
driver.get(it.group())
# 1. 执行 Chome 开发工具命令,得到mhtml内容
res = driver.execute_cdp_cmd('Page.captureSnapshot', {})
# 2. 写入文件
with open(filename, 'w', newline='') as f:
f.write(res['data'])
driver.quit()
# result = obj.findall(str(cell))
# print(result)
# if cell.value == "string1" or i[2] == "string2" or i[2] == "string3" or i[2] == "string4" or i[2] == "string5":
# else:
# rows_to_be_saved.append(i)
print("over")