参考:https://www.cnblogs.com/wolfstark/p/16895823.html
import requests
from lxml import etree
import openpyxl
import xlrd
import pandas as pd
import os
'''
实例:http://zjt.hunan.gov.cn/zjt/bsfw/ggfw/xxcx/202005/t20200525_12170746.html
.xlsx .xls
'''
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Pragma": "no-cache",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.203"
}
url = "http://zjt.hunan.gov.cn/zjt/bsfw/ggfw/xxcx/202005/t20200525_12170746.html"
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
element = etree.HTML(response.text)
lis = element.xpath('//div[@class="main_con_zw"]//p/a/@href')
one = 'http://zjt.hunan.gov.cn/zjt/bsfw/ggfw/xxcx/202005/' + lis[0].strip()
two = 'http://zjt.hunan.gov.cn/zjt/bsfw/ggfw/xxcx/202005/' + lis[1].strip()
print(one, '\n', two)
res1 = requests.get(one)
res2 = requests.get(two)
with open('xxx.xlsx', 'wb') as f, open('yyy.xls', 'wb') as p:
f.write(res1.content)
p.write(res2.content)
## 方案一:碰到有特殊设置的单元格,可能无法读取到真实的数值
def readExcel(filename):
company_excel_list = []
if '.xlsx' in filename:
workbook = openpyxl.load_workbook(filename) # 加载文件
sheetnames = workbook.get_sheet_names() # 获取所有sheet名称,通过名字的方式
ws = workbook.get_sheet_by_name(sheetnames[0]) # 获取第一个sheet内容
# 读取数据
for r in range(3, ws.max_row + 1):
old = ws.cell(r, 2).value
if old and len(old) > 1:
company_excel_list.append(old.strip().replace(' ', '').replace('\n', ''))
workbook.close()
elif '.xls' in filename:
workbook = xlrd.open_workbook(filename) # 加载文件
sheet = workbook.sheet_by_index(0) # 获取第一个sheet内容
# 读取数据
for row_index in range(2, sheet.nrows):
old = sheet.cell_value(row_index, 1)
if old and len(old) > 1:
company_excel_list.append(old.strip().replace(' ', '').replace('\n', ''))
os.remove(filename)
return company_excel_list
## 方案二
def readExcel2(filename):
company_excel_list = []
df = pd.read_excel(filename, header=1)
df.fillna('', inplace=True)
old_list = df.iloc[:, 1].values
for old in old_list:
if old and len(old) > 1:
company_excel_list.append(old.strip().replace(' ', '').replace('\n', ''))
os.remove(filename)
return company_excel_list
filename = ['xxx.xlsx', 'yyy.xls']
for i in filename:
company_excel_list = readExcel2(i)
for company in company_excel_list:
print(company)