python读取excel文件

 参考:https://www.cnblogs.com/wolfstark/p/16895823.html

import requests
from lxml import etree
import openpyxl
import xlrd
import pandas as pd
import os


'''
实例:http://zjt.hunan.gov.cn/zjt/bsfw/ggfw/xxcx/202005/t20200525_12170746.html
.xlsx .xls
'''


headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "Cache-Control": "no-cache",
    "Connection": "keep-alive",
    "Pragma": "no-cache",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.203"
}

url = "http://zjt.hunan.gov.cn/zjt/bsfw/ggfw/xxcx/202005/t20200525_12170746.html"
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
element = etree.HTML(response.text)
lis = element.xpath('//div[@class="main_con_zw"]//p/a/@href')
one = 'http://zjt.hunan.gov.cn/zjt/bsfw/ggfw/xxcx/202005/' + lis[0].strip()
two = 'http://zjt.hunan.gov.cn/zjt/bsfw/ggfw/xxcx/202005/' + lis[1].strip()
print(one, '\n', two)

res1 = requests.get(one)
res2 = requests.get(two)
with open('xxx.xlsx', 'wb') as f, open('yyy.xls', 'wb') as p:
    f.write(res1.content)
    p.write(res2.content)


## 方案一:碰到有特殊设置的单元格,可能无法读取到真实的数值
def readExcel(filename):
    company_excel_list = []
    if '.xlsx' in filename:
        workbook = openpyxl.load_workbook(filename)  # 加载文件
        sheetnames = workbook.get_sheet_names()  # 获取所有sheet名称,通过名字的方式
        ws = workbook.get_sheet_by_name(sheetnames[0])  # 获取第一个sheet内容
        # 读取数据
        for r in range(3, ws.max_row + 1):
            old = ws.cell(r, 2).value
            if old and len(old) > 1:
                company_excel_list.append(old.strip().replace(' ', '').replace('\n', ''))
        workbook.close()

    elif '.xls' in filename:
        workbook = xlrd.open_workbook(filename)  # 加载文件
        sheet = workbook.sheet_by_index(0)  # 获取第一个sheet内容
        # 读取数据
        for row_index in range(2, sheet.nrows):
            old = sheet.cell_value(row_index, 1)
            if old and len(old) > 1:
                company_excel_list.append(old.strip().replace(' ', '').replace('\n', ''))
    os.remove(filename)
    return company_excel_list

## 方案二
def readExcel2(filename):
    company_excel_list = []
    df = pd.read_excel(filename, header=1)
    df.fillna('', inplace=True)
    old_list = df.iloc[:, 1].values
    for old in old_list:
        if old and len(old) > 1:
            company_excel_list.append(old.strip().replace(' ', '').replace('\n', ''))
    os.remove(filename)
    return company_excel_list

filename = ['xxx.xlsx', 'yyy.xls']
for i in filename:
    company_excel_list = readExcel2(i)
    for company in company_excel_list:
        print(company)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值