python读取excel文件

world_in_world

已于 2023-10-12 09:22:01 修改

阅读量77

点赞数

分类专栏： python基础文章标签： python excel

于 2023-10-09 14:36:41 首次发布

本文链接：https://blog.csdn.net/world_in_world/article/details/133701630

版权

python基础专栏收录该内容

11 篇文章 0 订阅

订阅专栏

参考：https://www.cnblogs.com/wolfstark/p/16895823.html

import requests
from lxml import etree
import openpyxl
import xlrd
import pandas as pd
import os


'''
实例：http://zjt.hunan.gov.cn/zjt/bsfw/ggfw/xxcx/202005/t20200525_12170746.html
.xlsx .xls
'''


headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "Cache-Control": "no-cache",
    "Connection": "keep-alive",
    "Pragma": "no-cache",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.203"
}

url = "http://zjt.hunan.gov.cn/zjt/bsfw/ggfw/xxcx/202005/t20200525_12170746.html"
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
element = etree.HTML(response.text)
lis = element.xpath('//div[@class="main_con_zw"]//p/a/@href')
one = 'http://zjt.hunan.gov.cn/zjt/bsfw/ggfw/xxcx/202005/' + lis[0].strip()
two = 'http://zjt.hunan.gov.cn/zjt/bsfw/ggfw/xxcx/202005/' + lis[1].strip()
print(one, '\n', two)

res1 = requests.get(one)
res2 = requests.get(two)
with open('xxx.xlsx', 'wb') as f, open('yyy.xls', 'wb') as p:
    f.write(res1.content)
    p.write(res2.content)


## 方案一：碰到有特殊设置的单元格，可能无法读取到真实的数值
def readExcel(filename):
    company_excel_list = []
    if '.xlsx' in filename:
        workbook = openpyxl.load_workbook(filename)  # 加载文件
        sheetnames = workbook.get_sheet_names()  # 获取所有sheet名称，通过名字的方式
        ws = workbook.get_sheet_by_name(sheetnames[0])  # 获取第一个sheet内容
        # 读取数据
        for r in range(3, ws.max_row + 1):
            old = ws.cell(r, 2).value
            if old and len(old) > 1:
                company_excel_list.append(old.strip().replace(' ', '').replace('\n', ''))
        workbook.close()

    elif '.xls' in filename:
        workbook = xlrd.open_workbook(filename)  # 加载文件
        sheet = workbook.sheet_by_index(0)  # 获取第一个sheet内容
        # 读取数据
        for row_index in range(2, sheet.nrows):
            old = sheet.cell_value(row_index, 1)
            if old and len(old) > 1:
                company_excel_list.append(old.strip().replace(' ', '').replace('\n', ''))
    os.remove(filename)
    return company_excel_list

## 方案二
def readExcel2(filename):
    company_excel_list = []
    df = pd.read_excel(filename, header=1)
    df.fillna('', inplace=True)
    old_list = df.iloc[:, 1].values
    for old in old_list:
        if old and len(old) > 1:
            company_excel_list.append(old.strip().replace(' ', '').replace('\n', ''))
    os.remove(filename)
    return company_excel_list

filename = ['xxx.xlsx', 'yyy.xls']
for i in filename:
    company_excel_list = readExcel2(i)
    for company in company_excel_list:
        print(company)