pandas基本操作及pdf转excel的方法

基本操作

import pandas as pd

dict_list = [{"name":"Jack","age":22,"score":100}, {"name":"xuan","age":21,"score":99}, {"name":"Rose","age":18,"score":60}]
df1 = pd.DataFrame(dict_list)
print(df1)
#    name  age  score
# 0  Jack   22    100
# 1  xuan   21     99
# 2  Rose   18     60
dict = {"name":"Jack","age":22,"score":100}
df2 = pd.DataFrame(dict, index=[3,1,6])
print(df2)
#    name  age  score
# 3  Jack   22    100
# 1  Jack   22    100
# 6  Jack   22    100
df2 = pd.DataFrame.from_dict(dict, orient="index",columns=['test'])
print(df2)
#        test
# name   Jack
# age      22
# score   100

list = ['xuan', 'Jack', 'Rose', 'Luxi']
df3 = pd.DataFrame(list,columns=['name'],index=[1,6,7,8])
print(df3)
#    name
# 1  xuan
# 6  Jack
# 7  Rose
# 8  Luxi

list_list = [['name','age'],['xuanRui1',22],['xuanRui2',33],['xuanRui3',44]]
df4 = pd.DataFrame(list_list,columns=['姓名', '年龄'])
print(df4)
# 添加columns列名参数
#          姓名   年龄
# 0      name  age
# 1  xuanRui1   22
# 2  xuanRui2   33
# 3  xuanRui3   44

df4["性别"] = ["无",'m','w','m']
print(df4)
#          姓名   年龄 性别
# 0      name  age  无
# 1  xuanRui1   22  m
# 2  xuanRui2   33  w
# 3  xuanRui3   44  m
print(df4.iloc[:,[0,2]])  # iloc: 根据索引切片,不支持字段名
#          姓名 性别
# 0      name  无
# 1  xuanRui1  m
# 2  xuanRui2  w
# 3  xuanRui3  m
print(df4.iloc[[0,2],:])
#          姓名   年龄 性别
# 0      name  age  无
# 2  xuanRui2   33  w

print(df4.loc[:,["姓名"]]) # iloc: 根据索引切片,不支持字段名
# 姓名
# 0      name
# 1  xuanRui1
# 2  xuanRui2
# 3  xuanRui3
print(df4.loc[[2,3],:])
#          姓名  年龄 性别
# 2  xuanRui2  33  w
# 3  xuanRui3  44  m
df4.index = ["一",'er',"san","si"]
print(df4)
#            姓名   年龄 性别
# 一        name  age  无
# er   xuanRui1   22  m
# san  xuanRui2   33  w
# si   xuanRui3   44  m

print(df4.loc[["er","si"],["姓名"]])
#          姓名
# er  xuanRui1
# si  xuanRui3

PDF格式转为Excel格式

import pandas as pd
import camelot.io as camelot

def pdfToExcel(pdf_path):
    excel_path = pdf_path.split(".pdf")[0] + ".xlsx"
    with pd.ExcelWriter(excel_path) as writer:
        try:
            for page_num in range(1,20):
                print(page_num)
                try:
                    tables = camelot.read_pdf(pdf_path,flavor='stream',pages=str(page_num))
                    for table_num in range(5):
                        try:
                            if tables[table_num].shape != (1,1):
                                table_df = tables[table_num].df
                                # 生成excel文件
                                # table_df.to_excel(writer,sheet_name=f"数据表{page_num}")
                        except IndexError:
                            break
                except IndexError:
                    break
        except:
            print("PDF TO EXCEL ERROR ...")
            return None
    return table_df
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值