Excel文件检查
from xlrd import open_workbook
input_file='sales_2013.xlsx'
workbook=open_workbook(input_file)
print('Number of worksheets',workbook.nsheets)
for worksheet in workbook.sheets():
print('sheet name:',worksheet.name,'\trows:',worksheet.nrows,'\tcolumns:',worksheet.ncols)
读写Excel文件
from xlrd import open_workbook
from xlwt import Workbook
input_file='sales_2013.xlsx'
output_file='excel_copy.xlsx'
output_workbook=Workbook()
output_worksheet=output_workbook.add_sheet('jan_2013_output')
with open_workbook(input_file) as workbook:
worksheet=workbook.sheet_by_name('january_2013')
for row_index in range(worksheet.nrows):
for column_index in range(worksheet.ncols):
output_worksheet.write(row_index,column_index,worksheet.cell_value(row_index,column_index))
output_workbook.save(output_file)
格式化Excel文件中的数据
from datetime import date
from xlrd import open_workbook,xldate_as_tuple
from xlwt import Workbook
input_file='sales_2013.xlsx'
output_file='date_format.xls'
output_workbook=Workbook()
output_worksheet=output_workbook.add_sheet('jan_2013_output')
with open_workbook(input_file) as workbook:
worksheet=workbook.sheet_by_name('january_2013')
for row_index in range(worksheet.nrows):
row_list_output=[]
for col_index in range(worksheet.ncols):
if worksheet.cell_type(row_index,col_index) ==3:
date_cell=xldate_as_tuple(worksheet.cell_value(row_index,col_index),workbook.datemode)
date_cell=date(*date_cell[0:3]).strftime('%m%d%Y')
row_list_output.append(date_cell)
output_worksheet.write(row_index,col_index,date_cell)
else:
non_date_cell=worksheet.cell_value(row_index,col_index)
row_list_output.append(non_date_cell)
output_worksheet.write(row_index,col_index,non_date_cell)
print(row_list_output)
output_workbook.save(output_file)
Excel行中的值满足某个条件
import sys
from datetime import date
from xlrd import open_workbook,xldate_as_tuple
from xlwt import Workbook
input_file='sales_2013.xlsx'
output_file='g.xls'
output_workbook=Workbook()
output_worksheet=output_workbook.add_sheet('jan_2013_output')
sale_amount_column_index=3
with open_workbook(input_file) as workbook:
worksheet=workbook.sheet_by_name('january_2013')
data=[]
header=worksheet.row_values(0)
data.append(header)
for row_index in range(1,worksheet.nrows):
row_list=[]
sale_amount=worksheet.cell_value(row_index,sale_amount_column_index)
if sale_amount >1400.0:
for column_index in range(worksheet.ncols):
cell_value=worksheet.cell_value(row_index,column_index)
cell_type=worksheet.cell_type(row_index,column_index)
if cell_type ==3:
date_cell=xldate_as_tuple(cell_value,workbook.datemode)
date_cell=date(*date_cell[0:3]).strftime('%m%d%Y')
row_list.append(date_cell)
else:
row_list.append(cell_value)
if row_list:
data.append(row_list)
for list_index,output_list in enumerate(data):
for element_index,element in enumerate(output_list):
output_worksheet.write(list_index,element_index,element)
output_workbook.save(output_file)
pandas版本
import pandas as pd
import sys
input_file='sales_2013.xlsx'
output_file='gp.xls'
data_frame=pd.read_excel(input_file,'january_2013',index_col=None)
data_frame_value_meets_condition=data_frame[data_frame['Sale Amount'].astype(float) > 1400.0]
writer=pd.ExcelWriter(output_file)
data_frame_value_meets_condition.to_excel(writer,sheet_name='jan_13_output',index=False)
writer.save()
Excel行中的值属于某个集合
import pandas as pd
import sys
input_file='sales_2013.xlsx'
output_file='value_in_set.xls'
data_frame=pd.read_excel(input_file,'january_2013',index_col=None)
inportant_dates=['01/24/2013','01/31/2013']
data_frame_value_in_set=data_frame[data_frame['Purchase Date'].isin(inportant_dates)]
writer=pd.ExcelWriter(output_file)
data_frame_value_in_set.to_excel(writer,sheet_name='jan_2013_output',index=False)
writer.save()
Excel行中的值匹配某个表达式
import pandas as pd
input_file='sales_2013.xlsx'
output_file='pandas_row_value_matches_re.xls'
data_frame=pd.read_excel(input_file,'january_2013',index_col=None)
data_frame_value_matches_pattern=data_frame[data_frame['Customer Name'].str.match("J")]
#或者data_frame_value_matches_pattern=data_frame[data_frame['Customer Name'].str.startswith("J")]
writer=pd.ExcelWriter(output_file)
data_frame_value_matches_pattern.to_excel(writer,sheet_name='jan_13_output',index=False)
writer.save()
Excel基于索引选取特定的列
import pandas as pd
input_file='sales_2013.xlsx'
output_file='value_in_column.xls'
data_frame=pd.read_excel(input_file,'january_2013',index_col=None)
data_frame_by_index=data_frame.iloc[:,[1,4]]
writer=pd.ExcelWriter(output_file)
data_frame_by_index.to_excel(writer,sheet_name='jan_13',index=False)
writer.save()
Excel基于列标题选取
import pandas as pd
input_file='sales_2013.xlsx'
output_file='pandas_column_title_select.xls'
data_frame=pd.read_excel(input_file,'january_2013',index_col=None)
data_frame_column_by_name=data_frame.loc[:,['Customer ID','Purchase Date']]
writer=pd.ExcelWriter(output_file)
data_frame_column_by_name.to_excel(writer,sheet_name='jan_13_output',index=False)
writer.save()
工作表计数以及每个工作表中的行列计数
import glob
import os
import sys
from xlrd import open_workbook
input_directory = sys.argv[1]
workbook_counter = 0
for input_file in glob.glob(os.path.join(input_directory, '*.xls*')):
workbook = open_workbook(input_file)
print('Workbook: {}'.format(os.path.basename(input_file)))
print('Number of worksheets: {}'.format(workbook.nsheets))
for worksheet in workbook.sheets():
print('Worksheet name:', worksheet.name, '\tRows:',\
worksheet.nrows, '\tColumns:', worksheet.ncols)
workbook_counter += 1
print('Number of Excel workbooks: {}'.format(workbook_counter))
在一个Excel文件所有工作表中选取特定行
import pandas as pd
input_file='sales_2013.xlsx'
output_file='all_sheeet_column_title.xls'
data_frame=pd.read_excel(input_file,sheet_name=None,index_col=None)
row_output=[]
for worksheet_name , data in data_frame.items():
row_output.append(data[data['Sale Amount'].astype(float) >2279.0])
filtered_rows=pd.concat(row_output,axis=0,ignore_index=True)
writer=pd.ExcelWriter(output_file)
filtered_rows.to_excel(writer,sheet_name='sale_amount_gt200',index=False)
writer.save()
在一个Excel所有工作表中选取特定列
import pandas as pd
import sys
input_file='sales_2013.xlsx'
output_file='pandas_column_by_name_all_worksheets.xls'
data_frame = pd.read_excel(input_file, sheet_name=None, index_col=None)
column_output = []
for worksheet_name, data in data_frame.items():
column_output.append(data.loc[:, ['Customer Name', 'Sale Amount']])
selected_columns = pd.concat(column_output, axis=0, ignore_index=True)
writer = pd.ExcelWriter(output_file)
selected_columns.to_excel(writer, sheet_name='selected_columns_all_worksheets', index=False)
writer.save()
在Excel工作簿中读取一组工作表
也就是说只读取某些工作表
下面包含两种筛选方式:1 索引筛选 2 根据工作表名字筛选
import pandas as pd
import sys
input_file='sales_2013.xlsx'
output_file='pandas_value_meets_condition_set_of_.xls'
my_sheets = ['january_2013','february_2013'] #按照名字
#my_sheets = [0,1] 按照索引
threshold = 1900.0
data_frame = pd.read_excel(input_file, sheet_name=my_sheets, index_col=None)
row_list = []
for worksheet_name, data in data_frame.items():
row_list.append(data[data['Sale Amount'].replace('$', '').replace(',', '').astype(float) > threshold])
filtered_rows = pd.concat(row_list, axis=0, ignore_index=True)
writer = pd.ExcelWriter(output_file)
filtered_rows.to_excel(writer, sheet_name='set_of_worksheets', index=False)
writer.save()
将多个工作簿中的多个工作表整合到一个Excel中
import pandas as pd
import glob
import os
import sys
input_path='./'
output_file='all_data_all_workbooks.xls'
all_workbooks = glob.glob(os.path.join(input_path,'*.xls*'))
data_frames = []
for workbook in all_workbooks:
all_worksheets = pd.read_excel(workbook, sheet_name=None, index_col=None)
for worksheet_name, data in all_worksheets.items():
data_frames.append(data)
all_data_concatenated = pd.concat(data_frames, axis=0, ignore_index=True)
writer = pd.ExcelWriter(output_file)
all_data_concatenated.to_excel(writer, sheet_name='all_data_all_workbooks', index=False)
writer.save()
为每个工作簿和工作表计算总数和平均值
import pandas as pd
import glob
import os
input_path = './'
output_file = 'pandas_sum_average_multiple_workbooks.xls'
all_workbooks = glob.glob(os.path.join(input_path, '*.xls*'))
data_frames = []
for workbook in all_workbooks:
all_worksheets = pd.read_excel(workbook, sheet_name=None, index_col=None)
workbook_total_sales = []
workbook_number_of_sales = []
worksheet_data_frames = []
worksheets_data_frame = None
workbook_data_frame = None
for worksheet_name, data in all_worksheets.items():
total_sales = pd.DataFrame(
[float(str(value).strip('$').replace(',', '')) for value in data.ix[:, 'Sale Amount']]).sum()
number_of_sales = len(data.loc[:, 'Sale Amount'])
average_sales = pd.DataFrame(total_sales / number_of_sales)
workbook_total_sales.append(total_sales)
workbook_number_of_sales.append(number_of_sales)
data = {'workbook': os.path.basename(workbook),
'worksheet': worksheet_name,
'worksheet_total': total_sales,
'worksheet_average': average_sales}
worksheet_data_frames.append(
pd.DataFrame(data, columns=['workbook', 'worksheet', 'worksheet_total', 'worksheet_average']))
worksheets_data_frame = pd.concat(worksheet_data_frames, axis=0, ignore_index=True)
workbook_total = pd.DataFrame(workbook_total_sales).sum()
workbook_total_number_of_sales = pd.DataFrame(workbook_number_of_sales).sum()
workbook_average = pd.DataFrame(workbook_total / workbook_total_number_of_sales)
workbook_stats = {'workbook': os.path.basename(workbook),
'workbook_total': workbook_total,
'workbook_average': workbook_average}
workbook_stats = pd.DataFrame(workbook_stats, columns=['workbook', 'workbook_total', 'workbook_average'])
#workbook_data_frame == worksheets_data_frame + workbook_stats
workbook_data_frame = pd.merge(worksheets_data_frame, workbook_stats, on='workbook', how='left')
data_frames.append(workbook_data_frame)
all_data_concatenated = pd.concat(data_frames, axis=0, ignore_index=True)
writer = pd.ExcelWriter(output_file)
all_data_concatenated.to_excel(writer, sheet_name='sums_and_averages', index=False)
writer.save()