目录
前言
在Python中,处理Excel数据是一个常见的需求,通常可以通过多个库来实现,如pandas、openpyxl、xlsxwriter等。
今天给大家带来20个常用的Excel数据处理脚本示例,涵盖了从基础的文件读写到复杂的数据分析和格式设置等操作(文末附python学习资料)。
示例数据
假设我们有一个名为 data.xlsx 的 Excel 文件,其中包含以下数据:
| Column1 | Column2 | Column3 |
|---------|---------|---------|
| A | 10 | 20 |
| B | 20 | 30 |
| C | 30 | 40 |
| D | 40 | 50 |
| E | 50 | 60 |
| F | 60 | 70 |
| G | 70 | 80 |
| H | 80 | 90 |
| I | 90 | 100 |
1. 读取 Excel 文件
import pandas as pd
def read_excel(file_path):
df = pd.read_excel(file_path, engine='openpyxl')
return df
if __name__ == "__main__":
file_path = "data.xlsx"
df = read_excel(file_path)
print(df)
2. 数据筛选
筛选出 Column1 为 'D' 的行。
import pandas as pd
def filter_data(file_path, column, value):
df = pd.read_excel(file_path, engine='openpyxl')
filtered_df = df[df[column] == value]
return filtered_df
if __name__ == "__main__":
file_path = "data.xlsx"
column = "Column1"
value = "D"
filtered_df = filter_data(file_path, column, value)
print(filtered_df)
3. 数据汇总
计算 Column2 的总和。
import pandas as pd
def summarize_data(file_path, column):
df = pd.read_excel(file_path, engine='openpyxl')
total = df[column].sum()
return total
if __name__ == "__main__":
file_path = "data.xlsx"
column = "Column2"
total = summarize_data(file_path, column)
print(total)
4. 数据分组
按 Column1 分组并计算 Column2 的平均值。
import pandas as pd
def group_data(file_path, by, agg):
df = pd.read_excel(file_path, engine='openpyxl')
grouped = df.groupby(by)[agg].mean()
return grouped
if __name__ == "__main__":
file_path = "data.xlsx"
by = "Column1"
agg = "Column2"
grouped = group_data(file_path, by, agg)
print(grouped)
5. 数据排序
按 Column2 升序排序。
import pandas as pd
def sort_data(file_path, column, ascending=True):
df = pd.read_excel(file_path, engine='openpyxl')
sorted_df = df.sort_values(by=column, ascending=ascending)
return sorted_df
if __name__ == "__main__":
file_path = "data.xlsx"
column = "Column2"
sorted_df = sort_data(file_path, column)
print(sorted_df)
6. 数据对比
比较两个 Excel 文件 data1.xlsx 和 data2.xlsx 中 Column1 的不同之处。
import pandas as pd
def compare_data(file_path1, file_path2, on):
df1 = pd.read_excel(file_path1, engine='openpyxl')
df2 = pd.read_excel(file_path2, engine='openpyxl')
merged = pd.merge(df1, df2, on=on, how='outer', indicator=True)
diff = merged[merged['_merge'] != 'both']
return diff
if __name__ == "__main__":
file_path1 = "data1.xlsx"
file_path2 = "data2.xlsx"
on = "Column1"
diff = compare_data(file_path1, file_path2, on)
print(diff)
7. 数据合并
将两个 Excel 文件 data1.xlsx 和 data2.xlsx 中具有相同 Column1 的行合并。
import pandas as pd
def merge_data(file_path1, file_path2, on):
df1 = pd.read_excel(file_path1, engine='openpyxl')
df2 = pd.read_excel(file_path2, engine='openpyxl')
merged = pd.merge(df1, df2, on=on, how='outer')
return merged
if __name__ == "__main__":
file_path1 = "data1.xlsx"
file_path2 = "data2.xlsx"
on = "Column1"
merged = merge_data(file_path1, file_path2, on)
print(merged)
8. Sheet 合并
合并同一个 Excel 文件 data.xlsx 中的所有 Sheet。
import pandas as pd
def merge_sheets(file_path):
sheets = pd.read_excel(file_path, sheet_name=None, engine='openpyxl')
combined = pd.concat(sheets.values(), ignore_index=True)
return combined
if __name__ == "__main__":
file_path = "data.xlsx"
combined = merge_sheets(file_path)
print(combined)
9. 多个 Excel 文件合并
合并所有名为 *.xlsx 的 Excel 文件。
import pandas as pd
import glob
def merge_multiple_files(file_pattern):
files = glob.glob(file_pattern)
dfs = [pd.read_excel(file, engine='openpyxl') for file in files]
combined = pd.concat(dfs, ignore_index=True)
return combined
if __name__ == "__main__":
file_pattern = "*.xlsx"
combined = merge_multiple_files(file_pattern)
print(combined)
作为一个IT的过来人,我自己整理了一些python学习资料,希望对你们有帮助。
朋友们如果需要可以点击下方链接或微信扫描下方二维码都可以免费获取【保证100%免费】。
CSDN大礼包:《2024年最新Python全套学习资料包】免费领取(安全链接,放心点击)
10. 缺失值填充
将 Column2 中的缺失值填充为 0。
import pandas as pd
def fill_missing_values(file_path, column, value):
df = pd.read_excel(file_path, engine='openpyxl')
df[column].fillna(value, inplace=True)
return df
if __name__ == "__main__":
file_path = "data.xlsx"
column = "Column2"
value = 0
df = fill_missing_values(file_path, column, value)
print(df)
11. 数据清洗
移除 Column1 中的空格。
import pandas as pd
def clean_data(file_path, column, regex):
df = pd.read_excel(file_path, engine='openpyxl')
df[column] = df[column].str.replace(regex, '')
return df
if __name__ == "__main__":
file_path = "data.xlsx"
column = "Column1"
regex = r'\s+'
df = clean_data(file_path, column, regex)
print(df)
12. 数据透视表
创建一个透视表,按 Column1 分类,显示 Column2 的总和。
import pandas as pd
def pivot_table(file_path, index, columns, values):
df = pd.read_excel(file_path, engine='openpyxl')
pivoted = pd.pivot_table(df, index=index, columns=columns, values=values, aggfunc='sum')
return pivoted
if __name__ == "__main__":
file_path = "data.xlsx"
index = "Column1"
columns = "Column2"
values = "Column3"
pivoted = pivot_table(file_path, index, columns, values)
print(pivoted)
13. 数据重塑
将 Column2 和 Column3 重塑成多行。
import pandas as pd
def melt_data(file_path, id_vars, value_vars):
df = pd.read_excel(file_path, engine='openpyxl')
melted = pd.melt(df, id_vars=id_vars, value_vars=value_vars)
return melted
if __name__ == "__main__":
file_path = "data.xlsx"
id_vars = "Column1"
value_vars = ["Column2", "Column3"]
melted = melt_data(file_path, id_vars, value_vars)
print(melted)
14. 数据聚合
按 Column1 分组,计算 Column2 的总和和 Column3 的平均值。
import pandas as pd
def aggregate_data(file_path, group_by, agg):
df = pd.read_excel(file_path, engine='openpyxl')
aggregated = df.groupby(group_by).agg(agg)
return aggregated
if __name__ == "__main__":
file_path = "data.xlsx"
group_by = "Column1"
agg = {"Column2": "sum", "Column3": "mean"}
aggregated = aggregate_data(file_path, group_by, agg)
print(aggregated)
15. 数据统计
计算 Column2 的统计数据。
import pandas as pd
def describe_data(file_path, columns):
df = pd.read_excel(file_path, engine='openpyxl')
stats = df[columns].describe()
return stats
if __name__ == "__main__":
file_path = "data.xlsx"
columns = ["Column2", "Column3"]
stats = describe_data(file_path, columns)
print(stats)
16. 数据类型转换
将 Column2 的数据类型转换为整数。
import pandas as pd
def convert_types(file_path, column, dtype):
df = pd.read_excel(file_path, engine='openpyxl')
df[column] = df[column].astype(dtype)
return df
if __name__ == "__main__":
file_path = "data.xlsx"
column = "Column2"
dtype = int
df = convert_types(file_path, column, dtype)
print(df)
17. 数据切片
选择前5行。
import pandas as pd
def slice_data(file_path, start, stop):
df = pd.read_excel(file_path, engine='openpyxl')
sliced_df = df[start:stop]
return sliced_df
if __name__ == "__main__":
file_path = "data.xlsx"
start = 0
stop = 5
sliced_df = slice_data(file_path, start, stop)
print(sliced_df)
18. 数据重命名
将 Column1 重命名为 NewColumn1。
import pandas as pd
def rename_columns(file_path, old_name, new_name):
df = pd.read_excel(file_path, engine='openpyxl')
df.rename(columns={old_name: new_name}, inplace=True)
return df
if __name__ == "__main__":
file_path = "data.xlsx"
old_name = "Column1"
new_name = "NewColumn1"
df = rename_columns(file_path, old_name, new_name)
print(df)
19. 数据去重
去除 Column1 中的重复项。
import pandas as pd
def remove_duplicates(file_path, subset=None):
df = pd.read_excel(file_path, engine='openpyxl')
unique_df = df.drop_duplicates(subset=subset)
return unique_df
if __name__ == "__main__":
file_path = "data.xlsx"
subset = ["Column1"]
unique_df = remove_duplicates(file_path, subset)
print(unique_df)
20. 写入 Excel 文件
将处理后的数据写回 Excel 文件。
import pandas as pd
def write_excel(df, file_path):
df.to_excel(file_path, index=False, engine='openpyxl')
if __name__ == "__main__":
df = pd.DataFrame({'Column1': ['A', 'B', 'C'], 'Column2': [10, 20, 30]})
file_path = "output.xlsx"
write_excel(df, file_path)
结语
学会了Python就业还是不用愁的,这些行业在薪资待遇上可能会有一些区别,但是整体来看还是很好的,我也不会说往哪个方向发展是最好的,各取所长选择自己最感兴趣的去学习就好。
作为一个IT的过来人,我自己整理了一些python学习资料,希望对你们有帮助。
朋友们如果需要可以点击下方链接或微信扫描下方二维码都可以免费获取【保证100%免费】。
CSDN大礼包:《2024年最新Python全套学习资料包】免费领取(安全链接,放心点击)
编程资料、学习路线图、源代码、软件安装包等!
① Python所有方向的学习路线图,清楚各个方向要学什么东西
② 100多节Python课程视频,涵盖必备基础、爬虫和数据分析
③ 100多个Python实战案例,学习不再是只会理论
④ 华为出品独家Python漫画教程,手机也能学习