【python】pandas vs Excel 课程笔记
准备
开发环境
- 安装包:Anaconda
检查更新:在Anaconda Prompt中使用以下代码:conda update conda \\更新Anaconda conda update --all \\更新其他包
- IDE:PyCharm
格式化代码:Ctrl+Alt+L
简单的示例代码
- 求指定半径圆的面积
from numpy import pi
def get_circle_area(r):
a = r ** 2 * pi
return a
radius = 10
area = get_circle_area(radius)
print(f"The area of circle with radius {radius} is {area}.")
课时1 创建文件
-创建xlsl文件,并填充数据
import pandas as pd
df = pd.DataFrame({'ID': [1, 2, 3], 'Name': ['DingRui', 'Qiqi', 'Harry']})
# 将ID设置为索引
df = df.set_index('ID')
print(df)
df.to_excel('E:/DR_Python/Temp/output.xlsx')
print('Done!')
课时2 读取文件
打开“people.xlsx”,基础代码
import pandas as pd
people = pd.read_excel('E:/DR_Python/Temp/People.xlsx')
# 获取行列数
print(people.shape)
#获取列名
print(people.columns)
#查看前3行内容
print(people.head(3))
#查看最后3行内容
print(people.tail(3))
- 情况1: 表头脏
解决方法:设置header属性
people = pd.read_excel('E:/DR_Python/Temp/People2.xlsx',header=1)
- 情况2:无表头
-header设为None,设置Conlumns属性
people = pd.read_excel('E:/DR_Python/Temp/People2.xlsx',header=None)
people.columns = ['ID','Type','Title','FirstName','MiddleName','LastName']
- 指定Index列
people = pd.read_excel('E:/DR_Python/Temp/002output.xlsx', index_col='ID')
课时3 行,列,单元格
- 序列 Series
1.将字典转换为Series
import pandas as pd
# 定义Python数据类型字典(键值对)
d = {'x': 100, 'y': 200, 'z': 300}
# 将字典格式转换为Series
s1 = pd.Series(d)
print(s1)
- 定义Series
L1 = [100, 200, 300]
L2 = ['x', 'y', 'z']
s1 = pd.Series(L1, L2)
print(s1)
- 创建DataFrame
s1 = pd.Series([1, 2, 3], index=[1, 2, 3], name='A')
s2 = pd.Series([10, 20, 30], index=[1, 2, 3], name='B')
s3 = pd.Series([100, 200, 300], index=[1, 2, 3], name='C')
# 以dictionary形式将序列加入DataFrame(列)
df = pd.DataFrame({s1.name: s1, s2.name: s2, s3.name: s3})
print(df)
# 以list形式将序列加入DataFrame(行)
df2 = pd.DataFrame([s1, s2, s3])
print(df2)
运行结果
A B C
1 1 10 100
2 2 20 200
3 3 30 300
1 2 3
A 1 2 3
B 10 20 30
C 100 200 300
课时4 数据区域的读取,填充整数、文字
数据准备:‘Books.xlsx’
代码
import pandas as pd
from datetime import date, timedelta # 调用日期格式
# 读取Excel文件,读取指定行列
books = pd.read_excel('E:/DR_Python/Temp/Books.xlsx', skiprows=3, usecols="C:F", index_col=None,
dtype={'ID': str, 'InStore': str, 'Date': str})
# 定义起始年份
start = date(2019, 7, 28)
# 自动填充
for i in books.index:
# 设置ID列自动填充
books['ID'].at[i] = i + 1
# 设置InStore列自动填充(Yes,No交替)
books['InStore'].at[i] = 'Yes' if i % 2 == 0 else 'NO'
# 设置Date
books['Date'].at[i] = start
print(books)
课时5 填充日期序列
- 示例代码
import pandas as pd
from datetime import date, timedelta # 调用日期格式
# 自增月函数(传入日期,增加月份数量)
def add_month(d, md):
yd = md // 12 # 年份数为月数除12取整
m = d.month + md % 12 # 当前月份加上对年取余的新增月份
if m != 12:
yd += m // 12 # 若m大于12,新增年份加上多余的年份
m = m % 12 # 再m对12取余为目前的月份
return date(d.year + yd, m, d.day)
# 读取Excel文件,读取指定行列
books = pd.read_excel('E:/DR_Python/Temp/Books.xlsx', skiprows=3, usecols="C:F", index_col=None,
dtype={'ID': str, 'InStore': str, 'Date': str})
# 定义起始年份
start = date(2019, 7, 28)
# 自动填充
for i in books.index:
# 设置Date列自动填充(从start日期自增1天)
books['Date'].at[i] = start + timedelta(days=i)
# 设置Date列自动填充(从start日期自增1年)
books['Date'].at[i] = date(start.year + i, start.month, start.day)
# 设置Date列自动填充(从start日期自增1月)
books['Date'].at[i] = add_month(start, i)
print(books)
- 方式2:以单元格为单位进行数据更改
# 方式2:以单元格为单位修改
for i in books.index:
books.at[i, 'ID'] = i + 1
books.at[i, 'InStore'] = 'Yes' if i % 2 == 0 else 'NO'
books.at[i, 'Date'] = add_month(start, i)
课时6 函数填充
- 两列相乘
import pandas as pd
books = pd.read_excel('E:/DR_Python/Temp/006Books.xlsx', index_col='ID')
books['Price'] = books['ListPrice'] * books['Discount']
print(books)
- 计算指定行相乘(5-10行)
for i in range(5, 10):
books['Price'].at[i] = books['ListPrice'].at[i] * books['Discount'].at[i]
- 指定ListPrice列加2
# 方法1:直接相加
books['ListPrice'] = books['ListPrice'] + 2
# 方法2:定义函数
def add_2(x):
return x + 2
books['ListPrice'] = books['ListPrice'].apply(add_2)
# 方法3:lambda表达式
books['ListPrice'] = books['ListPrice'].apply(lambda x: x + 2)
课时7 排序,多重排序
- 按Worthy和Price排序 不值得且最贵的在最前
import pandas as pd
products = pd.read_excel('E:/DR_Python/Temp/List.xlsx', index_col='ID')
products.sort_values(by=['Worthy', 'Price'], ascending=[True, False], inplace=True)
print(products)
课时8 数据筛选、过滤
- 按指定条件筛选
import pandas as pd
# 年龄在18-30岁
def validate_age(a):
return 18 <= a <= 30
# 分数在60-90分
def level_b(s):
return 60 <= s < 90
students = pd.read_excel('E:/DR_Python/Temp/008/Students.xlsx', index_col='ID')
students = students.loc[students['Age'].apply(validate_age)].loc[students.Score.apply(level_b)] # 两种语法
print(students)
- 方法二 使用lambda表达式筛选
import pandas as pd
students = pd.read_excel('E:/DR_Python/Temp/008/Students.xlsx', index_col='ID')
students = students.loc[students['Age'].apply(lambda a: 18 <= 18 <= 30)].loc[
students.Score.apply(lambda s: 60 <= s <= 90)]
print(students)