learn_pandas

pandas

Pandas 是 Python 语言的一个扩展程序库,用于数据分析。

import pandas as pd

pd.__version__
'2.2.2'

Series

相当于表格中的一列,带有索引的一维数组
索引唯一,缺省为range(0,n)
有序

pandas.Series(data=None, index=None, dtype=None, name=None, copy=False, fastpath=False)
# 指定索引和值
s1 = pd.Series(data=[1, 2, 3], index=['a', 'b', 'c'])
# 通过字典创建
s2 = pd.Series({1:'a', 2:'b', 3:'c'})

s1, s2
(a    1
 b    2
 c    3
 dtype: int64,
 1    a
 2    b
 3    c
 dtype: object)

基本信息

# 获取统计信息
s1.min(), s1.max(), s1.mean(), s1.std(), s1.sum(),s2.value_counts()
(np.int64(1),
 np.int64(3),
 np.float64(2.0),
 np.float64(1.0),
 np.int64(6),
 a    1
 b    1
 c    1
 Name: count, dtype: int64)
# 获取示例数据,默认5
s2.head(2), s2.tail(2), s2.sample(2)
(1    a
 2    b
 dtype: object,
 2    b
 3    c
 dtype: object,
 1    a
 2    b
 dtype: object)
# 获取属性
print(s1.dtype, s1.shape, s1.size)
print(s2.dtype, s2.shape, s2.size)

print(s1.index, s1.values)
int64 (3,) 3
object (3,) 3
Index(['a', 'b', 'c'], dtype='object') [1 2 3]

常用操作

# 通过索引更改、添加元素
s1['a'], s1['d'] = -1, 4

# 使用 del 删除指定索引标签的元素。
del s1['b'] 

# 使用 drop 方法删除索引,并返回一个新的 Series
s_dropped = s1.drop(['c']) 

print(s1)
print(s_dropped)
a   -1
c    3
d    4
dtype: int64
a   -1
d    4
dtype: int64
# 使用切片语法
s1['a':'c'], s1[-2:]
(a   -1
 c    3
 dtype: int64,
 c    3
 d    4
 dtype: int64)
# 更改索引,并返回一个新的 Series
s1.rename({'a':'A','c':'C'})
A   -1
C    3
d    4
dtype: int64
# 设置数据类型,并返回一个新的 Series
s1.astype('float')
a   -1.0
c    3.0
d    4.0
dtype: float64
# 索引和值的关系
for index, value in s2.items():
    print(f'{index=}, {value=}')
index=1, value='a'
index=2, value='b'
index=3, value='c'

DataFrame

相当于二维表

pandas.DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)

创建

list dict json numpy

# 通过二维表(list, tuple, numpy)创建
data1 = [('Google', 10), ['Runoob', 12], ['Wiki', 13]]
df1 = pd.DataFrame(data1, columns=['Site', 'Age'])

# 通过字典的列表创建
data2 = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]
df2 = pd.DataFrame(data2)

df1, df2
(     Site  Age
 0  Google   10
 1  Runoob   12
 2    Wiki   13,
    a   b     c
 0  1   2   NaN
 1  5  10  20.0)
# 通过列表的字典创建
data = {'name':['Tom', 'Bob', 'peter', 'Tom'],
         'age':[18, 19, 14, 10],
         'gender':['male', 'female', 'male', 'female']}
df = pd.DataFrame(data)
df
nameagegender
0Tom18male
1Bob19female
2peter14male
3Tom10female

基本信息

# 获取示例数据,默认5
df.head(2), df.tail(2), df.sample(2)
(  name  age  gender
 0  Tom   18    male
 1  Bob   19  female,
     name  age  gender
 2  peter   14    male
 3    Tom   10  female,
   name  age  gender
 1  Bob   19  female
 3  Tom   10  female)
# 获取信息
df.info(), df.describe()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    4 non-null      object
 1   age     4 non-null      int64 
 2   gender  4 non-null      object
dtypes: int64(1), object(2)
memory usage: 228.0+ bytes





(None,
              age
 count   4.000000
 mean   15.250000
 std     4.112988
 min    10.000000
 25%    13.000000
 50%    16.000000
 75%    18.250000
 max    19.000000)
# 获取属性
df.columns, df.index, df.dtypes, df.shape
(Index(['name', 'age', 'gender'], dtype='object'),
 RangeIndex(start=0, stop=4, step=1),
 name      object
 age        int64
 gender    object
 dtype: object,
 (4, 3))

切片和定位

print("df[['name', 'age']]\n", df[['name', 'age']])  # 提取多列
print("\ndf[1:3]\n", df[1:3])               # 切片行
print("\ndf.loc[1:2, ['name', 'age']]\n"
      ,df.loc[1:2, ['name', 'age']])  # 标签索引提取指定行列
print("\ndf.iloc[:, 1:]\n",df.iloc[:, 1:])        # 位置索引提取指定列
df[['name', 'age']]
     name  age
0    Tom   18
1    Bob   19
2  peter   14
3    Tom   10

df[1:3]
     name  age  gender
1    Bob   19  female
2  peter   14    male

df.loc[1:2, ['name', 'age']]
     name  age
1    Bob   19
2  peter   14

df.iloc[:, 1:]
    age  gender
0   18    male
1   19  female
2   14    male
3   10  female
# 布尔索引,筛选True
r1 = df['name'] == 'Tom'
r2 = df['age'] < 18
df[r1], df[r2]
(  name  age  gender
 0  Tom   18    male
 3  Tom   10  female,
     name  age  gender
 2  peter   14    male
 3    Tom   10  female)
# 更改值
df.loc[df['name']=='Bob', 'age']=15
df
nameagegender
0Tom18male
1Bob15female
2peter14male
3Tom10female
# 重命名行(index,axis=0)和列(columns,axis=1)  
# 默认 axis=0,inplace=False
df.rename(columns={'name':'Name'},index={1:-1})
# df.rename({1:-1})
# df.rename({'name':'Name'},axis=1).rename({1:-1},axis=0)
Nameagegender
0Tom18male
-1Bob15female
2peter14male
3Tom10female

索引

index

# DataFrame.set_index(keys, drop=True, append=False, inplace=False, 
#                         verify_integrity=False)

# 设置keys列为索引
# drop=True表示删除该列
# append=False表示删除原索引。True保留形成多级索引
# verify_integrity=False不检查唯一性
df.set_index('name')
agegender
name
Tom18male
Bob15female
peter14male
Tom10female
# 重置索引
# drop=False,原索引会被添加为新的一列
df.reset_index(drop=False,inplace=False)
indexnameagegender
00Tom18male
11Bob15female
22peter14male
33Tom10female

排序

sort_values sort_index

# 值排序,默认升序(ascending=True),生成新的DateFrame  
# 可通过 key= 指定排序函数,接受一个 pd.Series 对象
df.sort_values(by='age',ascending=False,inplace=False)
df.sort_values(by='name',key=lambda x:x.str.len())
nameagegender
0Tom18male
1Bob15female
3Tom10female
2peter14male
# 索引排序,通过 axis=0 or 1 指定行列
# 默认ascending=True升序
df.sort_index(ascending=False)
nameagegender
3Tom10female
2peter14male
1Bob15female
0Tom18male

行列操作

insert concat drop

# 插入列
df['NAME'] = df['name'].str.upper()
df.insert(loc=4,column='note',value=['a','b','c','d'],allow_duplicates=True)

# 插入行
df.loc[df.index[-1]+1] = {'name':'Alice','age':1}
df.loc[df.index[-1]+1] = ['Joe',2,'male','',None]
new_line = pd.Series({'name':'Mike','age':11,'gender':'male'})
pd.concat([df,new_line]) # 推荐

df
nameagegenderNAMEnote
0Tom18maleTOMa
1Bob15femaleBOBb
2peter14malePETERc
3Tom10femaleTOMd
4Alice1NaNNaNNaN
5Joe2maleNone
# 删除行列,生成新的 DataFrame
df.drop(index=[0,1]).drop(columns='NAME')
nameagegendernote
2peter14malec
3Tom10femaled
4Alice1NaNNaN
5Joe2maleNone

分组

bins = [0, 10, 15, float('inf')]
labels = ['<10','10-15','>15']
df['age_priod'] = pd.cut(df['age'],bins=bins,labels=labels)
df
nameagegenderNAMEnoteage_priod
0Tom18maleTOMa>15
1Bob15femaleBOBb10-15
2peter14malePETERc10-15
3Tom10femaleTOMd<10
4Alice1NaNNaNNaN<10
5Joe2maleNone<10
df.groupby('gender')['age'].mean()
gender
female    12.500000
male      11.333333
Name: age, dtype: float64

长宽格式转换

pivot 和 melt

data = {'year': [2010, 2011, 2010, 2011],  
        'product': ['A', 'A', 'B', 'B'],  
        'sales': [5, 6, 7, 8]}  
df = pd.DataFrame(data)

pivoted_df = df.pivot(index='year', columns='product', values='sales')  
melted_df = (pivoted_df.reset_index()
            .melt(id_vars=['year'], var_name='product', value_name='sales'))

pivoted_df, melted_df
(product  A  B
 year         
 2010     5  7
 2011     6  8,
    year product  sales
 0  2010       A      5
 1  2011       A      6
 2  2010       B      7
 3  2011       B      8)

拼接、合并

concat 和 merge

# pandas.concat(objs, axis=0, join='outer', ignore_index=False, keys=None, 
#                 levels=None, names=None, verify_integrity=False, sort=False)
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2'],  
                    'B': ['B0', 'B1', 'B2']})  
df2 = pd.DataFrame({'A': ['A3', 'A4'],  
                    'B': ['B3', 'B4']})  
  
r1 = pd.concat([df1, df2])
r2 = pd.concat([df1, df2], axis=1)
r1, r2
(    A   B
 0  A0  B0
 1  A1  B1
 2  A2  B2
 0  A3  B3
 1  A4  B4,
     A   B    A    B
 0  A0  B0   A3   B3
 1  A1  B1   A4   B4
 2  A2  B2  NaN  NaN)
# pandas.merge(right, how='inner', on=None, left_on=None, right_on=None, 
#                 left_index=False, right_index=False, sort=False,
#                 suffixes=('_x', '_y'), copy=True, indicator=False, validate=None)
df1 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],  
                    'A': ['A0', 'A1', 'A2', 'A3'],  
                    'B': ['B0', 'B1', 'B2', 'B3']})  
df2 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K4'],  
                    'C': ['C0', 'C1', 'C2', 'C3'],  
                    'D': ['D0', 'D1', 'D2', 'D3']})  
  
result = pd.merge(df1, df2, on='key')  
result
keyABCD
0K0A0B0C0D0
1K1A1B1C1D1
2K2A2B2C2D2

文件操作

csv json excel

# 读取
df = pd.read_csv('inputs/data.csv')
df = pd.read_excel('inputs/data.xlsx')
df = pd.read_json('inputs/data.json')
# df = pd.read_html('inputs/data.html') # 需要lxml
df
yearproductsales
02010A5
12011A6
22010B7
32011B8
# 保存
df.to_csv('outputs/data.csv',index=False)
df.to_excel('outputs/data.xlsx',index=False)
df.to_json('outputs/data.json')
df.to_html('outputs/data.html')

数据清洗

pd.read_csv('inputs/property-data.csv')
PIDST_NUMST_NAMEOWN_OCCUPIEDNUM_BEDROOMSNUM_BATHSQ_FT
0100001000.0104.0PUTNAMY311000
1100002000.0197.0LEXINGTONN31.5--
2100003000.0NaNLEXINGTONNNaN1850
3100004000.0201.0BERKELEY121NaN700
4NaN203.0BERKELEYY321600
5100006000.0207.0BERKELEYYNaN1800
6100007000.0NaNWASHINGTONNaN2HURLEY950
7100008000.0213.0TREMONTY11NaN
8100009000.0215.0TREMONTYna21800
# 指定空值
missing_values = ['n/a', 'na', '--']
df = pd.read_csv('inputs/property-data.csv', na_values=missing_values)
df
PIDST_NUMST_NAMEOWN_OCCUPIEDNUM_BEDROOMSNUM_BATHSQ_FT
0100001000.0104.0PUTNAMY3.011000.0
1100002000.0197.0LEXINGTONN3.01.5NaN
2100003000.0NaNLEXINGTONNNaN1850.0
3100004000.0201.0BERKELEY121.0NaN700.0
4NaN203.0BERKELEYY3.021600.0
5100006000.0207.0BERKELEYYNaN1800.0
6100007000.0NaNWASHINGTONNaN2.0HURLEY950.0
7100008000.0213.0TREMONTY1.01NaN
8100009000.0215.0TREMONTYNaN21800.0
# 移除指定列有空值的数据
df.dropna(subset=['NUM_BEDROOMS'])
PIDST_NUMST_NAMEOWN_OCCUPIEDNUM_BEDROOMSNUM_BATHSQ_FT
0100001000.0104.0PUTNAMY3.011000.0
1100002000.0197.0LEXINGTONN3.01.5NaN
3100004000.0201.0BERKELEY121.0NaN700.0
4NaN203.0BERKELEYY3.021600.0
6100007000.0NaNWASHINGTONNaN2.0HURLEY950.0
7100008000.0213.0TREMONTY1.01NaN
# 替换空字段
# 通常使用均值mean(), 中位数median(), 众数mode()替换
(df.fillna({'PID':12345})
    .fillna({'ST_NUM':df["ST_NUM"].mean()})
    .fillna(-1))
PIDST_NUMST_NAMEOWN_OCCUPIEDNUM_BEDROOMSNUM_BATHSQ_FT
0100001000.0104.000000PUTNAMY3.011000.0
1100002000.0197.000000LEXINGTONN3.01.5-1.0
2100003000.0191.428571LEXINGTONN-1.01850.0
3100004000.0201.000000BERKELEY121.0-1700.0
412345.0203.000000BERKELEYY3.021600.0
5100006000.0207.000000BERKELEYY-1.01800.0
6100007000.0191.428571WASHINGTON-12.0HURLEY950.0
7100008000.0213.000000TREMONTY1.01-1.0
8100009000.0215.000000TREMONTY-1.021800.0
# 移去重复字段
df.drop_duplicates(['ST_NAME']) 
PIDST_NUMST_NAMEOWN_OCCUPIEDNUM_BEDROOMSNUM_BATHSQ_FT
0100001000.0104.0PUTNAMY3.011000.0
1100002000.0197.0LEXINGTONN3.01.5NaN
3100004000.0201.0BERKELEY121.0NaN700.0
6100007000.0NaNWASHINGTONNaN2.0HURLEY950.0
7100008000.0213.0TREMONTY1.01NaN
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

北辰2023

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值