Pandas数据分析01——基础数据结构

阡之尘埃

已于 2022-08-31 10:33:30 修改

阅读量727

点赞数 5

分类专栏： pandas数据分析文章标签： pandas 数据分析 python numpy

于 2022-07-26 11:58:35 首次发布

本文链接：https://blog.csdn.net/weixin_46277779/article/details/125991367

版权

pandas数据分析专栏收录该内容

42 篇文章 67 订阅

订阅专栏

参考书目：《深入浅出Pandas：利用Python进行数据处理与分析》

学习pandas之前，我们要了解一下Python的基础数据结构，还有pandas的底层库numpy的数据结构（数组矩阵之类的），然后才是pandas的两种基础数据结构，Series和DataFrame。

Python基础数据结构

数值

# 用科学计数法赋值
n = 1e4
n # 10000.0
m = 2e-2
m # 0.02
a = 10
b = 21
# 数值计算
a + b # 31
a - b # -11
a * b # 210
b / a # 2.1
a ** b # 表示 10 的 21 次幂
b % a # 1 （取余）
# 地板除 - 操作数的除法，其结果是删除小数点后的商数
# 但如果其中一个操作数为负数，则结果将被保留，即从零(向负无穷大)舍去
9//2 # 4
9.0//2.0 # 4.0
-11//3 # -4
-11.0//3 # -4.0

可以用来判断变量的类型

isinstance(123,int)
isinstance([123],list)

字符串

切片

#切片
var = 'Hello World!'
# 按索引取部分内容, 索引从0开始, 左必须小于右
# 支持字符、列表、元组
var[0] # 'H'
# 从右开始索引从 -1 开始
var[-1] # '!'
var[-3:-1] # 'ld'
var[1:7] # 'ello W'（有个空格，不包含最后一位）
var[6:] # 'World!' （前后省略按开头结尾）
var[:] # 'Hello World!'（相当于复制）
var[0:5:2] # 'Hlo'（2为步长，2的倍数取）
var[1:7:3] # 'ello W‘ -> 'eo'
var[::-1] # !dlroW olleH 实现反转字符功能

转义字符

print("一行\n另一行") # 换行
print("一格\t另一格") # 制表
print("我是\b中国人") # 退格，会删除「是」
print('I \'m boy.') # 引号，双引号同
print("看到反斜杠了么？\\") # 反斜杠

分割和连接

len('good') # 4 字符的长度
'good'.replace('g', 'G') # 'Good' 替换字符
'山-水-风-雨'.split('-') # ['山', '水', '风', '雨'] 用指定字符分隔，默认空格
'好山好水好风光'.split('好') # ['', '山', '水', '风光']
'-'.join(['山','水','风','雨']) # '山-水-风-雨'
'和'.join(['诗', '远方']) # '诗和远方'

# 分割连接
# 按换行分隔，默认(False)不保留换行符
'Good\nbye\nbye'.splitlines(True) # ['Good\n', 'bye\n', 'bye']

# 去空格
'Good bye'.strip('e') # 去掉首尾指定字符, 默认去空格
' Good bye '.lstrip() # 'Good bye ' 去掉左边空格
' Good bye '.rstrip() # ' Good bye'去掉右边空格

字母大小写

good'.upper() # 'GOOD' 全转大写
'GOOD'.lower() # 'good' 全转小写
'Good Bye'.swapcase() # 'gOOD bYE' 大小写互换
'good'.capitalize() # 'Good' 首字母转大写
'good'.islower() # True 是否全是小写
'good'.isupper() # False 是否全是大写
'good bYe'.title() # 'Good Bye' 所有的单词首字母转为大写，且其他字母转小写
'Good Bye'.istitle() # True 检测所有的单词首字母是否为大写，且其他字母为小写

索引填充等操作

'我和你'.endswith('你') # True 是否以指定字符结尾
'我和你'.startswith('你') # False 是否以指定字符开始
' and你'.isspace() # False 是否全是空白字符

'good'.center(10, '*') # '***good***' 字符居中, 其余用指定字符填充, 共多少位
'good'.ljust(10, '-') # 'good------' 左对齐，默认是空格补全
'good'.rjust(10, '-') # '------good' 右对齐
'good'.count('o') # 2 指定字符在字符中的数量
'good'.count('o', 2, 3) # 1 在索引范围内字符出现的数量
'3月'.zfill(3) # '03月' 指定长宽，不够前边补 0

max('good') # 'o' 按最大字母顺序最大的字母
min('good') # 'd' 最小的字母

'Good Good Study'.find('y') # 14 返回指,定字符第一次出现的索引, 如果不包含返回-1
'Good Good Study'.find('o', 3) # 6 指定开始位第一次出现的索引, 如果不包返回-1
'Good Good Study'.find('o', 2, 7) # 2 指定区间内第一次出现的索引, 如果不包返回-1
'Good Good Study'.find('up') # -1 不包含返回-1
rfind(str, beg=0,end=len(string)) # 从右开始的 find()

'Good Bye'.index('d') # 3 指定字符第一个索引
'Good Bye'.index('s') # 找不到会 ValueError 错误, 可以先 in 去判断是否包含
rindex(str, beg=0, end=len(string)) # 从右开始的 index()

字符串格式化

# 格式化, 此处推荐 f-string: https://www.gairuo.com/p/python-format-string4
name='tom'
f'{name}是好人' # 'tom是好人' 推荐此方法, name = 'tom'
'%s %s' % ('age', 18) # 'age 18' 
'{}, {}'.format(18, 'age') # '18 age' 
'{0}, {1}, {0}'.format('age', 18) # 'age, 18, age'
'{name}: {age}'.format(age=18, name='tom') # 'tom: 18'

布尔值

判断

a,b,c=0,1,2
a and b # 0 a 为假返回假的值
b and a # 0 b为真，返回a的值
a or b # 1 输出为真值的结果
a and b or c # 2
a and (b or c) # 0 用类似数学中的括号提高运算优先级

# not 的注意事项
not a # True
not a == b # True
not (a == b) # True 同上逻辑
#a == not b # ！这个是错误的语法, 正确如下：
a == (not b) # True

# and 优先级高 'a' 为真，返回 'b', '' or 'b' 返回 'b'
'' or 'a' and 'b' # 'b'

赋值运算

x = a or b # 哪个为真就就将值赋值到 x
x = a and b # 哪个为假就将值赋到 x
x = not a # 将结果赋值给 x, False

bool(None) # False
bool(0) # False
bool([]) # False
bool(()) # False

列表

生成列表

# 生成列表
y = '1345'
list(y) # ['1', '3', '4', '5'] 将列表转换成列表
list('刮风那天，我试过握着你手')# ['刮', '风', '那', '天', '，', '我', '试', '过', '握', '着', '你', '手']
# 元组
z = ('a', 'b', 'c')
list(z) # ['a', 'b', 'c'] 将元组转换成列表

# 字典
d = {'Name': 'Tom', 'Age': 7, 'Class': 'First'}
list(d) # ['Name', 'Age', 'Class'] 字典 key 转成列表
list(d.values()) # ['Tom', 7, 'First'] 字典 value 转成列表
# 字典键值对（一个元组）转成列表
list(d.items()) # [('Name', 'Tom'), ('Age', 7), ('Class', 'First')]

#列表操作
['a', 'b'] + ['c', 'd'] # ['a', 'b', 'c', 'd'] 拼接
['a', 'b'] * 2 # ['a', 'b', 'a', 'b''] 复制

常见用法

a = [1, 2, 3]
len(a) # 3 元素个数
max(a) # 3 最大值
min(a) # 1 最小值
sum(a) # 6 求和
a.index(2) # 1 指定元素位置
a.count(1) # 1 求元素的个数
for i in a: print(i) # 迭代元素
sorted(a) # 返回一个排序的列表，但不改变原列表
any(a) # True 是否至少有一个元素为真
all(a) # True 是否所有元素为真

增加删除

a = [1, 2, 3]
a.append(4) # a: [1, 2, 3, 4] 增加一个元素
a.pop() # 每执行一次删除最后一个元素
a.extend([9,8]) # a: [1, 2, 3, 9, 8] # 和其他列表合并
a.insert(1, 'a') # a: [1, 'a', 2, 3] 指定索引位插入元素
a.remove('a') # 删除第一个指定元素
a.clear() # [] 清空

排序

#排序  立即修改
a.reverse() # 反转顺序
a.sort() # 排序  立即修改
a.sort(reverse=True) # 反序
a.sort(key=abs) # 传入函数关键字作为排序规则

列表解析式

# 将一个可迭代的对象展开形成一个列表
[i for i in range(5)]    # [0, 1, 2, 3, 4]

# 可以将结果进行处理
['第'+str(i) for i in range(5)]   # ['第0', '第1', '第2', '第3', '第4']

# 可以进行条件筛选, 实现取偶数
[i for i in range(5) if i%2==0]

# 拆开字符, 过滤空格，全变成大写
[i.upper() for i in 'Hello world' if i != ' ']
# ['H', 'E', 'L', 'L', 'O', 'W', 'O', 'R', 'L', 'D']

# 条件分支
data= ['good','bad','bad','good','bad']
[1 if x == 'good' else 0 for x in data]    # [1, 0, 0, 1, 0]

元组

生成元组

a = () # 空元组
a = (1, ) # 只有一个元素
a = (1, 2, 3) # 定义一个元组
tuple() # 生成空元组
tuple('hello') # ('h', 'e', 'l', 'l', 'o')
type(a) # tuple 元素检测

# 没有括号也可以定义一个元组
a = 1,23,4,56 # a: (1, 23, 4, 56)
a = 1, # a: (1, )

元组解包

x = (1,2,3,4,5)
a, *b = x # a 占第一个，剩余的组成列表全给 b
# a -> 1
# b -> [2, 3, 4, 5]
# a, b -> (1, [2, 3, 4, 5])

a, *b, c = x # a 占第一个，c 占最后一个, 剩余的组成列表全给 b
# a -> 1
# b -> [2, 3, 4]
# c -> 5
# a, b, c -> (1, [2, 3, 4], 5)

字典

生成字典

d = {} # 定义空字典
d = dict() # 定义空字典
d = {'a': 1, 'b': 2, 'c': 3}
d = {'a': 1, 'a': 1, 'a': 1} # { 'a': 1} key 不能重复, 取最后一个
d = {'a': 1, 'b': {'x': 3}} # 嵌套字典
d = {'a': [1,2,3], 'b': [4,5,6]} # 嵌套列表

# 以下均可定义如下结果
# {'name': 'Tom', 'age': 18, 'height': 180}
d = dict(name='Tom', age=18, height=180)
d = dict([('name', 'Tom'), ('age', 18), ('height', 180)])
d = dict(zip(['name', 'age', 'height'], ['Tom', 18, 180]))

访问

d['name']  # 'Tom' 获取键的值
d['age'] = 20  # 将 age 的值更新为 20
d['Female'] = 'man'  # 增加属性
d.get('height', 180)  # 180

# 嵌套取值
d = {'a': {'name': 'Tom', 'age':18}, 'b': [4,5,6]}
d['b'][1] # 5
d['a']['age'] # 18

# 注意这不是切片操作，访问键返回值
d = {0: 10, 2: 20}
d[0]   # 10

增加删除访问等

d.pop('name') # 'Tom' 删除指定 key
d.popitem() # 随机删除某一项
del d['name']  # 删除键值对
d.clear()  # 清空字典

# 按类型访问，可迭代
d.keys() # 列出所有 key
d.values() # 列出所有 值
d.items() # 列出所有值对元组（k, v）可迭代 for k,v in d.items():

# 操作
d.setdefault('a', 3) # 插入一个键，给字默认值, 不指定为 None
d1.update(dict2) # 将字典 dict2 的键值对添加到字典 dict

d.get('math', 100) # 对于键(key)存在则返回其对应值，如果键不在字典中，则返回默认值
d2 = d.copy() # 深拷贝, d 变化不影响 d2

# update 更新方式
d = {}
d.update(a=1)
d.update(c=2, d=3)
d   # {'a': 1, 'c': 2, 'd': 3}

常见操作

d = {'a': 1, 'b': 2, 'c': 3}
max(d) # 'c' 最大的 k
min(d) # 'a' 最小的 k
len(d) # 3 字典的长度
str(d) # "{'a': 1, 'b': 2, 'c': 3}" 字符串形式
any(d) # True 只要一个键为 True
all(d) # True 所有键都为 True
sorted(d) # ['a', 'b', 'c'] 所有key当列表排序

解析式

d = {'ABCDE'[i]: i*10 for i in range(1,5)}
# {'B': 10, 'C': 20, 'D': 30, 'E': 40}

# 键值互换
d = {'name': 'Tom', 'age': 18, 'height': 180}
{v:k for k,v in d.items()}
# {'Tom': 'name', 18: 'age', 180: 'height'}

逻辑分支

#逻辑分支
route = {True: 'case1', False: 'case2'} # 定义路由
route[7>6] # 'case1' 传入结果为布尔的变量、表达式、函数调用

# 定义计算方法
cal = {'+': lambda x,y: x+y, '*':lambda x,y: x*y}
cal['*'](4,9) # 36 使用

集合

s = {'5元', '10元', '20元'} # 定义集合
s = set() # 空集合
s = set([1,2,3,4,5]) # {1, 2, 3, 4, 5} 使用列表定义
s = {1, True, 'a'}
s = {1, 1, 1} # {1} 去重
type(s) # set 类型检测

增加删除

#添加删除
s = {'a', 'b', 'c'}
s.add(2) # {2, 'a', 'b', 'c'}
s.update([1,3,4]) # {1, 2, 3, 4, 'a', 'b', 'c'}
s = {'a', 'b', 'c'}
s.remove('a') # {'b', 'c'} 删除不存在的会报错
s.discard('3') # 删除一个元素，无则忽略不报错
s.clear() # set() 清空

数学集合运算

s1 = {1,2,3}
s2 = {2,3,4}

s1 & s2 # {2, 3} 交集
s1.intersection(s2) # {2, 3} 交集
s1.intersection_update(s2) # {2, 3} 交集, 会覆盖 s1

s1 | s2  # {1, 2, 3, 4} 并集
s1.union(s2) # {1, 2, 3, 4} 并集

s1.difference(s2) # {1} 差集
s1.difference_update(s2) # {1} 差集, 会覆盖 s1

s1.symmetric_difference(s2) # {1, 4} 交集之外

s1.isdisjoint(s2) # False 是否没有交集
s1.issubset(s2) # False s2 是否 s1 的子集
s1.issuperset(s2) # False s1 是否 s2 的超集, 即 s1 是否包含 s2 的所有元素

numpy数组

数组生成

import numpy as np
np.arange(3)
# array([0, 1, 2])
np.arange(3.0)
# array([ 0.,  1.,  2.])
np.arange(3,7)
# array([3, 4, 5, 6])
np.arange(3,7,2)
# array([3, 5])
np.arange(3,4,.2)
# array([3. , 3.2, 3.4, 3.6, 3.8])

# 区间内等差数据  指定数量
np.linspace(2.0, 3.0, num=5)
# array([2.  , 2.25, 2.5 , 2.75, 3.  ])
# 右开区间（不包含右值）
np.linspace(2.0, 3.0, num=5, endpoint=False)
# array([2. ,  2.2,  2.4,  2.6,  2.8])
# (数组, 样本之间的间距)
np.linspace(2.0, 3.0, num=5, retstep=True)#(array([2.  , 2.25, 2.5 , 2.75, 3.  ]), 0.25)

全是0或1数组

#创建值为0的数组
np.zeros(6)#6个浮点0.  #行向量
np.zeros((2,3,4),dtype=int)#指定形状的0矩阵
np.ones((2,3,4))  #一矩阵
np.empty((3,4)) #空（0）矩阵
#结构相同的0矩阵
np.arange(8).reshape(1,-1).shape  #(1, 8)
np.arange(8).shape   #(8,)
np.zeros_like(np.arange(8).reshape(-1,1))#列矩阵  (8,1)
np.ones_like(np.arange(8).reshape(4,2))
np.empty_like(np.arange(8).reshape(2,2,2))

随机数组

np.random.randn(6,4)             #生成6*4的随机矩阵，标准正态分布浮点
np.random.random(size=(6,4))     #生成6*4的随机矩阵，0-1均匀分布浮点
np.random.rand(6, 4)             #同上均匀分布
np.random.randint(1,7,size=(6,4))#指定范围指定形状，整数

r1=np.random.standard_normal(size=5)      #标准正态分布
r2=np.random.normal(loc=50,scale=5,size=5)#均值为50，方差5的正态分布
r3=np.random.uniform(low=0,high=10,size=5)#0-10之间的均匀分布
r4=np.random.chisquare(df=15,size=5)      #自由度为15的卡方分布

抽样

a=range(1,21)
n1=random.sample(population=a,k=10);n1   #无放回抽取10次
n2=random.choices(population=a,k=10);n2  #有放回抽取10次

常见操作

a=np.linspace(2.0, 3.0, num=5)  #array([2.  , 2.25, 2.5 , 2.75, 3.  ])
a.max()
a.min()
a.sum()
a.std()
a.all()
a.any()
a.cumsum() #累计求和
np.sin(a)
np.log(a)

Pandas数据结构

Pandas基础的数据结构就两种，一种就是类似Excel表的二维数据框DataFrame，第二种就是数据框的一列，就是一条向量，叫Series。

数据框DataFrame生成

import pandas as pd 
df = pd.DataFrame({'国家': ['中国', '美国', '日本'],
                   '地区': ['亚洲', '北美', '亚洲'],
                   '人口': [14.33, 3.29, 1.26],
                   'GDP': [14.22, 21.34, 5.18],})
df

df2 = pd.DataFrame({'A': 1.,
                    'B': pd.Timestamp('20130102'),
                    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D': np.array([3] * 4, dtype='int32'),
                    'E': pd.Categorical(["test", "train", "test", "train"]),
                    'F': 'foo'})
df2.B.dtype  ##dtype('<M8[ns]')
df2

pd.DataFrame.from_dict({'国家':['中国','美国','日本'],'人口':[13.9,3.28,1.26]})  #字典生成
pd.DataFrame.from_records([('中国','美国','日本'),(13.9,3.28,1.26)]).T          #列表数组生成

Series常见用法

# 由索引为 a、b.. ， 五个随机浮点数数组组成
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
s.index # 查看索引
s = pd.Series(np.random.randn(5)) # 未指定索引
pd.Series(5., index=['a', 'b', 'c', 'd', 'e'])

s = pd.Series([1,2,3,4,5,6,7,8])
s[3] # 类似列表切片
s[2:]
s.median() # 平均值，包括其他的数学函数
s[s > s.median()] # 筛选大于平均值的内容
s[[1, 2, 1]] # 指定索引的内容，括号的列表是索引
s.dtype # 数据类型
s.array # 返回值的数列
s.to_numpy() # 转为 numpy 的 ndarray
3 in s # 逻辑运算，检测索引
s.to_numpy()==s.values  #[ True,  True,  True,  True,  True,  True,  True,  True])

s = pd.Series([1,2,3,4], name='数字')
s.add(1) # 每个元素加1 abs()
s.add_prefix(3) # 给索引前加个3，升位30,31,32,34
s.add_suffix(4) # 同上，在后增加04,14,24,34
s.sum() # 总和
s.count() # 数量，长度
s.agg('std') # 聚合，仅返回标准差, 与 s.std() 相同
s.agg(['min', 'max']) # 聚合，返回最大最小值
s2 = s.rename("number") # 修改名称
s.align(s2) # 联接
s.any() # 是否有为假的
s.all() # 是否全是真
s.append(s2) # 追加另外一个 Series
s.apply(lambda x:x+1) # 应用方法
s.empty # 是否为空
s3 = s.copy() # 深拷贝

判断类型

pd.api.types.is_bool_dtype(s)
pd.api.types.is_categorical_dtype(s)
pd.api.types.is_datetime64_any_dtype(s)
pd.api.types.is_datetime64_ns_dtype(s)
pd.api.types.is_datetime64_dtype(s)
pd.api.types.is_float_dtype(s)
pd.api.types.is_int64_dtype(s)
pd.api.types.is_numeric_dtype(s)
pd.api.types.is_object_dtype(s)
pd.api.types.is_string_dtype(s)
pd.api.types.is_timedelta64_dtype(s)
pd.api.types.is_bool_dtype(s)

pandas数据怎么读取，切片，筛选，画图等操作后面每一章再详细介绍