04python中的数据结构_ls1=['a','b','c','d'] ls2=['1','2','3'] ls3=[] a=0-CSDN博客

本文链接：https://blog.csdn.net/muyashui/article/details/115047165

自带数据格式

数值

# 整数与小数
a1 = 100  # 定义一个变量a1(建议定义的变量用英文数字下划线等符合组成), 并将100赋值给变量a1
a2 = 3.14

# 用print函数输出变量内容
print(a1)
print(a2)

100
3.14

# 数值格式
print(type(a1))
print(type(a2))

<class 'int'>
<class 'float'>

# 整数转小数
float(a1)

100.0

# 小数转整数 ※
int(a1)

# 四则运算
a1 + 50

a2 * 2

6.28

字符串

# 字符串
s1 = 'python'   #也可以是双引号 ""
s2 = 'pandas'

print(s1)
print(type(s1))

python
<class 'str'>

# 字符串组合  ※
s1 + s2

'pythonpandas'

# 拆分字符串:列表  ※
list(s1)

['p', 'y', 't', 'h', 'o', 'n']

# 拆分字符串:元组
tuple(s2)

('p', 'a', 'n', 'd', 'a', 's')

# 字符串长度  ※
len(s1)

# 字符串格式化
name = '小米'
s = '北京'
i = 5
print('%s在%s,今天气温%i度'%(name,s,i))

小米在北京,今天气温5度

# 数字转字符串  ※
a = 3.14
str(a)

'3.14'

布尔值

a = 100  # 一个等号表示"赋值"
b = 90

a == b   # 两个等号表示判断两个变量是否相等 , 不等于用 != 表示 ※

False

c = True
type(c)

bool

自带数据结构

列表

- 中括号表示的一组元素. 元素可以是数值 字符串 列表 字典等多种样式

ls1 = [1,2,3,4,5]
ls2 = ['a','b','c','d','e']
ls3 = ['a','b',[1,2],'d','e']

print(ls1)
print(ls2)
print(ls3)

[1, 2, 3, 4, 5]
['a', 'b', 'c', 'd', 'e']
['a', 'b', [1, 2], 'd', 'e']

# 0号是第1位
ls1[0]

# 切片:左包含右不包含
ls1[2:3]

[3]

# 修改列表值
ls1[4]= 100
ls1

[1, 2, 3, 4, 100]

# 列表常用于for循环中  ※
for i in ls1:
    print(i+100)

# 字符串转列表
list('python数据分析基础')

['p', 'y', 't', 'h', 'o', 'n', '数', '据', '分', '析', '基', '础']

range函数

# 使用range函数 ※
range(10)

range(0, 10)

# 生成0-10的之间个元素(不包含10) ※
list(range(10))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

# 生成1-10之间的元素(不包含10)
list(range(1,10))

[1, 2, 3, 4, 5, 6, 7, 8, 9]

# 生成3-10之间的,每间隔2的元素(不包含10)
list(range(3,10,2))

[3, 5, 7, 9]

进阶用法

# 用一个列表生成另一个列表
lis1 = [1,2,3,4,5]

lis2 = [i+100 for i in lis1]  #※
lis2

[101, 102, 103, 104, 105]

lis3 = [i*i for i in lis1 if i%2==0]
lis3

[4, 16]

lis4 = ['No.'+str(i) for i in lis1]  #※
lis4

['No.1', 'No.2', 'No.3', 'No.4', 'No.5']

lis5 = ['No.'+str(i) for i in range(100)]
lis5[:10]

['No.0',
 'No.1',
 'No.2',
 'No.3',
 'No.4',
 'No.5',
 'No.6',
 'No.7',
 'No.8',
 'No.9']

lis6 = [1,2,3,4,5] 
lis7 = list('abcde')
lis8 = [list(z) for z in zip(lis6,lis7)] #※
lis8

[[1, 'a'], [2, 'b'], [3, 'c'], [4, 'd'], [5, 'e']]

字典

- 大括号表示的成对组成的数据

dic1 = {'A':1,
        'B':2}

dic2 = {'A':'中国',
        'B':'美国'}

dic3 = {'A':[1,2,3],
        'B':[4,2,5]}

print(dic1)
print(dic2)
print(dic3)

{'A': 1, 'B': 2}
{'A': '中国', 'B': '美国'}
{'A': [1, 2, 3], 'B': [4, 2, 5]}

dic1['A']

# 用列表创建字典
lis1 = [1,2,3,4,5] 
lis2 = ['a','b','c','d','e']

dic4 = {i:j for i in lis2 for j in lis1}  #※
dic4

{'a': 5, 'b': 5, 'c': 5, 'd': 5, 'e': 5}

type(dic4)

dict

元组

tup1 = 4,5,6,7
print(tup1)

(4, 5, 6, 7)

# 元组转列表
list(tup1)

[4, 5, 6, 7]

集合

s1 = set([2,2,2,1,3,3,'a','a'])  #有去重功能 ※
print(s1)

{1, 2, 3, 'a'}

# 集合转列表
lis1 = list(s1)
lis1

[1, 2, 3, 'a']

数据结构

numpy中的数组

数组

#导入工具包
import numpy as np

# 通过列表生成一维数组
arr1 = np.array([1,2,3,4,5])
arr1

array([1, 2, 3, 4, 5])

# 通过列表生成二维数组
arr2 = np.array([[1,2,3,4,5],
                 [6,7,8,9,10]])
arr2

array([[ 1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10]])

# 数组属性
# 数组属性
print(arr2.shape) #※
print(arr2.size)
print(arr2.dtype)

(2, 5)
10
int32

# 数组运算
arr3 = arr2 + 100
arr3

array([[101, 102, 103, 104, 105],
       [106, 107, 108, 109, 110]])

随机数

- numpy中的random模块

# 导入工具包
import numpy as np

# 0-1之间的随机数
np.random.rand(5)  # 5个数

array([0.46591503, 0.87362145, 0.93809249, 0.25925983, 0.63894833])

# 2行3列数组
np.random.rand(2,3)

array([[0.96901492, 0.30594543, 0.22570567],
       [0.42208673, 0.99117033, 0.87474965]])

# 均值为0，标准差为1的随机数
np.random.randn(5) # 5个数

array([-1.61944511,  0.38982079, -0.63948306,  0.36588547,  0.63185553])

np.random.randn(2,3)  # 2行3列数组

array([[-0.91993249,  0.61362181, -0.67974097],
       [ 1.21197129,  1.84427274, -0.63912597]])

# 生产0-1的随机数
np.random.random(5)

array([0.54859105, 0.35911591, 0.91295063, 0.04102102, 0.82447624])

np.random.random((2,3))  # 2个括号

array([[0.61639853, 0.61691637, 0.483431  ],
       [0.95587659, 0.17188186, 0.04471951]])

# 随机整数
np.random.randint(5,20,4)  # 从5-20的随机整数

array([13, 16, 11, 17])

np.random.randint(5,20,(2,3))

array([[19, 13,  7],
       [13, 14,  7]])

# 随机选择
np.random.choice(["A","B","C"],10)

array(['B', 'A', 'B', 'C', 'A', 'B', 'B', 'C', 'A', 'B'], dtype='<U1')

# 正太分布
np.random.normal(loc=60,scale=15,size=10000)   #loc 均值  scale 标准差

array([81.19617159, 48.43510876, 51.33395763, ..., 82.90992432,
       61.88327572, 54.07451554])

pandas中的数据结构

import numpy as np
import pandas as pd

Series

#用随机数创建一个series
s1 = pd.Series(np.random.random(5))     # 生成5个 0到1 间的随机数 ,默认索引为0开始的整数
s1

0    0.720470
1    0.502933
2    0.465361
3    0.348212
4    0.317928
dtype: float64

# 用列表构建一个series
lis1 = [100,200,300,400,500]
s2 = pd.Series(lis1)
s2

0    100
1    200
2    300
3    400
4    500
dtype: int64

# 设置series的索引
s3 = pd.Series(lis1,index=list('ABCDE'))
s3

A    100
B    200
C    300
D    400
E    500
dtype: int64

s3['A']

s3['B':'D']

B    200
C    300
D    400
dtype: int64

s3 + 50

A    150
B    250
C    350
D    450
E    550
dtype: int64

# 转换成list ※
s3.tolist()

[100, 200, 300, 400, 500]

DataFrame

# 通过随机数组创建
data3 = pd.DataFrame(np.random.randint(5,20,(10,5)),  # 生成一个10行5列整数数组，值在5~20之间
                     columns=list('ABCDE'), # 设置列的名称为 A B C D E
                     index=list('abcdefghij'))  # 设置索引为小写的 a b c ... ,不设置时默认为0开始的数字序列
data3.head()

	A	B	C	D	E
a	15	14	10	12	14
b	14	18	8	16	17
c	7	6	10	11	11
d	15	10	14	16	19
e	11	7	13	18	9

# 通过字典创建
data4 = pd.DataFrame({'山东':[100,200,300,400],
                     '青岛':[30,60,70,90]},
                    index=['一','二','三','四'])
data4

	山东	青岛
一	100	30
二	200	60
三	300	70
四	400	90

# data3的属性
# 数据是几维数据
data3.shape

(10, 5)

# shape第一个数 ※
data3.shape[0]

# 数据的索引
data3.index

Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], dtype='object')

# 数值变量描述性统计 ※
data3.describe()

	A	B	C	D	E
count	10.000000	10.000000	10.000000	10.000000	10.000000
mean	10.200000	13.400000	11.600000	13.400000	11.500000
std	4.077036	4.671426	3.687818	3.949684	4.478343
min	5.000000	6.000000	5.000000	6.000000	5.000000
25%	6.250000	10.250000	10.000000	11.250000	9.000000
50%	10.000000	14.500000	11.500000	14.000000	11.500000
75%	14.000000	17.500000	13.750000	16.000000	13.750000
max	15.000000	19.000000	18.000000	18.000000	19.000000

# 描述性统计,保留2位小数
data3.describe().round(2)

	A	B	C	D	E
count	10.00	10.00	10.00	10.00	10.00
mean	10.20	13.40	11.60	13.40	11.50
std	4.08	4.67	3.69	3.95	4.48
min	5.00	6.00	5.00	6.00	5.00
25%	6.25	10.25	10.00	11.25	9.00
50%	10.00	14.50	11.50	14.00	11.50
75%	14.00	17.50	13.75	16.00	13.75
max	15.00	19.00	18.00	18.00	19.00

# 变量的格式
data3.dtypes

A    int32
B    int32
C    int32
D    int32
E    int32
dtype: object

# 数据的列名 ※
data3.columns.tolist()

['A', 'B', 'C', 'D', 'E']

# 数据基本信息
data3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, a to j
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A       10 non-null     int32
 1   B       10 non-null     int32
 2   C       10 non-null     int32
 3   D       10 non-null     int32
 4   E       10 non-null     int32
dtypes: int32(5)
memory usage: 280.0+ bytes

# 选择单列
data3['A']

a    15
b    14
c     7
d    15
e    11
f     5
g     6
h     6
i    14
j     9
Name: A, dtype: int32

# 转换成列表 ※
data3['A'].tolist()

[15, 14, 7, 15, 11, 5, 6, 6, 14, 9]

# 选择行
data3[:5]

	A	B	C	D	E
a	15	14	10	12	14
b	14	18	8	16	17
c	7	6	10	11	11
d	15	10	14	16	19
e	11	7	13	18	9

# 选择行列,按索引名称和列名
data3.loc[['d','e'],['A','B']]

	A	B
d	15	10
e	11	7

# 选择列 ※
data3[['A','B']]

	A	B
a	15	14
b	14	18
c	7	6
d	15	10
e	11	7
f	5	19
g	6	11
h	6	18
i	14	15
j	9	16

# 前几行
data3.head(6)

	A	B	C	D	E
a	15	14	10	12	14
b	14	18	8	16	17
c	7	6	10	11	11
d	15	10	14	16	19
e	11	7	13	18	9
f	5	19	11	13	12

# 后几行
data3.tail(2)

	A	B	C	D	E
i	14	15	15	15	13
j	9	16	5	6	6

扫码关注微信, 赠送《pandas数据读取与清洗》视频及课程代码!
在这里插入图片描述

	A	B	C	D	E
a	15	14	10	12	14
b	14	18	8	16	17
c	7	6	10	11	11
d	15	10	14	16	19
e	11	7	13	18	9
f	5	19	11	13	12

	A	B	C	D	E
a	15	14	10	12	14
b	14	18	8	16	17
c	7	6	10	11	11
d	15	10	14	16	19
e	11	7	13	18	9
f	5	19	11	13	12

	A	B	C	D	E
a	15	14	10	12	14
b	14	18	8	16	17
c	7	6	10	11	11
d	15	10	14	16	19
e	11	7	13	18	9
f	5	19	11	13	12