自带数据格式
数值
a1 = 100
a2 = 3.14
print ( a1)
print ( a2)
100
3.14
print ( type ( a1) )
print ( type ( a2) )
<class 'int'>
<class 'float'>
float ( a1)
100.0
int ( a1)
100
a1 + 50
150
a2 * 2
6.28
字符串
s1 = 'python'
s2 = 'pandas'
print ( s1)
print ( type ( s1) )
python
<class 'str'>
s1 + s2
'pythonpandas'
list ( s1)
['p', 'y', 't', 'h', 'o', 'n']
tuple ( s2)
('p', 'a', 'n', 'd', 'a', 's')
len ( s1)
6
name = '小米'
s = '北京'
i = 5
print ( '%s在%s,今天气温%i度' % ( name, s, i) )
小米在北京,今天气温5度
a = 3.14
str ( a)
'3.14'
布尔值
a = 100
b = 90
a == b
False
c = True
type ( c)
bool
自带数据结构
列表
- 中括号表示的一组元素. 元素可以是数值 字符串 列表 字典等多种样式
ls1 = [ 1 , 2 , 3 , 4 , 5 ]
ls2 = [ 'a' , 'b' , 'c' , 'd' , 'e' ]
ls3 = [ 'a' , 'b' , [ 1 , 2 ] , 'd' , 'e' ]
print ( ls1)
print ( ls2)
print ( ls3)
[1, 2, 3, 4, 5]
['a', 'b', 'c', 'd', 'e']
['a', 'b', [1, 2], 'd', 'e']
ls1[ 0 ]
1
ls1[ 2 : 3 ]
[3]
ls1[ 4 ] = 100
ls1
[1, 2, 3, 4, 100]
for i in ls1:
print ( i+ 100 )
101
102
103
104
200
list ( 'python数据分析基础' )
['p', 'y', 't', 'h', 'o', 'n', '数', '据', '分', '析', '基', '础']
range函数
range ( 10 )
range(0, 10)
list ( range ( 10 ) )
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
list ( range ( 1 , 10 ) )
[1, 2, 3, 4, 5, 6, 7, 8, 9]
list ( range ( 3 , 10 , 2 ) )
[3, 5, 7, 9]
进阶用法
lis1 = [ 1 , 2 , 3 , 4 , 5 ]
lis2 = [ i+ 100 for i in lis1]
lis2
[101, 102, 103, 104, 105]
lis3 = [ i* i for i in lis1 if i% 2 == 0 ]
lis3
[4, 16]
lis4 = [ 'No.' + str ( i) for i in lis1]
lis4
['No.1', 'No.2', 'No.3', 'No.4', 'No.5']
lis5 = [ 'No.' + str ( i) for i in range ( 100 ) ]
lis5[ : 10 ]
['No.0',
'No.1',
'No.2',
'No.3',
'No.4',
'No.5',
'No.6',
'No.7',
'No.8',
'No.9']
lis6 = [ 1 , 2 , 3 , 4 , 5 ]
lis7 = list ( 'abcde' )
lis8 = [ list ( z) for z in zip ( lis6, lis7) ]
lis8
[[1, 'a'], [2, 'b'], [3, 'c'], [4, 'd'], [5, 'e']]
字典
- 大括号表示的成对组成的数据
dic1 = { 'A' : 1 ,
'B' : 2 }
dic2 = { 'A' : '中国' ,
'B' : '美国' }
dic3 = { 'A' : [ 1 , 2 , 3 ] ,
'B' : [ 4 , 2 , 5 ] }
print ( dic1)
print ( dic2)
print ( dic3)
{'A': 1, 'B': 2}
{'A': '中国', 'B': '美国'}
{'A': [1, 2, 3], 'B': [4, 2, 5]}
dic1[ 'A' ]
1
lis1 = [ 1 , 2 , 3 , 4 , 5 ]
lis2 = [ 'a' , 'b' , 'c' , 'd' , 'e' ]
dic4 = { i: j for i in lis2 for j in lis1}
dic4
{'a': 5, 'b': 5, 'c': 5, 'd': 5, 'e': 5}
type ( dic4)
dict
元组
tup1 = 4 , 5 , 6 , 7
print ( tup1)
(4, 5, 6, 7)
list ( tup1)
[4, 5, 6, 7]
集合
s1 = set ( [ 2 , 2 , 2 , 1 , 3 , 3 , 'a' , 'a' ] )
print ( s1)
{1, 2, 3, 'a'}
lis1 = list ( s1)
lis1
[1, 2, 3, 'a']
数据结构
numpy中的数组
数组
import numpy as np
arr1 = np. array( [ 1 , 2 , 3 , 4 , 5 ] )
arr1
array([1, 2, 3, 4, 5])
arr2 = np. array( [ [ 1 , 2 , 3 , 4 , 5 ] ,
[ 6 , 7 , 8 , 9 , 10 ] ] )
arr2
array([[ 1, 2, 3, 4, 5],
[ 6, 7, 8, 9, 10]])
print ( arr2. shape)
print ( arr2. size)
print ( arr2. dtype)
(2, 5)
10
int32
arr3 = arr2 + 100
arr3
array([[101, 102, 103, 104, 105],
[106, 107, 108, 109, 110]])
随机数
- numpy中的random模块
import numpy as np
np. random. rand( 5 )
array([0.46591503, 0.87362145, 0.93809249, 0.25925983, 0.63894833])
np. random. rand( 2 , 3 )
array([[0.96901492, 0.30594543, 0.22570567],
[0.42208673, 0.99117033, 0.87474965]])
np. random. randn( 5 )
array([-1.61944511, 0.38982079, -0.63948306, 0.36588547, 0.63185553])
np. random. randn( 2 , 3 )
array([[-0.91993249, 0.61362181, -0.67974097],
[ 1.21197129, 1.84427274, -0.63912597]])
np. random. random( 5 )
array([0.54859105, 0.35911591, 0.91295063, 0.04102102, 0.82447624])
np. random. random( ( 2 , 3 ) )
array([[0.61639853, 0.61691637, 0.483431 ],
[0.95587659, 0.17188186, 0.04471951]])
np. random. randint( 5 , 20 , 4 )
array([13, 16, 11, 17])
np. random. randint( 5 , 20 , ( 2 , 3 ) )
array([[19, 13, 7],
[13, 14, 7]])
np. random. choice( [ "A" , "B" , "C" ] , 10 )
array(['B', 'A', 'B', 'C', 'A', 'B', 'B', 'C', 'A', 'B'], dtype='<U1')
np. random. normal( loc= 60 , scale= 15 , size= 10000 )
array([81.19617159, 48.43510876, 51.33395763, ..., 82.90992432,
61.88327572, 54.07451554])
pandas中的数据结构
import numpy as np
import pandas as pd
Series
s1 = pd. Series( np. random. random( 5 ) )
s1
0 0.720470
1 0.502933
2 0.465361
3 0.348212
4 0.317928
dtype: float64
lis1 = [ 100 , 200 , 300 , 400 , 500 ]
s2 = pd. Series( lis1)
s2
0 100
1 200
2 300
3 400
4 500
dtype: int64
s3 = pd. Series( lis1, index= list ( 'ABCDE' ) )
s3
A 100
B 200
C 300
D 400
E 500
dtype: int64
s3[ 'A' ]
100
s3[ 'B' : 'D' ]
B 200
C 300
D 400
dtype: int64
s3 + 50
A 150
B 250
C 350
D 450
E 550
dtype: int64
s3. tolist( )
[100, 200, 300, 400, 500]
DataFrame
data3 = pd. DataFrame( np. random. randint( 5 , 20 , ( 10 , 5 ) ) ,
columns= list ( 'ABCDE' ) ,
index= list ( 'abcdefghij' ) )
data3. head( )
A B C D E a 15 14 10 12 14 b 14 18 8 16 17 c 7 6 10 11 11 d 15 10 14 16 19 e 11 7 13 18 9
data4 = pd. DataFrame( { '山东' : [ 100 , 200 , 300 , 400 ] ,
'青岛' : [ 30 , 60 , 70 , 90 ] } ,
index= [ '一' , '二' , '三' , '四' ] )
data4
山东 青岛 一 100 30 二 200 60 三 300 70 四 400 90
data3. shape
(10, 5)
data3. shape[ 0 ]
10
data3. index
Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], dtype='object')
data3. describe( )
A B C D E count 10.000000 10.000000 10.000000 10.000000 10.000000 mean 10.200000 13.400000 11.600000 13.400000 11.500000 std 4.077036 4.671426 3.687818 3.949684 4.478343 min 5.000000 6.000000 5.000000 6.000000 5.000000 25% 6.250000 10.250000 10.000000 11.250000 9.000000 50% 10.000000 14.500000 11.500000 14.000000 11.500000 75% 14.000000 17.500000 13.750000 16.000000 13.750000 max 15.000000 19.000000 18.000000 18.000000 19.000000
data3. describe( ) . round ( 2 )
A B C D E count 10.00 10.00 10.00 10.00 10.00 mean 10.20 13.40 11.60 13.40 11.50 std 4.08 4.67 3.69 3.95 4.48 min 5.00 6.00 5.00 6.00 5.00 25% 6.25 10.25 10.00 11.25 9.00 50% 10.00 14.50 11.50 14.00 11.50 75% 14.00 17.50 13.75 16.00 13.75 max 15.00 19.00 18.00 18.00 19.00
data3. dtypes
A int32
B int32
C int32
D int32
E int32
dtype: object
data3. columns. tolist( )
['A', 'B', 'C', 'D', 'E']
data3. info( )
<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, a to j
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 A 10 non-null int32
1 B 10 non-null int32
2 C 10 non-null int32
3 D 10 non-null int32
4 E 10 non-null int32
dtypes: int32(5)
memory usage: 280.0+ bytes
data3[ 'A' ]
a 15
b 14
c 7
d 15
e 11
f 5
g 6
h 6
i 14
j 9
Name: A, dtype: int32
data3[ 'A' ] . tolist( )
[15, 14, 7, 15, 11, 5, 6, 6, 14, 9]
data3[ : 5 ]
A B C D E a 15 14 10 12 14 b 14 18 8 16 17 c 7 6 10 11 11 d 15 10 14 16 19 e 11 7 13 18 9
data3. loc[ [ 'd' , 'e' ] , [ 'A' , 'B' ] ]
data3[ [ 'A' , 'B' ] ]
A B a 15 14 b 14 18 c 7 6 d 15 10 e 11 7 f 5 19 g 6 11 h 6 18 i 14 15 j 9 16
data3. head( 6 )
A B C D E a 15 14 10 12 14 b 14 18 8 16 17 c 7 6 10 11 11 d 15 10 14 16 19 e 11 7 13 18 9 f 5 19 11 13 12
data3. tail( 2 )
扫码关注微信, 赠送《pandas数据读取与清洗》视频及课程代码!