基础训练,第二章py

import pandas as pd
import numpy as np

df = pd.DataFrame({'a': [1, 2] * 3,
                   'b': [True, False] * 3,
                    'c': [1.0, 2.0] * 3,
                    'e' : ['asian','white','black','white','asian','white'] ,
                  'd' :['low','low','low','median','high','high']})
df
abced
01True1.0asianlow
12False2.0whitelow
21True1.0blacklow
32False2.0whitemedian
41True1.0asianhigh
52False2.0whitehigh
df.dtypes
a      int64
b       bool
c    float64
e     object
d     object
dtype: object
df['d'] = df['d'].astype('category')
df
abced
01True1.0asianlow
12False2.0whitelow
21True1.0blacklow
32False2.0whitemedian
41True1.0asianhigh
52False2.0whitehigh
df.dtypes
a       int64
b        bool
c     float64
e      object
d    category
dtype: object
#d列为‘category’分类类型,
df.select_dtypes(include='bool')
b
0True
1False
2True
3False
4True
5False
df.select_dtypes(include='float64')
c
01.0
12.0
21.0
32.0
41.0
52.0
df.select_dtypes(include='number')
ac
011.0
122.0
211.0
322.0
411.0
522.0
df.select_dtypes(include='category')
d
0low
1low
2low
3median
4high
5high
df.select_dtypes(include='object')
e
0asian
1white
2black
3white
4asian
5white
df.select_dtypes(exclude=['float64'])

abed
01Trueasianlow
12Falsewhitelow
21Trueblacklow
32Falsewhitemedian
41Trueasianhigh
52Falsewhitehigh
df = pd.DataFrame(np.arange(12).reshape(3,4), columns=['A', 'B', 'C', 'D'])

df
ABCD
00123
14567
2891011
df.drop(['B', 'C'], axis=1)#df.drop(columns=['B', 'C'])
AD
003
147
2811
#Drop rows by index

df.drop([0, 1])

ABCD
2891011
s = pd.Series(["a","b",np.nan,"c",None])
print(s)
0       a
1       b
2     NaN
3       c
4    None
dtype: object
print(s.isnull())
0    False
1    False
2     True
3    False
4     True
dtype: bool
a  = pd.Series([1,2,np.nan,3,None])
print(s[s.isnull()])
2     NaN
4    None
dtype: object
a  = pd.Series([1,2,np.nan,3,None])
a.sum()
6.0
#此外pandas一共提供了4个针对缺失数据进行操作的函数,分别是isnull(),notnull(),dropna(),fillna()。
a = [[1, np.nan, 2],[9,None,np.nan],[3, 4, None],[5,6,7]]
data = pd.DataFrame(a)
data
012
01NaN2.0
19NaNNaN
234.0NaN
356.07.0
data.dropna()
012
356.07.0
data.dropna(axis=1)
0
01
19
23
35
a = [[1, np.nan, 2],[np.nan,None,np.nan],[3, None, None],[5,None,7]]
data = pd.DataFrame(a)
print(data)
print(data.dropna(how="all"))
print(data.dropna(how="all",axis=1))
     0   1    2
0  1.0 NaN  2.0
1  NaN NaN  NaN
2  3.0 NaN  NaN
3  5.0 NaN  7.0
     0   1    2
0  1.0 NaN  2.0
2  3.0 NaN  NaN
3  5.0 NaN  7.0
     0    2
0  1.0  2.0
1  NaN  NaN
2  3.0  NaN
3  5.0  7.0
a = [[1, 2, 2],[3,None,6],[3, 7, None],[5,None,7]]
data = pd.DataFrame(a)
print(data)
#用0填充所有的缺失数据
print(data.fillna(0))
   0    1    2
0  1  2.0  2.0
1  3  NaN  6.0
2  3  7.0  NaN
3  5  NaN  7.0
   0    1    2
0  1  2.0  2.0
1  3  0.0  6.0
2  3  7.0  0.0
3  5  0.0  7.0
#不同列使用不同的填充值
print(data.fillna({1:1,2:2}))
print(data.fillna(data.mean()))
   0    1    2
0  1  2.0  2.0
1  3  1.0  6.0
2  3  7.0  2.0
3  5  1.0  7.0
   0    1    2
0  1  2.0  2.0
1  3  4.5  6.0
2  3  7.0  5.0
3  5  4.5  7.0
from sklearn import preprocessing
import numpy as np
X_train = np.array([[ 1., -1.,  2.],
                     [ 2.,  0.,  0.],
                     [ 0.,  1., -1.]])
X_scaled = preprocessing.scale(X_train)

X_scaled
array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])
X_scaled.mean(axis=0)
#Out[29]: array([0., 0., 0.])


array([0., 0., 0.])
X_scaled.std(axis=0)
#Out[30]: array([1., 1., 1.])
array([1., 1., 1.])
df = pd.DataFrame({'col_a': np.arange(10), 
                   'col_b': np.random.randn(10), 
                   'col_c': np.random.choice(['A', 'B', 'C'], 10), 
                   'col_d': np.random.choice([0, 1], 10)})

df
col_acol_bcol_ccol_d
002.182928B1
11-0.830507B0
22-0.497002B0
331.485496B0
441.302028C1
550.480743A1
66-0.828251B0
77-1.771108C0
88-0.607708A1
991.938848C1
# R code:
# df <- data.frame(col_a = 0:9,
#                  col_b = rnorm(10),
#                  col_c = sample(c('A', 'B', 'C'), size = 10, replace = TRUE),
#                  col_d = sample(c(0, 1), size = 10, replace = TRUE), 
#                  stringsAsFactors = FALSE)
# head(df, 5)
print(df.shape, df.shape[0], df.shape[1])

# R code:
# dim(df), rnow(df), ncol(df)


(10, 4) 10 4
df.columns
# R code:
# names(df)
Index(['col_a', 'col_b', 'col_c', 'col_d'], dtype='object')
# 选取前5行数据
df.iloc[:5]

# R code:
# df[1:5, ]
col_acol_bcol_ccol_d
002.182928B1
11-0.830507B0
22-0.497002B0
331.485496B0
441.302028C1
# 选取col_a和col_b列
df[['col_a', 'col_b']]

# R code:
# df[, c('col_a', 'col_b')]
col_acol_b
002.182928
11-0.830507
22-0.497002
331.485496
441.302028
550.480743
66-0.828251
77-1.771108
88-0.607708
991.938848
# 选取前5行和前2列
df.iloc[:5, :2]

# R code:
# df[1:5, 1:2]
col_acol_b
002.182928
11-0.830507
22-0.497002
331.485496
441.302028
# 选取单个值(scalar)
df.iat[0, 1]

# R code:
# df[1, 2]
2.182928374642522
df[(df['col_a'] > 3) & (df['col_b'] < 0)]
# or 
# df.query('col_a > 3 & col_b < 0')

# R code:
# df[df$col_a > 3 & df$col_b < 0, ]
col_acol_bcol_ccol_d
66-0.828251B0
77-1.771108C0
88-0.607708A1
df[df['col_c'].isin(['A', 'B'])]

# R code:
# df[df$col_c %in% c('A', 'B'), ]
col_acol_bcol_ccol_d
002.182928B1
11-0.830507B0
22-0.497002B0
331.485496B0
550.480743A1
66-0.828251B0
88-0.607708A1
df['col_e'] = df['col_a'] + df['col_b']
df

# df$col_e <- df$col_a + df$col_b
col_acol_bcol_ccol_dcol_e
002.182928B12.182928
11-0.830507B00.169493
22-0.497002B01.502998
331.485496B04.485496
441.302028C15.302028
550.480743A15.480743
66-0.828251B05.171749
77-1.771108C05.228892
88-0.607708A17.392292
991.938848C110.938848
# 删除col_e列
df = df.drop(columns='col_e')
df

# R code:
# df <- df[, !names(df) == 'col_e']
col_acol_bcol_ccol_d
002.182928B1
11-0.830507B0
22-0.497002B0
331.485496B0
441.302028C1
550.480743A1
66-0.828251B0
77-1.771108C0
88-0.607708A1
991.938848C1
# 删除第一列
df.drop(columns=df.columns[0])

# R code:
# df[, -1]
col_bcol_ccol_d
02.182928B1
1-0.830507B0
2-0.497002B0
31.485496B0
41.302028C1
50.480743A1
6-0.828251B0
7-1.771108C0
8-0.607708A1
91.938848C1
df.T

# R code:
# t(df)
0123456789
col_a0123456789
col_b2.18293-0.830507-0.4970021.48551.302030.480743-0.828251-1.77111-0.6077081.93885
col_cBBBBCABCAC
col_d1000110011
df['col_a'].astype(str)

# as.character(df$col_a)
0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
Name: col_a, dtype: object
pd.Categorical(df['col_c'])

# factor(df$col_d)
[B, B, B, B, C, A, B, C, A, C]
Categories (3, object): [A, B, C]
df[['col_a', 'col_b']].sum(axis=1)

# R code:
# apply(df[, c('col_a', 'col_b')], 1, sum)
0     2.182928
1     0.169493
2     1.502998
3     4.485496
4     5.302028
5     5.480743
6     5.171749
7     5.228892
8     7.392292
9    10.938848
dtype: float64
df[['col_a', 'col_b']].mean(axis=0)

# R code:
# apply(df[, c('col_a', 'col_b')], 2, mean)
col_a    4.500000
col_b    0.285547
dtype: float64
df[['col_a', 'col_b']].apply(lambda x: x.mean() + 10)

# R code:
# apply(df[, c('col_a', 'col_b')], 2, function(x) mean(x) + 10)
col_a    14.500000
col_b    10.285547
dtype: float64
df2 = pd.DataFrame({'col_x': np.arange(10), 
                    'col_y': np.arange(10)[::-1]})
df2
col_xcol_y
009
118
227
336
445
554
663
772
881
990
pd.concat([df, df2], axis=1)


# R code:
# cbind(df, df2)
col_acol_bcol_ccol_dcol_xcol_y
002.182928B109
11-0.830507B018
22-0.497002B027
331.485496B036
441.302028C145
550.480743A154
66-0.828251B063
77-1.771108C072
88-0.607708A181
991.938848C190
df3 = pd.DataFrame({'col_a': [-1, -2], 
                    'col_b' : [0, 1], 
                    'col_c': ['B', 'C'], 
                    'col_d': [1, 0]})
df3
col_acol_bcol_ccol_d
0-10B1
1-21C0
pd.concat([df, df3], axis=0, ignore_index=True)

# R code:
# rbind(df, df3)
col_acol_bcol_ccol_d
002.182928B1
11-0.830507B0
22-0.497002B0
331.485496B0
441.302028C1
550.480743A1
66-0.828251B0
77-1.771108C0
88-0.607708A1
991.938848C1
10-10.000000B1
11-21.000000C0
data =pd.read_csv('https://vincentarelbundock.github.io/Rdatasets/csv/ggplot2/diamonds.csv',index_col=0)
data
caratcutcolorclaritydepthtablepricexyz
10.23IdealESI261.555.03263.953.982.43
20.21PremiumESI159.861.03263.893.842.31
30.23GoodEVS156.965.03274.054.072.31
40.29PremiumIVS262.458.03344.204.232.63
50.31GoodJSI263.358.03354.344.352.75
.................................
539360.72IdealDSI160.857.027575.755.763.50
539370.72GoodDSI163.155.027575.695.753.61
539380.70Very GoodDSI162.860.027575.665.683.56
539390.86PremiumHSI261.058.027576.156.123.74
539400.75IdealDSI262.255.027575.835.873.64

53940 rows × 10 columns

cor_matrix = data.corr()

data.corr()    
#可以直接给出数据框的相关系数矩阵
caratdepthtablepricexyz
carat1.0000000.0282240.1816180.9215910.9750940.9517220.953387
depth0.0282241.000000-0.295779-0.010647-0.025289-0.0293410.094924
table0.181618-0.2957791.0000000.1271340.1953440.1837600.150929
price0.921591-0.0106470.1271341.0000000.8844350.8654210.861249
x0.975094-0.0252890.1953440.8844351.0000000.9747010.970772
y0.951722-0.0293410.1837600.8654210.9747011.0000000.952006
z0.9533870.0949240.1509290.8612490.9707720.9520061.000000
data.corr()['price']  
#给出'price'变量与其他变量之间的相关系数
carat    0.921591
depth   -0.010647
table    0.127134
price    1.000000
x        0.884435
y        0.865421
z        0.861249
Name: price, dtype: float64
data['price'].corr(data["x"])    
#计算'price'与"x"之间的相关系数
0.8844351610161268
data.corr(method='spearman')   
caratdepthtablepricexyz
carat1.0000000.0301040.1949800.9628830.9961170.9955720.993183
depth0.0301041.000000-0.2450610.010020-0.023442-0.0254250.103498
table0.194980-0.2450611.0000000.1717840.2022310.1957340.159878
price0.9628830.0100200.1717841.0000000.9631960.9627190.957232
x0.996117-0.0234420.2022310.9631961.0000000.9978950.987355
y0.995572-0.0254250.1957340.9627190.9978951.0000000.987068
z0.9931830.1034980.1598780.9572320.9873550.9870681.000000
data.corr(method='pearson')['price']  
carat    0.921591
depth   -0.010647
table    0.127134
price    1.000000
x        0.884435
y        0.865421
z        0.861249
Name: price, dtype: float64
data['price'].corr(data["x"],method='pearson')  
#method也可以指定spearman法和kendall法计算相关系数。
0.8844351610161268
from numpy.random import rand
from numpy.random import seed
from scipy.stats import spearmanr
# seed random number generator
seed(1)
# prepare data
data1 = data['x']
data2 = data['price']
# calculate spearman's correlation
coef, p = spearmanr(data1, data2)
print('Spearmans correlation coefficient: %.3f' % coef)
Spearmans correlation coefficient: 0.963
# interpret the significance
alpha = 0.05
if p > alpha:
    print('Samples are uncorrelated (fail to reject H0) p=%.3f' % p)

else:
    print('Samples are correlated (reject H0) p=%.3f' % p)
Samples are correlated (reject H0) p=0.000
p
0.0
from scipy.stats import kendalltau
# seed random number generator
seed(1)

# calculate kendall's correlation
coef, p = kendalltau(data1, data2)
print('Kendall correlation coefficient: %.3f' % coef)
# interpret the significance
alpha = 0.05
if p > alpha:
    print('Samples are uncorrelated (fail to reject H0) p=%.3f' % p)
else:
    print('Samples are correlated (reject H0) p=%.3f' % p)

Kendall correlation coefficient: 0.831
Samples are correlated (reject H0) p=0.000
from scipy import stats

from scipy.stats import pearsonr
# seed random number generator
seed(1)

# calculate pearsonr's correlation
coef, p = pearsonr(data1, data2)
print('pearsonr correlation coefficient: %.3f' % coef)
# interpret the significance
alpha = 0.05
if p > alpha:
    print('Samples are uncorrelated (fail to reject H0) p=%.3f' % p)
else:
    print('Samples are correlated (reject H0) p=%.3f' % p)
pearsonr correlation coefficient: 0.884
Samples are correlated (reject H0) p=0.000

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值