python数据分析学习笔记三

第三章 统计学与线性代数

1 numpy和scipy模块

示例代码如下:


import pkgutil as pu
import numpy as np
import matplotlib as mpl
import scipy as sp
import pydoc

# 输出库的版本信息
print('Numpy version: ', np.__version__)
print('Scip version: ', sp.__version__)
print('Matplotlib: ', mpl.__version__)


# 移除multiple 中的空格
def clean(astr):
    s = astr

    s = ' '.join(s.split())
    s = s.replace('=', '')
    return s


def print_desc(prefix, pkg_path):
    # 取得指定模块下的所有子库
    for pkg in pu.iter_modules(path=pkg_path):
        name = prefix + '.' + pkg[1]

        if pkg[2] == True:
            try:
                # render_doc取得子库或函数的文字说明
                #
                docstr = pydoc.plain(pydoc.render_doc(name))
                docstr = clean(docstr)
                # 设定所取内容的起始位置 
                start = docstr.find('DESCRIPTION')
                docstr = docstr[start:start + 40]
                print(name, docstr)
            except:
                continue


print_desc('numpy', np.__path__)
print()
print()
print()
print_desc('scipy', sp.__path__)

 

运行结果如下:

Numpy version:  1.11.2

Scip version:  0.18.1

Matplotlib: 1.5.3

numpy.compat DESCRIPTION This modulecontains duplica

numpy.core DESCRIPTION Functions - array -NumPy Ar

numpy.distutils

numpy.doc DESCRIPTION Topicaldocumentation  The f

numpy.f2py

numpy.fft DESCRIPTION Discrete FourierTransform (

numpy.lib DESCRIPTION Basic functions usedby seve

numpy.linalg DESCRIPTION Core LinearAlgebra Tools --

numpy.ma DESCRIPTION  Masked Arrays Arrays somet

numpy.matrixlib

numpy.polynomial DESCRIPTION Within thedocumentation for

numpy.random DESCRIPTION  Random Number Generation  

numpy.testing DESCRIPTION This singlemodule should pr

 

 

 

scipy._build_utils

scipy._lib DESCRIPTION Module containing privateut

scipy.cluster DESCRIPTION  Clustering package (:mod:`s

scipy.constants DESCRIPTION  Constants (:mod:`scipy.cons

scipy.fftpack DESCRIPTION  Discrete Fourier transforms

scipy.integrate DESCRIPTION  Integration and ODEs (:mod:

scipy.interpolate DESCRIPTION  Interpolation (:mod:`scipy.

scipy.io DESCRIPTION  Input and output (:mod:`sci

scipy.linalg DESCRIPTION  Linear algebra (:mod:`scipy

scipy.misc DESCRIPTION  Miscellaneous routines (:mo

scipy.ndimage DESCRIPTION  Multi-dimensional image pro

scipy.odr DESCRIPTION  Orthogonal distance regress

scipy.optimize DESCRIPTION  Optimization and root findi

scipy.signal DESCRIPTION  Signal processing (:mod:`sc

scipy.sparse DESCRIPTION  Sparse matrices (:mod:`scip

scipy.spatial DESCRIPTION  Spatial algorithms and data

scipy.special DESCRIPTION  Special functions (:mod:`sc

scipy.stats DESCRIPTION  Statistical functions (:mod

2 使用numpy进行简单的描述性统计计算

示例代码如下:

 

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2016/12/29 17:16
# @Author  : Retacn
# @Site    : 使用numpy进行统计计算
# @File    : basic_stats.py
# @Software: PyCharm
__author__ = "retacn"
__copyright__ = "property of mankind."
__license__ = "CN"
__version__ = "0.0.1"
__maintainer__ = "retacn"
__email__ = "zhenhuayue@sina.com"
__status__ = "Development"

import numpy as np
from scipy.stats import scoreatpercentile

# 加载csv文件
#文件中Curaao,0  Cte d'Ivoire,2.5 两个字符器有问题,修改后正常

data = np.loadtxt('mdrtb_2012.csv', delimiter=',', usecols=(1,), skiprows=1, unpack=True)
#data = np.loadtxt('MLB2008.csv', delimiter=',', usecols=(1,), skiprows=1, unpack=True)

# 最大值
print('Max method', data.max())
print('Max function', np.max(data))

# 最小值
print('Min method', data.min())
print('Min function', np.min(data))

# 平均值
print('Mean method', data.mean())
print('Mean function', np.mean(data))

# 标准差
print('Std method', data.std())
print('Std function', np.std(data))

# 中位数
print('Median', np.median(data))
print('Score at percentile 50', scoreatpercentile(data, 50))

 

运行结果如下:

Max method 50.0

Max function 50.0

Min method 0.0

Min function 0.0

Mean method 3.2787037037

Mean function 3.2787037037

Std method 5.76332073654

Std function 5.76332073654

Median 1.8

Score at percentile 50 1.8

 

3 使用numpy进行线性代数运算

A 求矩阵逆

示例代码如下:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2016/12/29 17:53
# @Author  : Retacn
# @Site    : 使用numpy来求矩阵的逆
# @File    : inversion.py
# @Software: PyCharm
__author__ = "retacn"
__copyright__ = "property of mankind."
__license__ = "CN"
__version__ = "0.0.1"
__maintainer__ = "retacn"
__email__ = "zhenhuayue@sina.com"
__status__ = "Development"

import numpy as np

# 创建示例矩阵
A = np.mat('2 4 6;4 2 6;10 -4 18')
#使用mit公开课中老师用的矩阵例子进行测试
#A = np.mat('1 0 0;-3 1 0;0 0 1')
print('A\n', A)

# 求矩阵的逆
inverse = np.linalg.inv(A)
print('inverse of A\n', inverse)

# 利用乘法进行验算
print('Check\n', A * inverse)
print('Error\n', A * inverse - np.eye(3))

 

运行结果如下:

A

 [[2  4 6]

 [4  2 6]

 [10-4 18]]

inverse of A

 [[-0.41666667 0.66666667 -0.08333333]

 [0.08333333  0.16666667 -0.08333333]

 [0.25       -0.33333333  0.08333333]]

Check

 [[ 1.00000000e+00 -2.22044605e-16   0.00000000e+00]

 [ 5.55111512e-17  1.00000000e+00   0.00000000e+00]

 [ 3.88578059e-16 -4.44089210e-16   1.00000000e+00]]

Error

 [[ 0.00000000e+00 -2.22044605e-16   0.00000000e+00]

 [ 5.55111512e-17 -4.44089210e-16   0.00000000e+00]

 [ 3.88578059e-16 -4.44089210e-16   0.00000000e+00]]

 

#可逆矩阵的列子

A

 [[1  0 0]

 [-3 1  0]

 [0  0 1]]

inverse of A

 [[1.  0. 0.]

 [3.  1. 0.]

 [0.  0. 1.]]

Check

 [[1.  0. 0.]

 [0.  1. 0.]

 [0.  0. 1.]]

Error

 [[0.  0. 0.]

 [0.  0. 0.]

 [0.  0. 0.]]

 

B 解线性方程组

示例代码如下:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2017/1/20 13:52
# @Author  : Retacn
# @Site    : 用numpy解线性方程
# @File    : linner_equation.py
# @Software: PyCharm
__author__ = "retacn"
__copyright__ = "property of mankind."
__license__ = "CN"
__version__ = "0.0.1"
__maintainer__ = "retacn"
__email__ = "zhenhuayue@sina.com"
__status__ = "Development"

import numpy as np

# 创建矩阵A和数组b
A = np.mat("1 -2 1;0 2 -8;-4 5 9")
print("A\n", A)
b = np.array([0, 8, -9])
print("b\n", b)

# 调用solve()函数,解线性方程组
x = np.linalg.solve(A, b)
print("solution", x)

# 使用dot()函数进行验算
print("check\n", np.dot(A, x))

 

运行结果如下:

A

 [[ 1-2  1]

 [0  2 -8]

 [-4 5  9]]

b

 [0  8 -9]

solution [ 29.  16.  3.]

check

 [[0.  8. -9.]]

 

 

4 使用numpy计算特征值和特征向量

示例代码如下:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2017/1/21 12:47
# @Author  : Retacn
# @Site    : 计算特征值和特征向量
# @File    : eigenvalues.py
# @Software: PyCharm
__author__ = "retacn"
__copyright__ = "property of mankind."
__license__ = "CN"
__version__ = "0.0.1"
__maintainer__ = "retacn"
__email__ = "zhenhuayue@sina.com"
__status__ = "Development"

import numpy as np

A = np.mat("3 -2;1 0")
print("A\n", A)

print("Eigenvalues", np.linalg.eigvals(A))

eigenvalues, eigenvectors = np.linalg.eig(A)
print("first tuple of eig", eigenvalues)
print("second tuple of eig\n", eigenvectors)

for i in range(len(eigenvalues)):
    print("left", np.dot(A, eigenvectors[:, i]))
    print("right", eigenvalues[i] * eigenvectors[:, i])
    print()

 

 

运行结果如下:

A

 [[ 3-2]

 [1  0]]

Eigenvalues [ 2.  1.]

first tuple of eig [ 2.  1.]

second tuple of eig

 [[0.89442719  0.70710678]

 [0.4472136   0.70710678]]

left [[ 1.78885438]

 [0.89442719]]

right [[ 1.78885438]

 [0.89442719]]

 

Numpy随机数

用二项式分布进行博弈

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2017/1/21 13:17
# @Author  : Retacn
# @Site    : 
# @File    : headortail.py
# @Software: PyCharm
__author__ = "retacn"
__copyright__ = "property of mankind."
__license__ = "CN"
__version__ = "0.0.1"
__maintainer__ = "retacn"
__email__ = "zhenhuayue@sina.com"
__status__ = "Development"

import numpy as np
from matplotlib.pyplot import plot, show

cash = np.zeros(10000)
cash[0] = 1000
outcome = np.random.binomial(9, 0.5, size=len(cash))

for i in range(1, len(cash)):
    if outcome[i] < 5:
        cash[i] = cash[i - 1] - 1
    elif outcome[i] < 10:
        cash[i] = cash[i - 1] + 1
    else:
        raise AssertionError("Unexpected outcome" + outcome)

print(outcome.min(), outcome.max())

plot(np.arange(len(cash)), cash)
show()

 

 

 

 

正态分面采样

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2017/1/21 13:30
# @Author  : Retacn
# @Site    : 正态分布采样
# @File    : normaldist.py
# @Software: PyCharm
__author__ = "retacn"
__copyright__ = "property of mankind."
__license__ = "CN"
__version__ = "0.0.1"
__maintainer__ = "retacn"
__email__ = "zhenhuayue@sina.com"
__status__ = "Development"

import numpy as np
import matplotlib.pyplot as plt

N = 10000

normal_values = np.random.normal(size=N)
dummy, bins, dummy = plt.hist(normal_values, np.sqrt(N), normed=True, lw=1)
sigma = 1
mu = 0
plt.plot(bins, 1 / (sigma * np.sqrt(2 * np.pi)) * np.exp(-(bins - mu) ** 2 / (2 * sigma ** 2)), lw=2)
plt.show()

 

 

使用scipy 进行正态检验

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2017/1/21 13:41
# @Author  : Retacn
# @Site    : 用scipy进行正态检验
# @File    : normality_test.py
# @Software: PyCharm
__author__ = "retacn"
__copyright__ = "property of mankind."
__license__ = "CN"
__version__ = "0.0.1"
__maintainer__ = "retacn"
__email__ = "zhenhuayue@sina.com"
__status__ = "Development"

import numpy as np
from scipy.stats import shapiro
from scipy.stats import anderson
from scipy.stats import normaltest

# 读入流感趋势数据
flutrends = np.loadtxt("goog_flutrends.csv", delimiter=',', usecols=(1,), skiprows=1,
                       converters={1: lambda s: float(s or 0)}, unpack=True)
N = len(flutrends)
normal_values = np.random.normal(size=N)
zero_values = np.zeros(N)

print("Normal Value Shapiro", shapiro(normal_values))
print("Zero Shapiro", shapiro(zero_values))
print("Flu Shapiro", shapiro(flutrends))
print()

print("Normal Value Anderson", anderson(normal_values))
print("Zero Anderson", anderson(zero_values))
print("Flu Anderson", anderson(flutrends))
print()

print("Normal Value normaltest", normaltest(normal_values))
print("Zero normaltest", normaltest(zero_values))
print("Flu normaltest", normaltest(flutrends))
print()

 

运行结果如下:

Normal Value Shapiro (0.9982795119285583,0.8293641209602356)

Zero Shapiro (1.0, 1.0)

Flu Shapiro (0.9351992011070251,2.2946666759787607e-15)

 

Normal Value AndersonAndersonResult(statistic=0.25335473134487074, critical_values=array([ 0.572,  0.652, 0.782,  0.912,  1.085]), significance_level=array([ 15. ,  10. ,  5. ,   2.5,   1. ]))

 

Zero Anderson AndersonResult(statistic=nan,critical_values=array([ 0.572, 0.652,  0.782,  0.912, 1.085]), significance_level=array([ 15. ,  10. ,  5. ,   2.5,   1. ]))

Flu AndersonAndersonResult(statistic=8.258614154768793, critical_values=array([ 0.572,  0.652, 0.782,  0.912,  1.085]), significance_level=array([ 15.,  10. ,  5. ,   2.5,   1. ]))

 

Normal Value normaltestNormaltestResult(statistic=0.18497418424851336, pvalue=0.91166097849514516)

Zero normaltestNormaltestResult(statistic=1.0095473240349975, pvalue=0.60364218712103535)

Flu normaltestNormaltestResult(statistic=99.643733363569538, pvalue=2.3048264115368721e-22)

 

 

创建掩码式numpy数组

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2017/1/21 14:00
# @Author  : Retacn
# @Site    : 掩码式numpy数组
# @File    : masked.py
# @Software: PyCharm
__author__ = "retacn"
__copyright__ = "property of mankind."
__license__ = "CN"
__version__ = "0.0.1"
__maintainer__ = "retacn"
__email__ = "zhenhuayue@sina.com"
__status__ = "Development"

import numpy as np
from scipy.misc import ascent, face
import matplotlib.pyplot as plt

face = face()
random_mask = np.random.randint(0, 2, size=face.shape)

plt.subplot(221), plt.title("Original"), plt.imshow(face), plt.axis('off')

masked_array = np.ma.array(face, mask=random_mask)
print(masked_array)

plt.subplot(222)
plt.title("Masked")
plt.imshow(masked_array)
plt.axis("off")

plt.subplot(223)
plt.title("Log")
plt.imshow(np.log(face))
plt.axis("off")

plt.subplot(224)
plt.title("Log Masked")
plt.imshow(np.log(masked_array))
plt.axis("off")

plt.show()

 

运行结果如下:

[[[-- -- --]

 [138 129 --]

 [153 -- 165]

 ...,

  [--126 74]

 [131 136 --]

  [--144 90]]

 

 [[89-- 100]

  [---- 121]

  [--122 --]

 ...,

 [118 125 71]

 [134 141 87]

 [146 153 99]]

 

 [[73-- 84]

  [---- --]

  [---- 126]

 ...,

 [117 126 --]

  [---- 87]

 [144 153 98]]

 

 ...,

 [[87106 76]

  [94110 81]

 [107 124 --]

 ...,

  [---- 97]

  [--157 --]

  [--158 --]]

 

 [[---- --]

  [---- --]

  [---- --]

 ...,

 [121 157 --]

  [--156 94]

  [--156 94]]

 

 [[---- 74]

  [---- --]

 [111 126 --]

 ...,

  [---- --]

  [--155 --]

 [118 -- 92]]]

 

忽略负值和极值

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2017/1/21 14:46
# @Author  : Retacn
# @Site    : 忽略负值和极值
# @File    : masked_funcs.py
# @Software: PyCharm
__author__ = "retacn"
__copyright__ = "property of mankind."
__license__ = "CN"
__version__ = "0.0.1"
__maintainer__ = "retacn"
__email__ = "zhenhuayue@sina.com"
__status__ = "Development"

import numpy as np
from matplotlib.finance import _quotes_historical_yahoo
from datetime import date
import sys
import matplotlib.pyplot as plt

salary = np.loadtxt("MLB2008.csv", delimiter=",", usecols=(1,), skiprows=1, unpack=True)
triples = np.arange(0, len(salary), 3)
print("Triples", triples[:10], "...")

signs = np.ones(len(salary))
print("Signs", signs[:10], "...")

signs[triples] = -1
print("Sings", signs[:10], "...")

ma_log = np.ma.log(salary * signs)
print("Masked logs", ma_log[:10], "...")

dev = salary.std()
avg = salary.mean()
inside = np.ma.masked_outside(salary, avg - dev, avg + dev)
print("Inside", inside[:10], "...")

plt.subplot(311)
plt.title("Original")
plt.plot(salary)

plt.subplot(312)
plt.title("Log Masked")
plt.plot(np.exp(ma_log))

plt.subplot(313)
plt.title("Not Extreme")
plt.plot(inside)

plt.show()

 

 

运行结果如下:

Triples [ 0 3  6  9 12 15 18 21 24 27] ...

Signs [ 1. 1.  1.  1. 1.  1.  1.  1. 1.  1.] ...

Sings [-1. 1.  1. -1.  1.  1.-1.  1. 1. -1.] ...

Masked logs [-- 14.97081819030892915.830413578506539 -- 13.458835614025542

 15.319587954740548 -- 15.64809202171258413.864300722133706 --] ...

Inside [3750000.0 3175000.0 7500000.0 3000000.0700000.0 4500000.0 3000000.0

 6250000.0 1050000.0 4600000.0] ...

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值