第三章 统计学与线性代数
1 numpy和scipy模块
示例代码如下:
import pkgutil as pu import numpy as np import matplotlib as mpl import scipy as sp import pydoc # 输出库的版本信息 print('Numpy version: ', np.__version__) print('Scip version: ', sp.__version__) print('Matplotlib: ', mpl.__version__) # 移除multiple 中的空格 def clean(astr): s = astr s = ' '.join(s.split()) s = s.replace('=', '') return s def print_desc(prefix, pkg_path): # 取得指定模块下的所有子库 for pkg in pu.iter_modules(path=pkg_path): name = prefix + '.' + pkg[1] if pkg[2] == True: try: # render_doc取得子库或函数的文字说明 # docstr = pydoc.plain(pydoc.render_doc(name)) docstr = clean(docstr) # 设定所取内容的起始位置 start = docstr.find('DESCRIPTION') docstr = docstr[start:start + 40] print(name, docstr) except: continue print_desc('numpy', np.__path__) print() print() print() print_desc('scipy', sp.__path__)
运行结果如下:
Numpy version: 1.11.2
Scip version: 0.18.1
Matplotlib: 1.5.3
numpy.compat DESCRIPTION This modulecontains duplica
numpy.core DESCRIPTION Functions - array -NumPy Ar
numpy.distutils
numpy.doc DESCRIPTION Topicaldocumentation The f
numpy.f2py
numpy.fft DESCRIPTION Discrete FourierTransform (
numpy.lib DESCRIPTION Basic functions usedby seve
numpy.linalg DESCRIPTION Core LinearAlgebra Tools --
numpy.ma DESCRIPTION Masked Arrays Arrays somet
numpy.matrixlib
numpy.polynomial DESCRIPTION Within thedocumentation for
numpy.random DESCRIPTION Random Number Generation
numpy.testing DESCRIPTION This singlemodule should pr
scipy._build_utils
scipy._lib DESCRIPTION Module containing privateut
scipy.cluster DESCRIPTION Clustering package (:mod:`s
scipy.constants DESCRIPTION Constants (:mod:`scipy.cons
scipy.fftpack DESCRIPTION Discrete Fourier transforms
scipy.integrate DESCRIPTION Integration and ODEs (:mod:
scipy.interpolate DESCRIPTION Interpolation (:mod:`scipy.
scipy.io DESCRIPTION Input and output (:mod:`sci
scipy.linalg DESCRIPTION Linear algebra (:mod:`scipy
scipy.misc DESCRIPTION Miscellaneous routines (:mo
scipy.ndimage DESCRIPTION Multi-dimensional image pro
scipy.odr DESCRIPTION Orthogonal distance regress
scipy.optimize DESCRIPTION Optimization and root findi
scipy.signal DESCRIPTION Signal processing (:mod:`sc
scipy.sparse DESCRIPTION Sparse matrices (:mod:`scip
scipy.spatial DESCRIPTION Spatial algorithms and data
scipy.special DESCRIPTION Special functions (:mod:`sc
scipy.stats DESCRIPTION Statistical functions (:mod
2 使用numpy进行简单的描述性统计计算
示例代码如下:
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2016/12/29 17:16 # @Author : Retacn # @Site : 使用numpy进行统计计算 # @File : basic_stats.py # @Software: PyCharm __author__ = "retacn" __copyright__ = "property of mankind." __license__ = "CN" __version__ = "0.0.1" __maintainer__ = "retacn" __email__ = "zhenhuayue@sina.com" __status__ = "Development" import numpy as np from scipy.stats import scoreatpercentile # 加载csv文件 #文件中Curaao,0 Cte d'Ivoire,2.5 两个字符器有问题,修改后正常 data = np.loadtxt('mdrtb_2012.csv', delimiter=',', usecols=(1,), skiprows=1, unpack=True) #data = np.loadtxt('MLB2008.csv', delimiter=',', usecols=(1,), skiprows=1, unpack=True) # 最大值 print('Max method', data.max()) print('Max function', np.max(data)) # 最小值 print('Min method', data.min()) print('Min function', np.min(data)) # 平均值 print('Mean method', data.mean()) print('Mean function', np.mean(data)) # 标准差 print('Std method', data.std()) print('Std function', np.std(data)) # 中位数 print('Median', np.median(data)) print('Score at percentile 50', scoreatpercentile(data, 50))
运行结果如下:
Max method 50.0
Max function 50.0
Min method 0.0
Min function 0.0
Mean method 3.2787037037
Mean function 3.2787037037
Std method 5.76332073654
Std function 5.76332073654
Median 1.8
Score at percentile 50 1.8
3 使用numpy进行线性代数运算
A 求矩阵逆
示例代码如下:
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2016/12/29 17:53 # @Author : Retacn # @Site : 使用numpy来求矩阵的逆 # @File : inversion.py # @Software: PyCharm __author__ = "retacn" __copyright__ = "property of mankind." __license__ = "CN" __version__ = "0.0.1" __maintainer__ = "retacn" __email__ = "zhenhuayue@sina.com" __status__ = "Development" import numpy as np # 创建示例矩阵 A = np.mat('2 4 6;4 2 6;10 -4 18') #使用mit公开课中老师用的矩阵例子进行测试 #A = np.mat('1 0 0;-3 1 0;0 0 1') print('A\n', A) # 求矩阵的逆 inverse = np.linalg.inv(A) print('inverse of A\n', inverse) # 利用乘法进行验算 print('Check\n', A * inverse) print('Error\n', A * inverse - np.eye(3))
运行结果如下:
A
[[2 4 6]
[4 2 6]
[10-4 18]]
inverse of A
[[-0.41666667 0.66666667 -0.08333333]
[0.08333333 0.16666667 -0.08333333]
[0.25 -0.33333333 0.08333333]]
Check
[[ 1.00000000e+00 -2.22044605e-16 0.00000000e+00]
[ 5.55111512e-17 1.00000000e+00 0.00000000e+00]
[ 3.88578059e-16 -4.44089210e-16 1.00000000e+00]]
Error
[[ 0.00000000e+00 -2.22044605e-16 0.00000000e+00]
[ 5.55111512e-17 -4.44089210e-16 0.00000000e+00]
[ 3.88578059e-16 -4.44089210e-16 0.00000000e+00]]
#可逆矩阵的列子
A
[[1 0 0]
[-3 1 0]
[0 0 1]]
inverse of A
[[1. 0. 0.]
[3. 1. 0.]
[0. 0. 1.]]
Check
[[1. 0. 0.]
[0. 1. 0.]
[0. 0. 1.]]
Error
[[0. 0. 0.]
[0. 0. 0.]
[0. 0. 0.]]
B 解线性方程组
示例代码如下:
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2017/1/20 13:52 # @Author : Retacn # @Site : 用numpy解线性方程 # @File : linner_equation.py # @Software: PyCharm __author__ = "retacn" __copyright__ = "property of mankind." __license__ = "CN" __version__ = "0.0.1" __maintainer__ = "retacn" __email__ = "zhenhuayue@sina.com" __status__ = "Development" import numpy as np # 创建矩阵A和数组b A = np.mat("1 -2 1;0 2 -8;-4 5 9") print("A\n", A) b = np.array([0, 8, -9]) print("b\n", b) # 调用solve()函数,解线性方程组 x = np.linalg.solve(A, b) print("solution", x) # 使用dot()函数进行验算 print("check\n", np.dot(A, x))
运行结果如下:
A
[[ 1-2 1]
[0 2 -8]
[-4 5 9]]
b
[0 8 -9]
solution [ 29. 16. 3.]
check
[[0. 8. -9.]]
4 使用numpy计算特征值和特征向量
示例代码如下:
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2017/1/21 12:47 # @Author : Retacn # @Site : 计算特征值和特征向量 # @File : eigenvalues.py # @Software: PyCharm __author__ = "retacn" __copyright__ = "property of mankind." __license__ = "CN" __version__ = "0.0.1" __maintainer__ = "retacn" __email__ = "zhenhuayue@sina.com" __status__ = "Development" import numpy as np A = np.mat("3 -2;1 0") print("A\n", A) print("Eigenvalues", np.linalg.eigvals(A)) eigenvalues, eigenvectors = np.linalg.eig(A) print("first tuple of eig", eigenvalues) print("second tuple of eig\n", eigenvectors) for i in range(len(eigenvalues)): print("left", np.dot(A, eigenvectors[:, i])) print("right", eigenvalues[i] * eigenvectors[:, i]) print()
运行结果如下:
A
[[ 3-2]
[1 0]]
Eigenvalues [ 2. 1.]
first tuple of eig [ 2. 1.]
second tuple of eig
[[0.89442719 0.70710678]
[0.4472136 0.70710678]]
left [[ 1.78885438]
[0.89442719]]
right [[ 1.78885438]
[0.89442719]]
Numpy随机数
用二项式分布进行博弈
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2017/1/21 13:17 # @Author : Retacn # @Site : # @File : headortail.py # @Software: PyCharm __author__ = "retacn" __copyright__ = "property of mankind." __license__ = "CN" __version__ = "0.0.1" __maintainer__ = "retacn" __email__ = "zhenhuayue@sina.com" __status__ = "Development" import numpy as np from matplotlib.pyplot import plot, show cash = np.zeros(10000) cash[0] = 1000 outcome = np.random.binomial(9, 0.5, size=len(cash)) for i in range(1, len(cash)): if outcome[i] < 5: cash[i] = cash[i - 1] - 1 elif outcome[i] < 10: cash[i] = cash[i - 1] + 1 else: raise AssertionError("Unexpected outcome" + outcome) print(outcome.min(), outcome.max()) plot(np.arange(len(cash)), cash) show()
正态分面采样
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2017/1/21 13:30 # @Author : Retacn # @Site : 正态分布采样 # @File : normaldist.py # @Software: PyCharm __author__ = "retacn" __copyright__ = "property of mankind." __license__ = "CN" __version__ = "0.0.1" __maintainer__ = "retacn" __email__ = "zhenhuayue@sina.com" __status__ = "Development" import numpy as np import matplotlib.pyplot as plt N = 10000 normal_values = np.random.normal(size=N) dummy, bins, dummy = plt.hist(normal_values, np.sqrt(N), normed=True, lw=1) sigma = 1 mu = 0 plt.plot(bins, 1 / (sigma * np.sqrt(2 * np.pi)) * np.exp(-(bins - mu) ** 2 / (2 * sigma ** 2)), lw=2) plt.show()
使用scipy 进行正态检验
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2017/1/21 13:41 # @Author : Retacn # @Site : 用scipy进行正态检验 # @File : normality_test.py # @Software: PyCharm __author__ = "retacn" __copyright__ = "property of mankind." __license__ = "CN" __version__ = "0.0.1" __maintainer__ = "retacn" __email__ = "zhenhuayue@sina.com" __status__ = "Development" import numpy as np from scipy.stats import shapiro from scipy.stats import anderson from scipy.stats import normaltest # 读入流感趋势数据 flutrends = np.loadtxt("goog_flutrends.csv", delimiter=',', usecols=(1,), skiprows=1, converters={1: lambda s: float(s or 0)}, unpack=True) N = len(flutrends) normal_values = np.random.normal(size=N) zero_values = np.zeros(N) print("Normal Value Shapiro", shapiro(normal_values)) print("Zero Shapiro", shapiro(zero_values)) print("Flu Shapiro", shapiro(flutrends)) print() print("Normal Value Anderson", anderson(normal_values)) print("Zero Anderson", anderson(zero_values)) print("Flu Anderson", anderson(flutrends)) print() print("Normal Value normaltest", normaltest(normal_values)) print("Zero normaltest", normaltest(zero_values)) print("Flu normaltest", normaltest(flutrends)) print()
运行结果如下:
Normal Value Shapiro (0.9982795119285583,0.8293641209602356)
Zero Shapiro (1.0, 1.0)
Flu Shapiro (0.9351992011070251,2.2946666759787607e-15)
Normal Value AndersonAndersonResult(statistic=0.25335473134487074, critical_values=array([ 0.572, 0.652, 0.782, 0.912, 1.085]), significance_level=array([ 15. , 10. , 5. , 2.5, 1. ]))
Zero Anderson AndersonResult(statistic=nan,critical_values=array([ 0.572, 0.652, 0.782, 0.912, 1.085]), significance_level=array([ 15. , 10. , 5. , 2.5, 1. ]))
Flu AndersonAndersonResult(statistic=8.258614154768793, critical_values=array([ 0.572, 0.652, 0.782, 0.912, 1.085]), significance_level=array([ 15., 10. , 5. , 2.5, 1. ]))
Normal Value normaltestNormaltestResult(statistic=0.18497418424851336, pvalue=0.91166097849514516)
Zero normaltestNormaltestResult(statistic=1.0095473240349975, pvalue=0.60364218712103535)
Flu normaltestNormaltestResult(statistic=99.643733363569538, pvalue=2.3048264115368721e-22)
创建掩码式numpy数组
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2017/1/21 14:00 # @Author : Retacn # @Site : 掩码式numpy数组 # @File : masked.py # @Software: PyCharm __author__ = "retacn" __copyright__ = "property of mankind." __license__ = "CN" __version__ = "0.0.1" __maintainer__ = "retacn" __email__ = "zhenhuayue@sina.com" __status__ = "Development" import numpy as np from scipy.misc import ascent, face import matplotlib.pyplot as plt face = face() random_mask = np.random.randint(0, 2, size=face.shape) plt.subplot(221), plt.title("Original"), plt.imshow(face), plt.axis('off') masked_array = np.ma.array(face, mask=random_mask) print(masked_array) plt.subplot(222) plt.title("Masked") plt.imshow(masked_array) plt.axis("off") plt.subplot(223) plt.title("Log") plt.imshow(np.log(face)) plt.axis("off") plt.subplot(224) plt.title("Log Masked") plt.imshow(np.log(masked_array)) plt.axis("off") plt.show()
运行结果如下:
[[[-- -- --]
[138 129 --]
[153 -- 165]
...,
[--126 74]
[131 136 --]
[--144 90]]
[[89-- 100]
[---- 121]
[--122 --]
...,
[118 125 71]
[134 141 87]
[146 153 99]]
[[73-- 84]
[---- --]
[---- 126]
...,
[117 126 --]
[---- 87]
[144 153 98]]
...,
[[87106 76]
[94110 81]
[107 124 --]
...,
[---- 97]
[--157 --]
[--158 --]]
[[---- --]
[---- --]
[---- --]
...,
[121 157 --]
[--156 94]
[--156 94]]
[[---- 74]
[---- --]
[111 126 --]
...,
[---- --]
[--155 --]
[118 -- 92]]]
忽略负值和极值
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2017/1/21 14:46 # @Author : Retacn # @Site : 忽略负值和极值 # @File : masked_funcs.py # @Software: PyCharm __author__ = "retacn" __copyright__ = "property of mankind." __license__ = "CN" __version__ = "0.0.1" __maintainer__ = "retacn" __email__ = "zhenhuayue@sina.com" __status__ = "Development" import numpy as np from matplotlib.finance import _quotes_historical_yahoo from datetime import date import sys import matplotlib.pyplot as plt salary = np.loadtxt("MLB2008.csv", delimiter=",", usecols=(1,), skiprows=1, unpack=True) triples = np.arange(0, len(salary), 3) print("Triples", triples[:10], "...") signs = np.ones(len(salary)) print("Signs", signs[:10], "...") signs[triples] = -1 print("Sings", signs[:10], "...") ma_log = np.ma.log(salary * signs) print("Masked logs", ma_log[:10], "...") dev = salary.std() avg = salary.mean() inside = np.ma.masked_outside(salary, avg - dev, avg + dev) print("Inside", inside[:10], "...") plt.subplot(311) plt.title("Original") plt.plot(salary) plt.subplot(312) plt.title("Log Masked") plt.plot(np.exp(ma_log)) plt.subplot(313) plt.title("Not Extreme") plt.plot(inside) plt.show()
运行结果如下:
Triples [ 0 3 6 9 12 15 18 21 24 27] ...
Signs [ 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.] ...
Sings [-1. 1. 1. -1. 1. 1.-1. 1. 1. -1.] ...
Masked logs [-- 14.97081819030892915.830413578506539 -- 13.458835614025542
15.319587954740548 -- 15.64809202171258413.864300722133706 --] ...
Inside [3750000.0 3175000.0 7500000.0 3000000.0700000.0 4500000.0 3000000.0
6250000.0 1050000.0 4600000.0] ...