python数据分析学习笔记三

最新推荐文章于 2022-09-29 10:14:23 发布

retacn

最新推荐文章于 2022-09-29 10:14:23 发布

阅读量1k

点赞数

分类专栏： python数据分析

本文链接：https://blog.csdn.net/retacn_yue/article/details/55096845

版权

python数据分析专栏收录该内容

8 篇文章 2 订阅

订阅专栏

第三章统计学与线性代数

1 numpy和scipy模块

示例代码如下:


import pkgutil as pu
import numpy as np
import matplotlib as mpl
import scipy as sp
import pydoc

# 输出库的版本信息
print('Numpy version: ', np.__version__)
print('Scip version: ', sp.__version__)
print('Matplotlib: ', mpl.__version__)


# 移除multiple 中的空格
def clean(astr):
    s = astr

    s = ' '.join(s.split())
    s = s.replace('=', '')
    return s


def print_desc(prefix, pkg_path):
    # 取得指定模块下的所有子库
    for pkg in pu.iter_modules(path=pkg_path):
        name = prefix + '.' + pkg[1]

        if pkg[2] == True:
            try:
                # render_doc取得子库或函数的文字说明
                #
                docstr = pydoc.plain(pydoc.render_doc(name))
                docstr = clean(docstr)
                # 设定所取内容的起始位置 
                start = docstr.find('DESCRIPTION')
                docstr = docstr[start:start + 40]
                print(name, docstr)
            except:
                continue


print_desc('numpy', np.__path__)
print()
print()
print()
print_desc('scipy', sp.__path__)

运行结果如下:

Numpy version: 1.11.2

Scip version: 0.18.1

Matplotlib: 1.5.3

numpy.compat DESCRIPTION This modulecontains duplica

numpy.core DESCRIPTION Functions - array -NumPy Ar

numpy.distutils

numpy.doc DESCRIPTION Topicaldocumentation The f

numpy.f2py

numpy.fft DESCRIPTION Discrete FourierTransform (

numpy.lib DESCRIPTION Basic functions usedby seve

numpy.linalg DESCRIPTION Core LinearAlgebra Tools --

numpy.ma DESCRIPTION Masked Arrays Arrays somet

numpy.matrixlib

numpy.polynomial DESCRIPTION Within thedocumentation for

numpy.random DESCRIPTION Random Number Generation

numpy.testing DESCRIPTION This singlemodule should pr

scipy._build_utils

scipy._lib DESCRIPTION Module containing privateut

scipy.cluster DESCRIPTION Clustering package (:mod:`s

scipy.constants DESCRIPTION Constants (:mod:`scipy.cons

scipy.fftpack DESCRIPTION Discrete Fourier transforms

scipy.integrate DESCRIPTION Integration and ODEs (:mod:

scipy.interpolate DESCRIPTION Interpolation (:mod:`scipy.

scipy.io DESCRIPTION Input and output (:mod:`sci

scipy.linalg DESCRIPTION Linear algebra (:mod:`scipy

scipy.misc DESCRIPTION Miscellaneous routines (:mo

scipy.ndimage DESCRIPTION Multi-dimensional image pro

scipy.odr DESCRIPTION Orthogonal distance regress

scipy.optimize DESCRIPTION Optimization and root findi

scipy.signal DESCRIPTION Signal processing (:mod:`sc

scipy.sparse DESCRIPTION Sparse matrices (:mod:`scip

scipy.spatial DESCRIPTION Spatial algorithms and data

scipy.special DESCRIPTION Special functions (:mod:`sc

scipy.stats DESCRIPTION Statistical functions (:mod

2 使用numpy进行简单的描述性统计计算

示例代码如下:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2016/12/29 17:16
# @Author  : Retacn
# @Site    : 使用numpy进行统计计算
# @File    : basic_stats.py
# @Software: PyCharm
__author__ = "retacn"
__copyright__ = "property of mankind."
__license__ = "CN"
__version__ = "0.0.1"
__maintainer__ = "retacn"
__email__ = "zhenhuayue@sina.com"
__status__ = "Development"

import numpy as np
from scipy.stats import scoreatpercentile

# 加载csv文件
#文件中Curaao,0  Cte d'Ivoire,2.5 两个字符器有问题,修改后正常

data = np.loadtxt('mdrtb_2012.csv', delimiter=',', usecols=(1,), skiprows=1, unpack=True)
#data = np.loadtxt('MLB2008.csv', delimiter=',', usecols=(1,), skiprows=1, unpack=True)

# 最大值
print('Max method', data.max())
print('Max function', np.max(data))

# 最小值
print('Min method', data.min())
print('Min function', np.min(data))

# 平均值
print('Mean method', data.mean())
print('Mean function', np.mean(data))

# 标准差
print('Std method', data.std())
print('Std function', np.std(data))

# 中位数
print('Median', np.median(data))
print('Score at percentile 50', scoreatpercentile(data, 50))

运行结果如下:

Max method 50.0

Max function 50.0

Min method 0.0

Min function 0.0

Mean method 3.2787037037

Mean function 3.2787037037

Std method 5.76332073654

Std function 5.76332073654

Median 1.8

Score at percentile 50 1.8

3 使用numpy进行线性代数运算

A 求矩阵逆

示例代码如下:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2016/12/29 17:53
# @Author  : Retacn
# @Site    : 使用numpy来求矩阵的逆
# @File    : inversion.py
# @Software: PyCharm
__author__ = "retacn"
__copyright__ = "property of mankind."
__license__ = "CN"
__version__ = "0.0.1"
__maintainer__ = "retacn"
__email__ = "zhenhuayue@sina.com"
__status__ = "Development"

import numpy as np

# 创建示例矩阵
A = np.mat('2 4 6;4 2 6;10 -4 18')
#使用mit公开课中老师用的矩阵例子进行测试
#A = np.mat('1 0 0;-3 1 0;0 0 1')
print('A\n', A)

# 求矩阵的逆
inverse = np.linalg.inv(A)
print('inverse of A\n', inverse)

# 利用乘法进行验算
print('Check\n', A * inverse)
print('Error\n', A * inverse - np.eye(3))

运行结果如下:

[[2 4 6]

[4 2 6]

[10-4 18]]

inverse of A

[[-0.41666667 0.66666667 -0.08333333]

[0.08333333 0.16666667 -0.08333333]

[0.25 -0.33333333 0.08333333]]

Check

[[ 1.00000000e+00 -2.22044605e-16 0.00000000e+00]

[ 5.55111512e-17 1.00000000e+00 0.00000000e+00]

[ 3.88578059e-16 -4.44089210e-16 1.00000000e+00]]

Error

[[ 0.00000000e+00 -2.22044605e-16 0.00000000e+00]

[ 5.55111512e-17 -4.44089210e-16 0.00000000e+00]

[ 3.88578059e-16 -4.44089210e-16 0.00000000e+00]]

#可逆矩阵的列子

[[1 0 0]

[-3 1 0]

[0 0 1]]

inverse of A

[[1. 0. 0.]

[3. 1. 0.]

[0. 0. 1.]]

Check

[[1. 0. 0.]

[0. 1. 0.]

[0. 0. 1.]]

Error

[[0. 0. 0.]

[0. 0. 0.]

[0. 0. 0.]]

B 解线性方程组

示例代码如下:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2017/1/20 13:52
# @Author  : Retacn
# @Site    : 用numpy解线性方程
# @File    : linner_equation.py
# @Software: PyCharm
__author__ = "retacn"
__copyright__ = "property of mankind."
__license__ = "CN"
__version__ = "0.0.1"
__maintainer__ = "retacn"
__email__ = "zhenhuayue@sina.com"
__status__ = "Development"

import numpy as np

# 创建矩阵A和数组b
A = np.mat("1 -2 1;0 2 -8;-4 5 9")
print("A\n", A)
b = np.array([0, 8, -9])
print("b\n", b)

# 调用solve()函数,解线性方程组
x = np.linalg.solve(A, b)
print("solution", x)

# 使用dot()函数进行验算
print("check\n", np.dot(A, x))

运行结果如下:

[[ 1-2 1]

[0 2 -8]

[-4 5 9]]

[0 8 -9]

solution [ 29. 16. 3.]

check

[[0. 8. -9.]]

4 使用numpy计算特征值和特征向量

示例代码如下:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2017/1/21 12:47
# @Author  : Retacn
# @Site    : 计算特征值和特征向量
# @File    : eigenvalues.py
# @Software: PyCharm
__author__ = "retacn"
__copyright__ = "property of mankind."
__license__ = "CN"
__version__ = "0.0.1"
__maintainer__ = "retacn"
__email__ = "zhenhuayue@sina.com"
__status__ = "Development"

import numpy as np

A = np.mat("3 -2;1 0")
print("A\n", A)

print("Eigenvalues", np.linalg.eigvals(A))

eigenvalues, eigenvectors = np.linalg.eig(A)
print("first tuple of eig", eigenvalues)
print("second tuple of eig\n", eigenvectors)

for i in range(len(eigenvalues)):
    print("left", np.dot(A, eigenvectors[:, i]))
    print("right", eigenvalues[i] * eigenvectors[:, i])
    print()

运行结果如下:

[[ 3-2]

[1 0]]

Eigenvalues [ 2. 1.]

first tuple of eig [ 2. 1.]

second tuple of eig

[[0.89442719 0.70710678]

[0.4472136 0.70710678]]

left [[ 1.78885438]

[0.89442719]]

right [[ 1.78885438]

[0.89442719]]

Numpy随机数

用二项式分布进行博弈

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2017/1/21 13:17
# @Author  : Retacn
# @Site    : 
# @File    : headortail.py
# @Software: PyCharm
__author__ = "retacn"
__copyright__ = "property of mankind."
__license__ = "CN"
__version__ = "0.0.1"
__maintainer__ = "retacn"
__email__ = "zhenhuayue@sina.com"
__status__ = "Development"

import numpy as np
from matplotlib.pyplot import plot, show

cash = np.zeros(10000)
cash[0] = 1000
outcome = np.random.binomial(9, 0.5, size=len(cash))

for i in range(1, len(cash)):
    if outcome[i] < 5:
        cash[i] = cash[i - 1] - 1
    elif outcome[i] < 10:
        cash[i] = cash[i - 1] + 1
    else:
        raise AssertionError("Unexpected outcome" + outcome)

print(outcome.min(), outcome.max())

plot(np.arange(len(cash)), cash)
show()

正态分面采样

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2017/1/21 13:30
# @Author  : Retacn
# @Site    : 正态分布采样
# @File    : normaldist.py
# @Software: PyCharm
__author__ = "retacn"
__copyright__ = "property of mankind."
__license__ = "CN"
__version__ = "0.0.1"
__maintainer__ = "retacn"
__email__ = "zhenhuayue@sina.com"
__status__ = "Development"

import numpy as np
import matplotlib.pyplot as plt

N = 10000

normal_values = np.random.normal(size=N)
dummy, bins, dummy = plt.hist(normal_values, np.sqrt(N), normed=True, lw=1)
sigma = 1
mu = 0
plt.plot(bins, 1 / (sigma * np.sqrt(2 * np.pi)) * np.exp(-(bins - mu) ** 2 / (2 * sigma ** 2)), lw=2)
plt.show()

使用scipy 进行正态检验

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2017/1/21 13:41
# @Author  : Retacn
# @Site    : 用scipy进行正态检验
# @File    : normality_test.py
# @Software: PyCharm
__author__ = "retacn"
__copyright__ = "property of mankind."
__license__ = "CN"
__version__ = "0.0.1"
__maintainer__ = "retacn"
__email__ = "zhenhuayue@sina.com"
__status__ = "Development"

import numpy as np
from scipy.stats import shapiro
from scipy.stats import anderson
from scipy.stats import normaltest

# 读入流感趋势数据
flutrends = np.loadtxt("goog_flutrends.csv", delimiter=',', usecols=(1,), skiprows=1,
                       converters={1: lambda s: float(s or 0)}, unpack=True)
N = len(flutrends)
normal_values = np.random.normal(size=N)
zero_values = np.zeros(N)

print("Normal Value Shapiro", shapiro(normal_values))
print("Zero Shapiro", shapiro(zero_values))
print("Flu Shapiro", shapiro(flutrends))
print()

print("Normal Value Anderson", anderson(normal_values))
print("Zero Anderson", anderson(zero_values))
print("Flu Anderson", anderson(flutrends))
print()

print("Normal Value normaltest", normaltest(normal_values))
print("Zero normaltest", normaltest(zero_values))
print("Flu normaltest", normaltest(flutrends))
print()

运行结果如下:

Normal Value Shapiro (0.9982795119285583,0.8293641209602356)

Zero Shapiro (1.0, 1.0)

Flu Shapiro (0.9351992011070251,2.2946666759787607e-15)

Normal Value AndersonAndersonResult(statistic=0.25335473134487074, critical_values=array([ 0.572, 0.652, 0.782, 0.912, 1.085]), significance_level=array([ 15. , 10. , 5. , 2.5, 1. ]))

Zero Anderson AndersonResult(statistic=nan,critical_values=array([ 0.572, 0.652, 0.782, 0.912, 1.085]), significance_level=array([ 15. , 10. , 5. , 2.5, 1. ]))

Flu AndersonAndersonResult(statistic=8.258614154768793, critical_values=array([ 0.572, 0.652, 0.782, 0.912, 1.085]), significance_level=array([ 15., 10. , 5. , 2.5, 1. ]))

Normal Value normaltestNormaltestResult(statistic=0.18497418424851336, pvalue=0.91166097849514516)

Zero normaltestNormaltestResult(statistic=1.0095473240349975, pvalue=0.60364218712103535)

Flu normaltestNormaltestResult(statistic=99.643733363569538, pvalue=2.3048264115368721e-22)

创建掩码式numpy数组

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2017/1/21 14:00
# @Author  : Retacn
# @Site    : 掩码式numpy数组
# @File    : masked.py
# @Software: PyCharm
__author__ = "retacn"
__copyright__ = "property of mankind."
__license__ = "CN"
__version__ = "0.0.1"
__maintainer__ = "retacn"
__email__ = "zhenhuayue@sina.com"
__status__ = "Development"

import numpy as np
from scipy.misc import ascent, face
import matplotlib.pyplot as plt

face = face()
random_mask = np.random.randint(0, 2, size=face.shape)

plt.subplot(221), plt.title("Original"), plt.imshow(face), plt.axis('off')

masked_array = np.ma.array(face, mask=random_mask)
print(masked_array)

plt.subplot(222)
plt.title("Masked")
plt.imshow(masked_array)
plt.axis("off")

plt.subplot(223)
plt.title("Log")
plt.imshow(np.log(face))
plt.axis("off")

plt.subplot(224)
plt.title("Log Masked")
plt.imshow(np.log(masked_array))
plt.axis("off")

plt.show()

运行结果如下:

[[[-- -- --]

[138 129 --]

[153 -- 165]

...,

[--126 74]

[131 136 --]

[--144 90]]

[[89-- 100]

[---- 121]

[--122 --]

...,

[118 125 71]

[134 141 87]

[146 153 99]]

[[73-- 84]

[---- --]

[---- 126]

...,

[117 126 --]

[---- 87]

[144 153 98]]

...,

[[87106 76]

[94110 81]

[107 124 --]

...,

[---- 97]

[--157 --]

[--158 --]]

[[---- --]

[---- --]

...,

[121 157 --]

[--156 94]

[--156 94]]

[[---- 74]

[---- --]

[111 126 --]

...,

[---- --]

[--155 --]

[118 -- 92]]]

忽略负值和极值

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2017/1/21 14:46
# @Author  : Retacn
# @Site    : 忽略负值和极值
# @File    : masked_funcs.py
# @Software: PyCharm
__author__ = "retacn"
__copyright__ = "property of mankind."
__license__ = "CN"
__version__ = "0.0.1"
__maintainer__ = "retacn"
__email__ = "zhenhuayue@sina.com"
__status__ = "Development"

import numpy as np
from matplotlib.finance import _quotes_historical_yahoo
from datetime import date
import sys
import matplotlib.pyplot as plt

salary = np.loadtxt("MLB2008.csv", delimiter=",", usecols=(1,), skiprows=1, unpack=True)
triples = np.arange(0, len(salary), 3)
print("Triples", triples[:10], "...")

signs = np.ones(len(salary))
print("Signs", signs[:10], "...")

signs[triples] = -1
print("Sings", signs[:10], "...")

ma_log = np.ma.log(salary * signs)
print("Masked logs", ma_log[:10], "...")

dev = salary.std()
avg = salary.mean()
inside = np.ma.masked_outside(salary, avg - dev, avg + dev)
print("Inside", inside[:10], "...")

plt.subplot(311)
plt.title("Original")
plt.plot(salary)

plt.subplot(312)
plt.title("Log Masked")
plt.plot(np.exp(ma_log))

plt.subplot(313)
plt.title("Not Extreme")
plt.plot(inside)

plt.show()

运行结果如下:

Triples [ 0 3 6 9 12 15 18 21 24 27] ...

Signs [ 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.] ...

Sings [-1. 1. 1. -1. 1. 1.-1. 1. 1. -1.] ...

Masked logs [-- 14.97081819030892915.830413578506539 -- 13.458835614025542

15.319587954740548 -- 15.64809202171258413.864300722133706 --] ...

Inside [3750000.0 3175000.0 7500000.0 3000000.0700000.0 4500000.0 3000000.0

6250000.0 1050000.0 4600000.0] ...

retacn

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python数据分析学习笔记三

第三章统计学与线性代数1 numpy和scipy模块示例代码如下:import pkgutil as puimport numpy as npimport matplotlib as mplimport scipy as spimport pydoc# 输出库的版本信息print('Numpy version: ', np.__version__)print('Sc
复制链接

扫一扫