数据分析04

4.最值

  1. max/min:返回一个数组中最大/最小元素
  2. argmax/argmin:返回一个数组中最大/最小元素的下标
  3. maximum/minimum:将两个同维数组中对应元素中最大/最小元素构成一个新的数组
  4. ptp:返回一个数组中最大值和最小值之差
    代码:
    # -*- coding: utf-8 -*-
    from __future__ import unicode_literals
    import numpy as np
    # 产生9个介于[10, 100)区间的随机数
    a = np.random.randint(10, 100, 9).reshape(3, 3)
    print(a)
    print(np.max(a), np.min(a), np.ptp(a))
    print(np.argmax(a), np.argmin(a))
    b = np.random.randint(10, 100, 9).reshape(3, 3)
    print(b)
    print(np.maximum(a, b), np.minimum(a, b), sep='\n')
    
    # a [[69 25 50]
    #  [70 30 18]
    #  [47 84 32]]
    # 84 18 66
    # 7 5
    # b [[38 71 71]
    #  [82 34 42]
    #  [39 50 85]]
    # a,b
    # [[69 71 71]
    #  [82 34 42]
    #  [47 84 85]]
    # [[38 25 50]
    #  [70 30 18]
    #  [39 50 32]]
    # -*- coding: utf-8 -*-
    from __future__ import unicode_literals
    import numpy as np
    dates, highest_prices, lowest_prices = np.loadtxt(
        'aapl.csv', delimiter=',',
        usecols=(1, 4, 5), dtype='U10, f8, f8',
        unpack=True)
    max_price = np.max(highest_prices)
    min_price = np.min(lowest_prices)
    print(min_price, '~', max_price)
    max_index = np.argmax(highest_prices)
    min_index = np.argmin(lowest_prices)
    print(dates[min_index], dates[max_index])
    highest_ptp = np.ptp(highest_prices)
    lowest_ptp = np.ptp(lowest_prices)
    print(lowest_ptp, highest_ptp)

     

5.中位数

  • 将多个样本按照大小排序,取中间位置的元素
    10 20 30 40 50
               ^ (a[(5-1)/2]+a[5/2])/2
    10 20 30 40 50 60
               ^    ^
                \    /
                平均 (a[(6-1)/2]+a[6/2])/2
    (a[(s-1)/2]+a[s/2])/2
    np.median(无序样本)->中位数
    # -*- coding: utf-8 -*-
    from __future__ import unicode_literals
    import numpy as np
    closing_prices = np.loadtxt(
        'aapl.csv', delimiter=',',
        usecols=(6), unpack=True)
    size = closing_prices.size
    sorted_prices = np.msort(closing_prices)
    median = (sorted_prices[int((size - 1) / 2)] +
              sorted_prices[int(size / 2)]) / 2
    print(median)
    median = np.median(closing_prices)
    print(median)

     

6.标准差

  • 样本:[s1,s2,...,sn]
    平均值:m = (s1+s2+...sn)/n
    离差:D = [d1,d2,...,dn],di=si-m
    离差方:Q = [q1,q2,...,qn],qi= di**2
    总体方差:v = (q1+q2+...+qn)/n
    总体标准差:s = sqrt(v),方均根
    样本方差:v' = (q1+q2+...+qn)/(n-1)
    样本标准差:s' = sqrt(v'),方均根
    10
    2 + 2 + ... + 2 = 20
    10000
    -------
     10-8
    numpy.std(S)->s
    numpy.std(S, ddof=1)->s'
    代码:
    # -*- coding: utf-8 -*-
    from __future__ import unicode_literals
    import numpy as np
    closing_prices = np.loadtxt(
        '../../data/aapl.csv', delimiter=',',
        usecols=(6), unpack=True)
    mean = np.mean(closing_prices)         # 算数平均值
    devs = closing_prices - mean           # 离差
    dsqs = devs ** 2                       # 离差方
    pvar = np.sum(dsqs) / dsqs.size        # 总体方差
    pstd = np.sqrt(pvar)                   # 总体标准差
    svar = np.sum(dsqs) / (dsqs.size - 1)  # 样本方差
    sstd = np.sqrt(svar)                   # 样本标准差
    print(pstd, sstd)
    pstd = np.std(closing_prices)          # 总体标准差
    sstd = np.std(closing_prices, ddof=1)  # 样本标准差
    print(pstd, sstd)

     

7.时间数据处理

  1. 按星期取平均值
    代码:
    # -*- coding: utf-8 -*-
    from __future__ import unicode_literals
    import datetime as dt
    import numpy as np
    
    
    # 转换器函数:将日-月-年格式的日期字符串转换为星期
    def dmy2wday(dmy):
        dmy = str(dmy, encoding='utf-8')
        date = dt.datetime.strptime(
            dmy, '%d-%m-%Y').date()
        wday = date.weekday()  # 用0~6表示周一到周日
        return wday
    
    
    wdays, closing_prices = np.loadtxt(
        '../../data/aapl.csv', delimiter=',',
        usecols=(1, 6), unpack=True,
        converters={1: dmy2wday})
    print(wdays)
    ave_closing_prices = np.zeros(5)
    for wday in range(ave_closing_prices.size):
        '''
        ave_closing_prices[wday] = np.take(
            closing_prices, np.where(wdays == wday)).mean()
        '''
        '''
        ave_closing_prices[wday] = \
            closing_prices[np.where(wdays == wday)].mean()
        '''
        ave_closing_prices[wday] = \
            closing_prices[wdays == wday].mean()
    for wday, ave_closing_price in zip(
            ['MON', 'TUE', 'WED', 'THU', 'FRI'],
            ave_closing_prices):
        print(wday, np.round(ave_closing_price, 2))

     

  2. 按星期汇总数据
    数组的轴向汇总
    np.apply_along_axis(处理函数, 轴向, 数组)
    沿着数组中所指定的轴向,调用处理函数,并将每次调用的返回值重新组织成数组返回。
    代码:
    # -*- coding: utf-8 -*-
    from __future__ import unicode_literals
    import numpy as np
    
    
    def foo(arg):
        print('foo:', arg)
        return arg.sum()
    
    
    a = np.arange(1, 10).reshape(3, 3)
    print(a)
    b = np.apply_along_axis(foo, 0, a)
    print(b)
    c = np.apply_along_axis(foo, 1, a)
    print(c)
    sum.py
    # -*- coding: utf-8 -*-
    from __future__ import unicode_literals
    import datetime as dt
    import numpy as np
    
    
    def dmy2wday(dmy):
        dmy = str(dmy, encoding='utf-8')
        date = dt.datetime.strptime(dmy, '%d-%m-%Y').date()
        wday = date.weekday()
        return wday
    
    
    wdays, opening_prices, highest_prices, \
        lowest_prices, closing_prices = np.loadtxt(
            '../../data/aapl.csv',
            delimiter=',', usecols=(1, 3, 4, 5, 6),
            unpack=True, converters={1: dmy2wday})
    wdays = wdays[:16]
    opening_prices = opening_prices[:16]
    highest_prices = highest_prices[:16]
    lowest_prices = lowest_prices[:16]
    closing_prices = closing_prices[:16]
    # 第一个星期一的索引
    first_monday = np.where(wdays == 0)[0][0]
    last_friday = np.where(wdays == 4)[0][-1]
    indices = np.arange(first_monday, last_friday + 1)
    indices = np.split(indices, 3)
    
    
    def week_summary(indices):
        opening_price = opening_prices[indices[0]]
        highest_price = highest_prices[indices].max()
        lowest_price = lowest_prices[indices].min()
        closing_price = closing_prices[indices[-1]]
        return opening_price, highest_price, \
            lowest_price, closing_price
    
    
    summaries = np.apply_along_axis(
        week_summary, 1, indices)
    print(summaries)
    np.savetxt('../../data/summary.csv', summaries,
               delimiter=',', fmt='%g')

     

8.卷积

  • 卷积积分
    激励函数:g(t)
    单位激励下的响应函数:f(t)

    响应函数:g(t)f(t)dt
  • a = [1 2 3 4 5]
    b = [6 7 8]
    c = numpy.convolve(a,b,卷积类型)
                           40  61  82            - 有效卷积(valid)
                     19  40  61  82 67       - 同维卷积(same)
                6   19  40  61  82 67  40 - 完全卷积(full)
    0    0    1    2    3    4    5    0    0
    8    7    6
          8    7    6
                8    7    6
                      8    7    6
                            8    7    6
                                  8    7    6
                                        8     7    6
    代码:
    # -*- coding: utf-8 -*-
    from __future__ import unicode_literals
    import numpy as np
    a = np.array([1, 2, 3, 4, 5])   # 被卷积序列
    b = np.array([6, 7, 8])         # 卷积核序列
    print(a, b)
    c = np.convolve(a, b, 'full')   # 完全卷积
    print(c)
    d = np.convolve(a, b, 'same')   # 同维卷积
    print(d)
    e = np.convolve(a, b, 'valid')  # 有效卷积
    print(e)
    

     

9.移动均线
a b c d e f g h i j [1/5 1/5 1/5 1/5 1/5]
(a+b+c+d+e)/5
(b+c+d+e+f)/5
(c+d+e+f+g)/5
...
(f+g+h+i+j)/5
[A, B, C, D, E]
S=A+B+C+D+E
(aA+bB+cC+dD+eE)/S
aA/S+bB/S+cC/S+dD/S+eE/S
[A/S, B/S, C/S, D/S, E/S]
代码

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import datetime as dt
import numpy as np
import matplotlib.pyplot as mp
import matplotlib.dates as md


def dmy2ymd(dmy):
    dmy = str(dmy, encoding='utf-8')
    date = dt.datetime.strptime(dmy, '%d-%m-%Y').date()
    ymd = date.strftime('%Y-%m-%d')
    return ymd


dates, closing_prices = np.loadtxt(
    '../../data/aapl.csv', delimiter=',',
    usecols=(1, 6), unpack=True,
    dtype='M8[D], f8', converters={1: dmy2ymd})
sma51 = np.zeros(closing_prices.size - 4)
for i in range(sma51.size):
    sma51[i] = closing_prices[i:i + 5].mean()
sma52 = np.convolve(
    closing_prices, np.ones(5) / 5, 'valid')
sma10 = np.convolve(
    closing_prices, np.ones(10) / 10, 'valid')
mp.figure('Simple Moving Average',
          facecolor='lightgray')
mp.title('Simple Moving Average', fontsize=20)
mp.xlabel('Date', fontsize=14)
mp.ylabel('Price', fontsize=14)
ax = mp.gca()
# 设置水平坐标每个星期一为主刻度
ax.xaxis.set_major_locator(md.WeekdayLocator(
    byweekday=md.MO))
# 设置水平坐标每一天为次刻度
ax.xaxis.set_minor_locator(md.DayLocator())
# 设置水平坐标主刻度标签格式
ax.xaxis.set_major_formatter(md.DateFormatter(
    '%d %b %Y'))
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
dates = dates.astype(md.datetime.datetime)
mp.plot(dates, closing_prices, c='lightgray',
        label='Closing Price')
mp.plot(dates[4:], sma51, c='orangered',
        label='SMA-5(1)')
mp.plot(dates[4:], sma52, c='limegreen', alpha=0.5,
        linewidth=6, label='SMA-5(2)')
mp.plot(dates[9:], sma10, c='dodgerblue',
        label='SMA-10')
mp.legend()
mp.gcf().autofmt_xdate()
mp.show()

ema.py

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import datetime as dt
import numpy as np
import matplotlib.pyplot as mp
import matplotlib.dates as md


def dmy2ymd(dmy):
    dmy = str(dmy, encoding='utf-8')
    date = dt.datetime.strptime(dmy, '%d-%m-%Y').date()
    ymd = date.strftime('%Y-%m-%d')
    return ymd


dates, closing_prices = np.loadtxt(
    '../../data/aapl.csv', delimiter=',',
    usecols=(1, 6), unpack=True,
    dtype='M8[D], f8', converters={1: dmy2ymd})
weights = np.exp(np.linspace(-1, 0, 5))
weights /= weights.sum()
ema5 = np.convolve(
    closing_prices, weights[::-1], 'valid')
weights = np.exp(np.linspace(-1, 0, 10))
weights /= weights.sum()
ema10 = np.convolve(
    closing_prices, weights[::-1], 'valid')
mp.figure('Exponential Moving Average',
          facecolor='lightgray')
mp.title('Exponential Moving Average', fontsize=20)
mp.xlabel('Date', fontsize=14)
mp.ylabel('Price', fontsize=14)
ax = mp.gca()
# 设置水平坐标每个星期一为主刻度
ax.xaxis.set_major_locator(md.WeekdayLocator(
    byweekday=md.MO))
# 设置水平坐标每一天为次刻度
ax.xaxis.set_minor_locator(md.DayLocator())
# 设置水平坐标主刻度标签格式
ax.xaxis.set_major_formatter(md.DateFormatter(
    '%d %b %Y'))
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
dates = dates.astype(md.datetime.datetime)
mp.plot(dates, closing_prices, c='lightgray',
        label='Closing Price')
mp.plot(dates[4:], ema5, c='orangered',
        label='EMA-5')
mp.plot(dates[9:], ema10, c='dodgerblue',
        label='EMA-10')
mp.legend()
mp.gcf().autofmt_xdate()
mp.show()

10.布林带

  • 中轨:移动平均线
  • 上轨:中轨+2x标准差
  • 下轨:中轨-2x标准差
    代码:
    # -*- coding: utf-8 -*-
    from __future__ import unicode_literals
    import datetime as dt
    import numpy as np
    import matplotlib.pyplot as mp
    import matplotlib.dates as md
    
    
    def dmy2ymd(dmy):
        dmy = str(dmy, encoding='utf-8')
        date = dt.datetime.strptime(dmy, '%d-%m-%Y').date()
        ymd = date.strftime('%Y-%m-%d')
        return ymd
    
    
    dates, closing_prices = np.loadtxt(
        'aapl.csv', delimiter=',',
        usecols=(1, 6), unpack=True,
        dtype='M8[D], f8', converters={1: dmy2ymd})
    weights = np.exp(np.linspace(-1, 0, 5))
    weights /= weights.sum()
    medios = np.convolve(
        closing_prices, weights[::-1], 'valid')
    stds = np.zeros(medios.size)
    for i in range(stds.size):
        stds[i] = closing_prices[i:i + 5].std()
    stds *= 2
    lowers = medios - stds
    uppers = medios + stds
    mp.figure('Exponential Moving Average',
              facecolor='lightgray')
    mp.title('Exponential Moving Average', fontsize=20)
    mp.xlabel('Date', fontsize=14)
    mp.ylabel('Price', fontsize=14)
    ax = mp.gca()
    # 设置水平坐标每个星期一为主刻度
    ax.xaxis.set_major_locator(md.WeekdayLocator(
        byweekday=md.MO))
    # 设置水平坐标每一天为次刻度
    ax.xaxis.set_minor_locator(md.DayLocator())
    # 设置水平坐标主刻度标签格式
    ax.xaxis.set_major_formatter(md.DateFormatter(
        '%d %b %Y'))
    mp.tick_params(labelsize=10)
    mp.grid(linestyle=':')
    dates = dates.astype(md.datetime.datetime)
    mp.plot(dates, closing_prices, c='lightgray',
            label='Closing Price')
    mp.plot(dates[4:], medios, c='dodgerblue',
            label='Medio')
    mp.plot(dates[4:], lowers, c='limegreen',
            label='Lower')
    mp.plot(dates[4:], uppers, c='orangered',
            label='Upper')
    mp.legend()
    mp.gcf().autofmt_xdate()
    mp.show()

     

11.线性模型

  • 1    2    3    4     5
    60  65  70  75  <80>
  1. 线性预测

    a b c d e f
    aA+bB+cC=d \
    bA+cB+dC=e  > -> A B C
    cA+dB+eC= f /
    dA+eB+fC -> ?
    / a b c \     / A \    / d \
    | b c d  | X |  B | = | e  |
    \ c d e /     \ C /     \ f /
    ---------     -----     ----
        a           x          b
                          = np.linalg.lstsq(a, b)[0]

    代码:

    # -*- coding: utf-8 -*-
    from __future__ import unicode_literals
    import datetime as dt
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as mp
    import matplotlib.dates as md
    
    
    def dmy2ymd(dmy):
        dmy = str(dmy, encoding='utf-8')
        date = dt.datetime.strptime(dmy, '%d-%m-%Y').date()
        ymd = date.strftime('%Y-%m-%d')
        return ymd
    
    
    dates, closing_prices = np.loadtxt(
        '../../data/aapl.csv', delimiter=',',
        usecols=(1, 6), unpack=True,
        dtype='M8[D], f8', converters={1: dmy2ymd})
    N = 5
    pred_prices = np.zeros(
        closing_prices.size - 2 * N + 1)
    for i in range(pred_prices.size):
        a = np.zeros((N, N))
        for j in range(N):
            a[j, ] = closing_prices[i + j:i + j + N]
        b = closing_prices[i + N:i + N * 2]
        x = np.linalg.lstsq(a, b)[0]
        pred_prices[i] = b.dot(x)
    mp.figure('Linear Prediction',
              facecolor='lightgray')
    mp.title('Linear Prediction', fontsize=20)
    mp.xlabel('Date', fontsize=14)
    mp.ylabel('Price', fontsize=14)
    ax = mp.gca()
    # 设置水平坐标每个星期一为主刻度
    ax.xaxis.set_major_locator(md.WeekdayLocator(
        byweekday=md.MO))
    # 设置水平坐标每一天为次刻度
    ax.xaxis.set_minor_locator(md.DayLocator())
    # 设置水平坐标主刻度标签格式
    ax.xaxis.set_major_formatter(md.DateFormatter(
        '%d %b %Y'))
    mp.tick_params(labelsize=10)
    mp.grid(linestyle=':')
    dates = dates.astype(md.datetime.datetime)
    mp.plot(dates, closing_prices, 'o-', c='lightgray',
            label='Closing Price')
    dates = np.append(dates,
                      dates[-1] + pd.tseries.offsets.BDay())
    mp.plot(dates[2 * N:], pred_prices, 'o-',
            c='orangered', linewidth=3,
            label='Predicted Price')
    mp.legend()
    mp.gcf().autofmt_xdate()
    mp.show()
    

     

  2. 线性拟合
    kx + b = y
    kx1 + b = y1
    kx2 + b = y2
    ...
    kxn + b = yn
    / x1 1 \    / k \    / y1 \
    | x2 1  | X | b | = | y2  |
    | ...      |     \    /    | ...   |
    \ xn 1/                 \ yn /
        a             x           b
                       = np.linalg.lstsq(a, b)[0]

 


 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值