数据分析04

最新推荐文章于 2024-04-23 18:44:03 发布

CSDN时光

最新推荐文章于 2024-04-23 18:44:03 发布

阅读量273

点赞数

文章标签：数据分析

本文链接：https://blog.csdn.net/qq_42584444/article/details/83817739

版权

4.最值

max/min：返回一个数组中最大/最小元素
argmax/argmin：返回一个数组中最大/最小元素的下标
maximum/minimum：将两个同维数组中对应元素中最大/最小元素构成一个新的数组

ptp：返回一个数组中最大值和最小值之差
代码：

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
# 产生9个介于[10, 100)区间的随机数
a = np.random.randint(10, 100, 9).reshape(3, 3)
print(a)
print(np.max(a), np.min(a), np.ptp(a))
print(np.argmax(a), np.argmin(a))
b = np.random.randint(10, 100, 9).reshape(3, 3)
print(b)
print(np.maximum(a, b), np.minimum(a, b), sep='\n')

# a [[69 25 50]
#  [70 30 18]
#  [47 84 32]]
# 84 18 66
# 7 5
# b [[38 71 71]
#  [82 34 42]
#  [39 50 85]]
# a,b
# [[69 71 71]
#  [82 34 42]
#  [47 84 85]]
# [[38 25 50]
#  [70 30 18]
#  [39 50 32]]

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
dates, highest_prices, lowest_prices = np.loadtxt(
    'aapl.csv', delimiter=',',
    usecols=(1, 4, 5), dtype='U10, f8, f8',
    unpack=True)
max_price = np.max(highest_prices)
min_price = np.min(lowest_prices)
print(min_price, '~', max_price)
max_index = np.argmax(highest_prices)
min_index = np.argmin(lowest_prices)
print(dates[min_index], dates[max_index])
highest_ptp = np.ptp(highest_prices)
lowest_ptp = np.ptp(lowest_prices)
print(lowest_ptp, highest_ptp)

5.中位数

将多个样本按照大小排序，取中间位置的元素
10 20 30 40 50
^ (a[(5-1)/2]+a[5/2])/2
10 20 30 40 50 60
^ ^
\ /
平均 (a[(6-1)/2]+a[6/2])/2
(a[(s-1)/2]+a[s/2])/2
np.median(无序样本)->中位数

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
closing_prices = np.loadtxt(
    'aapl.csv', delimiter=',',
    usecols=(6), unpack=True)
size = closing_prices.size
sorted_prices = np.msort(closing_prices)
median = (sorted_prices[int((size - 1) / 2)] +
          sorted_prices[int(size / 2)]) / 2
print(median)
median = np.median(closing_prices)
print(median)

6.标准差

样本：[s1,s2,...,sn]
平均值：m = (s1+s2+...sn)/n
离差：D = [d1,d2,...,dn],di=si-m
离差方：Q = [q1,q2,...,qn],qi= di**2
总体方差：v = (q1+q2+...+qn)/n
总体标准差：s = sqrt(v),方均根
样本方差：v' = (q1+q2+...+qn)/(n-1)
样本标准差：s' = sqrt(v'),方均根
10
2 + 2 + ... + 2 = 20
10000
-------
10-8
numpy.std(S)->s
numpy.std(S, ddof=1)->s'
代码：

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
closing_prices = np.loadtxt(
    '../../data/aapl.csv', delimiter=',',
    usecols=(6), unpack=True)
mean = np.mean(closing_prices)         # 算数平均值
devs = closing_prices - mean           # 离差
dsqs = devs ** 2                       # 离差方
pvar = np.sum(dsqs) / dsqs.size        # 总体方差
pstd = np.sqrt(pvar)                   # 总体标准差
svar = np.sum(dsqs) / (dsqs.size - 1)  # 样本方差
sstd = np.sqrt(svar)                   # 样本标准差
print(pstd, sstd)
pstd = np.std(closing_prices)          # 总体标准差
sstd = np.std(closing_prices, ddof=1)  # 样本标准差
print(pstd, sstd)

7.时间数据处理

按星期取平均值
代码：

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import datetime as dt
import numpy as np


# 转换器函数：将日-月-年格式的日期字符串转换为星期
def dmy2wday(dmy):
    dmy = str(dmy, encoding='utf-8')
    date = dt.datetime.strptime(
        dmy, '%d-%m-%Y').date()
    wday = date.weekday()  # 用0~6表示周一到周日
    return wday


wdays, closing_prices = np.loadtxt(
    '../../data/aapl.csv', delimiter=',',
    usecols=(1, 6), unpack=True,
    converters={1: dmy2wday})
print(wdays)
ave_closing_prices = np.zeros(5)
for wday in range(ave_closing_prices.size):
    '''
    ave_closing_prices[wday] = np.take(
        closing_prices, np.where(wdays == wday)).mean()
    '''
    '''
    ave_closing_prices[wday] = \
        closing_prices[np.where(wdays == wday)].mean()
    '''
    ave_closing_prices[wday] = \
        closing_prices[wdays == wday].mean()
for wday, ave_closing_price in zip(
        ['MON', 'TUE', 'WED', 'THU', 'FRI'],
        ave_closing_prices):
    print(wday, np.round(ave_closing_price, 2))

按星期汇总数据
数组的轴向汇总
np.apply_along_axis(处理函数, 轴向, 数组)
沿着数组中所指定的轴向，调用处理函数，并将每次调用的返回值重新组织成数组返回。
代码：

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np


def foo(arg):
    print('foo:', arg)
    return arg.sum()


a = np.arange(1, 10).reshape(3, 3)
print(a)
b = np.apply_along_axis(foo, 0, a)
print(b)
c = np.apply_along_axis(foo, 1, a)
print(c)

sum.py

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import datetime as dt
import numpy as np


def dmy2wday(dmy):
    dmy = str(dmy, encoding='utf-8')
    date = dt.datetime.strptime(dmy, '%d-%m-%Y').date()
    wday = date.weekday()
    return wday


wdays, opening_prices, highest_prices, \
    lowest_prices, closing_prices = np.loadtxt(
        '../../data/aapl.csv',
        delimiter=',', usecols=(1, 3, 4, 5, 6),
        unpack=True, converters={1: dmy2wday})
wdays = wdays[:16]
opening_prices = opening_prices[:16]
highest_prices = highest_prices[:16]
lowest_prices = lowest_prices[:16]
closing_prices = closing_prices[:16]
# 第一个星期一的索引
first_monday = np.where(wdays == 0)[0][0]
last_friday = np.where(wdays == 4)[0][-1]
indices = np.arange(first_monday, last_friday + 1)
indices = np.split(indices, 3)


def week_summary(indices):
    opening_price = opening_prices[indices[0]]
    highest_price = highest_prices[indices].max()
    lowest_price = lowest_prices[indices].min()
    closing_price = closing_prices[indices[-1]]
    return opening_price, highest_price, \
        lowest_price, closing_price


summaries = np.apply_along_axis(
    week_summary, 1, indices)
print(summaries)
np.savetxt('../../data/summary.csv', summaries,
           delimiter=',', fmt='%g')

8.卷积

卷积积分
激励函数：g(t)
单位激励下的响应函数：f(t)

响应函数：g(t)f(t)dt
a = [1 2 3 4 5]
b = [6 7 8]
c = numpy.convolve(a,b,卷积类型)
40 61 82 - 有效卷积(valid)
19 40 61 82 67 - 同维卷积(same)
6 19 40 61 82 67 40 - 完全卷积(full)
0 0 1 2 3 4 5 0 0
8 7 6
8 7 6
8 7 6
8 7 6
8 7 6
8 7 6
8 7 6
代码：
```
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
a = np.array([1, 2, 3, 4, 5])   # 被卷积序列
b = np.array([6, 7, 8])         # 卷积核序列
print(a, b)
c = np.convolve(a, b, 'full')   # 完全卷积
print(c)
d = np.convolve(a, b, 'same')   # 同维卷积
print(d)
e = np.convolve(a, b, 'valid')  # 有效卷积
print(e)
```

9.移动均线
a b c d e f g h i j [1/5 1/5 1/5 1/5 1/5]
(a+b+c+d+e)/5
(b+c+d+e+f)/5
(c+d+e+f+g)/5
...
(f+g+h+i+j)/5
[A, B, C, D, E]
S=A+B+C+D+E
(aA+bB+cC+dD+eE)/S
aA/S+bB/S+cC/S+dD/S+eE/S
[A/S, B/S, C/S, D/S, E/S]
代码

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import datetime as dt
import numpy as np
import matplotlib.pyplot as mp
import matplotlib.dates as md


def dmy2ymd(dmy):
    dmy = str(dmy, encoding='utf-8')
    date = dt.datetime.strptime(dmy, '%d-%m-%Y').date()
    ymd = date.strftime('%Y-%m-%d')
    return ymd


dates, closing_prices = np.loadtxt(
    '../../data/aapl.csv', delimiter=',',
    usecols=(1, 6), unpack=True,
    dtype='M8[D], f8', converters={1: dmy2ymd})
sma51 = np.zeros(closing_prices.size - 4)
for i in range(sma51.size):
    sma51[i] = closing_prices[i:i + 5].mean()
sma52 = np.convolve(
    closing_prices, np.ones(5) / 5, 'valid')
sma10 = np.convolve(
    closing_prices, np.ones(10) / 10, 'valid')
mp.figure('Simple Moving Average',
          facecolor='lightgray')
mp.title('Simple Moving Average', fontsize=20)
mp.xlabel('Date', fontsize=14)
mp.ylabel('Price', fontsize=14)
ax = mp.gca()
# 设置水平坐标每个星期一为主刻度
ax.xaxis.set_major_locator(md.WeekdayLocator(
    byweekday=md.MO))
# 设置水平坐标每一天为次刻度
ax.xaxis.set_minor_locator(md.DayLocator())
# 设置水平坐标主刻度标签格式
ax.xaxis.set_major_formatter(md.DateFormatter(
    '%d %b %Y'))
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
dates = dates.astype(md.datetime.datetime)
mp.plot(dates, closing_prices, c='lightgray',
        label='Closing Price')
mp.plot(dates[4:], sma51, c='orangered',
        label='SMA-5(1)')
mp.plot(dates[4:], sma52, c='limegreen', alpha=0.5,
        linewidth=6, label='SMA-5(2)')
mp.plot(dates[9:], sma10, c='dodgerblue',
        label='SMA-10')
mp.legend()
mp.gcf().autofmt_xdate()
mp.show()

ema.py

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import datetime as dt
import numpy as np
import matplotlib.pyplot as mp
import matplotlib.dates as md


def dmy2ymd(dmy):
    dmy = str(dmy, encoding='utf-8')
    date = dt.datetime.strptime(dmy, '%d-%m-%Y').date()
    ymd = date.strftime('%Y-%m-%d')
    return ymd


dates, closing_prices = np.loadtxt(
    '../../data/aapl.csv', delimiter=',',
    usecols=(1, 6), unpack=True,
    dtype='M8[D], f8', converters={1: dmy2ymd})
weights = np.exp(np.linspace(-1, 0, 5))
weights /= weights.sum()
ema5 = np.convolve(
    closing_prices, weights[::-1], 'valid')
weights = np.exp(np.linspace(-1, 0, 10))
weights /= weights.sum()
ema10 = np.convolve(
    closing_prices, weights[::-1], 'valid')
mp.figure('Exponential Moving Average',
          facecolor='lightgray')
mp.title('Exponential Moving Average', fontsize=20)
mp.xlabel('Date', fontsize=14)
mp.ylabel('Price', fontsize=14)
ax = mp.gca()
# 设置水平坐标每个星期一为主刻度
ax.xaxis.set_major_locator(md.WeekdayLocator(
    byweekday=md.MO))
# 设置水平坐标每一天为次刻度
ax.xaxis.set_minor_locator(md.DayLocator())
# 设置水平坐标主刻度标签格式
ax.xaxis.set_major_formatter(md.DateFormatter(
    '%d %b %Y'))
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
dates = dates.astype(md.datetime.datetime)
mp.plot(dates, closing_prices, c='lightgray',
        label='Closing Price')
mp.plot(dates[4:], ema5, c='orangered',
        label='EMA-5')
mp.plot(dates[9:], ema10, c='dodgerblue',
        label='EMA-10')
mp.legend()
mp.gcf().autofmt_xdate()
mp.show()

10.布林带

中轨：移动平均线
上轨：中轨+2x标准差

下轨：中轨-2x标准差
代码：

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import datetime as dt
import numpy as np
import matplotlib.pyplot as mp
import matplotlib.dates as md


def dmy2ymd(dmy):
    dmy = str(dmy, encoding='utf-8')
    date = dt.datetime.strptime(dmy, '%d-%m-%Y').date()
    ymd = date.strftime('%Y-%m-%d')
    return ymd


dates, closing_prices = np.loadtxt(
    'aapl.csv', delimiter=',',
    usecols=(1, 6), unpack=True,
    dtype='M8[D], f8', converters={1: dmy2ymd})
weights = np.exp(np.linspace(-1, 0, 5))
weights /= weights.sum()
medios = np.convolve(
    closing_prices, weights[::-1], 'valid')
stds = np.zeros(medios.size)
for i in range(stds.size):
    stds[i] = closing_prices[i:i + 5].std()
stds *= 2
lowers = medios - stds
uppers = medios + stds
mp.figure('Exponential Moving Average',
          facecolor='lightgray')
mp.title('Exponential Moving Average', fontsize=20)
mp.xlabel('Date', fontsize=14)
mp.ylabel('Price', fontsize=14)
ax = mp.gca()
# 设置水平坐标每个星期一为主刻度
ax.xaxis.set_major_locator(md.WeekdayLocator(
    byweekday=md.MO))
# 设置水平坐标每一天为次刻度
ax.xaxis.set_minor_locator(md.DayLocator())
# 设置水平坐标主刻度标签格式
ax.xaxis.set_major_formatter(md.DateFormatter(
    '%d %b %Y'))
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
dates = dates.astype(md.datetime.datetime)
mp.plot(dates, closing_prices, c='lightgray',
        label='Closing Price')
mp.plot(dates[4:], medios, c='dodgerblue',
        label='Medio')
mp.plot(dates[4:], lowers, c='limegreen',
        label='Lower')
mp.plot(dates[4:], uppers, c='orangered',
        label='Upper')
mp.legend()
mp.gcf().autofmt_xdate()
mp.show()

11.线性模型

1 2 3 4 5
60 65 70 75 <80>

线性预测

a b c d e f
aA+bB+cC=d \
bA+cB+dC=e > -> A B C
cA+dB+eC= f /
dA+eB+fC -> ?
/ a b c \ / A \ / d \
| b c d | X | B | = | e |
\ c d e / \ C / \ f /
--------- ----- ----
a x b
= np.linalg.lstsq(a, b)[0]

代码：

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as mp
import matplotlib.dates as md


def dmy2ymd(dmy):
    dmy = str(dmy, encoding='utf-8')
    date = dt.datetime.strptime(dmy, '%d-%m-%Y').date()
    ymd = date.strftime('%Y-%m-%d')
    return ymd


dates, closing_prices = np.loadtxt(
    '../../data/aapl.csv', delimiter=',',
    usecols=(1, 6), unpack=True,
    dtype='M8[D], f8', converters={1: dmy2ymd})
N = 5
pred_prices = np.zeros(
    closing_prices.size - 2 * N + 1)
for i in range(pred_prices.size):
    a = np.zeros((N, N))
    for j in range(N):
        a[j, ] = closing_prices[i + j:i + j + N]
    b = closing_prices[i + N:i + N * 2]
    x = np.linalg.lstsq(a, b)[0]
    pred_prices[i] = b.dot(x)
mp.figure('Linear Prediction',
          facecolor='lightgray')
mp.title('Linear Prediction', fontsize=20)
mp.xlabel('Date', fontsize=14)
mp.ylabel('Price', fontsize=14)
ax = mp.gca()
# 设置水平坐标每个星期一为主刻度
ax.xaxis.set_major_locator(md.WeekdayLocator(
    byweekday=md.MO))
# 设置水平坐标每一天为次刻度
ax.xaxis.set_minor_locator(md.DayLocator())
# 设置水平坐标主刻度标签格式
ax.xaxis.set_major_formatter(md.DateFormatter(
    '%d %b %Y'))
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
dates = dates.astype(md.datetime.datetime)
mp.plot(dates, closing_prices, 'o-', c='lightgray',
        label='Closing Price')
dates = np.append(dates,
                  dates[-1] + pd.tseries.offsets.BDay())
mp.plot(dates[2 * N:], pred_prices, 'o-',
        c='orangered', linewidth=3,
        label='Predicted Price')
mp.legend()
mp.gcf().autofmt_xdate()
mp.show()

线性拟合
kx + b = y
kx1 + b = y1
kx2 + b = y2
...
kxn + b = yn
/ x1 1 \ / k \ / y1 \
| x2 1 | X | b | = | y2 |
| ... | \ / | ... |
\ xn 1/ \ yn /
a x b
= np.linalg.lstsq(a, b)[0]

CSDN时光

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
数据分析04

4.最值max/min：返回一个数组中最大/最小元素 argmax/argmin：返回一个数组中最大/最小元素的下标 maximum/minimum：将两个同维数组中对应元素中最大/最小元素构成一个新的数组 ptp：返回一个数组中最大值和最小值之差代码： # -*- coding: utf-8 -*-from __future__ import unicode_literals...
复制链接

扫一扫