基于机器学习的BTC时间序列数据异常检测(Python)

191 篇文章 1 订阅
152 篇文章 5 订阅
pip install orion-ml
pip install ml-stars
import pandas as pd
from binance import Client
from dotenv import dotenv_values
from datetime import datetime


config = dotenv_values('.env')
client = Client(config.get('KEY'), config.get('SECRET_KEY'))
TICKER = 'BTCUSDT'
start_date = datetime(2017, 9, 1)
end_date = datetime(2024, 4, 1)


start_date_str = start_date.strftime('%d %b, %Y')
end_date_str = end_date.strftime('%d %b, %Y')


klines = client.get_historical_klines(TICKER, client.KLINE_INTERVAL_1DAY, start_date_str, end_date_str)
dataBTC = pd.DataFrame(
    data=[row[1:7] for row in klines],
    columns=['Open', 'High', 'Low', 'Close', 'Volume', 'Date'],
).set_index('Date')
dataBTC.index = pd.to_datetime(dataBTC.index, unit='ms')
dataBTC = dataBTC.sort_index()
dataBTC = dataBTC.apply(pd.to_numeric, axis=1)
dataBTC
OpenHighLowCloseVolume
Date
2017-09-01 23:59:59.9994689.894885.554654.884834.91560.666366
2017-09-02 23:59:59.9994796.164939.194286.874472.14929.148595
2017-09-03 23:59:59.9994508.504714.764298.334509.08691.216198
2017-09-04 23:59:59.9994505.004527.493972.514100.111394.644614
2017-09-05 23:59:59.9994106.974484.993603.004366.471228.938157
..................
2024-03-28 23:59:59.99969469.9971552.0668903.6270780.6035439.032390
2024-03-29 23:59:59.99970780.6070916.1669009.0069850.5425445.083530
2024-03-30 23:59:59.99969850.5370321.1069540.0069582.1813644.611420
2024-03-31 23:59:59.99969582.1771366.0069562.9971280.0119396.344330
2024-04-01 23:59:59.99971280.0071288.2368062.8669649.8041445.320390

2405 rows × 5 columns

dataBTC=dataBTC.reset_index()
dataBTC
Date_onlyOpenHighLowCloseVolume
02017-09-014689.894885.554654.884834.91560.666366
12017-09-024796.164939.194286.874472.14929.148595
22017-09-034508.504714.764298.334509.08691.216198
32017-09-044505.004527.493972.514100.111394.644614
42017-09-054106.974484.993603.004366.471228.938157
.....................
24002024-03-2869469.9971552.0668903.6270780.6035439.032390
24012024-03-2970780.6070916.1669009.0069850.5425445.083530
24022024-03-3069850.5370321.1069540.0069582.1813644.611420
24032024-03-3169582.1771366.0069562.9971280.0119396.344330
24042024-04-0171280.0071288.2368062.8669649.8041445.320390

2405 rows × 6 columns

#Train test split
train = dataBTC.iloc[:int(len(dataBTC)*0.7)]
train_data = train.drop(columns={'Date', 'Open', 'High', 'Low', 'Volume'})
train_data=train_data.rename(columns={'Close': 'value'})
train_data
timestampvalue
01.504224e+124834.91
11.504310e+124472.14
21.504397e+124509.08
31.504483e+124100.11
41.504570e+124366.47
.........
16781.649203e+1243170.47
16791.649290e+1243444.19
16801.649376e+1242252.01
16811.649462e+1242753.97
16821.649549e+1242158.85

1683 rows × 2 columns

test = dataBTC.iloc[int(len(dataBTC)*0.7)+1:]
from orion import Orion
train_data
timestampvalue
01.504224e+124834.91
11.504310e+124472.14
21.504397e+124509.08
31.504483e+124100.11
41.504570e+124366.47
.........
16781.649203e+1243170.47
16791.649290e+1243444.19
16801.649376e+1242252.01
16811.649462e+1242753.97
16821.649549e+1242158.85

1683 rows × 2 columns

from orion.analysis import analyze
hyperparameters = {
    'keras.Sequential.LSTMTimeSeriesRegressor#1': {
        'epochs': 5,
        'verbose': True
        
    }
}
pipeline = 'lstm_dynamic_threshold'
anomalies = analyze(pipeline, train_data, hyperparams=hyperparameters)
from orion.data import load_signal
train_data
timestampvalue
01222819200-0.366359
11222840800-0.394108
212228624000.403625
31222884000-0.362759
41222905600-0.370746
.........
28131283580000-0.365308
281412836016001.000000
28151283623200-0.341357
28161283644800-0.392546
281712836664001.000000

2818 rows × 2 columns

dataBTC['Date_only']=dataBTC.index.strftime('%Y-%m-%d')
dataBTC['Date_only']=pd.to_datetime(dataBTC['Date_only'])
dataBTC=dataBTC.set_index(dataBTC['Date_only'])
dataBTC
OpenHighLowCloseVolumeDate_only
Date_only
2017-09-014689.894885.554654.884834.91560.6663662017-09-01
2017-09-024796.164939.194286.874472.14929.1485952017-09-02
2017-09-034508.504714.764298.334509.08691.2161982017-09-03
2017-09-044505.004527.493972.514100.111394.6446142017-09-04
2017-09-054106.974484.993603.004366.471228.9381572017-09-05
.....................
2024-03-2869469.9971552.0668903.6270780.6035439.0323902024-03-28
2024-03-2970780.6070916.1669009.0069850.5425445.0835302024-03-29
2024-03-3069850.5370321.1069540.0069582.1813644.6114202024-03-30
2024-03-3169582.1771366.0069562.9971280.0119396.3443302024-03-31
2024-04-0171280.0071288.2368062.8669649.8041445.3203902024-04-01

2405 rows × 6 columns

dataBTC=dataBTC.drop(columns='Date_only')
dataBTC
OpenHighLowCloseVolume
Date_only
2017-09-014689.894885.554654.884834.91560.666366
2017-09-024796.164939.194286.874472.14929.148595
2017-09-034508.504714.764298.334509.08691.216198
2017-09-044505.004527.493972.514100.111394.644614
2017-09-054106.974484.993603.004366.471228.938157
..................
2024-03-2869469.9971552.0668903.6270780.6035439.032390
2024-03-2970780.6070916.1669009.0069850.5425445.083530
2024-03-3069850.5370321.1069540.0069582.1813644.611420
2024-03-3169582.1771366.0069562.9971280.0119396.344330
2024-04-0171280.0071288.2368062.8669649.8041445.320390

2405 rows × 5 columns

from sklearn.preprocessing import StandardScaler
from orion import Orion
from orion.primitives.tadgan import TadGAN


parameters = {
    "mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1": {
            "interval": 3600 # hour level
    },'sklearn.preprocessing.MinMaxScaler#1': {
        'feature_range': (-1, 1)
    },
    
    'orion.primitives.tadgan.TadGAN#1': {
        'epochs': 5,
    }
}


orion = Orion(
    'tadgan',
    parameters
)


anomalies = orion.fit_detect(train_data)
Epoch: 1/5, Losses: {'cx_loss': -1.0073, 'cz_loss': -30.4356, 'eg_loss': 43.7877}
Epoch: 2/5, Losses: {'cx_loss': -0.8315, 'cz_loss': 0.0683, 'eg_loss': 4.7679}
Epoch: 3/5, Losses: {'cx_loss': -0.8058, 'cz_loss': 2.8823, 'eg_loss': -1.6866}
Epoch: 4/5, Losses: {'cx_loss': -0.7831, 'cz_loss': 3.6786, 'eg_loss': -1.7423}
Epoch: 5/5, Losses: {'cx_loss': -1.0498, 'cz_loss': 4.5926, 'eg_loss': -7.7526}
1259/1259 [==============================] - 87s 68ms/step
1259/1259 [==============================] - 76s 59ms/step
1259/1259 [==============================] - 11s 9ms/step
anomalies
startendseverity
0161308440016145640000.652743
1161749080016184916000.209329
train_data
timestampvalue
015042240004834.91
115043104004472.14
215043968004509.08
315044832004100.11
415045696004366.47
.........
1678164920320043170.47
1679164928960043444.19
1680164937600042252.01
1681164946240042753.97
1682164954880042158.85

1683 rows × 2 columns

import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
fig, ax = plt.subplots(figsize=(16, 8))
plt.plot(train_data['timestamp'], train_data['value'])
plt.axvspan(1613084400, 1614564000, alpha=0.2, color='red')
plt.axvspan(1617490800,1618491600, alpha=0.2, color='red' )


# Customize the plot
plt.xlabel("Time")
plt.ylabel("Value")
plt.title("TadGan detection")
plt.grid(True)
plt.show()

dataBTC
Date_onlyOpenHighLowCloseVolume
02017-09-014689.894885.554654.884834.91560.666366
12017-09-024796.164939.194286.874472.14929.148595
22017-09-034508.504714.764298.334509.08691.216198
32017-09-044505.004527.493972.514100.111394.644614
42017-09-054106.974484.993603.004366.471228.938157
.....................
24002024-03-2869469.9971552.0668903.6270780.6035439.032390
24012024-03-2970780.6070916.1669009.0069850.5425445.083530
24022024-03-3069850.5370321.1069540.0069582.1813644.611420
24032024-03-3169582.1771366.0069562.9971280.0119396.344330
24042024-04-0171280.0071288.2368062.8669649.8041445.320390

2405 rows × 6 columns

train
DatetimestampOpenHighLowCloseVolume
02017-09-01 23:59:59.9991.504224e+124689.894885.554654.884834.91560.666366
12017-09-02 23:59:59.9991.504310e+124796.164939.194286.874472.14929.148595
22017-09-03 23:59:59.9991.504397e+124508.504714.764298.334509.08691.216198
32017-09-04 23:59:59.9991.504483e+124505.004527.493972.514100.111394.644614
42017-09-05 23:59:59.9991.504570e+124106.974484.993603.004366.471228.938157
........................
16782022-04-06 23:59:59.9991.649203e+1245497.5445507.1443121.0043170.4760849.329360
16792022-04-07 23:59:59.9991.649290e+1243170.4743900.9942727.3543444.1937396.541560
16802022-04-08 23:59:59.9991.649376e+1243444.2043970.6242107.1442252.0142375.042030
16812022-04-09 23:59:59.9991.649462e+1242252.0242800.0042125.4842753.9717891.660470
16822022-04-10 23:59:59.9991.649549e+1242753.9643410.3041868.0042158.8522771.094030

1683 rows × 7 columns

train_data
timestampvalue
01.504224e+124834.91
11.504310e+124472.14
21.504397e+124509.08
31.504483e+124100.11
41.504570e+124366.47
.........
16781.649203e+1243170.47
16791.649290e+1243444.19
16801.649376e+1242252.01
16811.649462e+1242753.97
16821.649549e+1242158.85

1683 rows × 2 columns

timestamps = pd.to_datetime(train['timestamp'], unit='ms')
train_data['timestamp'] = timestamps.values.astype(np.int64) // 10 ** 9
train_data
timestampvalue
015042240004834.91
115043104004472.14
215043968004509.08
315044832004100.11
415045696004366.47
.........
1678164920320043170.47
1679164928960043444.19
1680164937600042252.01
1681164946240042753.97
1682164954880042158.85

1683 rows × 2 columns

df= (dataBTC.Close-dataBTC['Close'].mean())/dataBTC['Close'].std()
df
0      -1.027892
1      -1.049285
2      -1.047106
3      -1.071224
4      -1.055516
          ...   
2400    2.860970
2401    2.806123
2402    2.790298
2403    2.890420
2404    2.794285
Name: Close, Length: 2405, dtype: float64
dataBTC['Standardized_Close']=df
dataBTC=dataBTC.set_index(dataBTC['Date_only'])
dataBTC
Date_onlyOpenHighLowCloseVolumeStandardized_Close
Date_only
2017-09-012017-09-014689.894885.554654.884834.91560.666366-1.027892
2017-09-022017-09-024796.164939.194286.874472.14929.148595-1.049285
2017-09-032017-09-034508.504714.764298.334509.08691.216198-1.047106
2017-09-042017-09-044505.004527.493972.514100.111394.644614-1.071224
2017-09-052017-09-054106.974484.993603.004366.471228.938157-1.055516
........................
2024-03-282024-03-2869469.9971552.0668903.6270780.6035439.0323902.860970
2024-03-292024-03-2970780.6070916.1669009.0069850.5425445.0835302.806123
2024-03-302024-03-3069850.5370321.1069540.0069582.1813644.6114202.790298
2024-03-312024-03-3169582.1771366.0069562.9971280.0119396.3443302.890420
2024-04-012024-04-0171280.0071288.2368062.8669649.8041445.3203902.794285

2405 rows × 7 columns

dataBTC['Standardized_Close'].plot(figsize=(16,8), title='Standardized BTC price')

知乎学术咨询:https://www.zhihu.com/consult/people/792359672131756032?isMe=1

担任《Mechanical System and Signal Processing》等审稿专家,擅长领域:信号滤波/降噪,机器学习/深度学习,时间序列预分析/预测,设备故障诊断/缺陷检测/异常检测。

分割线

基于机器学习(霍特林统计量,高斯混合模型,支持向量机)的NASA涡扇发动机退化模拟数据异常检测(MATLAB R2021B)

完整代码可通过知乎学术咨询获得:

https://www.zhihu.com/consult/people/792359672131756032?isMe=1

  • 10
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

哥廷根数学学派

码字不易,且行且珍惜

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值