【记录】Realized Volatility代码

1.数据介绍

import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import random
import glob
import gc

import seaborn as sns
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_colwidth', None)

import os
for dirname, _, filenames in os.walk('../optiver-realized-volatility-prediction'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
../optiver-realized-volatility-prediction\sample_submission.csv
../optiver-realized-volatility-prediction\test.csv
../optiver-realized-volatility-prediction\train.csv
../optiver-realized-volatility-prediction\book_test.parquet\stock_id=0\7832c05caae3489cbcbbb9b02cf61711.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=0\c439ef22282f412ba39e9137a3fdabac.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=1\31a1c5cd6d8546b383d10373db762236.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=10\d671bb2b87f447d4ba3fa4b18b3656f9.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=100\9495f2c64cee42078601dee7408e3c36.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=101\dbbf6dc725ee4bcb8656c427165c888d.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=102\9d16ca233fea42c7a8ec91d621ca113a.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=103\cc7984f01a8747299339cb441632d6c0.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=104\6875618fb8144a80a033ae8de7809493.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=105\a37b8bf4be2e4fa5b3ea63b7c4240d3d.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=107\0a58f297fa7b471a8c39b79420f9990d.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=108\662f6789c2be46e28884e9caceb1f67a.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=109\fede24892e38482a9b0a0af7adb1030c.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=11\755d9c4058914e2a901aaeccf2406e30.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=110\2bba26018b9c482e90ca9be02e856382.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=111\76fdfb893640493f8822cec944e35f0d.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=112\89c931289b58423bb27b45ce8109c148.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=113\2698c014aa014ce0a476ccaf13795a43.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=114\22c84a10a4b4477b85b5bf7688037e77.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=115\32688ffcc77d40a5ab6ad8054cf44cab.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=116\bd6b0dc7386c49a79856b8df04e4a33b.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=118\eefc9574b4024ddb90eefc065ccd2918.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=119\a942847a75914186b47fec1c857e45fd.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=120\3068890f5521450db46c271190fa24d8.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=122\12dc61893ff0440e8972b8dc7d3988d8.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=123\50d0df62ffc241dea0ae635e7858aca5.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=124\a88197568c3e441cb1b45dccc5f3bd8b.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=125\996b501aec144de8ae4149a72633db84.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=126\c877c4dda91f41018dbf6ccb1182ef28.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=13\e576f0cd91c347a7b1cdb87c3bd1abf4.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=14\ae1ae9deed8048fbb1191a77ca68ab24.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=15\a47deb70de554fdcb7b780bd060d4564.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=16\eaa06704bf15493fae496cdf6c016b6c.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=17\4539ddf198a64337bdeca019b4edb1bb.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=18\cbc56bf4cc424b12ae18493920465a3a.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=19\cae027e29b0749b5b2e139134b75fe63.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=2\36f27d865da54e8d92ff54e07ac4afd2.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=20\fe0e467dc2ca4092afc151d4069c54a7.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=21\60cc70a1e25d4da7b39fbfac2bd9fab1.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=22\3f2ddd48d0b147f09c852cc2034ece80.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=23\9229d9611b4e45b4bf81d60939a8274a.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=26\1adc4a39b6fe4f8bbbedd7551006a170.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=27\0e5f57dd508b46cb9d6d88bd63aeaedd.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=28\e956e66d12664e509d27a680731a4057.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=29\9534c4ef29c049f6af22f4ca8305f315.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=3\ccc6d426f595409680d6f422ed911bee.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=30\ee59d4b4cf4847adaee7de785d7918e4.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=31\c0b9eb5ed0bf40bc838fab4da5459f76.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=32\446dd6db2d064e5b86666c66ec669755.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=33\85142fc4257b4ece8157a174ae6d2f4f.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=34\4cb0d70566654bf480518b3a1cee2ad2.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=35\8803efd6c9c44895a6fda59448a8eede.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=36\0ffca9c0dae84890ad8c923d4b95e840.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=37\7163ec1911204a10a479e421c0a41a35.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=38\c49c83c6eb1c41f0b8f0d61de1e3bb89.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=39\99d14712047840fbb3a93eb1f6ebc7a2.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=4\6ae39a36676a419f8847cb217e5ec75d.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=40\ab33096950f647ddb7b705f17b21f11c.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=41\3090fd7adaf447679b6e836778168be8.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=42\01281c6eedfc46c1b72357c484e640b5.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=43\ce8f54442b1142338d2e4b02b9dc578a.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=44\31274a12b2ec4c24a4a57a054e67d704.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=46\db00d635f1614686837abc1c71f6e16b.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=47\ede300b0202a43caaaefc8ae07475c33.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=48\5c08c8582813448e8e87eda2c3c36b84.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=5\c39da423b1404b00946ceedf32ae3f07.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=50\0f5c98a29be64774a136b60f71033ec6.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=51\ace8af4c1a474efd80544609ea4a47c4.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=52\d5eb854d66174f5483449da151fbca50.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=53\46c591d10f0940de908317001e533db7.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=55\d3bdb39f65d34bb18278d789729f9323.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=56\55f4ad52ac2347f1a468ea3bf0cb7adc.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=58\33de4ffff5494e3a8ecc02c7b6d9433a.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=59\fd5b747386604005b18594c00f0312ab.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=6\84352cd5e3424aeaadf11f8d89c8a07a.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=60\6242aca22efd458fbec9a13c86ab42ee.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=61\2522f71edc4d47419f602cc920f906d6.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=62\c2a348a3d0b04a1c945607779c2128dd.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=63\37bff36e80a0485bba2bf875cb3188d4.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=64\acbff79109e44474bfa3a6145bd4244a.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=66\3d1df975704b49cbb407d74d1adb53b6.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=67\2a7fd60b816b431597dec0e80d0ec3b7.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=68\ab42bd95488842f5aa15b750052e79ad.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=69\16f9fe0eb6544a1d89fbf25f290b8466.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=7\4f66c3336dec449280ed3719530f9486.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=70\afecffe028274a66a0e674d0db2d3c3c.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=72\0a18126395144a0a8ccca55bdbd31f1b.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=73\42238482fd7443838a8fc6490cd329c8.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=74\8229e924b5ef4716a0078d9e8ee397bd.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=75\f0354d49be4e4552b548b410b349bbc8.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=76\494ecaad37f9430384ad0a825aab00ec.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=77\5f6ce3ce569548c78c2ad17035454639.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=78\b6deb113746a4de0a752ca0b2917f7ef.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=8\e40cd587617e4bfc86f0125aef383932.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=80\705f7c3bde584dadba222d4ef6f7973e.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=81\3736a90d604d4b8eaa15c6eb5f83fd8c.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=82\ae830997c75f405986957af2984493f3.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=83\4c10ab2a0f324a1189af12bd5458ef15.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=84\53e9faaf11e745debb3c246acea61a84.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=85\1b7a536b29724b60990ee6c368f1fbf0.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=86\6ecdb8f8c4da483b8fd3fbdf6aee4381.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=87\670ab2a9e8b548bdaa8b9f8bc9057331.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=88\2c4ed518efb54a4f880a0f945c317fae.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=89\a1cd471332f24761a90f6fadb8b54ccf.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=9\8a87bd6ec9934a889026174cceb9e30a.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=90\380689865a7c4af8aa8625a7a7c40533.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=93\036725ac1ba94d40bf15a75c36adb184.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=94\d4c922016df84141ad176dac1455b725.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=95\8e4a80c1b7eb44e8844cc0f2ec4badff.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=96\605c29fae948465db8c2ab233852e982.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=97\52e74e4ef0d84c5c989fc4704e46b527.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=98\59ad203a6ac040fdb891702430440b01.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=99\903e3170136d4ce7a7761664482d61c2.parquet
../optiver-realized-volatility-prediction\trade_test.parquet\stock_id=0\31c83a67d81349208e7d5eace9dbbac8.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=0\ef805fd82ff54fadb363094e3b122ab9.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=1\170b39f1f7144bb3b4554aabc336106e.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=10\4be9b4d23e0149af9dc20ec10d5a360f.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=100\c6b5a2974aac4210a6803c5907baaa07.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=101\3ac4a780531644b1b9534b3ac6c8d5f6.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=102\0bdf7ff7bb4847bab154eb59fd1345bd.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=103\9e3f7f7b452a49e591da6115c17246af.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=104\06b735024acc45cbb4c81392e6d040e0.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=105\cca562a08dc24721b0dd837564abc3b5.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=107\f1b306dc9c24482390bf2083c301c588.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=108\581fd1b484db41e8a17449344c2ffb0e.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=109\0621bfa8cb8545a189878af2727f8178.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=11\5e8cea995bff4682a377e1d3932b08e2.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=110\8870109f748e4ff4b9044e0debe8a54c.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=111\24bc76eb35d843d28cf4aeca325e9084.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=112\cd283097a5b54293ba400a19e811a7f9.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=113\7cc3c1fd9452435c83518f3c7c5b74a8.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=114\6e085d4ca2ba44359673a073bc8e41e9.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=115\4dfaedf40be3485085d926ab58672bde.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=116\96cd7e2d2ea74a62bd8b6d739fce17ca.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=118\d5c040f03683418f87556865166846df.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=119\52f2e172b3444107903ec31242331923.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=120\0d06cf1025d84e19be32c9d9ffed9fd2.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=122\a2e2aa41640f4986bebc1b95bd29966f.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=123\168edf076f064342ad8aa3b37551cb8a.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=124\8e3d48a3b163471eb121216a37c6de62.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=125\a3f2c4430e1f4a3fb5837af2d970825d.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=126\3baa2ff424c8435d92fe32cfeed80ae7.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=13\2aeef69f03154de5835b9ba611d16172.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=14\50621e92b84e47a197071584bea4bf1d.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=15\43c3ace09a0a45b889073ce646bc6832.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=16\09bbef3150f744b2979fcab0ebf9e001.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=17\687d7abe51a24e2084bcb95eb8174e9b.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=18\06bb948e2275475e92fd90735050657f.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=19\7876f5e5338c4521982a7001d4489727.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=2\ca5a4d1f67024204ac7fd496a0b46ba3.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=20\4c8b1e3b619a4ee496d68962d30c6da4.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=21\1d8dc18ebfee47ffbb54b04e6afc0634.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=22\dfb544bbc3c34211bf12a6dfe6b584af.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=23\dfe7edcd05564b07983496d1b8a3b3c1.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=26\16a87247231944c4a0be45f797d14eb8.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=27\5eac589708444020a196eb738f06ce08.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=28\ec6f88c0ae114ff482cac8fb6f78e87d.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=29\ccea57f8a6324f969fe7144950c7369b.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=3\e0843aaf024f49228b281081a2524b39.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=30\8d772f1265d64c06add811d2ae912c7e.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=31\26f4296741054f9da7e7848521a80526.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=32\af90f42692874cefaa07e585d739bc7b.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=33\917a91c91ce04f62939b8710660ebb3b.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=34\a55d7f7c8ff5406c9ba9e4b0f8af0ada.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=35\e2480196f2a3426ea84e9b4284414bc0.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=36\4f713fbe94f542579ec55561a22db006.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=37\941cf9679a80466893da714884f5c4b0.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=38\e3ed94a67a444dc1a04838588ffa443f.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=39\364c0ad490cb4b9cb493c952b2124f17.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=4\761268d671f9429abb29d9d2895e9bd2.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=40\61456b44107d4b24bfe5d4ad90baaafe.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=41\bbde7b09f5e743508840782fac034d10.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=42\caab969eb87b4f7abfdd18d606a22ed0.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=43\bb0efa57f511470e817880842e3e2afa.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=44\bdfeb97d57a149049aecb2250af2c82a.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=46\7c55f1b4f4a34f83981f05974369ff6f.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=47\79bac282f1da4cb4aa24f7c667740341.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=48\507fb85f3c2a484f9fa4a78bcbfa992b.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=5\a5fd25253e3f43db884fe1e6fc2a06c3.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=50\935a65e6f4fc4e5990d086d6f6b4a932.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=51\f868a356824b4825a3c804206f513f70.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=52\54a5eea04ed042759b75b0bca90d1ced.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=53\caf58d3fd60c4a699d78ef114cf9ee17.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=55\1f19ad38e0d54c9680dc34dadf4dce47.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=56\b8615a9991bd4eada46a0483f9f9e3e6.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=58\b0e92dc583a544a08308d8c4e1df52af.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=59\2653e96292d24c64b27c0b70de53b2f9.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=6\fcf85e8ee88944d7bcbc9fc9862ee3f1.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=60\ae35d2b5825d4f1c9b52f462862eae04.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=61\fab533c5a5a741d8929f391890a4d54d.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=62\8df3e07ae2b74ad19fd98d4b9acfcbf2.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=63\107939e77b8642a3a222419adc1ae0f2.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=64\338f683c092048ebaba1708e6dd1e774.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=66\c544fc443c6042ab9bb4ddec47179a3d.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=67\703c7481d60a43839cd67d09fa3ad6c2.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=68\b077ddd5f9304972b0595a31899d4f8a.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=69\bcd636dbdd0b42bea8ad6a62a0d1fdf4.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=7\5ea290c973aa45d587602b4efb7205f2.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=70\23dc744f46e4478eb2f5253bcbffae15.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=72\60f62a03d8854605901dda072c84db39.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=73\1443c5278293485cb099fd2fad5348fa.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=74\8c298c007a0a4e078dd4efd4addb3057.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=75\5c163b1fc27c4c18acd83eb5ba5ef87d.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=76\8ee1f2bb5bed4bffb273db8b743f0ee7.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=77\3110dbc5706f4c388802faed11845bf9.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=78\c40d74d0331c465c9f3466df2abdd524.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=8\34dec4dac8ad49e2b4104f7c0ba105b3.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=80\a8c6fe433ef84012b38e96af9014ba2c.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=81\253855077c314a04be294ce53c3cdaa2.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=82\6614a1b49c5c4c8ba6a9edab2ed8708a.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=83\b8f39ee0b1eb4d3d8cda8e83c4f81c2b.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=84\98d3ff55665c4405af0688fae832e5e7.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=85\a12f69f3bf7e4deaa8fe7e2c9357bc41.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=86\3991563a90bb4206bb6b780546d938d4.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=87\927eb8a18153489b9f222d748b121473.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=88\108ca1fef317486f98e9dfab977b6f16.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=89\a05080679e8647978809f25206219ae5.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=9\01a3aa0496884cba833aa60b1c67ae6c.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=90\405e4073fed64deda32f8d74362b15b1.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=93\572fb394eb5745dcb36dfdc478b222c8.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=94\8037e3f4a763480a83b9bb0b1def9ce3.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=95\7396f50037c84b029a5132dfec6f7896.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=96\7a9cbfbbdfc54eef806d6fcc69f8ce5e.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=97\888f813404d8417ca8d6b8aebd5f2951.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=98\fb8bce8063fc4370917b71963ddbd1cf.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=99\d27f7f229131410399c78101da6e624b.parquet
# train
train = pd.read_csv("../optiver-realized-volatility-prediction/train.csv")
train
stock_idtime_idtarget
0050.004136
10110.001445
20160.002168
30310.002195
40620.001747
............
428927126327510.003461
428928126327530.003113
428929126327580.004070
428930126327630.003357
428931126327670.002090

428932 rows × 3 columns

# book
book_example = pd.read_parquet('../optiver-realized-volatility-prediction/book_train.parquet/stock_id=0')
book_example
time_idseconds_in_bucketbid_price1ask_price1bid_price2ask_price2bid_size1ask_size1bid_size2ask_size2
0501.0014221.0023011.0013701.00235332262100
1511.0014221.0023011.0013701.00235331002100
2551.0014221.0023011.0013701.00240531002100
3561.0014221.0023011.0013701.00240531262100
4571.0014221.0023011.0013701.00240531262100
.................................
917548327675680.9982750.9987540.9977960.99894690904828
917549327675690.9982750.9987540.9978920.998946919020028
917550327675710.9982750.9987540.9978920.998946919010028
917551327675720.9982750.9987540.9978920.998946929010028
917552327675820.9982750.9987540.9981790.99894692902628

917553 rows × 10 columns

# trade
trade_example = pd.read_parquet("../optiver-realized-volatility-prediction/trade_train.parquet/stock_id=0")
trade_example
time_idseconds_in_bucketpricesizeorder_count
05211.00230132612
15461.0027781284
25501.002818551
35571.0031551215
45681.00364641
..................
123438327674710.9986592003
123439327675170.998515901
123440327675230.99856311
123441327675420.998803904
123442327675670.9985473003

123443 rows × 5 columns

# submission
sample = pd.read_csv("../optiver-realized-volatility-prediction/sample_submission.csv")
sample
row_idtarget
00-40.003048
10-320.003048
20-340.003048

2.数据预处理与探索性分析

for col in train.columns:
    print(col, ":", len(train[col].unique()))
stock_id : 112
time_id : 3830
target : 414287
stock = train.groupby("stock_id")["target"].agg(["mean", "median", "std", "count", "sum"]).reset_index()
stock
stock_idmeanmedianstdcountsum
000.0040280.0031800.002855383015.429071
110.0043620.0037190.002433383016.704962
220.0023850.0017480.00233938309.133223
330.0061820.0054220.003201383023.675414
440.0041970.0033970.002879383016.073408
.....................
1071220.0037620.0031760.002300383014.407997
1081230.0025060.0019670.00181638309.599594
1091240.0036910.0031430.001953383014.135579
1101250.0020160.0015750.00170338307.719409
1111260.0053210.0044430.003179383020.377571

112 rows × 6 columns

print("mean value=" ,stock["mean"].mean())
plt.hist(stock["mean"])
plt.show()
mean value= 0.0038805243039130312

在这里插入图片描述

print("sum value=" ,stock["sum"].mean())
plt.hist(stock["sum"])
plt.show()
sum value= 14.860998424142858

在这里插入图片描述

book_test = book_example[book_example["time_id"] == 5]
book_test
time_idseconds_in_bucketbid_price1ask_price1bid_price2ask_price2bid_size1ask_size1bid_size2ask_size2
0501.0014221.0023011.0013701.00235332262100
1511.0014221.0023011.0013701.00235331002100
2551.0014221.0023011.0013701.00240531002100
3561.0014221.0023011.0013701.00240531262100
4571.0014221.0023011.0013701.00240531262100
.................................
29755851.0031291.0037491.0030251.0038011003263
29855861.0031291.0037491.0026121.003801100323
29955871.0031291.0037491.0030251.0038011003263
30055881.0031291.0037491.0026121.003801100323
30155931.0031291.0037491.0030251.0038011003263

302 rows × 10 columns

samples = ["bid_price1", "bid_price2", "ask_price1", "ask_price2"]

for num, a in enumerate(samples):
    plt.figure(figsize=(20,5))
    plt.subplot(4,1,num+1)
    plt.plot(book_test["seconds_in_bucket"], book_test[a])
    plt.title(a)
    
plt.show()
plt.figure(figsize=(20,5))

for num,a in enumerate(samples):
    plt.plot(book_test["seconds_in_bucket"],book_test[a],label=a)
    
plt.legend(fontsize=12)
plt.show()

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

trade_test = trade_example[trade_example["time_id"]==5]
trade_test.head(5)
time_idseconds_in_bucketpricesizeorder_count
05211.00230132612
15461.0027781284
25501.002818551
35571.0031551215
45681.00364641
plt.figure(figsize=(20,5))

for num,a in enumerate(samples):
    plt.plot(book_test["seconds_in_bucket"],book_test[a],label=a)
    
plt.scatter(trade_test["seconds_in_bucket"],trade_test["price"],label="trade_parquet",lw=7)
plt.legend(fontsize=12)
<matplotlib.legend.Legend at 0x7efe641c4750>

在这里插入图片描述

# 最小波动率时间段
stock0 = train[train["stock_id"]==0]
min_index = stock0["target"].idxmin()
min_time_id = stock0.iloc[min_index]["time_id"]
print("min index is",min_time_id,"and min target is",stock0.iloc[min_index]["target"])
min index is 24253.0 and min target is 0.000593833
book_test_min = book_example[book_example["time_id"]==min_time_id]
trade_test_min = trade_example[trade_example["time_id"]==min_time_id]


plt.figure(figsize=(20,5))

for num,a in enumerate(samples): 
    plt.plot(book_test_min["seconds_in_bucket"],book_test_min[a],label=a)
    
plt.scatter(trade_test_min["seconds_in_bucket"],trade_test_min["price"],label="trade_parquet",lw=7)
plt.legend(fontsize=12)
plt.show()

在这里插入图片描述

# 最大波动率时间段
stock0 = train[train["stock_id"]==0]
max_index = stock0["target"].idxmax()
max_time_id = stock0.iloc[max_index]["time_id"]
print("max index is",max_time_id,"and max target is",stock0.iloc[max_index]["target"])
max index is 19725.0 and max target is 0.036311154
book_test_max = book_example[book_example["time_id"]==max_time_id]
trade_test_max = trade_example[trade_example["time_id"]==max_time_id]


plt.figure(figsize=(20,5))

for num,a in enumerate(samples):
    plt.plot(book_test_max["seconds_in_bucket"],book_test_max[a],label=a)
    
plt.scatter(trade_test_max["seconds_in_bucket"],trade_test_max["price"],label="trade_parquet",lw=7)
plt.legend(fontsize=12)
plt.show()

在这里插入图片描述

plt.figure(figsize=(20,5))
plt.scatter(trade_test_min["seconds_in_bucket"],trade_test_min["price"],lw=10,label="min_vol_time")
plt.scatter(trade_test_max["seconds_in_bucket"],trade_test_max["price"],lw=10,label = "max_vol_time")
plt.legend(fontsize=15)
plt.show()

在这里插入图片描述

可以发现,放在同一尺度下比较的话,可以很明显的分辨波动率大小。

3.特征工程

def visualize_target(target):
    
    print(f'{target}\n{"-" * len(target)}')
        
    print(f'Mean: {df_train[target].mean():.4f}  -  Median: {df_train[target].median():.4f}  -  Std: {df_train[target].std():.4f}')
    print(f'Min: {df_train[target].min():.4f}  -  25%: {df_train[target].quantile(0.25):.4f}  -  50%: {df_train[target].quantile(0.5):.4f}  -  75%: {df_train[target].quantile(0.75):.4f}  -  Max: {df_train[target].max():.4f}')
    print(f'Skew: {df_train[target].skew():.4f}  -  Kurtosis: {df_train[target].kurtosis():.4f}')
    missing_values_count = df_train[df_train[target].isnull()].shape[0]
    training_samples_count = df_train.shape[0]
    print(f'Missing Values: {missing_values_count}/{training_samples_count} ({missing_values_count * 100 / training_samples_count:.4f}%)')

    fig, axes = plt.subplots(ncols=2, figsize=(24, 8), dpi=100)
    sns.kdeplot(df_train[target], label=target, fill=True, ax=axes[0])
    axes[0].axvline(df_train[target].mean(), label=f'{target} Mean', color='r', linewidth=2, linestyle='--')
    axes[0].axvline(df_train[target].median(), label=f'{target} Median', color='b', linewidth=2, linestyle='--')
    probplot(df_train[target], plot=axes[1])
    axes[0].legend(prop={'size': 16})
    
    for i in range(2):
        axes[i].tick_params(axis='x', labelsize=12.5, pad=10)
        axes[i].tick_params(axis='y', labelsize=12.5, pad=10)
        axes[i].set_xlabel('')
        axes[i].set_ylabel('')
    axes[0].set_title(f'{target} Distribution in Training Set', fontsize=20, pad=15)
    axes[1].set_title(f'{target} Probability Plot', fontsize=20, pad=15)

    plt.show()

visualize_target('target')
target
------
Mean: 0.0039  -  Median: 0.0030  -  Std: 0.0029
Min: 0.0001  -  25%: 0.0020  -  50%: 0.0030  -  75%: 0.0047  -  Max: 0.0703
Skew: 2.8226  -  Kurtosis: 14.9611
Missing Values: 0/428932 (0.0000%)

在这里插入图片描述

def log_return(x):
    return np.log(x).diff()

def realized_volatility(x):
    return np.sqrt(np.sum(log_return(x) ** 2))


for stock_id in tqdm(sorted(df_train['stock_id'].unique())):

    df_book = read_book_data('train', stock_id)

    # 计算定义的Bid_Ask_ratio
    df_book['bid_ask_price_ratio'] = df_book['bid_price1'] / df_book['ask_price1']
    for agg in ['mean', 'std', 'min', 'max', realized_volatility]:
        bid_ask_price_ratio_aggregation = df_book.groupby('time_id')['bid_ask_price_ratio'].agg(agg)
        feature_name = agg.__name__ if callable(agg) else agg
        df_train.loc[df_train['stock_id'] == stock_id, f'book_bid_ask_price_ratio_{feature_name}'] = df_train[df_train['stock_id'] == stock_id]['time_id'].map(bid_ask_price_ratio_aggregation)
        
    # 加权品军价格
    df_book['wap1'] = (df_book['bid_price1'] * df_book['ask_size1'] + df_book['ask_price1'] * df_book['bid_size1']) /\
                      (df_book['bid_size1'] + df_book['ask_size1'])
    df_book['wap2'] = (df_book['bid_price2'] * df_book['ask_size2'] + df_book['ask_price2'] * df_book['bid_size2']) /\
                      (df_book['bid_size2'] + df_book['ask_size2'])

    for wap in [1, 2]:
        for agg in ['mean', 'std', 'min', 'max', realized_volatility]:
            wap_aggregation = df_book.groupby('time_id')[f'wap{wap}'].agg(agg)
            feature_name = agg.__name__ if callable(agg) else agg
            df_train.loc[df_train['stock_id'] == stock_id, f'wap{wap}_{feature_name}'] = df_train[df_train['stock_id'] == stock_id]['time_id'].map(wap_aggregation)

  0%|          | 0/112 [00:00<?, ?it/s]
def visualize_continuous_feature(continuous_feature):
            
    print(f'{continuous_feature}\n{"-" * len(continuous_feature)}')

    print(f'Training Mean: {float(df_train[continuous_feature].mean()):.4}  - Training Median: {float(df_train[continuous_feature].median()):.4} - Training Std: {float(df_train[continuous_feature].std()):.4}')
    print(f'Training Min: {float(df_train[continuous_feature].min()):.4}  - Training Max: {float(df_train[continuous_feature].max()):.4}')
    print(f'Training Skew: {float(df_train[continuous_feature].skew()):.4}  - Training Kurtosis: {float(df_train[continuous_feature].kurtosis()):.4}')

    fig, axes = plt.subplots(ncols=2, figsize=(24, 6), dpi=100, constrained_layout=True)
    title_size = 18
    label_size = 18

    sns.kdeplot(df_train[continuous_feature], label='Training', fill=True, ax=axes[0])
    axes[0].set_xlabel('')
    axes[0].tick_params(axis='x', labelsize=label_size)
    axes[0].tick_params(axis='y', labelsize=label_size)
    axes[0].legend()
    axes[0].set_title(f'{continuous_feature} Distribution in Training Set', size=title_size, pad=title_size)
    
    sns.scatterplot(x=df_train[continuous_feature], y=df_train['target'], ax=axes[1])
    axes[1].set_title(f'{continuous_feature} vs target', size=title_size, pad=title_size)
    axes[1].set_xlabel('')
    axes[1].set_ylabel('')
    axes[1].tick_params(axis='x', labelsize=label_size)
    axes[1].tick_params(axis='y', labelsize=label_size)
    
    plt.show()
    
    
for continuous_feature in df_train.columns[6:]:
    visualize_continuous_feature(continuous_feature)

book_bid_ask_price_ratio_mean
-----------------------------
Training Mean: 0.9993  - Training Median: 0.9996 - Training Std: 0.000889
Training Min: 0.9813  - Training Max: 1.0
Training Skew: -3.975  - Training Kurtosis: 32.48

在这里插入图片描述

book_bid_ask_price_ratio_std
----------------------------
Training Mean: 0.0002196  - Training Median: 0.0001479 - Training Std: 0.0002727
Training Min: 0.0  - Training Max: 0.01006
Training Skew: 5.675  - Training Kurtosis: 64.22

在这里插入图片描述

book_bid_ask_price_ratio_min
----------------------------
Training Mean: 0.9986  - Training Median: 0.9991 - Training Std: 0.001899
Training Min: 0.9522  - Training Max: 0.9999
Training Skew: -4.826  - Training Kurtosis: 44.94

在这里插入图片描述

book_bid_ask_price_ratio_max
----------------------------
Training Mean: 0.9998  - Training Median: 0.9998 - Training Std: 0.0003422
Training Min: 0.992  - Training Max: 1.0
Training Skew: -3.949  - Training Kurtosis: 30.06

在这里插入图片描述

book_bid_ask_price_ratio_realized_volatility
--------------------------------------------
Training Mean: 0.002893  - Training Median: 0.002083 - Training Std: 0.002879
Training Min: 0.0  - Training Max: 0.06794
Training Skew: 4.321  - Training Kurtosis: 35.27

在这里插入图片描述

wap1_mean
---------
Training Mean: 1.0  - Training Median: 1.0 - Training Std: 0.00337
Training Min: 0.9173  - Training Max: 1.079
Training Skew: -0.2131  - Training Kurtosis: 29.89

在这里插入图片描述

wap1_std
--------
Training Mean: 0.00111  - Training Median: 0.0008073 - Training Std: 0.001052
Training Min: 1.631e-05  - Training Max: 0.03437
Training Skew: 4.461  - Training Kurtosis: 42.59

在这里插入图片描述

wap1_min
--------
Training Mean: 0.9977  - Training Median: 0.9985 - Training Std: 0.003891
Training Min: 0.8831  - Training Max: 1.049
Training Skew: -3.839  - Training Kurtosis: 43.12

在这里插入图片描述

wap1_max
--------
Training Mean: 1.002  - Training Median: 1.001 - Training Std: 0.003804
Training Min: 0.9479  - Training Max: 1.127
Training Skew: 3.534  - Training Kurtosis: 37.32

在这里插入图片描述

wap1_realized_volatility
------------------------
Training Mean: 0.004233  - Training Median: 0.003159 - Training Std: 0.003586
Training Min: 8.066e-05  - Training Max: 0.08642
Training Skew: 3.386  - Training Kurtosis: 22.65

在这里插入图片描述

wap2_mean
---------
Training Mean: 1.0  - Training Median: 1.0 - Training Std: 0.003374
Training Min: 0.9173  - Training Max: 1.079
Training Skew: -0.2179  - Training Kurtosis: 29.88

在这里插入图片描述

wap2_std
--------
Training Mean: 0.001149  - Training Median: 0.0008419 - Training Std: 0.001065
Training Min: 1.155e-06  - Training Max: 0.03441
Training Skew: 4.397  - Training Kurtosis: 41.1

在这里插入图片描述

wap2_min
--------
Training Mean: 0.9976  - Training Median: 0.9984 - Training Std: 0.003933
Training Min: 0.8815  - Training Max: 1.044
Training Skew: -3.873  - Training Kurtosis: 42.63

在这里插入图片描述

wap2_max
--------
Training Mean: 1.002  - Training Median: 1.002 - Training Std: 0.003849
Training Min: 0.9482  - Training Max: 1.126
Training Skew: 3.569  - Training Kurtosis: 36.6

在这里插入图片描述

wap2_realized_volatility
------------------------
Training Mean: 0.005808  - Training Median: 0.004347 - Training Std: 0.005026
Training Min: 7.879e-06  - Training Max: 0.1359
Training Skew: 3.591  - Training Kurtosis: 25.17

在这里插入图片描述

fig = plt.figure(figsize=(16, 16), dpi=100)
sns.heatmap(
    df_train[['target'] + df_train.columns[6:].tolist()].corr(),
    annot=True,
    square=True,
    cmap='coolwarm',
    annot_kws={'size': 11},
    fmt='.2f'
)
plt.tick_params(axis='x', labelsize=10, rotation=90)
plt.tick_params(axis='y', labelsize=10, rotation=0)
plt.title('Target and Feature Correlations', size=20, pad=20)
plt.show()

在这里插入图片描述

stock_features = [] 

for realized_volatility in ['target', 'realized_volatility_from_wap1', 'realized_volatility_from_wap2', 'realized_volatility_from_price']:
    for agg in ['mean', 'std', 'min', 'max']:
        df_train[f'stock_{realized_volatility}_{agg}'] = df_train.groupby('stock_id')[realized_volatility].transform(agg)
        stock_features.append(f'stock_{realized_volatility}_{agg}')
        
df_stocks = df_train.groupby('stock_id')[stock_features].first().reset_index()
df_train.drop(columns=stock_features, inplace=True)
kmeans = KMeans(n_clusters=6, random_state=42)
kmeans.fit(df_stocks[stock_features])

pca = PCA(n_components=2)
stocks_2d = pca.fit_transform(df_stocks[stock_features])

fig, ax = plt.subplots(figsize=(32, 10))
ax.scatter(stocks_2d[:, 0], stocks_2d[:, 1], s=200, c=kmeans.labels_, cmap='RdBu')
for idx, stock_id in enumerate(df_stocks['stock_id'].values):
    ax.annotate(stock_id, (stocks_2d[idx, 0], stocks_2d[idx, 1]), fontsize=20)
    
ax.tick_params(axis='x', labelsize=20, pad=10)
ax.tick_params(axis='y', labelsize=20, pad=10)
ax.set_title('Stock Clusters', size=25, pad=20)

plt.show()
D:\Anaconda\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
  warnings.warn(

在这里插入图片描述

4.Naive

import os
from sklearn.metrics import r2_score
import glob
list_order_book_file_train = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/*')
def realized_volatility_per_time_id(file_path, prediction_column_name):
    df_book_data = pd.read_parquet(file_path)
    df_book_data['wap'] =(df_book_data['bid_price1'] * df_book_data['ask_size1']+df_book_data['ask_price1'] * df_book_data['bid_size1'])  / (
                                      df_book_data['bid_size1']+ df_book_data[
                                  'ask_size1'])
    df_book_data['log_return'] = df_book_data.groupby(['time_id'])['wap'].apply(log_return)
    df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
    df_realized_vol_per_stock =  pd.DataFrame(df_book_data.groupby(['time_id'])['log_return'].agg(realized_volatility)).reset_index()
    df_realized_vol_per_stock = df_realized_vol_per_stock.rename(columns = {'log_return':prediction_column_name})
    stock_id = file_path.split('=')[1]
    df_realized_vol_per_stock['row_id'] = df_realized_vol_per_stock['time_id'].apply(lambda x:f'{stock_id}-{x}')
    return df_realized_vol_per_stock[['row_id',prediction_column_name]]
def past_realized_volatility_per_stock(list_file,prediction_column_name):
    df_past_realized = pd.DataFrame()
    for file in list_file:
        df_past_realized = pd.concat([df_past_realized,
                                     realized_volatility_per_time_id(file,prediction_column_name)])
    return df_past_realized
df_past_realized_train = past_realized_volatility_per_stock(list_file=list_order_book_file_train,
                                                           prediction_column_name='pred')
train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
train = train[['row_id','target']]
df_joined = train.merge(df_past_realized_train[['row_id','pred']], on = ['row_id'], how = 'left')
from sklearn.metrics import r2_score
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))
R2 = round(r2_score(y_true = df_joined['target'], y_pred = df_joined['pred']),3)
RMSPE = round(rmspe(y_true = df_joined['target'], y_pred = df_joined['pred']),3)
print(f'Performance of the naive prediction: R2 score: {R2}, RMSPE: {RMSPE}')
Performance of the naive prediction: R2 score: 0.628, RMSPE: 0.341

5.GARCH

def abs_log_returns_topn(list_file, n_top):
    '''Returns log returns for top n stocks from shuffled stock list'''
    df_list = []
    for stock_file in list_file[:n_top]:
        df_book_data = pd.read_parquet(stock_file)
        df_book_data['wap'] = calculate_wap(df_book_data) 
        df_book_data['log_return'] = df_book_data.groupby(['time_id'])['wap'].apply(log_return)
        df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
        df_book_data['log_return'] = df_book_data['log_return'].apply(abs)
        df_book_data['stock_id'] = int(stock_file.split('=')[1])
        df_list.append(df_book_data)
    df = pd.concat(df_list, ignore_index=True)
    return df
# 随机抽取股票
random.shuffle(order_book_train_files)

# 获取5只股票样本的对数收益率
df_log_returns = abs_log_returns_topn(order_book_train_files, 5)

selected_stocks = df_log_returns['stock_id'].unique()
print('Stock Ids:', ', '.join(str(s) for s in selected_stocks))
Stock Ids: 82, 80, 48, 50, 89
df_log_returns['stock_id'].value_counts()
50    2144862
80    1429737
89    1282438
48    1244721
82    1039429
Name: stock_id, dtype: int64
def plot_auto_correlation(series, lags=30, stock=''):
    plt.rcParams["figure.figsize"] = 20, 5
    fig, axes = plt.subplots(1, 2)
    acf = plot_acf(series, lags=lags, ax = axes[0])
    pacf = plot_pacf(series, lags=lags, ax = axes[1])
    acf.suptitle(f'Autocorrelation and Partial Autocorrelation - stock {stock}', fontsize=20)
    plt.show()
for stock_id in selected_stocks:
    df_log_returns_stock = df_log_returns[df_log_returns['stock_id'] == stock_id]
    random_time_id = random.choice(df_log_returns_stock['time_id'].unique())
    df_time_slice = df_log_returns_stock[df_log_returns_stock['time_id'] == random_time_id]
    plot_auto_correlation(df_time_slice['log_return'], stock=stock_id)

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

在这里插入图片描述

arima_order = []
for stock_id in selected_stocks:
    for order_seq in product((2, 1, 0), repeat=3):
        # ARIMA(0,0,0)是白噪声
        if order_seq == (0, 0, 0):
            continue
        try:
            df_log_returns_stock = df_log_returns[df_log_returns['stock_id'] == stock_id]
            # 随机时间段来训练ARIMA模型
            random_time_id = random.choice(df_log_returns_stock['time_id'].unique())
            df_time_slice = df_log_returns_stock[df_log_returns_stock['time_id'] == random_time_id]
            arima_model = ARIMA(df_time_slice['log_return'], order = order_seq)
            results_ARIMA = arima_model.fit(disp=-1)
            arima_order.append([stock_id, order_seq, results_ARIMA.aic])
            # print(f'{order_seq} AIC: {results_ARIMA.aic}')
        except:
            pass
df_arima_order = pd.DataFrame(arima_order, columns=['stock_id', 'order', 'aic'])
df_arima_order_min = pd.merge(df_arima_order.groupby('stock_id')['aic'].min().reset_index(),
                          df_arima_order, 
                          how='left', 
                          on=['stock_id','aic'])
df_arima_order_min
stock_idaicorder
048-7214.905180(0, 1, 0)
150-9740.828208(2, 1, 0)
280-7425.750374(1, 1, 2)
382-5490.511465(0, 0, 1)
489-7134.379857(1, 0, 2)
def plot_resid_sqr(series, stock_id='', time_id=''):
    plt.rcParams["figure.figsize"] = 20, 5
    plt.plot(series.values)
    plt.title(f'Squared Residial Plot - stock {stock_id} - time_id {time_id}', fontsize=14)
    plt.ylabel('Squared Residial', fontsize=14)
    plt.show()
for idx, row in df_arima_order_min.iterrows():
    stock_id = row['stock_id']
    df_log_returns_stock = df_log_returns[df_log_returns['stock_id'] == stock_id]
    # 随机时间段来训练ARIMA模型
    random_time_id = random.choice(df_log_returns_stock['time_id'].unique())
    df_time_slice = df_log_returns_stock[df_log_returns_stock['time_id'] == random_time_id]
    arima_model = ARIMA(df_time_slice['log_return'], order = row['order'])
    results_ARIMA = arima_model.fit(disp=-1)
    sqr_resid = np.power(results_ARIMA.resid, 2)
    plot_resid_sqr(sqr_resid, stock_id, random_time_id)

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

for idx, row in df_arima_order_min.iterrows():
    stock_id = row['stock_id']
    df_log_returns_stock = df_log_returns[df_log_returns['stock_id'] == stock_id]
    # Slicing random time period to train ARIMA model
    random_time_id = random.choice(df_log_returns_stock['time_id'].unique())
    df_time_slice = df_log_returns_stock[df_log_returns_stock['time_id'] == random_time_id]
    arima_model = ARIMA(df_time_slice['log_return'], order = row['order'])
    results_ARIMA = arima_model.fit(disp=-1)
    sqr_resid = np.power(results_ARIMA.resid, 2)
    plot_auto_correlation(sqr_resid, stock=stock_id)

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

在这里插入图片描述

# 定义如何处理买卖盘数据
def wap1(df):
    wap=(df['bid_price1']*df['ask_size1']+df['ask_price1']*df['bid_size1'])/(df['bid_size1']+df['ask_size1'])
    return wap

def wap2(df):
    wap=(df['bid_price2']*df['ask_size2']+df['ask_price2']*df['bid_size2'])/(df['bid_size2']+df['ask_size2'])
    return wap

def count_unique(series):
    return len(np.unique(series))

def book_prep(path):
    
    df=pd.read_parquet(path)
    df['wap']=wap1(df)
    df['log_return']=df.groupby('time_id')['wap'].apply(log_return)
    df['wap2']=wap2(df)
    df['log_return2']=df.groupby('time_id')['wap2'].apply(log_return)
    df['price_spread']=(df['ask_price1']-df['bid_price1'])/(df['ask_price1']+df['bid_price1'])
    df['price_spread2']=(df['ask_price2']-df['bid_price2'])/(df['ask_price2']+df['bid_price2'])
    df['bid_spread']=df['bid_price1']-df['bid_price2']
    df['ask_spread']=df['ask_price1']-df['ask_price2']
    df['total_volume']=df['ask_size1'] + df['ask_size2'] + df['bid_size1'] + df['bid_size2']
    df['volume_imbalance']=abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    
    # 建立一个用来聚合的字典
    create_feature_dict = {
        'log_return':[realized_volatility],
        'log_return2':[realized_volatility],
        'wap':[np.mean],
        'wap2':[np.mean],
        'price_spread':[np.mean],
        'price_spread2':[np.mean],
        'bid_spread':[np.mean],
        'ask_spread':[np.mean],
        'volume_imbalance':[np.mean],
        'total_volume':[np.mean],
            }
    # book里的特征建立完成,再来用groupby遍历所有秒数
    df_feature=pd.DataFrame(df.groupby(['time_id']).agg(create_feature_dict)).reset_index()
    
    # 把time_id变成time_id_
    df_feature.columns=['_'.join(col) for col in df_feature.columns] 
    
    last_seconds=[150,300,450]
    for second in last_seconds:
        second=600-second
        df_feature_sec=pd.DataFrame(df.query(f'seconds_in_bucket >={second}').groupby(['time_id']).agg(create_feature_dict)).reset_index()
        df_feature_sec.columns = ['_'.join(col) for col in df_feature_sec.columns]
        df_feature_sec=df_feature_sec.add_suffix('_'+str(second))
        df_feature=pd.merge(df_feature,df_feature_sec,how='left',left_on='time_id_',right_on=f'time_id__{second}')
        df_feature=df_feature.drop([f'time_id__{second}'],axis=1)
        
    #建立row_id
    stock_id=path.split('=')[1]
    df_feature['row_id']=df_feature['time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature=df_feature.drop(['time_id_'],axis=1)
    return df_feature
# 查看执行上述处理后的样子,并看需要多少时间

%%time
data_dir = '../optiver-realized-volatility-prediction/'
path = data_dir + "book_train.parquet/stock_id=0"
book_prep(path)
CPU times: total: 6.17 s
Wall time: 6.03 s
log_return_realized_volatilitylog_return2_realized_volatilitywap_meanwap2_meanprice_spread_meanprice_spread2_meanbid_spread_meanask_spread_meanvolume_imbalance_meantotal_volume_mean...log_return2_realized_volatility_150wap_mean_150wap2_mean_150price_spread_mean_150price_spread2_mean_150bid_spread_mean_150ask_spread_mean_150volume_imbalance_mean_150total_volume_mean_150row_id
00.0044990.0069991.0037251.0036610.0004260.0005880.000176-0.000151134.894040323.496689...0.0060871.0038321.0037530.0004290.0005960.000188-0.000147123.586207327.4310340-5
10.0012040.0024761.0002391.0002060.0001970.0003350.000142-0.000135142.050000411.450000...0.0022621.0003011.0002450.0001760.0003100.000141-0.000127151.566474419.2774570-11
20.0023690.0048010.9995420.9996800.0003630.0005600.000197-0.000198141.414894416.351064...0.0040190.9991260.9993120.0003400.0005400.000161-0.000241132.084034428.5378150-16
30.0025740.0036370.9988320.9986330.0004300.0005790.000190-0.000108146.216667435.266667...0.0032730.9984640.9983530.0004600.0006000.000170-0.000108151.765432424.2345680-31
40.0018940.0032570.9996190.9996260.0001990.0003490.000191-0.000109123.846591343.221591...0.0029270.9996180.9996700.0001980.0003490.000187-0.000117131.474074371.2666670-62
..................................................................
38250.0025790.0038210.9979380.9978640.0002760.0004090.000083-0.000182197.144781374.235690...0.0035070.9977840.9976660.0002860.0004050.000066-0.000171196.365639349.9163000-32751
38260.0022060.0028471.0003101.0004870.0002710.0004030.000092-0.000172233.781553621.131068...0.0023961.0005281.0007420.0002890.0004180.000079-0.000181253.850340707.7823130-32753
38270.0029130.0032660.9995520.9994560.0002630.0004050.000202-0.000083115.829787343.734043...0.0030060.9998550.9997930.0002570.0004070.000215-0.000084103.022556313.1428570-32758
38280.0030460.0051051.0023571.0023860.0002400.0003790.000113-0.000166132.074919385.429967...0.0045261.0022811.0023250.0002590.0003800.000095-0.000148120.972603389.2739730-32763
38290.0019010.0025410.9991230.9991660.0002290.0003550.000124-0.000127165.754386533.543860...0.0018230.9987460.9987720.0002360.0003590.000128-0.000118152.146341505.7560980-32767

3830 rows × 41 columns

look=pd.read_parquet('../optiver-realized-volatility-prediction/trade_train.parquet/stock_id=0')
look
time_idseconds_in_bucketpricesizeorder_count
05211.00230132612
15461.0027781284
25501.002818551
35571.0031551215
45681.00364641
..................
123438327674710.9986592003
123439327675170.998515901
123440327675230.99856311
123441327675420.998803904
123442327675670.9985473003

123443 rows × 5 columns

# 利用trade的交易数据建立特征
def trade_prep(path):
    df = pd.read_parquet(path)
    df['log_return'] = df.groupby('time_id')['price'].apply(log_return)
    
    
    aggregate_dictionary = {
        'log_return':[realized_volatility],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum],
        'order_count':[np.mean],
    }
    
    df_feature = df.groupby('time_id').agg(aggregate_dictionary).reset_index()
    
    df_feature.columns = ['_'.join(col) for col in df_feature.columns]

    
    
    last_seconds = [150,300,450]
    
    for second in last_seconds:
        second = 600 - second
    
        df_feature_sec = df.query(f'seconds_in_bucket >= {second}').groupby('time_id').agg(aggregate_dictionary)
        df_feature_sec = df_feature_sec.reset_index()
        
        df_feature_sec.columns = ['_'.join(col) for col in df_feature_sec.columns]
        df_feature_sec = df_feature_sec.add_suffix('_' + str(second))
        
        df_feature = pd.merge(df_feature,df_feature_sec,how='left',left_on='time_id_',right_on=f'time_id__{second}')
        df_feature = df_feature.drop([f'time_id__{second}'],axis=1)
    
    df_feature = df_feature.add_prefix('trade_')
    stock_id = path.split('=')[1]
    df_feature['row_id'] = df_feature['trade_time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature = df_feature.drop(['trade_time_id_'],axis=1)
    
    return df_feature
%%time
data_dir = '../optiver-realized-volatility-prediction/'
path = data_dir + "trade_train.parquet/stock_id=0"
trade_prep(path)
CPU times: total: 3.03 s
Wall time: 3.03 s
trade_log_return_realized_volatilitytrade_seconds_in_bucket_count_uniquetrade_size_sumtrade_order_count_meantrade_log_return_realized_volatility_450trade_seconds_in_bucket_count_unique_450trade_size_sum_450trade_order_count_mean_450trade_log_return_realized_volatility_300trade_seconds_in_bucket_count_unique_300trade_size_sum_300trade_order_count_mean_300trade_log_return_realized_volatility_150trade_seconds_in_bucket_count_unique_150trade_size_sum_150trade_order_count_mean_150row_id
00.0020064031792.7500000.00106014.01042.02.6428570.00130821.01587.02.5714290.00170130.02069.02.4333330-5
10.0009013012891.9000000.00050110.0828.02.2000000.00058716.0900.02.2500000.00081324.01173.02.0416670-11
20.0019612521612.7200000.0010489.01085.03.6666670.00113712.01189.03.1666670.00162120.02010.02.9500000-16
30.0015611519623.9333330.0008023.0514.03.6666670.0010899.01556.05.1111110.00140111.01631.04.5454550-31
40.0008712217914.0454550.0003604.043.03.5000000.00045311.01219.04.9090910.00055016.01570.04.5000000-62
......................................................
38250.0015195234503.0576920.00078619.01159.02.9473680.00116235.02365.03.2571430.00140942.02957.03.2380950-32751
38260.0014112845473.8928570.0007505.01158.04.6000000.00106612.02161.04.2500000.00128419.02494.03.4210530-32753
38270.0015213642503.5000000.0007808.0416.02.0000000.00124222.02294.03.7272730.00137527.02736.03.4444440-32758
38280.0017945332172.1509430.00101212.01415.02.6666670.00140425.01627.01.9200000.00165036.02296.02.0555560-32763
38290.0011972936792.4137930.0005428.0987.02.1250000.00080116.02650.03.0000000.00099023.03346.02.7391300-32767

3830 rows × 17 columns

Joblib

from joblib import Parallel, delayed
def preprocessor(list_stock_ids, is_train = True):
    df = pd.DataFrame()
    
    def for_joblib(stock_id):

        if is_train:
            file_path_book = data_dir + "book_train.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_train.parquet/stock_id=" + str(stock_id)
        else:
            file_path_book = data_dir + "book_test.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_test.parquet/stock_id=" + str(stock_id)
            
        df_tmp = pd.merge(book_prep(file_path_book),trade_prep(file_path_trade),on='row_id',how='left')
     
        return pd.concat([df,df_tmp])
    
    df = Parallel(n_jobs=-1, verbose=1)(
        delayed(for_joblib)(stock_id) for stock_id in list_stock_ids
        )

    df =  pd.concat(df,ignore_index = True)
    return df
  • Train_data
train = pd.read_csv(data_dir + 'train.csv')
train_ids = train.stock_id.unique()
# 合并所有训练集
%%time
df_train = preprocessor(list_stock_ids= train_ids, is_train = True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   48.5s


CPU times: total: 2.05 s
Wall time: 2min 51s


[Parallel(n_jobs=-1)]: Done 112 out of 112 | elapsed:  2.8min finished
# 合并feature和target
train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
train = train[['row_id','target']]
df_train = train.merge(df_train, on = ['row_id'], how = 'left')
  • Test_data
test = pd.read_csv(data_dir + 'test.csv')
test_ids = test.stock_id.unique()
%%time
df_test = preprocessor(list_stock_ids= test_ids, is_train = False)
CPU times: total: 0 ns
Wall time: 78.8 ms


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
df_test = test.merge(df_test, on = ['row_id'], how = 'left')
from sklearn.model_selection import KFold
df_train['stock_id']=df_train['row_id'].apply(lambda x:x.split('-')[0])
df_test['stock_id']=df_test['row_id'].apply(lambda x:x.split('-')[0])

stock_id_target_mean=df_train.groupby('stock_id')['target'].mean()
df_test['stock_id_target_enc']=df_test['stock_id'].map(stock_id_target_mean)

tmp=np.repeat(np.nan,df_train.shape[0])
kf=KFold(n_splits=10,shuffle=True)
for idx_1,idx_2 in kf.split(df_train):
    target_mean = df_train.iloc[idx_1].groupby('stock_id')['target'].mean()
    tmp[idx_2]=df_train['stock_id'].iloc[idx_2].map(target_mean)
df_train['stock_id_target_enc'] = tmp
# 把stock_id类型改成int
df_train['stock_id'] = df_train['stock_id'].astype(int)
df_test['stock_id'] = df_test['stock_id'].astype(int)
X=df_train.drop(['row_id','target'],axis=1)
y=df_train['target']
# 定义RMSPE
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

def feval_RMSPE(preds, lgbm_train):
    labels = lgbm_train.get_label()
    return 'RMSPE', round(rmspe(y_true = labels, y_pred = preds),5), False

# 定义LGBM模型的超参数
params = {
      "objective": "rmse", 
      "metric": "rmse", 
      "boosting_type": "gbdt",
      'early_stopping_rounds': 30,
      'learning_rate': 0.01,
      'lambda_l1': 1,
      'lambda_l2': 1,
  }
# 设定k折交叉验证参数
kf = KFold(n_splits=4,shuffle=True)
oof = pd.DataFrame()        
models = []                  
scores = 0.0          
# 开始训练
for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):

    print("Fold :", fold+1)
    
    X_train, y_train = X.loc[trn_idx], y[trn_idx]
    X_valid, y_valid = X.loc[val_idx], y[val_idx]
    
    # https://www.kaggle.com/c/optiver-realized-volatility-prediction/discussion/250324
    # 根据搜集资料,使用RMSPE weight
    weights = 1/np.square(y_train)
    lgbm_train = lgbm.Dataset(X_train,y_train,weight = weights)

    weights = 1/np.square(y_valid)
    lgbm_valid = lgbm.Dataset(X_valid,y_valid,reference = lgbm_train,weight = weights)
    
    # 构造LGBM模型
    model = lgbm.train(params=params,
                      train_set=lgbm_train,
                      valid_sets=[lgbm_train, lgbm_valid],
                      num_boost_round=1000,         
                      feval=feval_RMSPE,
                      verbose_eval=200,
                      categorical_feature = ['stock_id']                
                     )
    
    # 预测
    y_pred = model.predict(X_valid, num_iteration=model.best_iteration)
    
    # 展示每次RMSPE分数
    RMSPE = round(rmspe(y_true = y_valid, y_pred = y_pred),4)
    print(f'Performance of the prediction: , RMSPE: {RMSPE}')

    # 记录模型及其分数
    scores += RMSPE / 4
    models.append(model)
    print("*" * 100)
Fold : 1


D:\Anaconda\lib\site-packages\lightgbm\engine.py:153: UserWarning: Found `early_stopping_rounds` in params. Will use it instead of argument
  _log_warning("Found `{}` in params. Will use it instead of argument".format(alias))
D:\Anaconda\lib\site-packages\lightgbm\basic.py:1705: UserWarning: categorical_feature in Dataset is overridden.
New categorical_feature is ['stock_id']
  _log_warning('categorical_feature in Dataset is overridden.\n'


[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.044245 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14543
[LightGBM] [Info] Number of data points in the train set: 321699, number of used features: 58
[LightGBM] [Info] Start training from score 0.001795
Training until validation scores don't improve for 30 rounds


D:\Anaconda\lib\site-packages\lightgbm\basic.py:1433: UserWarning: Overriding the parameters from Reference Dataset.
  _log_warning('Overriding the parameters from Reference Dataset.')
D:\Anaconda\lib\site-packages\lightgbm\basic.py:1245: UserWarning: categorical_column in param dict is overridden.
  _log_warning('{} in param dict is overridden.'.format(cat_alias))


[200]	training's rmse: 0.000533368	training's RMSPE: 0.24722	valid_1's rmse: 0.0005513	valid_1's RMSPE: 0.254
[400]	training's rmse: 0.000499109	training's RMSPE: 0.23134	valid_1's rmse: 0.00052092	valid_1's RMSPE: 0.24
[600]	training's rmse: 0.000492161	training's RMSPE: 0.22812	valid_1's rmse: 0.000516319	valid_1's RMSPE: 0.23788
[800]	training's rmse: 0.000487324	training's RMSPE: 0.22588	valid_1's rmse: 0.000514602	valid_1's RMSPE: 0.23709
[1000]	training's rmse: 0.000483462	training's RMSPE: 0.22409	valid_1's rmse: 0.000513685	valid_1's RMSPE: 0.23667
Did not meet early stopping. Best iteration is:
[1000]	training's rmse: 0.000483462	training's RMSPE: 0.22409	valid_1's rmse: 0.000513685	valid_1's RMSPE: 0.23667
Performance of the prediction: , RMSPE: 0.2367
****************************************************************************************************
Fold : 2


D:\Anaconda\lib\site-packages\lightgbm\engine.py:153: UserWarning: Found `early_stopping_rounds` in params. Will use it instead of argument
  _log_warning("Found `{}` in params. Will use it instead of argument".format(alias))
D:\Anaconda\lib\site-packages\lightgbm\basic.py:1705: UserWarning: categorical_feature in Dataset is overridden.
New categorical_feature is ['stock_id']
  _log_warning('categorical_feature in Dataset is overridden.\n'


[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.049546 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14543
[LightGBM] [Info] Number of data points in the train set: 321699, number of used features: 58
[LightGBM] [Info] Start training from score 0.001810
Training until validation scores don't improve for 30 rounds


D:\Anaconda\lib\site-packages\lightgbm\basic.py:1433: UserWarning: Overriding the parameters from Reference Dataset.
  _log_warning('Overriding the parameters from Reference Dataset.')
D:\Anaconda\lib\site-packages\lightgbm\basic.py:1245: UserWarning: categorical_column in param dict is overridden.
  _log_warning('{} in param dict is overridden.'.format(cat_alias))


[200]	training's rmse: 0.000534933	training's RMSPE: 0.24681	valid_1's rmse: 0.000542633	valid_1's RMSPE: 0.25343
[400]	training's rmse: 0.000501284	training's RMSPE: 0.23129	valid_1's rmse: 0.000512115	valid_1's RMSPE: 0.23918
[600]	training's rmse: 0.000494241	training's RMSPE: 0.22804	valid_1's rmse: 0.000510521	valid_1's RMSPE: 0.23844
Early stopping, best iteration is:
[577]	training's rmse: 0.000494887	training's RMSPE: 0.22834	valid_1's rmse: 0.000510407	valid_1's RMSPE: 0.23838
Performance of the prediction: , RMSPE: 0.2384
****************************************************************************************************
Fold : 3


D:\Anaconda\lib\site-packages\lightgbm\engine.py:153: UserWarning: Found `early_stopping_rounds` in params. Will use it instead of argument
  _log_warning("Found `{}` in params. Will use it instead of argument".format(alias))
D:\Anaconda\lib\site-packages\lightgbm\basic.py:1705: UserWarning: categorical_feature in Dataset is overridden.
New categorical_feature is ['stock_id']
  _log_warning('categorical_feature in Dataset is overridden.\n'


[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.050070 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14543
[LightGBM] [Info] Number of data points in the train set: 321699, number of used features: 58
[LightGBM] [Info] Start training from score 0.001799
Training until validation scores don't improve for 30 rounds


D:\Anaconda\lib\site-packages\lightgbm\basic.py:1433: UserWarning: Overriding the parameters from Reference Dataset.
  _log_warning('Overriding the parameters from Reference Dataset.')
D:\Anaconda\lib\site-packages\lightgbm\basic.py:1245: UserWarning: categorical_column in param dict is overridden.
  _log_warning('{} in param dict is overridden.'.format(cat_alias))


[200]	training's rmse: 0.000533172	training's RMSPE: 0.24683	valid_1's rmse: 0.000539364	valid_1's RMSPE: 0.24942
[400]	training's rmse: 0.000499037	training's RMSPE: 0.23102	valid_1's rmse: 0.000509534	valid_1's RMSPE: 0.23563
[600]	training's rmse: 0.000492018	training's RMSPE: 0.22777	valid_1's rmse: 0.000506017	valid_1's RMSPE: 0.234
[800]	training's rmse: 0.000487172	training's RMSPE: 0.22553	valid_1's rmse: 0.000503676	valid_1's RMSPE: 0.23292
[1000]	training's rmse: 0.000483288	training's RMSPE: 0.22373	valid_1's rmse: 0.000502225	valid_1's RMSPE: 0.23225
Did not meet early stopping. Best iteration is:
[1000]	training's rmse: 0.000483288	training's RMSPE: 0.22373	valid_1's rmse: 0.000502225	valid_1's RMSPE: 0.23225
Performance of the prediction: , RMSPE: 0.2322
****************************************************************************************************
Fold : 4


D:\Anaconda\lib\site-packages\lightgbm\engine.py:153: UserWarning: Found `early_stopping_rounds` in params. Will use it instead of argument
  _log_warning("Found `{}` in params. Will use it instead of argument".format(alias))
D:\Anaconda\lib\site-packages\lightgbm\basic.py:1705: UserWarning: categorical_feature in Dataset is overridden.
New categorical_feature is ['stock_id']
  _log_warning('categorical_feature in Dataset is overridden.\n'


[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.044247 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14544
[LightGBM] [Info] Number of data points in the train set: 321699, number of used features: 58
[LightGBM] [Info] Start training from score 0.001796
Training until validation scores don't improve for 30 rounds


D:\Anaconda\lib\site-packages\lightgbm\basic.py:1433: UserWarning: Overriding the parameters from Reference Dataset.
  _log_warning('Overriding the parameters from Reference Dataset.')
D:\Anaconda\lib\site-packages\lightgbm\basic.py:1245: UserWarning: categorical_column in param dict is overridden.
  _log_warning('{} in param dict is overridden.'.format(cat_alias))


[200]	training's rmse: 0.000533089	training's RMSPE: 0.24704	valid_1's rmse: 0.000543243	valid_1's RMSPE: 0.25045
[400]	training's rmse: 0.000498918	training's RMSPE: 0.2312	valid_1's rmse: 0.000512558	valid_1's RMSPE: 0.2363
[600]	training's rmse: 0.000492014	training's RMSPE: 0.228	valid_1's rmse: 0.000508456	valid_1's RMSPE: 0.23441
[800]	training's rmse: 0.000487206	training's RMSPE: 0.22578	valid_1's rmse: 0.000506189	valid_1's RMSPE: 0.23336
[1000]	training's rmse: 0.000483353	training's RMSPE: 0.22399	valid_1's rmse: 0.000504692	valid_1's RMSPE: 0.23267
Did not meet early stopping. Best iteration is:
[1000]	training's rmse: 0.000483353	training's RMSPE: 0.22399	valid_1's rmse: 0.000504692	valid_1's RMSPE: 0.23267
Performance of the prediction: , RMSPE: 0.2327
****************************************************************************************************
scores
0.235
# 准备测试集
y_pred = df_test[['row_id']]
X_test = df_test.drop(['time_id', 'row_id'], axis = 1)
X_test
stock_idlog_return_realized_volatilitylog_return2_realized_volatilitywap_meanwap2_meanprice_spread_meanprice_spread2_meanbid_spread_meanask_spread_meanvolume_imbalance_mean...trade_order_count_mean_450trade_log_return_realized_volatility_300trade_seconds_in_bucket_count_unique_300trade_size_sum_300trade_order_count_mean_300trade_log_return_realized_volatility_150trade_seconds_in_bucket_count_unique_150trade_size_sum_150trade_order_count_mean_150stock_id_target_enc
000.0002940.0002521.0004051.000550.0002790.0005330.000393-0.000115164.666667...NaNNaNNaNNaNNaNNaNNaNNaNNaN0.004028
10NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaN0.004028
20NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaN0.004028

3 rows × 58 columns

target = np.zeros(len(X_test))
# 使用所有模型逐一预测测试集,并挑选最优模型
for model in models:
    pred = model.predict(X_test[X_valid.columns], num_iteration=model.best_iteration)
    target += pred / len(models)
y_pred = y_pred.assign(target = target)
y_pred
row_idtarget
00-40.000970
10-320.000951
20-340.000951
y_pred.to_csv('submission.csv',index = False)

  • 23
    点赞
  • 20
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值