1.数据介绍
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import random
import glob
import gc
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_colwidth', None)
import os
for dirname, _, filenames in os.walk('../optiver-realized-volatility-prediction'):
for filename in filenames:
print(os.path.join(dirname, filename))
../optiver-realized-volatility-prediction\sample_submission.csv
../optiver-realized-volatility-prediction\test.csv
../optiver-realized-volatility-prediction\train.csv
../optiver-realized-volatility-prediction\book_test.parquet\stock_id=0\7832c05caae3489cbcbbb9b02cf61711.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=0\c439ef22282f412ba39e9137a3fdabac.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=1\31a1c5cd6d8546b383d10373db762236.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=10\d671bb2b87f447d4ba3fa4b18b3656f9.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=100\9495f2c64cee42078601dee7408e3c36.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=101\dbbf6dc725ee4bcb8656c427165c888d.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=102\9d16ca233fea42c7a8ec91d621ca113a.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=103\cc7984f01a8747299339cb441632d6c0.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=104\6875618fb8144a80a033ae8de7809493.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=105\a37b8bf4be2e4fa5b3ea63b7c4240d3d.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=107\0a58f297fa7b471a8c39b79420f9990d.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=108\662f6789c2be46e28884e9caceb1f67a.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=109\fede24892e38482a9b0a0af7adb1030c.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=11\755d9c4058914e2a901aaeccf2406e30.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=110\2bba26018b9c482e90ca9be02e856382.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=111\76fdfb893640493f8822cec944e35f0d.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=112\89c931289b58423bb27b45ce8109c148.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=113\2698c014aa014ce0a476ccaf13795a43.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=114\22c84a10a4b4477b85b5bf7688037e77.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=115\32688ffcc77d40a5ab6ad8054cf44cab.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=116\bd6b0dc7386c49a79856b8df04e4a33b.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=118\eefc9574b4024ddb90eefc065ccd2918.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=119\a942847a75914186b47fec1c857e45fd.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=120\3068890f5521450db46c271190fa24d8.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=122\12dc61893ff0440e8972b8dc7d3988d8.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=123\50d0df62ffc241dea0ae635e7858aca5.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=124\a88197568c3e441cb1b45dccc5f3bd8b.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=125\996b501aec144de8ae4149a72633db84.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=126\c877c4dda91f41018dbf6ccb1182ef28.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=13\e576f0cd91c347a7b1cdb87c3bd1abf4.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=14\ae1ae9deed8048fbb1191a77ca68ab24.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=15\a47deb70de554fdcb7b780bd060d4564.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=16\eaa06704bf15493fae496cdf6c016b6c.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=17\4539ddf198a64337bdeca019b4edb1bb.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=18\cbc56bf4cc424b12ae18493920465a3a.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=19\cae027e29b0749b5b2e139134b75fe63.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=2\36f27d865da54e8d92ff54e07ac4afd2.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=20\fe0e467dc2ca4092afc151d4069c54a7.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=21\60cc70a1e25d4da7b39fbfac2bd9fab1.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=22\3f2ddd48d0b147f09c852cc2034ece80.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=23\9229d9611b4e45b4bf81d60939a8274a.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=26\1adc4a39b6fe4f8bbbedd7551006a170.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=27\0e5f57dd508b46cb9d6d88bd63aeaedd.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=28\e956e66d12664e509d27a680731a4057.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=29\9534c4ef29c049f6af22f4ca8305f315.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=3\ccc6d426f595409680d6f422ed911bee.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=30\ee59d4b4cf4847adaee7de785d7918e4.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=31\c0b9eb5ed0bf40bc838fab4da5459f76.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=32\446dd6db2d064e5b86666c66ec669755.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=33\85142fc4257b4ece8157a174ae6d2f4f.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=34\4cb0d70566654bf480518b3a1cee2ad2.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=35\8803efd6c9c44895a6fda59448a8eede.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=36\0ffca9c0dae84890ad8c923d4b95e840.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=37\7163ec1911204a10a479e421c0a41a35.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=38\c49c83c6eb1c41f0b8f0d61de1e3bb89.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=39\99d14712047840fbb3a93eb1f6ebc7a2.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=4\6ae39a36676a419f8847cb217e5ec75d.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=40\ab33096950f647ddb7b705f17b21f11c.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=41\3090fd7adaf447679b6e836778168be8.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=42\01281c6eedfc46c1b72357c484e640b5.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=43\ce8f54442b1142338d2e4b02b9dc578a.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=44\31274a12b2ec4c24a4a57a054e67d704.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=46\db00d635f1614686837abc1c71f6e16b.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=47\ede300b0202a43caaaefc8ae07475c33.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=48\5c08c8582813448e8e87eda2c3c36b84.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=5\c39da423b1404b00946ceedf32ae3f07.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=50\0f5c98a29be64774a136b60f71033ec6.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=51\ace8af4c1a474efd80544609ea4a47c4.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=52\d5eb854d66174f5483449da151fbca50.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=53\46c591d10f0940de908317001e533db7.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=55\d3bdb39f65d34bb18278d789729f9323.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=56\55f4ad52ac2347f1a468ea3bf0cb7adc.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=58\33de4ffff5494e3a8ecc02c7b6d9433a.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=59\fd5b747386604005b18594c00f0312ab.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=6\84352cd5e3424aeaadf11f8d89c8a07a.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=60\6242aca22efd458fbec9a13c86ab42ee.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=61\2522f71edc4d47419f602cc920f906d6.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=62\c2a348a3d0b04a1c945607779c2128dd.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=63\37bff36e80a0485bba2bf875cb3188d4.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=64\acbff79109e44474bfa3a6145bd4244a.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=66\3d1df975704b49cbb407d74d1adb53b6.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=67\2a7fd60b816b431597dec0e80d0ec3b7.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=68\ab42bd95488842f5aa15b750052e79ad.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=69\16f9fe0eb6544a1d89fbf25f290b8466.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=7\4f66c3336dec449280ed3719530f9486.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=70\afecffe028274a66a0e674d0db2d3c3c.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=72\0a18126395144a0a8ccca55bdbd31f1b.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=73\42238482fd7443838a8fc6490cd329c8.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=74\8229e924b5ef4716a0078d9e8ee397bd.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=75\f0354d49be4e4552b548b410b349bbc8.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=76\494ecaad37f9430384ad0a825aab00ec.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=77\5f6ce3ce569548c78c2ad17035454639.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=78\b6deb113746a4de0a752ca0b2917f7ef.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=8\e40cd587617e4bfc86f0125aef383932.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=80\705f7c3bde584dadba222d4ef6f7973e.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=81\3736a90d604d4b8eaa15c6eb5f83fd8c.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=82\ae830997c75f405986957af2984493f3.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=83\4c10ab2a0f324a1189af12bd5458ef15.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=84\53e9faaf11e745debb3c246acea61a84.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=85\1b7a536b29724b60990ee6c368f1fbf0.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=86\6ecdb8f8c4da483b8fd3fbdf6aee4381.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=87\670ab2a9e8b548bdaa8b9f8bc9057331.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=88\2c4ed518efb54a4f880a0f945c317fae.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=89\a1cd471332f24761a90f6fadb8b54ccf.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=9\8a87bd6ec9934a889026174cceb9e30a.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=90\380689865a7c4af8aa8625a7a7c40533.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=93\036725ac1ba94d40bf15a75c36adb184.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=94\d4c922016df84141ad176dac1455b725.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=95\8e4a80c1b7eb44e8844cc0f2ec4badff.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=96\605c29fae948465db8c2ab233852e982.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=97\52e74e4ef0d84c5c989fc4704e46b527.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=98\59ad203a6ac040fdb891702430440b01.parquet
../optiver-realized-volatility-prediction\book_train.parquet\stock_id=99\903e3170136d4ce7a7761664482d61c2.parquet
../optiver-realized-volatility-prediction\trade_test.parquet\stock_id=0\31c83a67d81349208e7d5eace9dbbac8.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=0\ef805fd82ff54fadb363094e3b122ab9.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=1\170b39f1f7144bb3b4554aabc336106e.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=10\4be9b4d23e0149af9dc20ec10d5a360f.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=100\c6b5a2974aac4210a6803c5907baaa07.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=101\3ac4a780531644b1b9534b3ac6c8d5f6.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=102\0bdf7ff7bb4847bab154eb59fd1345bd.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=103\9e3f7f7b452a49e591da6115c17246af.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=104\06b735024acc45cbb4c81392e6d040e0.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=105\cca562a08dc24721b0dd837564abc3b5.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=107\f1b306dc9c24482390bf2083c301c588.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=108\581fd1b484db41e8a17449344c2ffb0e.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=109\0621bfa8cb8545a189878af2727f8178.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=11\5e8cea995bff4682a377e1d3932b08e2.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=110\8870109f748e4ff4b9044e0debe8a54c.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=111\24bc76eb35d843d28cf4aeca325e9084.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=112\cd283097a5b54293ba400a19e811a7f9.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=113\7cc3c1fd9452435c83518f3c7c5b74a8.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=114\6e085d4ca2ba44359673a073bc8e41e9.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=115\4dfaedf40be3485085d926ab58672bde.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=116\96cd7e2d2ea74a62bd8b6d739fce17ca.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=118\d5c040f03683418f87556865166846df.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=119\52f2e172b3444107903ec31242331923.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=120\0d06cf1025d84e19be32c9d9ffed9fd2.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=122\a2e2aa41640f4986bebc1b95bd29966f.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=123\168edf076f064342ad8aa3b37551cb8a.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=124\8e3d48a3b163471eb121216a37c6de62.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=125\a3f2c4430e1f4a3fb5837af2d970825d.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=126\3baa2ff424c8435d92fe32cfeed80ae7.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=13\2aeef69f03154de5835b9ba611d16172.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=14\50621e92b84e47a197071584bea4bf1d.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=15\43c3ace09a0a45b889073ce646bc6832.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=16\09bbef3150f744b2979fcab0ebf9e001.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=17\687d7abe51a24e2084bcb95eb8174e9b.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=18\06bb948e2275475e92fd90735050657f.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=19\7876f5e5338c4521982a7001d4489727.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=2\ca5a4d1f67024204ac7fd496a0b46ba3.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=20\4c8b1e3b619a4ee496d68962d30c6da4.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=21\1d8dc18ebfee47ffbb54b04e6afc0634.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=22\dfb544bbc3c34211bf12a6dfe6b584af.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=23\dfe7edcd05564b07983496d1b8a3b3c1.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=26\16a87247231944c4a0be45f797d14eb8.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=27\5eac589708444020a196eb738f06ce08.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=28\ec6f88c0ae114ff482cac8fb6f78e87d.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=29\ccea57f8a6324f969fe7144950c7369b.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=3\e0843aaf024f49228b281081a2524b39.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=30\8d772f1265d64c06add811d2ae912c7e.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=31\26f4296741054f9da7e7848521a80526.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=32\af90f42692874cefaa07e585d739bc7b.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=33\917a91c91ce04f62939b8710660ebb3b.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=34\a55d7f7c8ff5406c9ba9e4b0f8af0ada.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=35\e2480196f2a3426ea84e9b4284414bc0.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=36\4f713fbe94f542579ec55561a22db006.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=37\941cf9679a80466893da714884f5c4b0.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=38\e3ed94a67a444dc1a04838588ffa443f.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=39\364c0ad490cb4b9cb493c952b2124f17.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=4\761268d671f9429abb29d9d2895e9bd2.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=40\61456b44107d4b24bfe5d4ad90baaafe.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=41\bbde7b09f5e743508840782fac034d10.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=42\caab969eb87b4f7abfdd18d606a22ed0.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=43\bb0efa57f511470e817880842e3e2afa.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=44\bdfeb97d57a149049aecb2250af2c82a.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=46\7c55f1b4f4a34f83981f05974369ff6f.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=47\79bac282f1da4cb4aa24f7c667740341.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=48\507fb85f3c2a484f9fa4a78bcbfa992b.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=5\a5fd25253e3f43db884fe1e6fc2a06c3.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=50\935a65e6f4fc4e5990d086d6f6b4a932.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=51\f868a356824b4825a3c804206f513f70.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=52\54a5eea04ed042759b75b0bca90d1ced.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=53\caf58d3fd60c4a699d78ef114cf9ee17.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=55\1f19ad38e0d54c9680dc34dadf4dce47.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=56\b8615a9991bd4eada46a0483f9f9e3e6.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=58\b0e92dc583a544a08308d8c4e1df52af.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=59\2653e96292d24c64b27c0b70de53b2f9.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=6\fcf85e8ee88944d7bcbc9fc9862ee3f1.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=60\ae35d2b5825d4f1c9b52f462862eae04.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=61\fab533c5a5a741d8929f391890a4d54d.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=62\8df3e07ae2b74ad19fd98d4b9acfcbf2.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=63\107939e77b8642a3a222419adc1ae0f2.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=64\338f683c092048ebaba1708e6dd1e774.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=66\c544fc443c6042ab9bb4ddec47179a3d.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=67\703c7481d60a43839cd67d09fa3ad6c2.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=68\b077ddd5f9304972b0595a31899d4f8a.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=69\bcd636dbdd0b42bea8ad6a62a0d1fdf4.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=7\5ea290c973aa45d587602b4efb7205f2.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=70\23dc744f46e4478eb2f5253bcbffae15.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=72\60f62a03d8854605901dda072c84db39.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=73\1443c5278293485cb099fd2fad5348fa.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=74\8c298c007a0a4e078dd4efd4addb3057.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=75\5c163b1fc27c4c18acd83eb5ba5ef87d.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=76\8ee1f2bb5bed4bffb273db8b743f0ee7.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=77\3110dbc5706f4c388802faed11845bf9.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=78\c40d74d0331c465c9f3466df2abdd524.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=8\34dec4dac8ad49e2b4104f7c0ba105b3.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=80\a8c6fe433ef84012b38e96af9014ba2c.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=81\253855077c314a04be294ce53c3cdaa2.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=82\6614a1b49c5c4c8ba6a9edab2ed8708a.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=83\b8f39ee0b1eb4d3d8cda8e83c4f81c2b.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=84\98d3ff55665c4405af0688fae832e5e7.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=85\a12f69f3bf7e4deaa8fe7e2c9357bc41.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=86\3991563a90bb4206bb6b780546d938d4.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=87\927eb8a18153489b9f222d748b121473.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=88\108ca1fef317486f98e9dfab977b6f16.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=89\a05080679e8647978809f25206219ae5.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=9\01a3aa0496884cba833aa60b1c67ae6c.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=90\405e4073fed64deda32f8d74362b15b1.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=93\572fb394eb5745dcb36dfdc478b222c8.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=94\8037e3f4a763480a83b9bb0b1def9ce3.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=95\7396f50037c84b029a5132dfec6f7896.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=96\7a9cbfbbdfc54eef806d6fcc69f8ce5e.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=97\888f813404d8417ca8d6b8aebd5f2951.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=98\fb8bce8063fc4370917b71963ddbd1cf.parquet
../optiver-realized-volatility-prediction\trade_train.parquet\stock_id=99\d27f7f229131410399c78101da6e624b.parquet
# train
train = pd.read_csv("../optiver-realized-volatility-prediction/train.csv")
train
stock_id | time_id | target | |
---|---|---|---|
0 | 0 | 5 | 0.004136 |
1 | 0 | 11 | 0.001445 |
2 | 0 | 16 | 0.002168 |
3 | 0 | 31 | 0.002195 |
4 | 0 | 62 | 0.001747 |
... | ... | ... | ... |
428927 | 126 | 32751 | 0.003461 |
428928 | 126 | 32753 | 0.003113 |
428929 | 126 | 32758 | 0.004070 |
428930 | 126 | 32763 | 0.003357 |
428931 | 126 | 32767 | 0.002090 |
428932 rows × 3 columns
# book
book_example = pd.read_parquet('../optiver-realized-volatility-prediction/book_train.parquet/stock_id=0')
book_example
time_id | seconds_in_bucket | bid_price1 | ask_price1 | bid_price2 | ask_price2 | bid_size1 | ask_size1 | bid_size2 | ask_size2 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 5 | 0 | 1.001422 | 1.002301 | 1.001370 | 1.002353 | 3 | 226 | 2 | 100 |
1 | 5 | 1 | 1.001422 | 1.002301 | 1.001370 | 1.002353 | 3 | 100 | 2 | 100 |
2 | 5 | 5 | 1.001422 | 1.002301 | 1.001370 | 1.002405 | 3 | 100 | 2 | 100 |
3 | 5 | 6 | 1.001422 | 1.002301 | 1.001370 | 1.002405 | 3 | 126 | 2 | 100 |
4 | 5 | 7 | 1.001422 | 1.002301 | 1.001370 | 1.002405 | 3 | 126 | 2 | 100 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
917548 | 32767 | 568 | 0.998275 | 0.998754 | 0.997796 | 0.998946 | 90 | 90 | 48 | 28 |
917549 | 32767 | 569 | 0.998275 | 0.998754 | 0.997892 | 0.998946 | 91 | 90 | 200 | 28 |
917550 | 32767 | 571 | 0.998275 | 0.998754 | 0.997892 | 0.998946 | 91 | 90 | 100 | 28 |
917551 | 32767 | 572 | 0.998275 | 0.998754 | 0.997892 | 0.998946 | 92 | 90 | 100 | 28 |
917552 | 32767 | 582 | 0.998275 | 0.998754 | 0.998179 | 0.998946 | 92 | 90 | 26 | 28 |
917553 rows × 10 columns
# trade
trade_example = pd.read_parquet("../optiver-realized-volatility-prediction/trade_train.parquet/stock_id=0")
trade_example
time_id | seconds_in_bucket | price | size | order_count | |
---|---|---|---|---|---|
0 | 5 | 21 | 1.002301 | 326 | 12 |
1 | 5 | 46 | 1.002778 | 128 | 4 |
2 | 5 | 50 | 1.002818 | 55 | 1 |
3 | 5 | 57 | 1.003155 | 121 | 5 |
4 | 5 | 68 | 1.003646 | 4 | 1 |
... | ... | ... | ... | ... | ... |
123438 | 32767 | 471 | 0.998659 | 200 | 3 |
123439 | 32767 | 517 | 0.998515 | 90 | 1 |
123440 | 32767 | 523 | 0.998563 | 1 | 1 |
123441 | 32767 | 542 | 0.998803 | 90 | 4 |
123442 | 32767 | 567 | 0.998547 | 300 | 3 |
123443 rows × 5 columns
# submission
sample = pd.read_csv("../optiver-realized-volatility-prediction/sample_submission.csv")
sample
row_id | target | |
---|---|---|
0 | 0-4 | 0.003048 |
1 | 0-32 | 0.003048 |
2 | 0-34 | 0.003048 |
2.数据预处理与探索性分析
for col in train.columns:
print(col, ":", len(train[col].unique()))
stock_id : 112
time_id : 3830
target : 414287
stock = train.groupby("stock_id")["target"].agg(["mean", "median", "std", "count", "sum"]).reset_index()
stock
stock_id | mean | median | std | count | sum | |
---|---|---|---|---|---|---|
0 | 0 | 0.004028 | 0.003180 | 0.002855 | 3830 | 15.429071 |
1 | 1 | 0.004362 | 0.003719 | 0.002433 | 3830 | 16.704962 |
2 | 2 | 0.002385 | 0.001748 | 0.002339 | 3830 | 9.133223 |
3 | 3 | 0.006182 | 0.005422 | 0.003201 | 3830 | 23.675414 |
4 | 4 | 0.004197 | 0.003397 | 0.002879 | 3830 | 16.073408 |
... | ... | ... | ... | ... | ... | ... |
107 | 122 | 0.003762 | 0.003176 | 0.002300 | 3830 | 14.407997 |
108 | 123 | 0.002506 | 0.001967 | 0.001816 | 3830 | 9.599594 |
109 | 124 | 0.003691 | 0.003143 | 0.001953 | 3830 | 14.135579 |
110 | 125 | 0.002016 | 0.001575 | 0.001703 | 3830 | 7.719409 |
111 | 126 | 0.005321 | 0.004443 | 0.003179 | 3830 | 20.377571 |
112 rows × 6 columns
print("mean value=" ,stock["mean"].mean())
plt.hist(stock["mean"])
plt.show()
mean value= 0.0038805243039130312
print("sum value=" ,stock["sum"].mean())
plt.hist(stock["sum"])
plt.show()
sum value= 14.860998424142858
book_test = book_example[book_example["time_id"] == 5]
book_test
time_id | seconds_in_bucket | bid_price1 | ask_price1 | bid_price2 | ask_price2 | bid_size1 | ask_size1 | bid_size2 | ask_size2 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 5 | 0 | 1.001422 | 1.002301 | 1.001370 | 1.002353 | 3 | 226 | 2 | 100 |
1 | 5 | 1 | 1.001422 | 1.002301 | 1.001370 | 1.002353 | 3 | 100 | 2 | 100 |
2 | 5 | 5 | 1.001422 | 1.002301 | 1.001370 | 1.002405 | 3 | 100 | 2 | 100 |
3 | 5 | 6 | 1.001422 | 1.002301 | 1.001370 | 1.002405 | 3 | 126 | 2 | 100 |
4 | 5 | 7 | 1.001422 | 1.002301 | 1.001370 | 1.002405 | 3 | 126 | 2 | 100 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
297 | 5 | 585 | 1.003129 | 1.003749 | 1.003025 | 1.003801 | 100 | 3 | 26 | 3 |
298 | 5 | 586 | 1.003129 | 1.003749 | 1.002612 | 1.003801 | 100 | 3 | 2 | 3 |
299 | 5 | 587 | 1.003129 | 1.003749 | 1.003025 | 1.003801 | 100 | 3 | 26 | 3 |
300 | 5 | 588 | 1.003129 | 1.003749 | 1.002612 | 1.003801 | 100 | 3 | 2 | 3 |
301 | 5 | 593 | 1.003129 | 1.003749 | 1.003025 | 1.003801 | 100 | 3 | 26 | 3 |
302 rows × 10 columns
samples = ["bid_price1", "bid_price2", "ask_price1", "ask_price2"]
for num, a in enumerate(samples):
plt.figure(figsize=(20,5))
plt.subplot(4,1,num+1)
plt.plot(book_test["seconds_in_bucket"], book_test[a])
plt.title(a)
plt.show()
plt.figure(figsize=(20,5))
for num,a in enumerate(samples):
plt.plot(book_test["seconds_in_bucket"],book_test[a],label=a)
plt.legend(fontsize=12)
plt.show()
trade_test = trade_example[trade_example["time_id"]==5]
trade_test.head(5)
time_id | seconds_in_bucket | price | size | order_count | |
---|---|---|---|---|---|
0 | 5 | 21 | 1.002301 | 326 | 12 |
1 | 5 | 46 | 1.002778 | 128 | 4 |
2 | 5 | 50 | 1.002818 | 55 | 1 |
3 | 5 | 57 | 1.003155 | 121 | 5 |
4 | 5 | 68 | 1.003646 | 4 | 1 |
plt.figure(figsize=(20,5))
for num,a in enumerate(samples):
plt.plot(book_test["seconds_in_bucket"],book_test[a],label=a)
plt.scatter(trade_test["seconds_in_bucket"],trade_test["price"],label="trade_parquet",lw=7)
plt.legend(fontsize=12)
<matplotlib.legend.Legend at 0x7efe641c4750>
# 最小波动率时间段
stock0 = train[train["stock_id"]==0]
min_index = stock0["target"].idxmin()
min_time_id = stock0.iloc[min_index]["time_id"]
print("min index is",min_time_id,"and min target is",stock0.iloc[min_index]["target"])
min index is 24253.0 and min target is 0.000593833
book_test_min = book_example[book_example["time_id"]==min_time_id]
trade_test_min = trade_example[trade_example["time_id"]==min_time_id]
plt.figure(figsize=(20,5))
for num,a in enumerate(samples):
plt.plot(book_test_min["seconds_in_bucket"],book_test_min[a],label=a)
plt.scatter(trade_test_min["seconds_in_bucket"],trade_test_min["price"],label="trade_parquet",lw=7)
plt.legend(fontsize=12)
plt.show()
# 最大波动率时间段
stock0 = train[train["stock_id"]==0]
max_index = stock0["target"].idxmax()
max_time_id = stock0.iloc[max_index]["time_id"]
print("max index is",max_time_id,"and max target is",stock0.iloc[max_index]["target"])
max index is 19725.0 and max target is 0.036311154
book_test_max = book_example[book_example["time_id"]==max_time_id]
trade_test_max = trade_example[trade_example["time_id"]==max_time_id]
plt.figure(figsize=(20,5))
for num,a in enumerate(samples):
plt.plot(book_test_max["seconds_in_bucket"],book_test_max[a],label=a)
plt.scatter(trade_test_max["seconds_in_bucket"],trade_test_max["price"],label="trade_parquet",lw=7)
plt.legend(fontsize=12)
plt.show()
plt.figure(figsize=(20,5))
plt.scatter(trade_test_min["seconds_in_bucket"],trade_test_min["price"],lw=10,label="min_vol_time")
plt.scatter(trade_test_max["seconds_in_bucket"],trade_test_max["price"],lw=10,label = "max_vol_time")
plt.legend(fontsize=15)
plt.show()
可以发现,放在同一尺度下比较的话,可以很明显的分辨波动率大小。
3.特征工程
def visualize_target(target):
print(f'{target}\n{"-" * len(target)}')
print(f'Mean: {df_train[target].mean():.4f} - Median: {df_train[target].median():.4f} - Std: {df_train[target].std():.4f}')
print(f'Min: {df_train[target].min():.4f} - 25%: {df_train[target].quantile(0.25):.4f} - 50%: {df_train[target].quantile(0.5):.4f} - 75%: {df_train[target].quantile(0.75):.4f} - Max: {df_train[target].max():.4f}')
print(f'Skew: {df_train[target].skew():.4f} - Kurtosis: {df_train[target].kurtosis():.4f}')
missing_values_count = df_train[df_train[target].isnull()].shape[0]
training_samples_count = df_train.shape[0]
print(f'Missing Values: {missing_values_count}/{training_samples_count} ({missing_values_count * 100 / training_samples_count:.4f}%)')
fig, axes = plt.subplots(ncols=2, figsize=(24, 8), dpi=100)
sns.kdeplot(df_train[target], label=target, fill=True, ax=axes[0])
axes[0].axvline(df_train[target].mean(), label=f'{target} Mean', color='r', linewidth=2, linestyle='--')
axes[0].axvline(df_train[target].median(), label=f'{target} Median', color='b', linewidth=2, linestyle='--')
probplot(df_train[target], plot=axes[1])
axes[0].legend(prop={'size': 16})
for i in range(2):
axes[i].tick_params(axis='x', labelsize=12.5, pad=10)
axes[i].tick_params(axis='y', labelsize=12.5, pad=10)
axes[i].set_xlabel('')
axes[i].set_ylabel('')
axes[0].set_title(f'{target} Distribution in Training Set', fontsize=20, pad=15)
axes[1].set_title(f'{target} Probability Plot', fontsize=20, pad=15)
plt.show()
visualize_target('target')
target
------
Mean: 0.0039 - Median: 0.0030 - Std: 0.0029
Min: 0.0001 - 25%: 0.0020 - 50%: 0.0030 - 75%: 0.0047 - Max: 0.0703
Skew: 2.8226 - Kurtosis: 14.9611
Missing Values: 0/428932 (0.0000%)
def log_return(x):
return np.log(x).diff()
def realized_volatility(x):
return np.sqrt(np.sum(log_return(x) ** 2))
for stock_id in tqdm(sorted(df_train['stock_id'].unique())):
df_book = read_book_data('train', stock_id)
# 计算定义的Bid_Ask_ratio
df_book['bid_ask_price_ratio'] = df_book['bid_price1'] / df_book['ask_price1']
for agg in ['mean', 'std', 'min', 'max', realized_volatility]:
bid_ask_price_ratio_aggregation = df_book.groupby('time_id')['bid_ask_price_ratio'].agg(agg)
feature_name = agg.__name__ if callable(agg) else agg
df_train.loc[df_train['stock_id'] == stock_id, f'book_bid_ask_price_ratio_{feature_name}'] = df_train[df_train['stock_id'] == stock_id]['time_id'].map(bid_ask_price_ratio_aggregation)
# 加权品军价格
df_book['wap1'] = (df_book['bid_price1'] * df_book['ask_size1'] + df_book['ask_price1'] * df_book['bid_size1']) /\
(df_book['bid_size1'] + df_book['ask_size1'])
df_book['wap2'] = (df_book['bid_price2'] * df_book['ask_size2'] + df_book['ask_price2'] * df_book['bid_size2']) /\
(df_book['bid_size2'] + df_book['ask_size2'])
for wap in [1, 2]:
for agg in ['mean', 'std', 'min', 'max', realized_volatility]:
wap_aggregation = df_book.groupby('time_id')[f'wap{wap}'].agg(agg)
feature_name = agg.__name__ if callable(agg) else agg
df_train.loc[df_train['stock_id'] == stock_id, f'wap{wap}_{feature_name}'] = df_train[df_train['stock_id'] == stock_id]['time_id'].map(wap_aggregation)
0%| | 0/112 [00:00<?, ?it/s]
def visualize_continuous_feature(continuous_feature):
print(f'{continuous_feature}\n{"-" * len(continuous_feature)}')
print(f'Training Mean: {float(df_train[continuous_feature].mean()):.4} - Training Median: {float(df_train[continuous_feature].median()):.4} - Training Std: {float(df_train[continuous_feature].std()):.4}')
print(f'Training Min: {float(df_train[continuous_feature].min()):.4} - Training Max: {float(df_train[continuous_feature].max()):.4}')
print(f'Training Skew: {float(df_train[continuous_feature].skew()):.4} - Training Kurtosis: {float(df_train[continuous_feature].kurtosis()):.4}')
fig, axes = plt.subplots(ncols=2, figsize=(24, 6), dpi=100, constrained_layout=True)
title_size = 18
label_size = 18
sns.kdeplot(df_train[continuous_feature], label='Training', fill=True, ax=axes[0])
axes[0].set_xlabel('')
axes[0].tick_params(axis='x', labelsize=label_size)
axes[0].tick_params(axis='y', labelsize=label_size)
axes[0].legend()
axes[0].set_title(f'{continuous_feature} Distribution in Training Set', size=title_size, pad=title_size)
sns.scatterplot(x=df_train[continuous_feature], y=df_train['target'], ax=axes[1])
axes[1].set_title(f'{continuous_feature} vs target', size=title_size, pad=title_size)
axes[1].set_xlabel('')
axes[1].set_ylabel('')
axes[1].tick_params(axis='x', labelsize=label_size)
axes[1].tick_params(axis='y', labelsize=label_size)
plt.show()
for continuous_feature in df_train.columns[6:]:
visualize_continuous_feature(continuous_feature)
book_bid_ask_price_ratio_mean
-----------------------------
Training Mean: 0.9993 - Training Median: 0.9996 - Training Std: 0.000889
Training Min: 0.9813 - Training Max: 1.0
Training Skew: -3.975 - Training Kurtosis: 32.48
book_bid_ask_price_ratio_std
----------------------------
Training Mean: 0.0002196 - Training Median: 0.0001479 - Training Std: 0.0002727
Training Min: 0.0 - Training Max: 0.01006
Training Skew: 5.675 - Training Kurtosis: 64.22
book_bid_ask_price_ratio_min
----------------------------
Training Mean: 0.9986 - Training Median: 0.9991 - Training Std: 0.001899
Training Min: 0.9522 - Training Max: 0.9999
Training Skew: -4.826 - Training Kurtosis: 44.94
book_bid_ask_price_ratio_max
----------------------------
Training Mean: 0.9998 - Training Median: 0.9998 - Training Std: 0.0003422
Training Min: 0.992 - Training Max: 1.0
Training Skew: -3.949 - Training Kurtosis: 30.06
book_bid_ask_price_ratio_realized_volatility
--------------------------------------------
Training Mean: 0.002893 - Training Median: 0.002083 - Training Std: 0.002879
Training Min: 0.0 - Training Max: 0.06794
Training Skew: 4.321 - Training Kurtosis: 35.27
wap1_mean
---------
Training Mean: 1.0 - Training Median: 1.0 - Training Std: 0.00337
Training Min: 0.9173 - Training Max: 1.079
Training Skew: -0.2131 - Training Kurtosis: 29.89
wap1_std
--------
Training Mean: 0.00111 - Training Median: 0.0008073 - Training Std: 0.001052
Training Min: 1.631e-05 - Training Max: 0.03437
Training Skew: 4.461 - Training Kurtosis: 42.59
wap1_min
--------
Training Mean: 0.9977 - Training Median: 0.9985 - Training Std: 0.003891
Training Min: 0.8831 - Training Max: 1.049
Training Skew: -3.839 - Training Kurtosis: 43.12
wap1_max
--------
Training Mean: 1.002 - Training Median: 1.001 - Training Std: 0.003804
Training Min: 0.9479 - Training Max: 1.127
Training Skew: 3.534 - Training Kurtosis: 37.32
wap1_realized_volatility
------------------------
Training Mean: 0.004233 - Training Median: 0.003159 - Training Std: 0.003586
Training Min: 8.066e-05 - Training Max: 0.08642
Training Skew: 3.386 - Training Kurtosis: 22.65
wap2_mean
---------
Training Mean: 1.0 - Training Median: 1.0 - Training Std: 0.003374
Training Min: 0.9173 - Training Max: 1.079
Training Skew: -0.2179 - Training Kurtosis: 29.88
wap2_std
--------
Training Mean: 0.001149 - Training Median: 0.0008419 - Training Std: 0.001065
Training Min: 1.155e-06 - Training Max: 0.03441
Training Skew: 4.397 - Training Kurtosis: 41.1
wap2_min
--------
Training Mean: 0.9976 - Training Median: 0.9984 - Training Std: 0.003933
Training Min: 0.8815 - Training Max: 1.044
Training Skew: -3.873 - Training Kurtosis: 42.63
wap2_max
--------
Training Mean: 1.002 - Training Median: 1.002 - Training Std: 0.003849
Training Min: 0.9482 - Training Max: 1.126
Training Skew: 3.569 - Training Kurtosis: 36.6
wap2_realized_volatility
------------------------
Training Mean: 0.005808 - Training Median: 0.004347 - Training Std: 0.005026
Training Min: 7.879e-06 - Training Max: 0.1359
Training Skew: 3.591 - Training Kurtosis: 25.17
fig = plt.figure(figsize=(16, 16), dpi=100)
sns.heatmap(
df_train[['target'] + df_train.columns[6:].tolist()].corr(),
annot=True,
square=True,
cmap='coolwarm',
annot_kws={'size': 11},
fmt='.2f'
)
plt.tick_params(axis='x', labelsize=10, rotation=90)
plt.tick_params(axis='y', labelsize=10, rotation=0)
plt.title('Target and Feature Correlations', size=20, pad=20)
plt.show()
stock_features = []
for realized_volatility in ['target', 'realized_volatility_from_wap1', 'realized_volatility_from_wap2', 'realized_volatility_from_price']:
for agg in ['mean', 'std', 'min', 'max']:
df_train[f'stock_{realized_volatility}_{agg}'] = df_train.groupby('stock_id')[realized_volatility].transform(agg)
stock_features.append(f'stock_{realized_volatility}_{agg}')
df_stocks = df_train.groupby('stock_id')[stock_features].first().reset_index()
df_train.drop(columns=stock_features, inplace=True)
kmeans = KMeans(n_clusters=6, random_state=42)
kmeans.fit(df_stocks[stock_features])
pca = PCA(n_components=2)
stocks_2d = pca.fit_transform(df_stocks[stock_features])
fig, ax = plt.subplots(figsize=(32, 10))
ax.scatter(stocks_2d[:, 0], stocks_2d[:, 1], s=200, c=kmeans.labels_, cmap='RdBu')
for idx, stock_id in enumerate(df_stocks['stock_id'].values):
ax.annotate(stock_id, (stocks_2d[idx, 0], stocks_2d[idx, 1]), fontsize=20)
ax.tick_params(axis='x', labelsize=20, pad=10)
ax.tick_params(axis='y', labelsize=20, pad=10)
ax.set_title('Stock Clusters', size=25, pad=20)
plt.show()
D:\Anaconda\lib\site-packages\sklearn\cluster\_kmeans.py:1334: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
warnings.warn(
4.Naive
import os
from sklearn.metrics import r2_score
import glob
list_order_book_file_train = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/*')
def realized_volatility_per_time_id(file_path, prediction_column_name):
df_book_data = pd.read_parquet(file_path)
df_book_data['wap'] =(df_book_data['bid_price1'] * df_book_data['ask_size1']+df_book_data['ask_price1'] * df_book_data['bid_size1']) / (
df_book_data['bid_size1']+ df_book_data[
'ask_size1'])
df_book_data['log_return'] = df_book_data.groupby(['time_id'])['wap'].apply(log_return)
df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
df_realized_vol_per_stock = pd.DataFrame(df_book_data.groupby(['time_id'])['log_return'].agg(realized_volatility)).reset_index()
df_realized_vol_per_stock = df_realized_vol_per_stock.rename(columns = {'log_return':prediction_column_name})
stock_id = file_path.split('=')[1]
df_realized_vol_per_stock['row_id'] = df_realized_vol_per_stock['time_id'].apply(lambda x:f'{stock_id}-{x}')
return df_realized_vol_per_stock[['row_id',prediction_column_name]]
def past_realized_volatility_per_stock(list_file,prediction_column_name):
df_past_realized = pd.DataFrame()
for file in list_file:
df_past_realized = pd.concat([df_past_realized,
realized_volatility_per_time_id(file,prediction_column_name)])
return df_past_realized
df_past_realized_train = past_realized_volatility_per_stock(list_file=list_order_book_file_train,
prediction_column_name='pred')
train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
train = train[['row_id','target']]
df_joined = train.merge(df_past_realized_train[['row_id','pred']], on = ['row_id'], how = 'left')
from sklearn.metrics import r2_score
def rmspe(y_true, y_pred):
return (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))
R2 = round(r2_score(y_true = df_joined['target'], y_pred = df_joined['pred']),3)
RMSPE = round(rmspe(y_true = df_joined['target'], y_pred = df_joined['pred']),3)
print(f'Performance of the naive prediction: R2 score: {R2}, RMSPE: {RMSPE}')
Performance of the naive prediction: R2 score: 0.628, RMSPE: 0.341
5.GARCH
def abs_log_returns_topn(list_file, n_top):
'''Returns log returns for top n stocks from shuffled stock list'''
df_list = []
for stock_file in list_file[:n_top]:
df_book_data = pd.read_parquet(stock_file)
df_book_data['wap'] = calculate_wap(df_book_data)
df_book_data['log_return'] = df_book_data.groupby(['time_id'])['wap'].apply(log_return)
df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
df_book_data['log_return'] = df_book_data['log_return'].apply(abs)
df_book_data['stock_id'] = int(stock_file.split('=')[1])
df_list.append(df_book_data)
df = pd.concat(df_list, ignore_index=True)
return df
# 随机抽取股票
random.shuffle(order_book_train_files)
# 获取5只股票样本的对数收益率
df_log_returns = abs_log_returns_topn(order_book_train_files, 5)
selected_stocks = df_log_returns['stock_id'].unique()
print('Stock Ids:', ', '.join(str(s) for s in selected_stocks))
Stock Ids: 82, 80, 48, 50, 89
df_log_returns['stock_id'].value_counts()
50 2144862
80 1429737
89 1282438
48 1244721
82 1039429
Name: stock_id, dtype: int64
def plot_auto_correlation(series, lags=30, stock=''):
plt.rcParams["figure.figsize"] = 20, 5
fig, axes = plt.subplots(1, 2)
acf = plot_acf(series, lags=lags, ax = axes[0])
pacf = plot_pacf(series, lags=lags, ax = axes[1])
acf.suptitle(f'Autocorrelation and Partial Autocorrelation - stock {stock}', fontsize=20)
plt.show()
for stock_id in selected_stocks:
df_log_returns_stock = df_log_returns[df_log_returns['stock_id'] == stock_id]
random_time_id = random.choice(df_log_returns_stock['time_id'].unique())
df_time_slice = df_log_returns_stock[df_log_returns_stock['time_id'] == random_time_id]
plot_auto_correlation(df_time_slice['log_return'], stock=stock_id)
arima_order = []
for stock_id in selected_stocks:
for order_seq in product((2, 1, 0), repeat=3):
# ARIMA(0,0,0)是白噪声
if order_seq == (0, 0, 0):
continue
try:
df_log_returns_stock = df_log_returns[df_log_returns['stock_id'] == stock_id]
# 随机时间段来训练ARIMA模型
random_time_id = random.choice(df_log_returns_stock['time_id'].unique())
df_time_slice = df_log_returns_stock[df_log_returns_stock['time_id'] == random_time_id]
arima_model = ARIMA(df_time_slice['log_return'], order = order_seq)
results_ARIMA = arima_model.fit(disp=-1)
arima_order.append([stock_id, order_seq, results_ARIMA.aic])
# print(f'{order_seq} AIC: {results_ARIMA.aic}')
except:
pass
df_arima_order = pd.DataFrame(arima_order, columns=['stock_id', 'order', 'aic'])
df_arima_order_min = pd.merge(df_arima_order.groupby('stock_id')['aic'].min().reset_index(),
df_arima_order,
how='left',
on=['stock_id','aic'])
df_arima_order_min
stock_id | aic | order | |
---|---|---|---|
0 | 48 | -7214.905180 | (0, 1, 0) |
1 | 50 | -9740.828208 | (2, 1, 0) |
2 | 80 | -7425.750374 | (1, 1, 2) |
3 | 82 | -5490.511465 | (0, 0, 1) |
4 | 89 | -7134.379857 | (1, 0, 2) |
def plot_resid_sqr(series, stock_id='', time_id=''):
plt.rcParams["figure.figsize"] = 20, 5
plt.plot(series.values)
plt.title(f'Squared Residial Plot - stock {stock_id} - time_id {time_id}', fontsize=14)
plt.ylabel('Squared Residial', fontsize=14)
plt.show()
for idx, row in df_arima_order_min.iterrows():
stock_id = row['stock_id']
df_log_returns_stock = df_log_returns[df_log_returns['stock_id'] == stock_id]
# 随机时间段来训练ARIMA模型
random_time_id = random.choice(df_log_returns_stock['time_id'].unique())
df_time_slice = df_log_returns_stock[df_log_returns_stock['time_id'] == random_time_id]
arima_model = ARIMA(df_time_slice['log_return'], order = row['order'])
results_ARIMA = arima_model.fit(disp=-1)
sqr_resid = np.power(results_ARIMA.resid, 2)
plot_resid_sqr(sqr_resid, stock_id, random_time_id)
for idx, row in df_arima_order_min.iterrows():
stock_id = row['stock_id']
df_log_returns_stock = df_log_returns[df_log_returns['stock_id'] == stock_id]
# Slicing random time period to train ARIMA model
random_time_id = random.choice(df_log_returns_stock['time_id'].unique())
df_time_slice = df_log_returns_stock[df_log_returns_stock['time_id'] == random_time_id]
arima_model = ARIMA(df_time_slice['log_return'], order = row['order'])
results_ARIMA = arima_model.fit(disp=-1)
sqr_resid = np.power(results_ARIMA.resid, 2)
plot_auto_correlation(sqr_resid, stock=stock_id)
# 定义如何处理买卖盘数据
def wap1(df):
wap=(df['bid_price1']*df['ask_size1']+df['ask_price1']*df['bid_size1'])/(df['bid_size1']+df['ask_size1'])
return wap
def wap2(df):
wap=(df['bid_price2']*df['ask_size2']+df['ask_price2']*df['bid_size2'])/(df['bid_size2']+df['ask_size2'])
return wap
def count_unique(series):
return len(np.unique(series))
def book_prep(path):
df=pd.read_parquet(path)
df['wap']=wap1(df)
df['log_return']=df.groupby('time_id')['wap'].apply(log_return)
df['wap2']=wap2(df)
df['log_return2']=df.groupby('time_id')['wap2'].apply(log_return)
df['price_spread']=(df['ask_price1']-df['bid_price1'])/(df['ask_price1']+df['bid_price1'])
df['price_spread2']=(df['ask_price2']-df['bid_price2'])/(df['ask_price2']+df['bid_price2'])
df['bid_spread']=df['bid_price1']-df['bid_price2']
df['ask_spread']=df['ask_price1']-df['ask_price2']
df['total_volume']=df['ask_size1'] + df['ask_size2'] + df['bid_size1'] + df['bid_size2']
df['volume_imbalance']=abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
# 建立一个用来聚合的字典
create_feature_dict = {
'log_return':[realized_volatility],
'log_return2':[realized_volatility],
'wap':[np.mean],
'wap2':[np.mean],
'price_spread':[np.mean],
'price_spread2':[np.mean],
'bid_spread':[np.mean],
'ask_spread':[np.mean],
'volume_imbalance':[np.mean],
'total_volume':[np.mean],
}
# book里的特征建立完成,再来用groupby遍历所有秒数
df_feature=pd.DataFrame(df.groupby(['time_id']).agg(create_feature_dict)).reset_index()
# 把time_id变成time_id_
df_feature.columns=['_'.join(col) for col in df_feature.columns]
last_seconds=[150,300,450]
for second in last_seconds:
second=600-second
df_feature_sec=pd.DataFrame(df.query(f'seconds_in_bucket >={second}').groupby(['time_id']).agg(create_feature_dict)).reset_index()
df_feature_sec.columns = ['_'.join(col) for col in df_feature_sec.columns]
df_feature_sec=df_feature_sec.add_suffix('_'+str(second))
df_feature=pd.merge(df_feature,df_feature_sec,how='left',left_on='time_id_',right_on=f'time_id__{second}')
df_feature=df_feature.drop([f'time_id__{second}'],axis=1)
#建立row_id
stock_id=path.split('=')[1]
df_feature['row_id']=df_feature['time_id_'].apply(lambda x:f'{stock_id}-{x}')
df_feature=df_feature.drop(['time_id_'],axis=1)
return df_feature
# 查看执行上述处理后的样子,并看需要多少时间
%%time
data_dir = '../optiver-realized-volatility-prediction/'
path = data_dir + "book_train.parquet/stock_id=0"
book_prep(path)
CPU times: total: 6.17 s
Wall time: 6.03 s
log_return_realized_volatility | log_return2_realized_volatility | wap_mean | wap2_mean | price_spread_mean | price_spread2_mean | bid_spread_mean | ask_spread_mean | volume_imbalance_mean | total_volume_mean | ... | log_return2_realized_volatility_150 | wap_mean_150 | wap2_mean_150 | price_spread_mean_150 | price_spread2_mean_150 | bid_spread_mean_150 | ask_spread_mean_150 | volume_imbalance_mean_150 | total_volume_mean_150 | row_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.004499 | 0.006999 | 1.003725 | 1.003661 | 0.000426 | 0.000588 | 0.000176 | -0.000151 | 134.894040 | 323.496689 | ... | 0.006087 | 1.003832 | 1.003753 | 0.000429 | 0.000596 | 0.000188 | -0.000147 | 123.586207 | 327.431034 | 0-5 |
1 | 0.001204 | 0.002476 | 1.000239 | 1.000206 | 0.000197 | 0.000335 | 0.000142 | -0.000135 | 142.050000 | 411.450000 | ... | 0.002262 | 1.000301 | 1.000245 | 0.000176 | 0.000310 | 0.000141 | -0.000127 | 151.566474 | 419.277457 | 0-11 |
2 | 0.002369 | 0.004801 | 0.999542 | 0.999680 | 0.000363 | 0.000560 | 0.000197 | -0.000198 | 141.414894 | 416.351064 | ... | 0.004019 | 0.999126 | 0.999312 | 0.000340 | 0.000540 | 0.000161 | -0.000241 | 132.084034 | 428.537815 | 0-16 |
3 | 0.002574 | 0.003637 | 0.998832 | 0.998633 | 0.000430 | 0.000579 | 0.000190 | -0.000108 | 146.216667 | 435.266667 | ... | 0.003273 | 0.998464 | 0.998353 | 0.000460 | 0.000600 | 0.000170 | -0.000108 | 151.765432 | 424.234568 | 0-31 |
4 | 0.001894 | 0.003257 | 0.999619 | 0.999626 | 0.000199 | 0.000349 | 0.000191 | -0.000109 | 123.846591 | 343.221591 | ... | 0.002927 | 0.999618 | 0.999670 | 0.000198 | 0.000349 | 0.000187 | -0.000117 | 131.474074 | 371.266667 | 0-62 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
3825 | 0.002579 | 0.003821 | 0.997938 | 0.997864 | 0.000276 | 0.000409 | 0.000083 | -0.000182 | 197.144781 | 374.235690 | ... | 0.003507 | 0.997784 | 0.997666 | 0.000286 | 0.000405 | 0.000066 | -0.000171 | 196.365639 | 349.916300 | 0-32751 |
3826 | 0.002206 | 0.002847 | 1.000310 | 1.000487 | 0.000271 | 0.000403 | 0.000092 | -0.000172 | 233.781553 | 621.131068 | ... | 0.002396 | 1.000528 | 1.000742 | 0.000289 | 0.000418 | 0.000079 | -0.000181 | 253.850340 | 707.782313 | 0-32753 |
3827 | 0.002913 | 0.003266 | 0.999552 | 0.999456 | 0.000263 | 0.000405 | 0.000202 | -0.000083 | 115.829787 | 343.734043 | ... | 0.003006 | 0.999855 | 0.999793 | 0.000257 | 0.000407 | 0.000215 | -0.000084 | 103.022556 | 313.142857 | 0-32758 |
3828 | 0.003046 | 0.005105 | 1.002357 | 1.002386 | 0.000240 | 0.000379 | 0.000113 | -0.000166 | 132.074919 | 385.429967 | ... | 0.004526 | 1.002281 | 1.002325 | 0.000259 | 0.000380 | 0.000095 | -0.000148 | 120.972603 | 389.273973 | 0-32763 |
3829 | 0.001901 | 0.002541 | 0.999123 | 0.999166 | 0.000229 | 0.000355 | 0.000124 | -0.000127 | 165.754386 | 533.543860 | ... | 0.001823 | 0.998746 | 0.998772 | 0.000236 | 0.000359 | 0.000128 | -0.000118 | 152.146341 | 505.756098 | 0-32767 |
3830 rows × 41 columns
look=pd.read_parquet('../optiver-realized-volatility-prediction/trade_train.parquet/stock_id=0')
look
time_id | seconds_in_bucket | price | size | order_count | |
---|---|---|---|---|---|
0 | 5 | 21 | 1.002301 | 326 | 12 |
1 | 5 | 46 | 1.002778 | 128 | 4 |
2 | 5 | 50 | 1.002818 | 55 | 1 |
3 | 5 | 57 | 1.003155 | 121 | 5 |
4 | 5 | 68 | 1.003646 | 4 | 1 |
... | ... | ... | ... | ... | ... |
123438 | 32767 | 471 | 0.998659 | 200 | 3 |
123439 | 32767 | 517 | 0.998515 | 90 | 1 |
123440 | 32767 | 523 | 0.998563 | 1 | 1 |
123441 | 32767 | 542 | 0.998803 | 90 | 4 |
123442 | 32767 | 567 | 0.998547 | 300 | 3 |
123443 rows × 5 columns
# 利用trade的交易数据建立特征
def trade_prep(path):
df = pd.read_parquet(path)
df['log_return'] = df.groupby('time_id')['price'].apply(log_return)
aggregate_dictionary = {
'log_return':[realized_volatility],
'seconds_in_bucket':[count_unique],
'size':[np.sum],
'order_count':[np.mean],
}
df_feature = df.groupby('time_id').agg(aggregate_dictionary).reset_index()
df_feature.columns = ['_'.join(col) for col in df_feature.columns]
last_seconds = [150,300,450]
for second in last_seconds:
second = 600 - second
df_feature_sec = df.query(f'seconds_in_bucket >= {second}').groupby('time_id').agg(aggregate_dictionary)
df_feature_sec = df_feature_sec.reset_index()
df_feature_sec.columns = ['_'.join(col) for col in df_feature_sec.columns]
df_feature_sec = df_feature_sec.add_suffix('_' + str(second))
df_feature = pd.merge(df_feature,df_feature_sec,how='left',left_on='time_id_',right_on=f'time_id__{second}')
df_feature = df_feature.drop([f'time_id__{second}'],axis=1)
df_feature = df_feature.add_prefix('trade_')
stock_id = path.split('=')[1]
df_feature['row_id'] = df_feature['trade_time_id_'].apply(lambda x:f'{stock_id}-{x}')
df_feature = df_feature.drop(['trade_time_id_'],axis=1)
return df_feature
%%time
data_dir = '../optiver-realized-volatility-prediction/'
path = data_dir + "trade_train.parquet/stock_id=0"
trade_prep(path)
CPU times: total: 3.03 s
Wall time: 3.03 s
trade_log_return_realized_volatility | trade_seconds_in_bucket_count_unique | trade_size_sum | trade_order_count_mean | trade_log_return_realized_volatility_450 | trade_seconds_in_bucket_count_unique_450 | trade_size_sum_450 | trade_order_count_mean_450 | trade_log_return_realized_volatility_300 | trade_seconds_in_bucket_count_unique_300 | trade_size_sum_300 | trade_order_count_mean_300 | trade_log_return_realized_volatility_150 | trade_seconds_in_bucket_count_unique_150 | trade_size_sum_150 | trade_order_count_mean_150 | row_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.002006 | 40 | 3179 | 2.750000 | 0.001060 | 14.0 | 1042.0 | 2.642857 | 0.001308 | 21.0 | 1587.0 | 2.571429 | 0.001701 | 30.0 | 2069.0 | 2.433333 | 0-5 |
1 | 0.000901 | 30 | 1289 | 1.900000 | 0.000501 | 10.0 | 828.0 | 2.200000 | 0.000587 | 16.0 | 900.0 | 2.250000 | 0.000813 | 24.0 | 1173.0 | 2.041667 | 0-11 |
2 | 0.001961 | 25 | 2161 | 2.720000 | 0.001048 | 9.0 | 1085.0 | 3.666667 | 0.001137 | 12.0 | 1189.0 | 3.166667 | 0.001621 | 20.0 | 2010.0 | 2.950000 | 0-16 |
3 | 0.001561 | 15 | 1962 | 3.933333 | 0.000802 | 3.0 | 514.0 | 3.666667 | 0.001089 | 9.0 | 1556.0 | 5.111111 | 0.001401 | 11.0 | 1631.0 | 4.545455 | 0-31 |
4 | 0.000871 | 22 | 1791 | 4.045455 | 0.000360 | 4.0 | 43.0 | 3.500000 | 0.000453 | 11.0 | 1219.0 | 4.909091 | 0.000550 | 16.0 | 1570.0 | 4.500000 | 0-62 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
3825 | 0.001519 | 52 | 3450 | 3.057692 | 0.000786 | 19.0 | 1159.0 | 2.947368 | 0.001162 | 35.0 | 2365.0 | 3.257143 | 0.001409 | 42.0 | 2957.0 | 3.238095 | 0-32751 |
3826 | 0.001411 | 28 | 4547 | 3.892857 | 0.000750 | 5.0 | 1158.0 | 4.600000 | 0.001066 | 12.0 | 2161.0 | 4.250000 | 0.001284 | 19.0 | 2494.0 | 3.421053 | 0-32753 |
3827 | 0.001521 | 36 | 4250 | 3.500000 | 0.000780 | 8.0 | 416.0 | 2.000000 | 0.001242 | 22.0 | 2294.0 | 3.727273 | 0.001375 | 27.0 | 2736.0 | 3.444444 | 0-32758 |
3828 | 0.001794 | 53 | 3217 | 2.150943 | 0.001012 | 12.0 | 1415.0 | 2.666667 | 0.001404 | 25.0 | 1627.0 | 1.920000 | 0.001650 | 36.0 | 2296.0 | 2.055556 | 0-32763 |
3829 | 0.001197 | 29 | 3679 | 2.413793 | 0.000542 | 8.0 | 987.0 | 2.125000 | 0.000801 | 16.0 | 2650.0 | 3.000000 | 0.000990 | 23.0 | 3346.0 | 2.739130 | 0-32767 |
3830 rows × 17 columns
Joblib
from joblib import Parallel, delayed
def preprocessor(list_stock_ids, is_train = True):
df = pd.DataFrame()
def for_joblib(stock_id):
if is_train:
file_path_book = data_dir + "book_train.parquet/stock_id=" + str(stock_id)
file_path_trade = data_dir + "trade_train.parquet/stock_id=" + str(stock_id)
else:
file_path_book = data_dir + "book_test.parquet/stock_id=" + str(stock_id)
file_path_trade = data_dir + "trade_test.parquet/stock_id=" + str(stock_id)
df_tmp = pd.merge(book_prep(file_path_book),trade_prep(file_path_trade),on='row_id',how='left')
return pd.concat([df,df_tmp])
df = Parallel(n_jobs=-1, verbose=1)(
delayed(for_joblib)(stock_id) for stock_id in list_stock_ids
)
df = pd.concat(df,ignore_index = True)
return df
- Train_data
train = pd.read_csv(data_dir + 'train.csv')
train_ids = train.stock_id.unique()
# 合并所有训练集
%%time
df_train = preprocessor(list_stock_ids= train_ids, is_train = True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 48.5s
CPU times: total: 2.05 s
Wall time: 2min 51s
[Parallel(n_jobs=-1)]: Done 112 out of 112 | elapsed: 2.8min finished
# 合并feature和target
train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
train = train[['row_id','target']]
df_train = train.merge(df_train, on = ['row_id'], how = 'left')
- Test_data
test = pd.read_csv(data_dir + 'test.csv')
test_ids = test.stock_id.unique()
%%time
df_test = preprocessor(list_stock_ids= test_ids, is_train = False)
CPU times: total: 0 ns
Wall time: 78.8 ms
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 1 out of 1 | elapsed: 0.0s finished
df_test = test.merge(df_test, on = ['row_id'], how = 'left')
from sklearn.model_selection import KFold
df_train['stock_id']=df_train['row_id'].apply(lambda x:x.split('-')[0])
df_test['stock_id']=df_test['row_id'].apply(lambda x:x.split('-')[0])
stock_id_target_mean=df_train.groupby('stock_id')['target'].mean()
df_test['stock_id_target_enc']=df_test['stock_id'].map(stock_id_target_mean)
tmp=np.repeat(np.nan,df_train.shape[0])
kf=KFold(n_splits=10,shuffle=True)
for idx_1,idx_2 in kf.split(df_train):
target_mean = df_train.iloc[idx_1].groupby('stock_id')['target'].mean()
tmp[idx_2]=df_train['stock_id'].iloc[idx_2].map(target_mean)
df_train['stock_id_target_enc'] = tmp
# 把stock_id类型改成int
df_train['stock_id'] = df_train['stock_id'].astype(int)
df_test['stock_id'] = df_test['stock_id'].astype(int)
X=df_train.drop(['row_id','target'],axis=1)
y=df_train['target']
# 定义RMSPE
def rmspe(y_true, y_pred):
return (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))
def feval_RMSPE(preds, lgbm_train):
labels = lgbm_train.get_label()
return 'RMSPE', round(rmspe(y_true = labels, y_pred = preds),5), False
# 定义LGBM模型的超参数
params = {
"objective": "rmse",
"metric": "rmse",
"boosting_type": "gbdt",
'early_stopping_rounds': 30,
'learning_rate': 0.01,
'lambda_l1': 1,
'lambda_l2': 1,
}
# 设定k折交叉验证参数
kf = KFold(n_splits=4,shuffle=True)
oof = pd.DataFrame()
models = []
scores = 0.0
# 开始训练
for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
print("Fold :", fold+1)
X_train, y_train = X.loc[trn_idx], y[trn_idx]
X_valid, y_valid = X.loc[val_idx], y[val_idx]
# https://www.kaggle.com/c/optiver-realized-volatility-prediction/discussion/250324
# 根据搜集资料,使用RMSPE weight
weights = 1/np.square(y_train)
lgbm_train = lgbm.Dataset(X_train,y_train,weight = weights)
weights = 1/np.square(y_valid)
lgbm_valid = lgbm.Dataset(X_valid,y_valid,reference = lgbm_train,weight = weights)
# 构造LGBM模型
model = lgbm.train(params=params,
train_set=lgbm_train,
valid_sets=[lgbm_train, lgbm_valid],
num_boost_round=1000,
feval=feval_RMSPE,
verbose_eval=200,
categorical_feature = ['stock_id']
)
# 预测
y_pred = model.predict(X_valid, num_iteration=model.best_iteration)
# 展示每次RMSPE分数
RMSPE = round(rmspe(y_true = y_valid, y_pred = y_pred),4)
print(f'Performance of the prediction: , RMSPE: {RMSPE}')
# 记录模型及其分数
scores += RMSPE / 4
models.append(model)
print("*" * 100)
Fold : 1
D:\Anaconda\lib\site-packages\lightgbm\engine.py:153: UserWarning: Found `early_stopping_rounds` in params. Will use it instead of argument
_log_warning("Found `{}` in params. Will use it instead of argument".format(alias))
D:\Anaconda\lib\site-packages\lightgbm\basic.py:1705: UserWarning: categorical_feature in Dataset is overridden.
New categorical_feature is ['stock_id']
_log_warning('categorical_feature in Dataset is overridden.\n'
[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.044245 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14543
[LightGBM] [Info] Number of data points in the train set: 321699, number of used features: 58
[LightGBM] [Info] Start training from score 0.001795
Training until validation scores don't improve for 30 rounds
D:\Anaconda\lib\site-packages\lightgbm\basic.py:1433: UserWarning: Overriding the parameters from Reference Dataset.
_log_warning('Overriding the parameters from Reference Dataset.')
D:\Anaconda\lib\site-packages\lightgbm\basic.py:1245: UserWarning: categorical_column in param dict is overridden.
_log_warning('{} in param dict is overridden.'.format(cat_alias))
[200] training's rmse: 0.000533368 training's RMSPE: 0.24722 valid_1's rmse: 0.0005513 valid_1's RMSPE: 0.254
[400] training's rmse: 0.000499109 training's RMSPE: 0.23134 valid_1's rmse: 0.00052092 valid_1's RMSPE: 0.24
[600] training's rmse: 0.000492161 training's RMSPE: 0.22812 valid_1's rmse: 0.000516319 valid_1's RMSPE: 0.23788
[800] training's rmse: 0.000487324 training's RMSPE: 0.22588 valid_1's rmse: 0.000514602 valid_1's RMSPE: 0.23709
[1000] training's rmse: 0.000483462 training's RMSPE: 0.22409 valid_1's rmse: 0.000513685 valid_1's RMSPE: 0.23667
Did not meet early stopping. Best iteration is:
[1000] training's rmse: 0.000483462 training's RMSPE: 0.22409 valid_1's rmse: 0.000513685 valid_1's RMSPE: 0.23667
Performance of the prediction: , RMSPE: 0.2367
****************************************************************************************************
Fold : 2
D:\Anaconda\lib\site-packages\lightgbm\engine.py:153: UserWarning: Found `early_stopping_rounds` in params. Will use it instead of argument
_log_warning("Found `{}` in params. Will use it instead of argument".format(alias))
D:\Anaconda\lib\site-packages\lightgbm\basic.py:1705: UserWarning: categorical_feature in Dataset is overridden.
New categorical_feature is ['stock_id']
_log_warning('categorical_feature in Dataset is overridden.\n'
[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.049546 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14543
[LightGBM] [Info] Number of data points in the train set: 321699, number of used features: 58
[LightGBM] [Info] Start training from score 0.001810
Training until validation scores don't improve for 30 rounds
D:\Anaconda\lib\site-packages\lightgbm\basic.py:1433: UserWarning: Overriding the parameters from Reference Dataset.
_log_warning('Overriding the parameters from Reference Dataset.')
D:\Anaconda\lib\site-packages\lightgbm\basic.py:1245: UserWarning: categorical_column in param dict is overridden.
_log_warning('{} in param dict is overridden.'.format(cat_alias))
[200] training's rmse: 0.000534933 training's RMSPE: 0.24681 valid_1's rmse: 0.000542633 valid_1's RMSPE: 0.25343
[400] training's rmse: 0.000501284 training's RMSPE: 0.23129 valid_1's rmse: 0.000512115 valid_1's RMSPE: 0.23918
[600] training's rmse: 0.000494241 training's RMSPE: 0.22804 valid_1's rmse: 0.000510521 valid_1's RMSPE: 0.23844
Early stopping, best iteration is:
[577] training's rmse: 0.000494887 training's RMSPE: 0.22834 valid_1's rmse: 0.000510407 valid_1's RMSPE: 0.23838
Performance of the prediction: , RMSPE: 0.2384
****************************************************************************************************
Fold : 3
D:\Anaconda\lib\site-packages\lightgbm\engine.py:153: UserWarning: Found `early_stopping_rounds` in params. Will use it instead of argument
_log_warning("Found `{}` in params. Will use it instead of argument".format(alias))
D:\Anaconda\lib\site-packages\lightgbm\basic.py:1705: UserWarning: categorical_feature in Dataset is overridden.
New categorical_feature is ['stock_id']
_log_warning('categorical_feature in Dataset is overridden.\n'
[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.050070 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14543
[LightGBM] [Info] Number of data points in the train set: 321699, number of used features: 58
[LightGBM] [Info] Start training from score 0.001799
Training until validation scores don't improve for 30 rounds
D:\Anaconda\lib\site-packages\lightgbm\basic.py:1433: UserWarning: Overriding the parameters from Reference Dataset.
_log_warning('Overriding the parameters from Reference Dataset.')
D:\Anaconda\lib\site-packages\lightgbm\basic.py:1245: UserWarning: categorical_column in param dict is overridden.
_log_warning('{} in param dict is overridden.'.format(cat_alias))
[200] training's rmse: 0.000533172 training's RMSPE: 0.24683 valid_1's rmse: 0.000539364 valid_1's RMSPE: 0.24942
[400] training's rmse: 0.000499037 training's RMSPE: 0.23102 valid_1's rmse: 0.000509534 valid_1's RMSPE: 0.23563
[600] training's rmse: 0.000492018 training's RMSPE: 0.22777 valid_1's rmse: 0.000506017 valid_1's RMSPE: 0.234
[800] training's rmse: 0.000487172 training's RMSPE: 0.22553 valid_1's rmse: 0.000503676 valid_1's RMSPE: 0.23292
[1000] training's rmse: 0.000483288 training's RMSPE: 0.22373 valid_1's rmse: 0.000502225 valid_1's RMSPE: 0.23225
Did not meet early stopping. Best iteration is:
[1000] training's rmse: 0.000483288 training's RMSPE: 0.22373 valid_1's rmse: 0.000502225 valid_1's RMSPE: 0.23225
Performance of the prediction: , RMSPE: 0.2322
****************************************************************************************************
Fold : 4
D:\Anaconda\lib\site-packages\lightgbm\engine.py:153: UserWarning: Found `early_stopping_rounds` in params. Will use it instead of argument
_log_warning("Found `{}` in params. Will use it instead of argument".format(alias))
D:\Anaconda\lib\site-packages\lightgbm\basic.py:1705: UserWarning: categorical_feature in Dataset is overridden.
New categorical_feature is ['stock_id']
_log_warning('categorical_feature in Dataset is overridden.\n'
[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.044247 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14544
[LightGBM] [Info] Number of data points in the train set: 321699, number of used features: 58
[LightGBM] [Info] Start training from score 0.001796
Training until validation scores don't improve for 30 rounds
D:\Anaconda\lib\site-packages\lightgbm\basic.py:1433: UserWarning: Overriding the parameters from Reference Dataset.
_log_warning('Overriding the parameters from Reference Dataset.')
D:\Anaconda\lib\site-packages\lightgbm\basic.py:1245: UserWarning: categorical_column in param dict is overridden.
_log_warning('{} in param dict is overridden.'.format(cat_alias))
[200] training's rmse: 0.000533089 training's RMSPE: 0.24704 valid_1's rmse: 0.000543243 valid_1's RMSPE: 0.25045
[400] training's rmse: 0.000498918 training's RMSPE: 0.2312 valid_1's rmse: 0.000512558 valid_1's RMSPE: 0.2363
[600] training's rmse: 0.000492014 training's RMSPE: 0.228 valid_1's rmse: 0.000508456 valid_1's RMSPE: 0.23441
[800] training's rmse: 0.000487206 training's RMSPE: 0.22578 valid_1's rmse: 0.000506189 valid_1's RMSPE: 0.23336
[1000] training's rmse: 0.000483353 training's RMSPE: 0.22399 valid_1's rmse: 0.000504692 valid_1's RMSPE: 0.23267
Did not meet early stopping. Best iteration is:
[1000] training's rmse: 0.000483353 training's RMSPE: 0.22399 valid_1's rmse: 0.000504692 valid_1's RMSPE: 0.23267
Performance of the prediction: , RMSPE: 0.2327
****************************************************************************************************
scores
0.235
# 准备测试集
y_pred = df_test[['row_id']]
X_test = df_test.drop(['time_id', 'row_id'], axis = 1)
X_test
stock_id | log_return_realized_volatility | log_return2_realized_volatility | wap_mean | wap2_mean | price_spread_mean | price_spread2_mean | bid_spread_mean | ask_spread_mean | volume_imbalance_mean | ... | trade_order_count_mean_450 | trade_log_return_realized_volatility_300 | trade_seconds_in_bucket_count_unique_300 | trade_size_sum_300 | trade_order_count_mean_300 | trade_log_return_realized_volatility_150 | trade_seconds_in_bucket_count_unique_150 | trade_size_sum_150 | trade_order_count_mean_150 | stock_id_target_enc | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0.000294 | 0.000252 | 1.000405 | 1.00055 | 0.000279 | 0.000533 | 0.000393 | -0.000115 | 164.666667 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.004028 |
1 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.004028 |
2 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.004028 |
3 rows × 58 columns
target = np.zeros(len(X_test))
# 使用所有模型逐一预测测试集,并挑选最优模型
for model in models:
pred = model.predict(X_test[X_valid.columns], num_iteration=model.best_iteration)
target += pred / len(models)
y_pred = y_pred.assign(target = target)
y_pred
row_id | target | |
---|---|---|
0 | 0-4 | 0.000970 |
1 | 0-32 | 0.000951 |
2 | 0-34 | 0.000951 |
y_pred.to_csv('submission.csv',index = False)