主要是用于分类选股,没有太多特别
1、这里用的是涨跌幅做聚类,特征并不多!
2、时间只是分析了2023年至今的分类情况!
3、这里博主K-mean选了10类,可以根据自己需求更改!
这个有什么用呢?
1、对冲关系的股票必定是存在两种不同聚类之中!
2、K-mean选2类,大家可是看看是不是大市值和小市值这两类!
3、热门股票们(龙头们)是会有聚类现象的,可以辅助分析!
from jqdata import * from jqfactor import * import jqdata from jqfactor import * import pandas as pd
import numpy as np from sklearn.model_selection import train_test_split from xgboost import XGBClassifier import matplotlib.pyplot as plt import jqdata from jqfactor import * from tqdm import tqdm import warnings warnings.filterwarnings("ignore") import numpy as np import pandas as pd from pandas.tseries.offsets import CustomBusinessDay # from datetime import datetime import datetime import xgboost as xgb import talib from sklearn.preprocessing import StandardScaler current_date = datetime.datetime.now() current_time = current_date.strftime("%Y-%m-%d") #去除上市距beginDate不足3个月的股票 def delect_stop(stocks,beginDate,n=365): stockList=[] beginDate = datetime.datetime.strptime(beginDate, "%Y-%m-%d") for stock in stocks: start_date=get_security_info(stock).start_date if start_date<(beginDate-datetime.timedelta(days=n)).date(): stockList.append(stock) return stockList #获取股票池 def get_stock(stockPool,begin_date): if stockPool=='HS300': stockList=get_index_stocks('000300.XSHG',begin_date) elif stockPool=='ZZ500': stockList=get_index_stocks('399905.XSHE',begin_date) elif stockPool=='ZZ800': stockList=get_index_stocks('399906.XSHE',begin_date) elif stockPool=='CYBZ': stockList=get_index_stocks('399006.XSHE',begin_date) elif stockPool=='ZXBZ': stockList=get_index_stocks('399005.XSHE',begin_date) elif stockPool=='A': stockList=get_index_stocks('000002.XSHG',begin_date)+get_index_stocks('399107.XSHE',begin_date) stockList = [stock for stock in stockList if not stock.startswith(('3', '68', '4', '8'))] elif stockPool=='AA': stockList=get_index_stocks('000985.XSHG',begin_date) stockList = [stock for stock in stockList if not stock.startswith(('3', '68', '4', '8'))] #剔除ST股 st_data=get_extras('is_st',stockList, count = 1,end_date=begin_date) stockList = [stock for stock in stockList if not st_data[stock][0]] #剔除停牌、新股及退市股票 stockList=delect_stop(stockList,begin_date) return stockList stockList=get_stock('AA',current_time) _end_date = current_date + datetime.timedelta(-2) _end_date = _end_date.strftime("%Y-%m-%d") print(_end_date) DATA = [] for i in stockList: df = get_price(i, frequency='1d', start_date='2023-01-01', end_date=_end_date, # fields=['open','close','high','low'] fields=['close'] ) df['returns'] = df['close'].pct_change() * 100 df = df.dropna() # print(df) # 将收益率转换为 NumPy 数组 returns_array = df['returns'].values DATA.append(returns_array) DATA = np.array(DATA) print(DATA.shape) max_value = np.max(DATA) min_value = np.min(DATA) nan_count = np.isnan(DATA).sum() print("数组中包含 NaN 的数量:", nan_count) print("最大值:", max_value) print("最小值:", min_value)
2023-12-12 (3032, 228) 数组中包含 NaN 的数量: 0 最大值: 10.465116279069765 最小值: -10.416666666666663
import numpy as np import matplotlib.pyplot as plt from sklearn.manifold import TSNE
# 使用 t-SNE 进行降维 tsne = TSNE(n_components=2, random_state=42) embedded_data = tsne.fit_transform(DATA) print(embedded_data.shape) # 创建一个散点图进行可视化 plt.scatter(embedded_data[:, 0], embedded_data[:, 1], marker='.', alpha=0.5) plt.title('t-SNE Visualization') plt.show()
未完,详见下一篇
...