上海股票和波形图预测
"""
date:25/09/2021
author:syb
version:1.0
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# 显示所有的行列
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', None)
# 读取数据
df = pd.read_pickle('stock_SH.plk')
# print(df.head(10))
df1 = df.copy()
df_group = df1.groupby(['Date_month'])
def del_nan(lis):
"""
删除缺失值NaN,选择没有缺失值的前10%
"""
res = []
for i in range(len(lis)):
if -100 < float(lis[i]) < 100:
res.append(lis[i])
return res
# 选择不包含日期的列
lis_col = df1.columns.tolist()[1:-1]
res = 100 # 投入100万元
for name, group in df_group:
# group为DataFrame对象
# 去掉Date和Date_month两列,根据索引选择每月的第一天和最后一天
opening_price = group.iloc[0, 1:-1].values.tolist()
# 买入时不看‘NaN’值的股票
opening_price = del_nan(opening_price)
m = len(opening_price) // 10
# print(m) # 前10%有多少支股票
opening_price.sort()
# 前10%分界点的股票价格
value_10 = max(opening_price[:m])
# print(type(value_10)) # 'numpy.float64'
col_10 = []
yield_rate = 0
for i, col in enumerate(lis_col):
end = group.iloc[-1, i+1]
start = group.iloc[0, i+1]
# 单支股票收益为: res/m * (1+single_stock_rate)
if group.iloc[0, i + 1] <= value_10:
yield_rate += (end - start) / start
res = (1 + yield_rate / m) * res
print('%s卖出后剩余金额为%f万元' % (name, res))
# 最后的财富为543.900078元
"""
date:26/09/2021
author:syb
version:1.0
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import scipy.stats as st
import os
import re
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', None)
# 显示所有info,默认100
pd.set_option("display.max_info_columns", 2000)
# 读取文件夹
file_dir = './test1_files/'
file_list = os.listdir(file_dir)
# 按照文件夹顺序更新file_list
c = []
for f in file_list:
f = re.sub("\D", "", f)
c.append(f)
c_1 = sorted(list(map(int, c)))
new_files = [('test'+str(i)+'.csv') for i in c_1]
# 将结果存储到字典中
result = dict()
# 可以合并dataframe后操作,--------这里选择分开读取,不用逐行删除缺失值
for idx, file in enumerate(new_files):
path = os.path.join(file_dir, file)
df = pd.read_csv(path, header=None)
# 查看每一行数据量
int_dot = len(df.iloc[0])
# 起始位置
start = 0
# 跳过每行开头的几个自然数
for i in range(100):
if df.iloc[0, i] < 2:
start = i
break
# 更新起始位置,剔除前面几个数字
start_num = start
def select_pattern_len(start, df):
"""
选择每一种pattern的元素数
按照pearson相关性筛选,如果3到20之间相关性系数没有符合条件的,则取2个元素
"""
p_len = 2
for j in range(3, 20):
a = df.iloc[0, start: start+j].values
b = df.iloc[0, start+j: start+2*j].values
if len(a) != len(b):
break
pearson, _ = st.pearsonr(a, b)
if pearson > 0.9999999:
p_len = j
break
return p_len
# 每种波形样式至少有三个点,最多不超过20个
p_len = select_pattern_len(start, df)
len1 = start_num
class Solution:
# @staticmethod
def style_pattern(self, start, p_len, all_style, len1):
"""
:start: 起始位置
:p_len: 每一个pattern包含元素数
:all_style: 每一种pattern中样式数
:len1:遍历过多少元素,当len1大于元素总数,退出
:return:
"""
style_n = 0
for i in range(1, int_dot // p_len):
a = df.iloc[0, start: start + p_len].values
b = df.iloc[0, (start + i * p_len): (start + (i + 1) * p_len)].values
if len(a) != len(b):
break
# print(len(a), len(b))
pearson, _ = st.pearsonr(a, b)
if pearson > 0.9999999:
style_n = max(style_n, i + 1)
else:
# 更新起始位置,有新的pattern出现
start += i * p_len
# 下一pattern中的元素数
p_len = select_pattern_len(start, df)
# len1 += p_len * style_n
break
# 记录下该pattern中的样式数
all_style.append(style_n)
len1 += p_len * style_n
# 退出判别条件
if len1 < int_dot:
start, p_len, all_style, len1 = self.style_pattern(start, p_len, all_style, len1)
return start, p_len, all_style, len1
all_style = []
re_start, re_p_len, re_all_style, re_len1 = Solution().style_pattern(start, p_len, all_style, len1)
# pattern的数量插入到第一个位置
re_all_style.insert(0, len(re_all_style))
# 按照文件名,将pattern的样式数量写入字典中
name = new_files[idx]
result[name] = re_all_style
# print(re_all_style)
print(result)