pearson相关性

最新推荐文章于 2022-03-16 14:19:08 发布

大佬喝可乐

最新推荐文章于 2022-03-16 14:19:08 发布

阅读量227

点赞数

分类专栏：机器学习数据结构数据分析文章标签：股票预测波形图分析收益率优化 Pearson相关性投资策略

本文链接：https://blog.csdn.net/weixin_44322171/article/details/120492399

版权

数据分析同时被 3 个专栏收录

23 篇文章 1 订阅

订阅专栏

数据结构

19 篇文章 0 订阅

订阅专栏

机器学习

14 篇文章 0 订阅

订阅专栏

上海股票和波形图预测

在这里插入图片描述

"""
    date:25/09/2021
    author:syb
    version:1.0
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 显示所有的行列
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', None)

# 读取数据
df = pd.read_pickle('stock_SH.plk')
# print(df.head(10))

df1 = df.copy()
df_group = df1.groupby(['Date_month'])


def del_nan(lis):
    """
        删除缺失值NaN，选择没有缺失值的前10%
    """
    res = []
    for i in range(len(lis)):
        if -100 < float(lis[i]) < 100:
            res.append(lis[i])
    return res


# 选择不包含日期的列
lis_col = df1.columns.tolist()[1:-1]
res = 100         # 投入100万元
for name, group in df_group:
    # group为DataFrame对象
    # 去掉Date和Date_month两列，根据索引选择每月的第一天和最后一天
    opening_price = group.iloc[0, 1:-1].values.tolist()
    # 买入时不看‘NaN’值的股票
    opening_price = del_nan(opening_price)
    m = len(opening_price) // 10
    # print(m) # 前10%有多少支股票
    opening_price.sort()
    # 前10%分界点的股票价格
    value_10 = max(opening_price[:m])
    # print(type(value_10)) # 'numpy.float64'

    col_10 = []
    yield_rate = 0
    for i, col in enumerate(lis_col):
        end = group.iloc[-1, i+1]
        start = group.iloc[0, i+1]
        # 单支股票收益为： res/m * (1+single_stock_rate)
        if group.iloc[0, i + 1] <= value_10:
            yield_rate += (end - start) / start
    res = (1 + yield_rate / m) * res
    print('%s卖出后剩余金额为%f万元' % (name, res))
# 最后的财富为543.900078元

在这里插入图片描述

"""
    date:26/09/2021
    author:syb
    version:1.0
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import scipy.stats as st
import os
import re

warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', None)
# 显示所有info，默认100
pd.set_option("display.max_info_columns", 2000)
# 读取文件夹
file_dir = './test1_files/'
file_list = os.listdir(file_dir)
# 按照文件夹顺序更新file_list
c = []
for f in file_list:
    f = re.sub("\D", "", f)
    c.append(f)
c_1 = sorted(list(map(int, c)))
new_files = [('test'+str(i)+'.csv') for i in c_1]
# 将结果存储到字典中
result = dict()
# 可以合并dataframe后操作，--------这里选择分开读取，不用逐行删除缺失值
for idx, file in enumerate(new_files):
    path = os.path.join(file_dir, file)
    df = pd.read_csv(path, header=None)
    # 查看每一行数据量
    int_dot = len(df.iloc[0])
    # 起始位置
    start = 0
    # 跳过每行开头的几个自然数
    for i in range(100):
        if df.iloc[0, i] < 2:
            start = i
            break
    # 更新起始位置，剔除前面几个数字
    start_num = start


    def select_pattern_len(start, df):
        """
            选择每一种pattern的元素数
            按照pearson相关性筛选，如果3到20之间相关性系数没有符合条件的，则取2个元素
        """
        p_len = 2
        for j in range(3, 20):
            a = df.iloc[0, start: start+j].values
            b = df.iloc[0, start+j: start+2*j].values
            if len(a) != len(b):
                break
            pearson, _ = st.pearsonr(a, b)
            if pearson > 0.9999999:
                p_len = j
                break
        return p_len

    # 每种波形样式至少有三个点，最多不超过20个
    p_len = select_pattern_len(start, df)
    len1 = start_num


    class Solution:
        # @staticmethod

        def style_pattern(self, start, p_len, all_style, len1):
            """

            :start: 起始位置
            :p_len: 每一个pattern包含元素数
            :all_style: 每一种pattern中样式数
            :len1:遍历过多少元素，当len1大于元素总数，退出
            :return:
            """
            style_n = 0
            for i in range(1, int_dot // p_len):
                a = df.iloc[0, start: start + p_len].values
                b = df.iloc[0, (start + i * p_len): (start + (i + 1) * p_len)].values
                if len(a) != len(b):
                    break
                # print(len(a), len(b))
                pearson, _ = st.pearsonr(a, b)
                if pearson > 0.9999999:
                    style_n = max(style_n, i + 1)

                else:
                    # 更新起始位置，有新的pattern出现
                    start += i * p_len
                    # 下一pattern中的元素数
                    p_len = select_pattern_len(start, df)
                    # len1 += p_len * style_n
                    break
            # 记录下该pattern中的样式数
            all_style.append(style_n)
            len1 += p_len * style_n
            # 退出判别条件
            if len1 < int_dot:
                start, p_len, all_style, len1 = self.style_pattern(start, p_len, all_style, len1)
            return start, p_len, all_style, len1


    all_style = []
    re_start, re_p_len, re_all_style, re_len1 = Solution().style_pattern(start, p_len, all_style, len1)
    # pattern的数量插入到第一个位置
    re_all_style.insert(0, len(re_all_style))
    # 按照文件名，将pattern的样式数量写入字典中
    name = new_files[idx]
    result[name] = re_all_style
    # print(re_all_style)
print(result)

大佬喝可乐

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
pearson相关性

import numpy as npimport pandas as pdimport matplotlib.pyplot as pltimport seaborn as snsimport warningsimport scipy.stats as stimport osimport rewarnings.filterwarnings('ignore')pd.set_option('display.max_rows', None)pd.set_option('display.max
复制链接

扫一扫

专栏目录