FDDC2018金融算法挑战赛-A股上市公司盈利预测

 这篇实战也算是对前段时间的一次总结吧,还需要优化,有的地方可以思路清奇,欢迎指正。

数据来源于天池FDDC2018金融算法挑战赛数据集

数据预处理

导入数据

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from operator import itemgetter
from collections import Counter
from scipy.interpolate import lagrange
import copy
from time import sleep
from tqdm import tqdm

Balance_row=pd.read_csv('B.csv')
Cash_row=pd.read_csv('C.csv')
Income_row=pd.read_csv('I.csv')

缺失值处理

对于省缺值我们使用两种策略:对于省缺值数量超过样本总量80%的财务条目(字段)我们直接舍弃;对于剩余省缺值,用随机森林填充。
#对于省缺值我们使用两种策略:对于省缺值数量超过样本总量80%的财务条目(字段)我们直接舍弃;对于剩余省缺值,用随机森林填充。
def delna(data):
    nan_percentage = (data.isnull().sum() / data.shape[ 0 ]).to_list()
    len_columns=data.columns.to_list()
    max_err_rate=0.8
    for j,i in zip(nan_percentage,len_columns):
        if j >max_err_rate:
            data.drop(i,axis=1,inplace=True)
    return data

Income_row=delna(Income_row)
Balance_row=delna(Balance_row)
Cash_row=delna(Cash_row)
#随机森林填补缺失值
# data_df: DataFrame类型的数据
# obj_column:待填补缺失值的列名
# missing_other_column:数据中含义空值的其他列
########## 缺失值处理

def fill_miss_byRandomForest(data_df , obj_column, missing_other_column ):
    ## 先把有缺失的其他列删除掉missing_other_column
    data_df = data_df.drop(missing_other_column , axis = 1)
    # 分成已知该特征和未知该特征两部分
    known = data_df[data_df[obj_column].notnull()]
    unknown = data_df[data_df[obj_column].isnull()]
    # y为结果标签值
    y_know = known[obj_column]
    # X为特征属性值
    X_know= known.drop(obj_column , axis = 1)
    from sklearn.ensemble import RandomForestRegressor
    rfr = RandomForestRegressor(random_state=0, n_estimators=70,max_depth=3,n_jobs=-1)
    rfr.fit(X_know,y_know)
    # 用得到的模型进行未知特征值预测
    # X为特征属性值
    X_unknow= unknown.drop(obj_column , axis = 1)
    predicted = rfr.predict(X_unknow).round(0)
    data_df.loc[(data_df[obj_column].isnull()), obj_column] = predicted
    return data_df


num_na_back  = pd.isna(Balance_row).sum()
obj_column=pd.DataFrame(num_na_back)
obj_column.drop(obj_column[obj_column[0]==0].index,inplace = True,axis = 0)
len_obj_column=obj_column.index.to_list()
temp=Balance_row.iloc[1,0:9]
missing_other_column=temp.index.to_list()
len_obj_column_copy=copy.deepcopy(len_obj_column)

Balance_row_copy = Balance_row
for j in tqdm(len_obj_column):
    len_obj_column_copy.remove(j)
    Balance_row_temp=Balance_row_copy.drop(len_obj_column_copy,axis = 1)
    Balance_row_temp_1=fill_miss_byRandomForest(Balance_row_temp,j,missing_other_column)
    Balance_row_temp_1=Balance_row_temp_1[j]
    Balance_row_copy=Balance_row_copy.drop(j,axis=1)
    Balance_row_copy=Balance_row_copy.merge(Balance_row_temp_1,how='inner',left_index = True,right_index = True)

# fill_miss_byRandomForest(Balance_row,len_obj_column,missing_other_column)
#相同操作对于其他两张表


Cash_row=Cash_row[Cash_row['C_FR_SALE_G_S'].notnull()]
Cash_row=Cash_row[Cash_row['C_PAID_TO_FOR_EMPL'].notnull()]
Cash_row=Cash_row[Cash_row['C_PAID_FOR_TAXES'].notnull()]
Cash_row=Cash_row[Cash_row['C_PAID_FOR_OTH_OP_A'].notnull()]
num_na_back  = pd.isna(Cash_row).sum()
obj_column=pd.DataFrame(num_na_back)
obj_column.drop(obj_column[obj_column[0]==0].index,inplace = True,axis = 0)
len_obj_column=obj_column.index.to_list()
temp=Cash_row.iloc[1,0:9]
missing_other_column=temp.index.to_list()
len_obj_column_copy=copy.deepcopy(len_obj_column)




Cash_row_copy = Cash_row
for j in tqdm(len_obj_column):
    len_obj_column_copy.remove(j)
    Cash_row_temp=Cash_row_copy.drop(len_obj_column_copy,axis = 1)
    Cash_row_temp_1=fill_miss_byRandomForest(Cash_row_temp,j,missing_other_column)
    Cash_row_temp_1=Cash_row_temp_1[j]
    Cash_row_copy=Cash_row_copy.drop(j,axis=1)
    Cash_row_copy=Cash_row_copy.merge(Cash_row_temp_1,how='inner',left_index = True,right_index = True)


num_na_back  = pd.isna(Income_row).sum()
obj_column=pd.DataFrame(num_na_back)
obj_column.drop(obj_column[obj_column[0]==0].index,inplace = True,axis = 0)
len_obj_column=obj_column.index.to_list()
temp=Income_row.iloc[1,0:9]
missing_other_column=temp.index.to_list()
len_obj_column_copy=copy.deepcopy(len_obj_column)

Income_row_copy = Income_row
for j in tqdm(len_obj_column):
    len_obj_column_copy.remove(j)
    Income_row_temp=Income_row_copy.drop(len_obj_column_copy,axis = 1)
    Income_row_temp_1=fill_miss_byRandomForest(Income_row_temp,j,missing_other_column)
    Income_row_temp_1=Income_row_temp_1[j]
    Income_row_copy=Income_row_copy.drop(j,axis=1)
    Income_row_copy=Income_row_copy.merge(Income_row_temp_1,how='inner',left_index = True,right_index = True)

重复数据剔除

# #提取最新发布的日期
# #对时间排序 方便删除重复项
def dup(df):
    df=df.sort_values(axis = 0, ascending = True, by = [ 'TICKER_SYMBOL', 'END_DATE' ])
    df = df.drop_duplicates()
    return df
I=dup(Income_row_copy)
B=dup(Balance_row_copy)
C=dup(Cash_row_copy)

#删除按列指定的重复值
B=B.drop_duplicates(['TICKER_SYMBOL','END_DATE','REPORT_TYPE'])
C=C.drop_duplicates(['TICKER_SYMBOL','END_DATE','REPORT_TYPE'])
I=I.drop_duplicates(['TICKER_SYMBOL','END_DATE','REPORT_TYPE'])

B.to_csv('B1.csv')
C.to_csv('C1.csv')
I.to_csv('I1.csv')

#至此第一部分结束

特征构建

统计层次特征构建
 例如:历史营业收入
      利润表和现金表 差分处理  资产负债表则S1就为Q2
       已有每年的Q1(3个月),S1(6个月),Q3(3个月),A(12个月)

       构造Q2(3个月), Q4 (3个月), S2 (6个月)
#首先对Balance(资产负债表为静态表)进行操作,思路为直接复制 再更改对于的字段名 再插入
B=pd.read_csv('B1.csv')#首先导入第一部分的三表
C=pd.read_csv('C1.csv')
I=pd.read_csv('I1.csv')
def main1():
    B_C = B
    B_C2 = B
    l = len(B)
    for i in tqdm(range(l)):
        if B_C.iloc[i,7]=='S1':
            a = B_C.iloc[ i ]
            d = pd.DataFrame(a).T
            d.iloc[0,7]='Q2'
            B_C2 = B_C2.append([ d ])
        elif B_C.iloc[i,7]=='Q3':
            a = B_C.iloc[ i ]
            d = pd.DataFrame(a).T
            d.iloc[0,7]='NQ3'
            B_C2 = B_C2.append([ d ])
        elif B_C.iloc[ i, 7 ] == 'A':
            a = B_C.iloc[ i ]
            d = pd.DataFrame(a).T
            d=d.append([ d ])
            d.iloc[0,7]='Q4'
            d.iloc[ 1, 7 ] = 'S2'
            B_C2 = B_C2.append([ d ] )

    del B_C
    return B_C2
B1=main1()

B1=pd.read_csv('B2.csv')

# #对收入表和现金流量表(该表为动态表)进行操作
def main2():
    I_C=copy.deepcopy(I)
    I_C['END_DATE_YEAR']=I_C['END_DATE'].apply(lambda x:x.split('-')[0])
    I_C2=copy.deepcopy(I_C)
    l=len(I_C)
    I_C['TICKER_SYMBOL'].duplicated()
    TICKER_SYMBOL = I_C[ 'TICKER_SYMBOL' ].to_list()
    TICKER_SYMBOL = Counter(TICKER_SYMBOL)
    TICKER_SYMBOL=pd.DataFrame([TICKER_SYMBOL]).T
    TICKER_SYMBOL = TICKER_SYMBOL.reset_index().rename(columns = {'index': 'id',})
    TICKER_SYMBOL.columns = ('TICKER_SYMBOL', 'conunt')
    TICKER_SYMBOL.drop(TICKER_SYMBOL[ TICKER_SYMBOL[ 'conunt' ] >= 41 ].index, inplace = True, axis = 0)
    TICKER_SYMBOL=TICKER_SYMBOL.iloc[:,0]
    TICKER_SYMBOL=TICKER_SYMBOL.to_list()
    I_C2.loc[:,&
  • 2
    点赞
  • 10
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值