FDDC2018金融算法挑战赛01-A股上市公司季度营收预测

天池大赛链接

我所用到的数据

1、income_gb_2代表的是我从天池原有的income_statement中的general business导出的,balance_gb_2和cash_gb_2 

首亦然。

2、 Macro为宏观数据,Market为市场数据

导入相关包,将工作目录改为数据所在目录

from pandas import DataFrame
from numpy import nan as NA
from pandas import Series
import os 
import pandas as pd
import numpy as  np
import random
import time
import threading as td
import multiprocessing as mp
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.decomposition import PCA

#改变工作目录
os.chdir('E://kaggle//tc2')

1、由于资产负债表、利润表和现金流量表内各个数据间在存在等式关系,例如:资产=负债+ 股东权益,利润总额=营业收入-营业成本-各项费用等。根据这些内在逻辑剔除异常值

2、由于内在等式关系, 对于缺失值则直接填充0,不改变原有等式关系

3、数据中有些列是前方列的和,为排除多重共线性,将这些列予以剔除

#根据财务领域知识剔除异常值与线性相关的列
#利润表
#导入数据
income_gb2=pd.read_csv('income_gb_2.csv')
#填充缺失值
income_gb2=income_gb2.fillna(0)
#建立空列表,用于收集需要剔除的观测样本
income_drop_index=[]
#检测异常样本
for i in range(np.shape(income_gb2)[0]):
    if (income_gb2.ix[i,9]-income_gb2.ix[i,10:16].sum()) >1000 or \
       (income_gb2.ix[i,9]-income_gb2.ix[i,10:16].sum()) <-1000 or \
       (income_gb2.ix[i,16]-income_gb2.ix[i,17:32].sum()) >1000 or \
       (income_gb2.ix[i,16]-income_gb2.ix[i,17:32].sum()) <-1000 or \
       (income_gb2.ix[i,10:16].sum()-income_gb2.ix[i,17:32].sum()+income_gb2.ix[i,32:34].sum()+ \
        income_gb2.ix[i,35:40].sum()-income_gb2.ix[i,40]) > 1000 or \
       (income_gb2.ix[i,10:16].sum()-income_gb2.ix[i,17:32].sum()+income_gb2.ix[i,32:34].sum()+ \
        income_gb2.ix[i,35:40].sum()-income_gb2.ix[i,40]) < -1000 :
           income_drop_index.append(i)       
    print((i/np.shape(income_gb2)[0])*100)  

#剔除观测样本  
income_gb2_drop=income_gb2.drop(income_drop_index,axis=0)
#根据业务逻辑剔除数据中线性相关的列,防止多重共线性 
income_gb2_drop=income_gb2.drop(['T_REVENUE','T_COGS','OPERATE_PROFIT','N_INCOME','T_COMPR_INCOME'],axis=1)   
income_gb2_drop.to_csv('income_gb2_drop.csv',index=None)       
#资产负债表
#处理方式同上
balance_gb2=pd.read_csv('balance_gb_2.csv')
balance_gb2=balance_gb2.fillna(0)
balance_gb2_drop=balance_gb2
balance_gb2_drop1=balance_gb2.drop(['T_CA','T_NCA','T_ASSETS','T_CL','T_NCL','T_LIAB',
                                   'PREFERRED_STOCK_E','PREFERRED_STOCK_L','T_EQUITY_ATTR_P',
                                   'T_SH_EQUITY','T_LIAB_EQUITY'],axis=1)
balance_drop_index_total=[]
for i in range(np.shape(balance_gb2_drop)[0]) :
    if (balance_gb2_drop1.ix[i,9:list(balance_gb2_drop1.columns).index('ST_BORR')].sum() - \
        balance_gb2_drop1.ix[i,list(balance_gb2_drop1.columns).index('ST_BORR'):].sum()) >10000 or \
       (balance_gb2_drop1.ix[i,9:list(balance_gb2_drop1.columns).index('ST_BORR')].sum() - \
        balance_gb2_drop1.ix[i,list(balance_gb2_drop1.columns).index('ST_BORR'):].sum()) < -10000 :
        balance_drop_index_total.append(i) 
    print((i+1)/209872)

balance_drop_index_sum=[]    
for i in range(np.shape(balance_gb2_drop)[0]) :
    if (balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_ASSETS')] - \
        balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_LIAB_EQUITY')]) >10000 or \
       (balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_ASSETS')] - \
        balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_LIAB_EQUITY')]) < -10000 :
        balance_drop_index_sum.append(i) 
    print((i+1)/209872)     


balance_drop_index_TCA=[]    
for i in range(np.shape(balance_gb2_drop)[0]) :
    if (balance_gb2_drop.ix[i,9:list(balance_gb2_drop.columns).index('T_CA')].sum() - \
        balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_CA')]) >10000 or \
       (balance_gb2_drop.ix[i,9:list(balance_gb2_drop.columns).index('T_CA')].sum() - \
        balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_CA')]) < -10000 :
        balance_drop_index_TCA.append(i) 
    print((i+1)/209872)     

balance_drop_index_TNCA=[]    
for i in range(np.shape(balance_gb2_drop)[0]) :
    if (balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('DISBUR_LA'):list(balance_gb2_drop.columns).index('T_NCA')].sum() - \
        balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_NCA')]) >10000 or \
       (balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('DISBUR_LA'):list(balance_gb2_drop.columns).index('T_NCA')].sum() - \
        balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_NCA')]) < -10000 :
        balance_drop_index_TNCA.append(i) 
    print((i+1)/209872)     
#
balance_drop_index_T_CL=[]    
for i in range(np.shape(balance_gb2_drop)[0]) :
    if (balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('ST_BORR'):list(balance_gb2_drop.columns).index('T_CL')].sum() - \
        balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_CL')]) >10000 or \
       (balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('ST_BORR'):list(balance_gb2_drop.columns).index('T_CL')].sum() - \
        balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_CL')]) < -10000 :
        balance_drop_index_T_CL.append(i) 
    print((i+1)/209872)      
    
balance_drop_index_T_NCL=[]    
for i in range(np.shape(balance_gb2_drop)[0]) :
    if (balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('LT_BORR'):list(balance_gb2_drop.columns).index('T_NCL')].sum() - \
        balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_NCL')] -balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('PREFERRED_STOCK_L')]) >10000 or \
       (balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('LT_BORR'):list(balance_gb2_drop.columns).index('T_NCL')].sum() - \
        balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_NCL')]-balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('PREFERRED_STOCK_L')]) < -10000 :
        balance_drop_index_T_NCL.append(i) 
    print((i+1)/209872)    
    
balance_drop_index_T_EQUITY_ATTR_P=[]    
for i in range(np.shape(balance_gb2_drop)[0]) :
    if (balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('PAID_IN_CAPITAL'):list(balance_gb2_drop.columns).index('T_EQUITY_ATTR_P')].sum() - \
        balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_EQUITY_ATTR_P')] -balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('PREFERRED_STOCK_E')]) >10000 or \
       (balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('PAID_IN_CAPITAL'):list(balance_gb2_drop.columns).index('T_EQUITY_ATTR_P')].sum() - \
        balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_EQUITY_ATTR_P')]-balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('PREFERRED_STOCK_E')]) < -10000 :
        balance_drop_index_T_EQUITY_ATTR_P.append(i) 
    print((i+1)/209872)

balance_drop_index_T_SH_EQUITY=[]    
for i in range(np.shape(balance_gb2_drop)[0]) :
    if (balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_EQUITY_ATTR_P'):list(balance_gb2_drop.columns).index('T_SH_EQUITY')].sum() - \
        balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_SH_EQUITY')] ) >10000 or \
       (balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_EQUITY_ATTR_P'):list(balance_gb2_drop.columns).index('T_SH_EQUITY')].sum() - \
        balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_SH_EQUITY')]) < -10000 :
        balance_drop_index_T_SH_EQUITY.append(i) 
    print((i+1)/209872)      
    
balance_drop_index_final=balance_drop_index_sum+balance_drop_index_TCA+balance_drop_index_TNCA+balance_drop_index_T_CL+balance_drop_index_T_NCL+balance_drop_index_T_EQUITY_ATTR_P+balance_drop_index_T_SH_EQUITY
balance_drop_index_final=list(set(balance_drop_index_final))
balance_gb2_drop_final=balance_gb2.drop(balance_drop_index_final,axis=0) 
balance_gb2_drop_final=balance_gb2_drop_final.drop(['T_CA','T_NCA','T_ASSETS','T_CL','T_NCL','T_LIAB',
                                   'PREFERRED_STOCK_E','PREFERRED_STOCK_L','T_EQUITY_ATTR_P',
                                   'T_SH_EQUITY','T_LIAB_EQUITY'],axis=1) 
balance_gb2_drop_final.to_csv('balance_gb2_drop.csv',index=None)  

#现金流量表  
#处理方式同上  
cash_gb2=pd.read_csv('cash_gb_2.csv')
cash_gb2=cash_gb2.fillna(0)   

cash_drop_index_OPERATE_A=[]    
for i in range(np.shape(cash_gb2)[0]) :
    if abs(cash_gb2.ix[i,list(cash_gb2.columns).index('C_FR_SALE_G_S'):list(cash_gb2.columns).index('C_INF_FR_OPERATE_A')].sum() - \
        cash_gb2.ix[i,list(cash_gb2.columns).index('C_PAID_G_S'):list(cash_gb2.columns).index('C_OUTF_OPERATE_A')].sum() + \
        cash_gb2.ix[i,list(cash_gb2.columns).index('ANOCF')] - \
        cash_gb2.ix[i,list(cash_gb2.c
  • 0
    点赞
  • 17
    收藏
    觉得还不错? 一键收藏
  • 4
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值