1、base_info删除空列
base_info = pd.read_csv(PATH + 'base_info.csv')
base_info = base_info.drop(filter_col_by_nan(base_info, 0.01), axis=1)
annual_report_info = pd.read_csv(PATH + 'annual_report_info.csv')
annual_report_info = annual_report_info.drop(filter_col_by_nan(annual_report_info, 0.01), axis=1)
2、news_info
2.1、news_info
news_info = pd.read_csv(PATH + 'news_info.csv')
news_info.head()

2.2、news_info解析时间特征
news_info = pd.read_csv(PATH + 'news_info.csv')
news_info['public_date'] = news_info['public_date'].apply(lambda x: x if '-' in str(x) else np.nan)
news_info['public_date'] = pd.to_datetime(news_info['public_date'])
news_info['public_date'] = (datetime.now() - news_info['public_date']).dt.days

2.3、以public_data为主,合并重复id—>news_info_df
#.groupby分组操作涉及拆分对象,应用功能以及合并结果的某种组合。这可用于对大量数据进行分组并在这些组上进行计算操作。
#.agg使用指定轴上的一项或多项操作进行汇总(在public_date特征下新建四个子特征)
#合并重复数据集,妙哉!!!!
news_info_df = news_info.groupby('id').agg({
'public_date': ['count','max','min','mean']}).reset_index()

#直接修改列的名称,
news_info_df.columns = ['id', 'public_date_COUNT', 'public_MAX', 'public_MIN', 'public_MEAN']

2.4、以positive_negtive为主,合并重复id—>news_info_df2
#通过求和来汇总值,
news_info_df2 = pd.pivot_table(news_info, index='id', columns='positive_negtive', aggfunc='count').reset_index()

#直接修改列名称
news_info_df2.columns = ['id', 'news_COUNT1', 'news_COUNT2', 'news_COUNT3']

2.5、合并两个特征——>news_info_df