1、base_info删除空列
base_info = pd.read_csv(PATH + 'base_info.csv')
base_info = base_info.drop(filter_col_by_nan(base_info, 0.01), axis=1)
annual_report_info = pd.read_csv(PATH + 'annual_report_info.csv')
annual_report_info = annual_report_info.drop(filter_col_by_nan(annual_report_info, 0.01), axis=1)
2、news_info
2.1、news_info
news_info = pd.read_csv(PATH + 'news_info.csv')
news_info.head()
![在这里插入图片描述](https://img-blog.csdnimg.cn/2020112913572782.png#pic_center)
2.2、news_info解析时间特征
news_info = pd.read_csv(PATH + 'news_info.csv')
news_info['public_date'] = news_info['public_date'].apply(lambda x: x if '-' in str(x) else np.nan)
news_info['public_date'] = pd.to_datetime(news_info['public_date'])
news_info['public_date'] = (datetime.now() - news_info['public_date']).dt.days
![在这里插入图片描述](https://img-blog.csdnimg.cn/20201129135858366.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L1hpYW9iYWlfcmFiYml0MA==,size_16,color_FFFFFF,t_70#pic_center)
2.3、以public_data为主,合并重复id—>news_info_df
#.groupby分组操作涉及拆分对象,应用功能以及合并结果的某种组合。这可用于对大量数据进行分组并在这些组上进行计算操作。
#.agg使用指定轴上的一项或多项操作进行汇总(在public_date特征下新建四个子特征)
#合并重复数据集,妙哉!!!!
news_info_df = news_info.groupby('id').agg({
'public_date': ['count','max','min','mean']}).reset_index()
![在这里插入图片描述](https://img-blog.csdnimg.cn/20201129140003502.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L1hpYW9iYWlfcmFiYml0MA==,size_16,color_FFFFFF,t_70#pic_center)
#直接修改列的名称,
news_info_df.columns = ['id', 'public_date_COUNT', 'public_MAX', 'public_MIN', 'public_MEAN']
![在这里插入图片描述](https://img-blog.csdnimg.cn/20201129140108431.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L1hpYW9iYWlfcmFiYml0MA==,size_16,color_FFFFFF,t_70#pic_center)
2.4、以positive_negtive为主,合并重复id—>news_info_df2
#通过求和来汇总值,
news_info_df2 = pd.pivot_table(news_info, index='id', columns='positive_negtive', aggfunc='count').reset_index()
![在这里插入图片描述](https://img-blog.csdnimg.cn/20201129141040819.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L1hpYW9iYWlfcmFiYml0MA==,size_16,color_FFFFFF,t_70#pic_center)
#直接修改列名称
news_info_df2.columns = ['id', 'news_COUNT1', 'news_COUNT2', 'news_COUNT3']