- constant.py
定义数据位置,在主文件中调用
raw_text_csv_file = './dataset/Combined_News_DJIA.csv'
cln_text_csv_file = './cln_text.csv'
- tools.py
把主文件需要数据处理的步骤模块化,分别定义为函数进行调用,代码更加清晰
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
def proc_text(raw_line):
"""
处理每行的文本数据
返回分词结果
"""
raw_line = str(raw_line)
# 全部转为小写
raw_line = raw_line.lower()
# 去除 b'...' 或 b"..."
if raw_line[:2] == 'b\'' or raw_line[:2] == 'b"':
raw_line = raw_line[2:-1]
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(raw_line)
meaninful_words = [w for w in tokens if w not in stopwords.words('english')]
return ' '.join(meaninful_words)
def clean_text(raw_text_df):
"""
清洗原始文本数据
"""
cln_text_df = pd.DataFrame()
cln_text_df['date'] = raw_text_df['Date'].values
cln_text_df['label'] = raw_text_df['Label'].values
cln_text_df['text'] = ''
# 处理25列文本数据,['Top1', ..., 'Top25']
col_list = ['Top' + str(i) for i in range(1, 26)]
for i, col in enumerate(col_list):
raw_text_df[col] = raw_text_df[col].apply(proc_text)
# 合并列
cln_text_df['text'] = cln_text_df['text'].str.cat(raw_text_df[col], sep=' ')
print('已处理{}列.'.format(i + 1))
return cln_text_df
def split_train_test(data_df):
"""
分割训练集和测试集
"""
# 训练集时间范围 2008-08-08 ~ 2014-12-31
train_text_df = data_df.loc['20080808':'20141231', :]
# 将时间索引替换为整型索引
train_text_df.reset_index(drop=True, inplace=True)
# 测试集时间范