词向量数据预处理参考

预处理
def parse_data(path):
    df = pd.read_csv(path, encoding='utf-8')
    data_x = df.Question.str.cat(df.Dialogue)
    data_y = []
    if 'Report' in df.columns:
        data_y = df.Report
    return data_x, data_y

def save_data(data_1, data_2, data_3, data_path_1, data_path_2, data_path_3, stop_words_path=''):
	# 读入停用词
    stopwords = read_stopwords(stop_words_path)
    with open(data_path_1, 'w', encoding='utf-8') as f1:
        count = 0
        for line in data_1:
            # print(line)
            if isinstance(line, str):
            	# 分词
                seg_list = segment(line.strip(), cut_type='word')
                # seg_words = []
                # for j in seg_list:
                #     if j in stopwords:
                #         continue
                #     seg_words.append(j)
                seg_line = ' '.join(seg_list)
                f1.write('%s' % seg_line)
            count += 1
            f1.write('\n')

    with open(data_path_2, 'w', encoding='utf-8') as f2:
        for line in data_2:
            if isinstance(line, str):
                seg_list = segment(line.strip(), cut_type='word')
                # seg_words = []
                # for j in seg_list:
                #     if j in stopwords:
                #         continue
                #     seg_words.append(j)
                seg_line = ' '.join(seg_list)
                f2.write('%s' % seg_line)
            f2.write('\n')

    with open(data_path_3, 'w', encoding='utf-8') as f3:
        for line in data_3:
            if isinstance(line, str):
                seg_list = segment(line.strip(), cut_type='word')
                seg_line = ' '.join(seg_list)
                f3.write('%s' % seg_line)
            f3.write('\n')
if __name__ == '__main__':
    train_list_src, train_list_trg = parse_data(config.train_path)
    test_list_src, _ = parse_data(config.test_path)
    save_data(train_list_src,
              train_list_trg,
              test_list_src,
              config.train_seg_path_x,
              config.train_seg_path_y,
              config.test_seg_path_x,
              stop_words_path=config.stop_words_path)         
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值