上篇将wiki中文语料已经下载下来(wiki中文文本语料下载并处理 ubuntu + python2.7),并且转为了txt格式,本篇对txt文件进行分词,分词后才能使用word2vector训练词向量
分词python程序为(使用jieba分词)
# -*- coding: utf-8 -*-
#!/usr/bin/env python
import sys
reload(sys)
sys.setdefaultencoding('utf8')
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from gensim.models import word2vec
import logging, jieba
import os, io
stop_words_file = "stop_words.txt"
stop_words = list()
with io.open(stop_words_file, 'r', encoding="gb18030") as stop_words_file_object:
contents = stop_words_file_object.readlines()
for line in contents:
line = line.strip()
stop_words.append(line)
data_file = 'wiki.txt'
i = 1
with io.open(data_file, 'r'