使用Genism进行词向量训练:实践版
1.语料库准备
输入为一行行正常的语句
2.Code
#!/usr/bin/python
# -*- coding: utf-8 -*-
import gensim.models
import time
import pandas as pd
from nltk.tokenize import TweetTokenizer
time1 = time.time()
import logging
import numpy as np
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
def loaddata(inputfile):
file = open(inputfile)
tknzr = TweetTokenizer()
sentences=[]
while 1:
line = file.readline().strip()
if not line:
break
sentences.append(tknzr.tokenize(line))
return sentences
def WordFr