数据:使用丘吉尔的人物传记作为我的学习语料
框架:Keras
import os
import numpy as np
import nltk
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from gensim.models.word2vec import Word2Vec
#读入文本
raw_text=""
for file in os.listdir("../input/"):
if file.endswith(".txt"):
raw_text+=open("../input/"+file,errors="ignore").read()+\"n\n"
#row_test=open("../input/Winston_Churchil.txt").read()
raw_text=raw_text.lower()
sentensor=nltk.data.load("tokenizers/punkt/english.pickle")
sents=sentensor.tokenize(raw_text)
corpus=[]
for sen in sents:
corpus.append(nltk.word_tokenize(sen))
print(len(corpus))
print(corpus[:3])
#结果
91007
[['\ufeffthe', 'project', 'gutenberg', 'ebook', 'of', 'great', 'expectations', ',', 'by', 'charles', 'dickens', 'this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restrictions', 'whatsoever', '.'], ['you', 'may', 'copy', 'it', ',', 'give', 'it', 'away', 'or', 're-use', 'it', 'under', 'the', 'terms', 'of', 'the', 'project', 'gutenberg', 'license', 'included', 'with', 'this', 'ebook', 'or', 'online', 'at', 'www.gutenberg.org', 'title', ':', 'great', 'expectations', 'author', ':', 'charles', 'dickens', 'posting', 'date', ':', 'august', '20', ',', '2008', '[', 'ebook', '#'