因为我用的package有bug有些文档不能处理当程序在读取这个文件的时候会出现mathdomainerror,所以我现在要实现的目的就是跳过这些error,同时删除产生error的文档。我的code如下所示:...
因为我用的package有bug有些文档不能处理当程序在读取这个文件的时候会出现math domain error,所以我现在要实现的目的就是跳过这些error,同时删除产生error的文档。
我的code如下所示:
首先建立一个excel文档,因为我需要把算的结果导出到excel里面,
然后就用了一个for loop直接运行上面所写的三个method,请前辈帮忙在我现有的code基础改一下达到我想要实现的目的。
import os,csv,nltk, math
from nltk.model.ngram import NgramModel
from nltk.probability import LidstoneProbDist
#open the csv file
fout = open("/Users//WN1.data.csv", "w")
outfilehandle = csv.writer(fout,
delimiter=",",
quotechar='"',
quoting=csv.QUOTE_NONNUMERIC)
localrow = []
localrow.append("File name")
localrow.append("Perplexity for unigram")
localrow.append("Perplexity for bigram")
localrow.append("Perplexity for trigram")
outfilehandle.writerow(localrow)
# unigram model
def unigram(file):
#read file
file_object = open(file)
ln=file_object.read()
words = nltk.word_tokenize(ln)
estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
tt=NgramModel(1, words, estimator = estimator)
return tt.perplexity(words)
#bigram model
def bigram(file):
file_object = open(file)
ln=file_object.read()
words = nltk.word_tokenize(ln)
my_bigrams = nltk.bigrams(words)
#fdist = nltk.FreqDist(my_bigrams)
#lapalce smoothing
estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
tt2=NgramModel(2, my_bigrams, estimator = estimator)
return tt2.perplexity(my_bigrams)
#trigram model
def trigram(file):
file_object = open(file)
ln=file_object.read()
words = nltk.word_tokenize(ln)
my_trigrams = nltk.trigrams(words)
#lapalce smoothing
estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
tt3=NgramModel(3, my_trigrams, estimator = estimator)
return tt3.perplexity(my_trigrams)
#set the path of the folder
os.chdir("/Users/Documents/A")
s = os.getcwd()
#read files in the folder
files = os.listdir(s)
bg=0
for file in files:
uni = unigram(file)
bi=bigram(file)
tri=trigram(file)
localrow= []
localrow.append(file)
localrow.append(uni)
localrow.append(bi)
localrow.append(tri)
outfilehandle.writerow(localrow)
fout.close()
展开