N-gram
给段落划词。2-gram是两个为一组
# 从文件夹里读所有文件
import glob
path = "text/"
string=''
for files in glob.glob(path +"*.txt"):
infile = open(files)
a = infile.readlines()
for k in range (0,len(a)):
newString=a[k].replace('\n',' ')
string+=newString
infile.close()
print("done1")
# count the four words and print Top 100 most common ones
from nltk import ngrams
import collections
n = 4
gramsList = ngrams(string.split(), n)
ngrams = []
for grams in gramsList:
ngrams.append(grams)
a_counter = collections.Counter(ngrams)
sorted_by_count = a_counter.most_common(100)
print("\n".join(str(i) for i in sorted_by_count))
找出最常见词
import collections
count_grams = collections.Counter(find_ngrams(3,remove_stop_words((lines_in_file))))
print(count_grams)
most_common_gram = count_grams.most_common(2)
#most common
print ("Most common")
print(most_common_gram[0])
print(most_common_gram[0].__getitem__(0))
print ("Second most common")
print(most_common_gram[1])
print(most_common_gram[1].__getitem__(0).__getitem__(2))
print("\n")
tag
给词标词性
import nltk
tokens = nltk. word_tokenize("AUT is in New Zealand")
postags = nltk.pos_tag(tokens)
print(postags)
做成混淆矩阵计算准确率
# Using confusion matrix for evaluation.
from nltk.metrics import *
ref = 'CC NNP IN NNP NN NNP NNP VBD NN DT NN MD RB VB DT NN POS NNS . PRP MD VBN “ JJ NN CC NN ” IN NN CC NNS , PRP VBD IN DT NN IN RB .'.split()
tagged = ''.join(list).split() #pred
cm = ConfusionMatrix(ref, tagged)
print(cm)
print("Precision: ",precision(set(ref),set(tagged)))
print("Recall: ",recall(set(ref),set(tagged)))
print("F measure: ",f_measure(set(ref),set(tagged)))
print("Accuracy: ",accuracy(ref,tagged))