N-gram和tag

N-gram

给段落划词。2-gram是两个为一组

# 从文件夹里读所有文件
import glob
path = "text/"
string=''
for files in glob.glob(path +"*.txt"):
    infile = open(files)
    a = infile.readlines()
    for k in range (0,len(a)):
        newString=a[k].replace('\n',' ')
        string+=newString
infile.close()
print("done1")
# count the four words and print Top 100 most common ones
from nltk import ngrams
import collections
n = 4
gramsList = ngrams(string.split(), n)
ngrams = []
for grams in gramsList:
  ngrams.append(grams)
a_counter = collections.Counter(ngrams)
sorted_by_count = a_counter.most_common(100)
print("\n".join(str(i) for i in sorted_by_count))

找出最常见词

import collections
count_grams = collections.Counter(find_ngrams(3,remove_stop_words((lines_in_file))))
print(count_grams)
most_common_gram = count_grams.most_common(2)

#most common
print ("Most common")
print(most_common_gram[0])
print(most_common_gram[0].__getitem__(0))


print ("Second most common")
print(most_common_gram[1])
print(most_common_gram[1].__getitem__(0).__getitem__(2))
print("\n")

tag

给词标词性

import nltk

tokens = nltk. word_tokenize("AUT is in New Zealand")
postags = nltk.pos_tag(tokens)
print(postags)

做成混淆矩阵计算准确率

# Using confusion matrix for evaluation.    
from nltk.metrics import *                  
ref  = 'CC NNP IN NNP NN NNP NNP VBD NN DT NN MD RB VB DT NN POS NNS . PRP MD VBN “ JJ NN CC NN ” IN NN CC NNS , PRP VBD IN DT NN IN RB .'.split()
tagged = ''.join(list).split()   #pred      
cm = ConfusionMatrix(ref, tagged)           
print(cm)                                                                              
print("Precision: ",precision(set(ref),set(tagged)))  
print("Recall: ",recall(set(ref),set(tagged)))        
print("F measure: ",f_measure(set(ref),set(tagged)))  
print("Accuracy: ",accuracy(ref,tagged))   

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值