N-gram和tag

最新推荐文章于 2024-09-15 22:31:42 发布

YK_Forever

最新推荐文章于 2024-09-15 22:31:42 发布

阅读量110

点赞数

分类专栏：笔记

本文链接：https://blog.csdn.net/m0_50207094/article/details/119192126

版权

词频统计 n-gram 文本处理词性标注混淆矩阵

关键词由CSDN通过智能技术生成

笔记专栏收录该内容

9 篇文章 1 订阅

订阅专栏

N-gram

给段落划词。2-gram是两个为一组

# 从文件夹里读所有文件
import glob
path = "text/"
string=''
for files in glob.glob(path +"*.txt"):
    infile = open(files)
    a = infile.readlines()
    for k in range (0,len(a)):
        newString=a[k].replace('\n',' ')
        string+=newString
infile.close()
print("done1")
# count the four words and print Top 100 most common ones
from nltk import ngrams
import collections
n = 4
gramsList = ngrams(string.split(), n)
ngrams = []
for grams in gramsList:
  ngrams.append(grams)
a_counter = collections.Counter(ngrams)
sorted_by_count = a_counter.most_common(100)
print("\n".join(str(i) for i in sorted_by_count))

找出最常见词

import collections
count_grams = collections.Counter(find_ngrams(3,remove_stop_words((lines_in_file))))
print(count_grams)
most_common_gram = count_grams.most_common(2)

#most common
print ("Most common")
print(most_common_gram[0])
print(most_common_gram[0].__getitem__(0))


print ("Second most common")
print(most_common_gram[1])
print(most_common_gram[1].__getitem__(0).__getitem__(2))
print("\n")

tag

给词标词性

import nltk

tokens = nltk. word_tokenize("AUT is in New Zealand")
postags = nltk.pos_tag(tokens)
print(postags)

做成混淆矩阵计算准确率

# Using confusion matrix for evaluation.    
from nltk.metrics import *                  
ref  = 'CC NNP IN NNP NN NNP NNP VBD NN DT NN MD RB VB DT NN POS NNS . PRP MD VBN “ JJ NN CC NN ” IN NN CC NNS , PRP VBD IN DT NN IN RB .'.split()
tagged = ''.join(list).split()   #pred      
cm = ConfusionMatrix(ref, tagged)           
print(cm)                                                                              
print("Precision: ",precision(set(ref),set(tagged)))  
print("Recall: ",recall(set(ref),set(tagged)))        
print("F measure: ",f_measure(set(ref),set(tagged)))  
print("Accuracy: ",accuracy(ref,tagged))