import numpy as np
import pandas as pd
import jieba
import os
import re,string
from zhon.hanzi import punctuation
import matplotlib as m
m.use('TkAgg')
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize
from nltk import bigrams, FreqDist
from math import log
import thulac
import nltk
nltk.download('punkt')
root_path = "/Users/zongzi/Documents/NLP/h2/"
unigramDist = FreqDist()
bigramsDist = FreqDist()
w2gram = {} # 可能存在的以w为开头的2-gram的种类数量
def process_data(file_path):
data_list = [] #数据集数据 #数据集类别
data = pd.read_csv(file_path,sep='\t')
for i in range(0,len(data)):
text = data.loc[i,'text_a']
text = re.sub(r"[%s]+" %punctuation, " ",text[-1:])+text[-1]
word_cut = jieba.cut(text, cut_all = False) #精简模式,返回一个可迭代的generator
word_list = list(word_cut) #generator转换为list
data_list.append(word_list)
return data_list
def unigram_train(data):
for i in data:
sentence_freq = FreqDist(i)
for j in sentence_freq:
if j in unigramDist:
unigramDist[j]+=sentence_freq[j]
else:
unigramDist[j]=sentence_freq[j]
def unigram_test(data):
for i in data:
sentence_freq = FreqDist(i)
for j in sentence_freq:
if j not in unigramDist:
unigramDist[j] = 0
# 频数转化为频率 使用加一平滑法 unigramDist.N() n,unigramsDist.B()表示每个词都加一后的增加量|vocabulary|
s = unigramDist.N() + unigramDist.B()
unigramsFreq = FreqDist()
for i in unigramDist:
unigramsFreq[i] = (unigramDist[i] + 1) / s
pp = []
for sentence in data:
logprob = 0
N = 0
for word in sentence:
if word in unigramsFreq:
logprob += log(unigramsFreq[word],2)
N += 1
if N > 0:
pp.append(pow(2,-(logprob/N)))
s = 0
for i in pp:
s += i
print("一元语法模型的困惑度:", s/len(pp))
def bigram_train(data):
for sentence in data:
sWordFreq = FreqDist(bigrams(sentence))
for j in sWordFreq:
if j in bigramsDist:
bigramsDist[j] += sWordFreq[j]
else:
bigramsDist[j] = sWordFreq[j]
if j[0] in w2gram:
w2gram[j[0]] += 1
else:
w2gram[j[0]] = 1
def bigram_test(data):
for sentence in data:
word = bigrams(sentence)
for j in word:
if j not in bigramsDist:
bigramsDist[j] = 0
if j[0] in w2gram:
w2gram[j[0]] += 1
else:
w2gram[j[0]] = 1
# 频数转化为频率 使用加一平滑法
history = {} # 以w为历史的2-gram的数量和
for i in bigramsDist:
if i[0] in history:
history[i[0]] += bigramsDist[i]
else:
history[i[0]] = bigramsDist[i]
bigramsFreq = FreqDist()
for i in bigramsDist:
bigramsFreq[i] = (bigramsDist[i] + 1) / (history[i[0]] + len(w2gram.keys()))
pp = []
for sentence in data:
logprob = 0
N = 0
for word in bigrams(sentence):
if word in bigramsFreq:
logprob += log(bigramsFreq[word],2)
N += 1
if N > 0:
pp.append(pow(2,-(logprob/N)))
s = 0
for i in pp:
s += i
print("二元语法模型的困惑度:", s/len(pp))
if __name__ == '__main__':
train_data_list = process_data(os.path.join(root_path,"data 2/train.tsv"))
test_data_list = process_data(os.path.join(root_path,"data 2/test.tsv"))
unigram_train(train_data_list)
unigram_test(test_data_list)
bigram_train(train_data_list)
bigram_test(test_data_list)
NLP homework2.2
最新推荐文章于 2024-09-06 17:41:57 发布