NLP homework2.2

import numpy as np
import pandas as pd
import jieba
import os
import re,string
from zhon.hanzi import punctuation
import matplotlib as m
m.use('TkAgg')
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize
from nltk import bigrams, FreqDist
from math import log
import thulac
import nltk
nltk.download('punkt')
root_path = "/Users/zongzi/Documents/NLP/h2/"
unigramDist = FreqDist()
bigramsDist = FreqDist()
w2gram = {}     # 可能存在的以w为开头的2-gram的种类数量
def process_data(file_path):
    data_list = []                                                #数据集数据                                         #数据集类别
    data = pd.read_csv(file_path,sep='\t')

    for i in range(0,len(data)):
        text = data.loc[i,'text_a']
        text = re.sub(r"[%s]+" %punctuation, " ",text[-1:])+text[-1]
        word_cut = jieba.cut(text, cut_all = False)            #精简模式,返回一个可迭代的generator
        word_list = list(word_cut)                            #generator转换为list

        data_list.append(word_list)

    return data_list

def unigram_train(data):
    for i in data:
        sentence_freq = FreqDist(i)
        for j in sentence_freq:
            if j in unigramDist:
                unigramDist[j]+=sentence_freq[j]
            else:
                unigramDist[j]=sentence_freq[j]

def unigram_test(data):
    for i in data:
        sentence_freq = FreqDist(i)
        for j in sentence_freq:
            if j not in unigramDist:
                unigramDist[j] = 0
    # 频数转化为频率  使用加一平滑法   unigramDist.N() n,unigramsDist.B()表示每个词都加一后的增加量|vocabulary|
    s = unigramDist.N() + unigramDist.B()
    unigramsFreq = FreqDist()
    for i in unigramDist:
        unigramsFreq[i] = (unigramDist[i] + 1) / s
    
    pp = []
    for sentence in data:
        logprob = 0
        N = 0
        for word in sentence:
            if word in unigramsFreq:
                logprob += log(unigramsFreq[word],2)
                N += 1
        if N > 0:
            pp.append(pow(2,-(logprob/N)))
    s = 0
    for i in pp:
        s += i
    print("一元语法模型的困惑度:", s/len(pp))

def bigram_train(data):
    for sentence in data:
        sWordFreq = FreqDist(bigrams(sentence))
        for j in sWordFreq:
            if j in bigramsDist:
                bigramsDist[j] += sWordFreq[j]
            else:
                bigramsDist[j] = sWordFreq[j]
                if j[0] in w2gram:
                    w2gram[j[0]] += 1
                else:
                    w2gram[j[0]] = 1

def bigram_test(data):
    for sentence in data:
        word = bigrams(sentence)
        for j in word:
            if j not in bigramsDist:
                bigramsDist[j] = 0
                if j[0] in w2gram:
                    w2gram[j[0]] += 1
                else:
                    w2gram[j[0]] = 1
    
    # 频数转化为频率  使用加一平滑法
    history = {}    # 以w为历史的2-gram的数量和
    for i in bigramsDist:
        if i[0] in history:
            history[i[0]] += bigramsDist[i]
        else:
            history[i[0]] = bigramsDist[i]
    bigramsFreq = FreqDist()
    for i in bigramsDist:
        bigramsFreq[i] = (bigramsDist[i] + 1) / (history[i[0]] + len(w2gram.keys()))
    
    pp = []
    for sentence in data:
        logprob = 0
        N = 0
        for word in bigrams(sentence):
            if word in bigramsFreq:
                logprob += log(bigramsFreq[word],2)
                N += 1
        if N > 0:
            pp.append(pow(2,-(logprob/N)))
    s = 0
    for i in pp:
        s += i
    print("二元语法模型的困惑度:", s/len(pp))

if __name__ == '__main__':
    train_data_list = process_data(os.path.join(root_path,"data 2/train.tsv"))
    test_data_list = process_data(os.path.join(root_path,"data 2/test.tsv"))
    unigram_train(train_data_list)
    unigram_test(test_data_list)
    bigram_train(train_data_list)
    bigram_test(test_data_list)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值