自然语言处理lab笔记-bigram,lm

爱格白

已于 2022-02-14 22:52:47 修改

阅读量800

点赞数 1

分类专栏：学习笔记文章标签：自然语言处理 python

于 2022-02-07 05:21:37 首次发布

本文链接：https://blog.csdn.net/zj71hmvx/article/details/122795520

版权

学习笔记专栏收录该内容

40 篇文章 1 订阅

订阅专栏

bigram

class language_model():
    
    def __init__(self,trainingdir=TRAINING_DIR,files=[]):
        self.training_dir=trainingdir
        self.files=files
        self.train()
        
    def train(self):    
        self.unigram={}
        self.bigram={}
         
        self._processfiles()
        self._make_unknowns()
        self._discount()
        self._convert_to_probs()
        
    
    def _processline(self,line):
        tokens=["__START"]+tokenize(line)+["__END"]
        previous="__END"
        for token in tokens:
            self.unigram[token]=self.unigram.get(token,0)+1
            current=self.bigram.get(previous,{})
            current[token]=current.get(token,0)+1
            self.bigram[previous]=current
            previous=token
            
    
    def _processfiles(self):
        for afile in self.files:
            print("Processing {}".format(afile))
            try:
                with open(os.path.join(self.training_dir,afile)) as instream:
                    for line in instream:
                        line=line.rstrip()
                        if len(line)>0:
                            self._processline(line)
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing {}: ignoring rest of file".format(afile))
      
            
    def _convert_to_probs(self):
        
        self.unigram={k:v/sum(self.unigram.values()) for (k,v) in self.unigram.items()}
        self.bigram={key:{k:v/sum(adict.values()) for (k,v) in adict.items()} for (key,adict) in self.bigram.items()}
        self.kn={k:v/sum(self.kn.values()) for (k,v) in self.kn.items()}
        
    def get_prob(self,token,context="",methodparams={}):
        if methodparams.get("method","unigram")=="unigram":
            return self.unigram.get(token,self.unigram.get("__UNK",0))
        else:
            if methodparams.get("smoothing","kneser-ney")=="kneser-ney":
                unidist=self.kn
            else:
                unidist=self.unigram
            bigram=self.bigram.get(context[-1],self.bigram.get("__UNK",{}))
            big_p=bigram.get(token,bigram.get("__UNK",0))
            lmbda=bigram["__DISCOUNT"]
            uni_p=unidist.get(token,unidist.get("__UNK",0))
            #print(big_p,lmbda,uni_p)
            p=big_p+lmbda*uni_p            
            return p
    
    
    def nextlikely(self,k=1,current="",method="unigram"):
        blacklist=["__START","__DISCOUNT"]
       
        if method=="unigram":
            dist=self.unigram
        else:
            dist=self.bigram.get(current,self.bigram.get("__UNK",{}))
    
        mostlikely=list(dist.items())
        #filter out any undesirable tokens
        filtered=[(w,p) for (w,p) in mostlikely if w not in blacklist]
        #choose one randomly from the top k
        words,probdist=zip(*filtered)
        res=random.choices(words,probdist)[0]
        return res
    
    def generate(self,k=1,end="__END",limit=20,method="bigram",methodparams={}):
        if method=="":
            method=methodparams.get("method","bigram")
        current="__START"
        tokens=[]
        while current!=end and len(tokens)<limit:
            current=self.nextlikely(k=k,current=current,method=method)
            tokens.append(current)
        return " ".join(tokens[:-1])
    
    
    def compute_prob_line(self,line,methodparams={}):
        #this will add _start to the beginning of a line of text
        #compute the probability of the line according to the desired model
        #and returns probability together with number of tokens
        
        tokens=["__START"]+tokenize(line)+["__END"]
        acc=0
        for i,token in enumerate(tokens[1:]):
            acc+=math.log(self.get_prob(token,tokens[:i+1],methodparams))
        return acc,len(tokens[1:])
    
    def compute_probability(self,filenames=[],methodparams={}):
        #computes the probability (and length) of a corpus contained in filenames
        if filenames==[]:
            filenames=self.files
        
        total_p=0
        total_N=0
        for i,afile in enumerate(filenames):
            print("Processing file {}:{}".format(i,afile))
            try:
                with open(os.path.join(self.training_dir,afile)) as instream:
                    for line in instream:
                        line=line.rstrip()
                        if len(line)>0:
                            p,N=self.compute_prob_line(line,methodparams=methodparams)
                            total_p+=p
                            total_N+=N
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing file {}: ignoring rest of file".format(afile))
        return total_p,total_N
    
    def compute_perplexity(self,filenames=[],methodparams={"method":"bigram","smoothing":"kneser-ney"}):
        
        #compute the probability and length of the corpus
        #calculate perplexity
        #lower perplexity means that the model better explains the data
        
        p,N=self.compute_probability(filenames=filenames,methodparams=methodparams)
        #print(p,N)
        pp=math.exp(-p/N)
        return pp  
    
    def _make_unknowns(self,known=2):
        unknown=0
        for (k,v) in list(self.unigram.items()):
            if v<known:
                del self.unigram[k]
                self.unigram["__UNK"]=self.unigram.get("__UNK",0)+v
        for (k,adict) in list(self.bigram.items()):
            for (kk,v) in list(adict.items()):
                isknown=self.unigram.get(kk,0)
                if isknown==0:
                    adict["__UNK"]=adict.get("__UNK",0)+v
                    del adict[kk]
            isknown=self.unigram.get(k,0)
            if isknown==0:
                del self.bigram[k]
                current=self.bigram.get("__UNK",{})
                current.update(adict)
                self.bigram["__UNK"]=current
                
            else:
                self.bigram[k]=adict
                
    def _discount(self,discount=0.75):
        #discount each bigram count by a small fixed amount
        self.bigram={k:{kk:value-discount for (kk,value) in adict.items()}for (k,adict) in self.bigram.items()}
        
        #for each word, store the total amount of the discount so that the total is the same 
        #i.e., so we are reserving this as probability mass
        for k in self.bigram.keys():
            lamb=len(self.bigram[k])
            self.bigram[k]["__DISCOUNT"]=lamb*discount
            
        #work out kneser-ney unigram probabilities
        #count the number of contexts each word has been seen in
        self.kn={}
        for (k,adict) in self.bigram.items():
            for kk in adict.keys():
                self.kn[kk]=self.kn.get(kk,0)+1

老师给的答案，使用方法如下

mylm=language_model(files=trainingfiles[:MAX_FILES])
mylm.generate()
mylm.generate(method="bigram")

真的是自己想不出的方法，举例句子 “When did the cat sit on the mat?”，
先给句子加start 和end字符，接着分词
[’__START’, ‘When’, ‘did’, ‘the’, ‘cat’, ‘sit’, ‘on’, ‘the’, ‘mat’, ‘?’, ‘__END’]
再遍历，在processline方法里把句子变成bigram和unigram。
unigram就是单纯计数，获取unigram里的键，对应值+1，从零开始。

bigram这块，先定义previous是最后一个词_END，但遍历的时候是从第一个词开始的且同时定义为current，current值就相当于一个键值对 {词：次数 }bigram {’__START’: 1}，再把current值放进bigram，bigram[previous]=current，bigram就是一个嵌套的键值对？ {词：{下一个词：下一个词的次数}}，有重复的单词就把他们合并

答案就是短短四行，自己做lab的时候怎么都写不出…可能还是不熟悉dictionary的用法

结果：
unigram：{’?’: 1, ‘When’: 1, ‘__END’: 1, ‘__START’: 1, ‘cat’: 1, ‘did’: 1, ‘mat’: 1, ‘on’: 1, ‘sit’: 1, ‘the’: 2}
bigram：{’?’: {’__END’: 1}, ‘When’: {‘did’: 1}, ‘__END’: {’__START’: 1}, ‘__START’: {‘When’: 1}, ‘cat’: {‘sit’: 1}, ‘did’: {‘the’: 1}, ‘mat’: {’?’: 1}, ‘on’: {‘the’: 1}, ‘sit’: {‘on’: 1}, ‘the’: {‘cat’: 1, ‘mat’: 1}}

convert_to_probs方法

self.unigram={k:v/sum(self.unigram.values()) for (k,v) in self.unigram.items()}
self.bigram={key:{k:v/sum(adict.values()) for (k,v) in adict.items()} for (key,adict) in self.bigram.items()}

是计算了键值对的出现概率，这步是基础后面的基于这个方法
dic的get方法是dict.get(key, default=None)返回key对应的值，没有该键则返回默认值
后面计算概率先计算行，再加在一起计算整段
perplexity值也是用这个计算的 p = e （−LP/N）

遍历寻找左右上下文

def get_left_context(self,window=1,target="_____"):
        found=-1
        sent_tokens=self.get_tokens()
        for i,token in enumerate(sent_tokens):
            if token==target:
                found=i
                break              
        if found>-1:
            return sent_tokens[i-window:i]
        else:
            return []
            
    def get_right_context(self,window=1,target="_____"):
        found=-1
        sent_tokens=self.get_tokens()
        for i,token in enumerate(sent_tokens):
            if token==target:
                found=i
                break  
          
        if found>-1:
            return sent_tokens[found+1:found+window+1]
           
        else:
            return []