# 自然语言处理的一些算法研究和实现(NLTK)

## 递归

#### 如果要计算n个词有多少种组合方式？按照阶乘定义：n! = n*(n-1)*…*1

def func(wordlist):
length = len(wordlist)
if length==1:
return 1
else:
return func(wordlist[1:])*length

#### 如果要寻找word下位词的大小，并且将他们加和。

from nltk.corpus import wordnet as wn

def func(s):#s是WordNet里面的对象
return 1+sum(func(child) for child in s.hyponyms())

dog = wn.synset('dog.n.01')
print(func(dog))

#### 构建一个字母查找树

def WordTree(trie,key,value):
if key:
first , rest = key[0],key[1:]
if first not in trie:
trie[first] = {}
WordTree(trie[first],rest,value)
else:
trie['value'] = value

WordDict = {}
WordTree(WordDict,'cat','cat')
WordTree(WordDict,'dog','dog')
print(WordDict)

## 贪婪算法：不确定边界自然语言的分割问题(退火算法的非确定性搜索)

import nltk
from random import randint

#text = 'doyou'
#segs = '01000'

def segment(text,segs):#根据segs，返回切割好的词链表
words = []
last = 0
for i in range(len(segs)):
if segs[i]=='1':#每当遇见1,说明是词分界
words.append(text[last:i+1])
last = i+1
words.append(text[last:])
return words

def evaluate(text,segs): #计算这种词分界的得分。作为分词质量，得分值越小越好(分的越细和更准确之间的平衡)
words = segment(text,segs)
text_size = len(words)
lexicon_size = len(' '.join(list(set(words))))
return text_size + lexicon_size

###################################以下是退火算法的非确定性搜索############################################

def filp(segs,pos):#在pos位置扰动
return segs[:pos]+str(1-int(segs[pos]))+segs[pos+1:]

def filp_n(segs,n):#扰动n次
for i in range(n):
segs = filp(segs,randint(0,len(segs)-1))#随机位置扰动
return segs

def anneal(text,segs,iterations,cooling_rate):
temperature = float(len(segs))
while temperature>=0.5:
best_segs,best = segs,evaluate(text,segs)
for i in range(iterations):#扰动次数
guess = filp_n(segs,int(round(temperature)))
score = evaluate(text,guess)
if score<best:
best ,best_segs = score,guess
score,segs = best,best_segs
temperature = temperature/cooling_rate #扰动边界，进行降温
print( evaluate(text,segs),segment(text,segs))
print()
return segs
text = 'doyouseethekittyseethedoggydoyoulikethekittylikethedoggy'
seg =  '0000000000000001000000000010000000000000000100000000000'
anneal(text,seg,5000,1.2)

## 动态规划

#### 首先用递归的方式编写一下找到任意音节的函数

def func1(n):
if n==0:
return [""]
elif n==1:
return ["S"]
else:
s = ["S" + item for item in func1(n-1)]
l = ["L" + item for item in func1(n-2)]
return s+l
print(func1(4))

#### 使用动态规划来实现找到任意音节的函数

def func2(n):#采用自下而上的动态规划
lookup = [[""],["S"]]
for i in range(n-1):
s = ["S"+ item for item in lookup[i+1]]
l = ["L" + item for item in lookup[i]]
lookup.append(s+l)
return lookup
print(func2(4)[4])
print(func2(4))
def func3(n,lookup={0:[""],1:["S"]}):#采用自上而下的动态规划
if n not in lookup:
s = ["S" + item for item in func3(n-1)]
l = ["L" + item for item in func3(n-2)]
lookup[n] = s+l
return lookup[n]#必须返回lookup[n].否则递归的时候会出错
print(func3(4))

#### NLTK自带装饰符:默记

from nltk import memoize
@memoize
def func4(n):
if n==0:
return [""]
elif n==1:
return ["S"]
else:
s = ["S" + item for item in func4(n-1)]
l = ["L" + item for item in func4(n-2)]
return s+l
print(func4(4))

## 其他的应用

#### 词汇多样性

from nltk.corpus import gutenberg
for fileid in gutenberg.fileids():
num_chars = len(gutenberg.raw(fileid))
num_words = len(gutenberg.words(fileid))
num_sents = len(gutenberg.sents(fileid))
num_vocab = len(set(w.lower() for w in gutenberg.words(fileid)))
print(int(num_chars/num_words),int(num_words/num_sents),int(num_words/num_vocab),'from',fileid)

#### 文体差异性

from nltk.corpus import brown
from nltk import FreqDist,ConditionalFreqDist
cfd = ConditionalFreqDist(( genere,word) for genere in brown.categories() for word in brown.words(categories=genere))
genres=['news','religion','hobbies']
models = ['can','could','will','may','might','must']
cfd.tabulate(conditions = genres,samples=models)

#### 随机语句生成

import nltk
def create_sentence(cfd,word,num=15):
for i in range(num):
print(word,end=" ")
word = cfd[word].max()#查找word最有可能的后缀
text= nltk.corpus.genesis.words("english-kjv.txt")
bigrams = nltk.bigrams(text)
cfd = nltk.ConditionalFreqDist(bigrams)

print(create_sentence(cfd,'living'))

#### 词谜问题解决

puzzle_word = nltk.FreqDist('egivrvonl')
base_word = 'r'
wordlist = nltk.corpus.words.words()
result = [w for w in wordlist if len(w)>=3 and base_word in w and nltk.FreqDist(w)<=puzzle_word]
#通过FreqDist比较法（比较键对应的value），来完成字母只出现一次的要求！！！
print(result)

## 时间和空间权衡:全文检索系统

import nltk
def raw(file):
return str(contents)

def snippet(doc,term):#查找doc中term的定位
text = ' '*30+raw(doc)+' '*30
pos = text.index(term)
return text[pos-30:pos+30]

files = nltk.corpus.movie_reviews.abspaths()
idx = nltk.Index((w,f) for f in files for w in raw(f).split())
#注意nltk.Index格式

query = 'tem'
while query!='quit' and query:
query = input('>>> input the word:')
if query in idx:
for doc in idx[query]:
print(snippet(doc,query))
else:
print('Not found')