# to compute the frequency of ngrams in n_grams_AlphaNum
# put this snippet of code after the second snippet of code in Section 7.4
freq_dict ={}for i in n_grams_AlphaNum:if i in freq_dict.keys():
freq_dict[i]+=1else:
freq_dict[i]=1for j in freq_dict.keys():if freq_dict[j]>=2:print(j[0],j[1],j[2],j[3],'\t',freq_dict[j])
import nltk.collocations
string='''I give Pirrip as my father's family name, on the authority of his tombstone and my sister,--Mrs. Joe Gargery, who married the blacksmith. As I never saw my father or my mother, and never saw any likeness of either of them (for their days were long before the days of photographs), my first fancies regarding what they were like were unreasonably derived from their tombstones. The shape of the letters on my father's, gave me an odd idea that he was a square, stout, dark man, with curly black hair.'''
string_tokenized = nltk.word_tokenize(string.lower())
finder = nltk.collocations.BigramCollocationFinder.from_words(string_tokenized)
bgm = nltk.collocations.BigramAssocMeasures()
scored = finder.score_ngrams(bgm.likelihood_ratio)
scored
上面的代码中,我们首先引入nltk.collocations和定义需要处理的文本,然后同nltk.word_tokenize对文本进行分词处理。接下来,我们通过nltk.collocations中的BigramCollocationFinder.from_words()函数提取分词后的二词词块,并将之赋值给finder。如果我们执行print(finder),返回的结果为<nltk.collocations.Bigram CollocationFinder object at 0x919eacc>,也就是说,finder实际上是一个Bigram CollocationFinder对象。
import nltk
import nltk.collocations
string='''I give Pirrip as my father's family name, on the authority of his tombstone and my sister,--Mrs. Joe Gargery, who married the blacksmith. As I never saw my father or my mother, and never saw any likeness of either of them (for their days were long before the days of photographs), my first fancies regarding what they were like were unreasonably derived from their tombstones. The shape of the letters on my father's, gave me an odd idea that he was a square, stout, dark man, with curly black hair.'''
string_tokenized = nltk.word_tokenize(string.lower())
bgm = nltk.collocations.TrigramAssocMeasures()
finder = nltk.collocations.TrigramCollocationFinder.from_words(string_tokenized)
scored = finder.score_ngrams(bgm.likelihood_ratio)print(scored)
import nltk
from nltk.corpus import stopwords
stopwords_list = stopwords.words('english')print(stopwords_list) #打印出停用词列表
string='''I give Pirrip as my father's family name, on the authority of his tombstone and my sister,--Mrs. Joe Gargery, who married the blacksmith. As I never saw my father or my mother, and never saw any likeness of either of them (for their days were long before the days of photographs), my first fancies regarding what they were like were unreasonably derived from their tombstones. The shape of the letters on my father's, gave me an odd idea that he was a square, stout, dark man, with curly black hair.'''
wordlist = nltk.word_tokenize(string.lower())for word in wordlist:if word not in stopwords_list:print(word)
我们定义需处理的文本,通过nltk.word_tokenize()函数对该文本进行分词处理,制作文本词表,并赋值给wordlist变量。最后,for … in 对wordlist中的单词循环遍历,如果单词不在停用词表中,则将之打印出来。
7.7 词料检索的KWIC实现
在使用Wordsmith或AntConc软件检索关键词时,经常看到返回结果时使用了Key Word in Context(KWIC)的显示方式,即将检索的关键词放在中间对齐的中间位置,关键词左右各留出一定数量的单词或字符串作为语境,以方便研究者在一定语境中阅读关键词。NLTK库有concordance()函数可以实现关键词的KWIC检索。