理解文本分析的原理对各种机器学习场景都是非常有用的,而且还可以提高自己利用概率论和算法知识对现实问题进行建模的能力。
1.概括数据
简单修改一下我们在第 8 章用过的 n-gram 模型,就可以用来获得 2-gram 序列的频率数据,并返回一个 2-gram 的 Counter 对象,代码如下:
# -*- coding: GBK -*-
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string
from collections import Counter
def cleanSentence(sentence):
sentence = sentence.split(' ')
sentence = [word.strip(string.punctuation+string.whitespace) for word in sentence]
sentence = [word for word in sentence if len(word) > 1 or (word.lower() == 'a' or word.lower() == 'i')]
return sentence
def cleanInput(content):
content = content.upper()
content = re.sub('\n', ' ', content)
content = bytes(content, 'UTF-8')
content = content.decode('ascii', 'ignore')
sentences = content.split('. ')
return [cleanSentence(sentence) for sentence in sentences]
def getNgramsFromSentence(content, n):
output = []
for i in range(len(content)-n+1):
output.append(content[i:i+n])
return output
def getNgrams(content, n):
content = cleanInput(content)
ngrams = Counter()
ngrams_list = []
for sentence in content:
newNgrams = [' '.join(ngram) for ngram in getNgramsFromSentence(sentence, 2)]
ngrams_list.extend