w9_Reading and Writing Natural Languages_p172 2-grams OR N-grams From Sentences_re.sub

#!/usr/bin/python
#encoding:utf-8

"""
@author: LlQ
@contact:LIQINGLIN54951@gmail.com
@file:cp9_p172.py
@time: 5/19/2019 3:40 AM
"""

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string

from collections import Counter

def cleanSentence(sentence):
    #splits the sentence into words,
    wordList = sentence.split(' ')

    #strips punctuation and whitespace,
    #string.punctuation):    !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
    wordList = [word.strip(string.punctuation+string.whitespace) for word in wordList]

    #removes single-character words besides I and a
    wordList = [word for word in wordList if (len(word) > 1)
                or (word.lower() == 'a' or word.lower() =='i')]
    return wordList #word List

def cleanInput(content):
    #replaces all instances of the newline character with a space
    #removes citations like [123],[1]
    content = re.sub('\n|[[\d+\]]', ' ', content)

    #连续的多个空格替换成一个空格,确保所有单词之间只有一个空格
    content = re.sub(' +', " ", content)

    #escape characters are eliminated by encoding the content with UTF-8
    #bytes():如果 source 为字符串,则按照指定的 encoding 将字符串转换为字节序列
    content = bytes(content,"UTF-8")
    content = content.decode('ascii', 'ignore')

    sentenceList = content.split('. ')
    # sentenceList: sentenceList[wordsList[]...]
    return [cleanSentence(sentence) for sentence in sentenceList]

def getNgramsFromSentence(wordList,n):
    output = []
    for i in range( len(wordList)-n+1 ):
        output.append( wordList[i:i+n] )
    #print(output) #output:[[N-WordList]...] or [[N-gram]...]
    return output

def getNgrams(content,n):
    content = content.upper()
    content = cleanInput(content) # sentences: sentenceList[wordsList[]...]
    #print(content)
    # ngrams = []
    # for sentence in content:
    #     ngrams.extend( getNgramsFromSentence(sentence, n) )
    # return (ngrams)
    ngrams=Counter()
    ngrams_list=[]                  ##########

    for sentence in content:  #each wordList in sentenceList
        newNgrams = [ ' '.join(ngram) for ngram in getNgramsFromSentence(sentence,2) ]
        ngrams_list.extend(newNgrams)##########

        ngrams.update(newNgrams)#Convert A Counter Object
    return (ngrams)

def isCommon(ngram):
    commonWords = ['THE', 'BE', 'AND', 'OF', 'A', 'IN', 'TO', 'HAVE', 'IT', 'I',
        'THAT', 'FOR', 'YOU', 'HE', 'WITH', 'ON', 'DO', 'SAY', 'THIS', 'THEY',
        'IS', 'AN', 'AT', 'BUT', 'WE', 'HIS', 'FROM', 'THAT', 'NOT', 'BY',
        'SHE', 'OR', 'AS', 'WHAT', 'GO', 'THEIR', 'CAN', 'WHO', 'GET', 'IF',
        'WOULD', 'HER', 'ALL', 'MY', 'MAKE', 'ABOUT', 'KNOW', 'WILL', 'AS',
        'UP', 'ONE', 'TIME', 'HAS', 'BEEN', 'THERE', 'YEAR', 'SO', 'THINK',
        'WHEN', 'WHICH', 'THEM', 'SOME', 'ME', 'PEOPLE', 'TAKE', 'OUT', 'INTO',
        'JUST', 'SEE', 'HIM', 'YOUR', 'COME', 'COULD', 'NOW', 'THAN', 'LIKE',
        'OTHER', 'HOW', 'THEN', 'ITS', 'OUR', 'TWO', 'MORE', 'THESE', 'WANT',
        'WAY', 'LOOK', 'FIRST', 'ALSO', 'NEW', 'BECAUSE', 'DAY', 'MORE', 'USE',
        'NO', 'MAN', 'FIND', 'HERE', 'THING', 'GIVE', 'MANY', 'WELL']

    for word in ngram:
        if word in commonWords:
            return True
    return False

content = str(urlopen('http://pythonscraping.com/files/inaugurationSpeech.txt').read(), 'utf-8')
ngrams = getNgrams(content,2)

#print( list(ngrams.elements()))
for ngram in list(ngrams.elements()):
    if ngrams[ngram]<3 or isCommon(ngram)!=True:
        del ngrams[ngram]


import operator
sortedNGrams = sorted(ngrams.items(), key = operator.itemgetter(1), reverse=False)

print(sortedNGrams)
print('2-grams count is: '+str(len(sortedNGrams)))#284 #5632:content = content.upper()

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值