#!/usr/bin/env python
# _*_ coding:utf-8 _*_
import re
import string
from collections import OrderedDict
from urllib.request import urlopen
from bs4 import BeautifulSoup
def cleanInput(input):
input= re.sub('\n+'," ",input)
input=re.sub('\[[0-9]*\]',"",input)
input=re.sub(' +'," ",input)
input=bytes(input,"UTF-8")
input=input.decode("ascii","ignore")
cleanInput=[]
input=input.split(' ')
for item in input:
item=item.strip(string.punctuation)
if len(item)>0 or (item.lower()=='a' or item.lower()=='t'):
cleanInput.append(item)
return cleanInput
def ngrams(input ,n):
input=cleanInput(input)
output=[]
outputNew=[]
for i in range(len(input)-n+1):
output.append(str(input[i:i+n]))
setout=set(output)
for item in setout:
outputNew.append((item,output.count(item)))
return outputNew
html=urlopen("https://en.wikipedia.org/wiki/Python_(programming_language)")
bsObj=BeautifulSoup(html,"html.parser")
content=bsObj.find("div",{"id":"mw-content-text"}).get_text()
ngrams=ngrams(content,2)
ngrams=OrderedDict(sorted(ngrams,key=lambda t: t[1],reverse=True))
print(ngrams)
print("2-ngrams count is "+str(len(ngrams)))