Markov Models

#!/usr/bin/python
#encoding:utf-8

"""
@author: LlQ
@contact:LIQINGLIN54951@gmail.com
@file:cp9_p178.py
@time: 5/19/2019 7:00 PM
"""

from urllib.request import urlopen
from random import randint

def wordListSum(wordList):
    sum = 0
    for word, value in wordList.items():
        sum += value
    return sum

def retrieveRandomWord(wordList):#Markov Models
    randIndex = randint(1, wordListSum(wordList))#
    for word, value in wordList.items():
        randIndex -= value

        if randIndex <= 0:
            return word

def buildWordDict(text):
    # Remove newlines and quotes
    text = text.replace('\n', ' ');
    text = text.replace('"', '');

    # Make sure punctuation marks are treated as  their own "words,"
    # so that they will be included in the Markov chain
    punctuaction = [',','.',';',':']
    # putting spaces around the punctuation
    for symbol in punctuaction:
        text = text.replace(symbol, ' {} '.format(symbol))
        # text = text.replace(symbol, " "+symbol+" ");

    words = text.split(' ')
    # Filter our empty words
    words = [word for word in words if word != '']

    wordDict = {}
    #it builds a two-dimensional dictionary—a dictionary of dictionaries
    for i in range(1, len(words)):
        if words[i-1] not in wordDict:
            # Create a new dictionary for this word
            wordDict[ words[i-1] ] = {} #{words[i-1]:{}}

        if words[i] not in wordDict[ words[i-1] ]:
            wordDict[ words[i-1] ][ words[i] ] = 0 #{words[i-1]:{words[i]:0}}
        wordDict[ words[i-1] ][ words[i] ] += 1    #{words[i-1]:{words[i]:1}}
    return wordDict

text = str(urlopen('http://pythonscraping.com/files/inaugurationSpeech.txt').read(), 'utf-8')
#print(text)

wordDict = buildWordDict(text)

#print(wordDict)

# Generate a Markov chain of length 100
length =100
chain = ['Called']
for i in range(0, length):
    newWord = retrieveRandomWord(wordDict[chain[-1]])#Markov Models
    chain.append(newWord)

print( ' '.join(chain))

# for i in range(0, length):
#     chain += currentWord+" "
#     currentWord = retrieveRandomWord(wordDict[currentWord])
# print(chain)

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值