type在python的用法和短语_Python中使用NLTK的短语的一致性

我狼吞虎咽地想出了这个解决方案。。。在def n_concordance_tokenised(text,phrase,left_margin=5,right_margin=5):

#concordance replication via https://simplypython.wordpress.com/2014/03/14/saving-output-of-nltk-text-concordance/

phraseList=phrase.split(' ')

c = nltk.ConcordanceIndex(text.tokens, key = lambda s: s.lower())

#Find the offset for each token in the phrase

offsets=[c.offsets(x) for x in phraseList]

offsets_norm=[]

#For each token in the phraselist, find the offsets and rebase them to the start of the phrase

for i in range(len(phraseList)):

offsets_norm.append([x-i for x in offsets[i]])

#We have found the offset of a phrase if the rebased values intersect

#

# http://stackoverflow.com/a/3852792/454773

#the intersection method takes an arbitrary amount of arguments

#result = set(d[0]).intersection(*d[1:])

#

intersects=set(offsets_norm[0]).intersection(*offsets_norm[1:])

concordance_txt = ([text.tokens[map(lambda x: x-left_margin if (x-left_margin)>0 else 0,[offset])[0]:offset+len(phraseList)+right_margin]

for offset in intersects])

outputs=[''.join([x+' ' for x in con_sub]) for con_sub in concordance_txt]

return outputs

def n_concordance(txt,phrase,left_margin=5,right_margin=5):

tokens = nltk.word_tokenize(txt)

text = nltk.Text(tokens)

return

n_concordance_tokenised(text,phrase,left_margin=left_margin,right_margin=right_margin)

n_concordance_tokenised(text1,'monstrous size')

>> [u'one was of a most monstrous size . ... This came towards ',

u'; for Whales of a monstrous size are oftentimes cast up dead ']

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值