python自然语言处理 第四章基本语法

#赋值
foo = 'Monty'
bar = foo
foo = 'Python'
bar# 'Monty'
foo = ['Monty', 'Python']
bar = foo
foo[1] = 'Bodkin'
bar# ['Monty', 'Bodkin']
empty = []
nested = [empty, empty, empty]
nested# [[], [], []]
nested[1].append('Python')
nested#  [['Python'], ['Python'], ['Python']]
nested = [[]] * 3
nested[1].append('aaa')
nested# [['aaa'], ['aaa'], ['aaa']]
nested[1] = ['Monty']
nested# [['aaa'], ['Monty'], ['aaa']]
#等式
size = 5
python = ['Python']
snake_nest = [python] * size
snake_nest# [['Python'], ['Python'], ['Python'], ['Python'], ['Python']]
snake_nest[0] == snake_nest[1] == snake_nest[2] == snake_nest[3] == snake_nest[4]# True
snake_nest[0] is snake_nest[1] is snake_nest[2] is snake_nest[3] is snake_nest[4]# True
import random
position = random.choice(range(size))
snake_nest[position] = ['Python']
snake_nest# [['Python'], ['Python'], ['Python'], ['Python'], ['Python']]
snake_nest[0] == snake_nest[1] == snake_nest[2] == snake_nest[3] == snake_nest[4]# True
snake_nest[0] is snake_nest[1] is snake_nest[2] is snake_nest[3] is snake_nest[4]# False
[id(snake) for snake in snake_nest]# [212336032, 212336032, 212128032, 212336032, 212336032]
#条件语句,if...elif...,if中为真不会执行elif语句
mixed = ['cat', '', ['dog'], []]
for element in mixed:
    if element:
        print element
animals = ['cat', 'dog']
if 'rabbit' in animals:
    print 1
elif 'dog' in animals:
    print 2
sent = ['No', 'good', 'fish', 'goes', 'anywhere', 'without', 'a', 'porpoise', '.']
all(len(w) > 4 for w in sent)# False,反映的是是否全部满足
any(len(w) > 4 for w in sent)# True,反映的是有一项满足
#序列
t = 'walk', 'fem', 3
t#  ('walk', 'fem', 3)
t[0]# 'walk'
t[1:]# ('fem', 3)
len(t)# 3
#字符串、元组、列表比较
raw = 'I turned off the spectroroute'
text = ['I', 'turned', 'off', 'the', 'spectroroute']
pair = (6, 'turned')
raw[2],text[3],pair[1]# ('t', 'the', 'turned')
raw[-3:],text[-3:],pair[-3:]# ('ute', ['off', 'the', 'spectroroute'], (6, 'turned'))
len(raw),len(text),len(pair)# (29, 5, 2)
import nltk
raw = 'Red lorry, yellow lorry, red lorry, yellow lorry'
text = nltk.word_tokenize(raw)
fdist = nltk.FreqDist(text)
list(fdist)# ['yellow', 'red', 'lorry', 'Red', ',']
for key in fdist:
    print fdist[key],
#2 1 4 1 3,2个yellow,4个lorry
words = ['I', 'turned', 'off', 'the', 'spectroroute']
words[2], words[3], words[4] = words[3], words[4], words[2]
words#  ['I', 'turned', 'the', 'spectroroute', 'off']
tmp = words[2]
words[2] = words[3]
words[3] = words[4]
words[4] = tmp
words#  ['I', 'turned', 'spectroroute', 'off', 'the']
words = ['I' ,'turned', 'off', 'the', 'spectroroute']
tags = ['noun', 'verb' ,'prep', 'det', 'noun']
zip(words, tags)# zip取两个或两个以上序列中的项目,将其“压缩”打包成单个配对列表
#==============================================================================
# [('I', 'noun'),
#  ('turned', 'verb'),
#  ('off', 'prep'),
#  ('the', 'det'),
#  ('spectroroute', 'noun')]
#==============================================================================
#数据分割
text = nltk.corpus.nps_chat.words()
cut = int(0.9 * len(text))
training_data, test_data = text[:cut], text[cut:]
text == training_data + test_data# True
len(training_data) / len(test_data)# 9
#合并不同类型的序列
words = 'I turned off the spectroroute'.split()
wordlens = [(len(word), word) for word in words]
wordlens# [(1, 'I'), (6, 'turned'), (3, 'off'), (3, 'the'), (12, 'spectroroute')]
wordlens.sort()
' '.join(w for (_,w) in wordlens)#  'I off the turned spectroroute'
#产生器表达式
text = """
When I use a word,"Humpty Dumpty said in rather a acornful tone,
"it means just what I choose it to mean - neither more nor less.
"""
[w.lower() for w in nltk.word_tokenize(text)]#如何读中文分词,啊不明白!!!!!!!
max([w.lower() for w in nltk.word_tokenize(text)])
min([w.lower() for w in nltk.word_tokenize(text)])
#产生一个词链表的所有排列
def permutations(seq):
    if len(seq) <= 1:
        yield seq
    else:
        for perm in permutations(seq[1:]):
            for i in range(len(perm)+1):
                yield perm[:i] + seq[0:1] + perm[i:]
list(permutations(['police', 'fish', 'buffalo']))
#检查一个词是否来自一个开放的实词类
def is_content_word(word):
    return word.lower() not in ['a', 'of', 'the', 'and', 'will', ',', '.']
sent = ['Take', 'care', 'of', 'the', 'sense', ',', 'and', 'the', 'sounds', 'will', 'take', 'care',
        'of', 'themeselves', '.']
filter(is_content_word, sent)
[w for w in sent if is_content_word(w)]
nltk.metrics.distance.__file__
def factorial2(n):
    if n == 1: return 1
    else: return n*factorial2(n-1)
factorial2(5)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值