《Python自然语言处理》第三章练习题答案

第三章
这本书网上能找到的答案很少。
这一章主要涉及字符串处理、正则表达式、爬虫、列表推导等内容。
1

s = 'colorful'
print(s[:3]+'u'+s[3:])

2

s[-9]

4

s[::2]

5

s[::-1]

7

import re
string = "The purpose of this research was to create a framework of indicators that enabled us to measure the classic dimensions of sustainable development (SD): People, Planet, and Profit, in combination with the sustainability of the heritage values and the policy dimension. Methods developed as an approach to sustainable urban planning and that were based on system analysis"
re.findall(r'a|an|the',string)
string = "The purpose of this research was to create 2+3*7"
re.search(r'\d+([+*]\d)*',string)

8

from  urllib import request
with request.urlopen('https://news.sina.com.cn/gov/xlxw/2021-03-18/doc-ikknscsi7715675.shtml') as f:
    data = f.read()
    print('Status:', f.status, f.reason)
    for k, v in f.getheaders():
        print('%s: %s' % (k, v))
    print('Data:', data.decode('utf-8'))
without_html = re.sub(r'<(\S*?)[^>]*>.*?|<.*? />','',data.decode('utf-8'))
without_space = re.sub(r'^\s*|\s*$','',without_html)

9

def load(f):
    try:
        with open(f,encoding='utf8') as f:
            data = f.read()
    except:
        with open(f,encoding='gbk') as f:
            data = f.read()

    return re.findall( r'[\。\,\;\“\’\?\(\)\:\-\——\`]',data)
pattern = re.compile(r"""(?x)
([A-Z]\.)+
|[A-Z][a-z]*\s[A-Z][a-z]*
|\$?\d+(\.\d+)?%?
|\d+\-\d+\-\d+
""",re.X)
text='That U.S.A 2011-2-3 soub $12.22'
res=pattern.findall(text)
print(res)

10

sent=['ans','sjs','aaa']
res = [(w,len(w)) for w in sent]
res

11

raw = 'aksjd askjdn eurs snnd'
raw.split('s')

12

for w in raw.split(' '):
    print(w)

13

#split() 与 split(' ')的区别是split()默认所有空格和制表符作为分隔符,而split(' ')只以一个空格作为分隔符
raw='skjkaj   sass  \t'
raw.split()
raw.split(' ')

14

import jieba
'''
sort()方法是针对列表类型的变量的一种排序方法,使用时:比如列表类型的变量ls, ls.sort()
sorted()方法是针对所有类型的变量,比sort()使用范围更广,使用时:sorted(ls)
'''
sent='people, Xi extended sincere greetings and best wishes to Bangladeshi President Abdul Hamid, Bangladeshi Prime Minister Sheikh Hasina, and the Bangladeshi government and people. (Xinhua/Li Xueren)'
words = sent.split()
words.sort()
sorted(words)

15

"3"*7
3*7

16

%load monty.py
monty = 'Monty python'

17

string = 'ashshser alskdn eeee'
print('%a-6s' % string)

18

from nltk.corpus import brown
words = brown.words()
whs = []
for w in words:
    if re.match(r'[wW]h[A-Za-z]+',w):
        whs.append(w)
set(whs)

20

import requests 
import bs4
import re 
url='http://news.weather.com.cn/2021/03/3449612.shtml'
response = requests.get(url)
response.encoding='utf-8'
soup = bs4.BeautifulSoup(response.text,'html.parser')
for i in soup.findAll(name='div',attrs = {'class':'articleBody'}):       
    print(i.get_text())

21

from nltk.corpus import words
nltk_words = words.words()
def unknown(url):
    unknown_words = []
    cont = ''
    response = requests.get(url)
    response.encoding='utf-8'
    soup = bs4.BeautifulSoup(response.text,'html.parser')
    for i in soup.findAll(name='p'):       
        cont += i.get_text()
    sp_words = cont.split(' ')
    for w in sp_words:
        w = re.findall(r'[A-Za-z]+',w)
        if w:
            w = w[0].lower()
            if w not in nltk_words and w != '':
                unknown_words.append(w)
    return unknown_words
unknown('http://www.chinadaily.com.cn/a/202103/19/WS6053fedba31024ad0bab0310.html')

23

print(re.sub('don\'t','do n\'t',"don't touch me"))
print(re.sub('don\'t','do n\'t',"《don't|\w+》"))#因为原来句子由转义符所以不工作

24

def word_token(string):
    w = re.sub('e','3',string)
    w = re.sub('i','1',w)
    w = re.sub('o','0',w)
    w = re.sub('1','|',w)
    w = re.sub('s','5',w)
    w = re.sub('\.','5w33t!',w)
    w = re.sub('ate','8',w)
    return w
word_token('esjioa1s.ppate')
def word_token(string):
    if string[0]=='s':
        w = '$'+re.sub('s','5',string[1:])
    elif 's' in string:
        w = re.sub('s','5',string)
    return w
word_token('shs')

25

def word_transfer(string):
    words = string.split(' ')
    trans_words = []
    for word in words:
        if re.match(r'[^aeiou]+',word) and re.match(r'[^A-Z]+',word):
            fu = re.match(r'[^aeiou]+',word)
            yuan = re.search(r'[aeiou](.*)',word)
            trans_words.append(yuan.group(0)+fu.group(0)+'ay')
        else:
            trans_words.append(word)
    return ' '.join(trans_words)
word_transfer('Pig Latin string idle')

27

import random
def generaize_sents(num):
    words = []
    for i in range(num):
        words.append(random.choice('aehh '))
    return ''.join(words)
generaize_sents(500)

29

from nltk.corpus import brown
for cate in brown.categories():
    words = brown.words(categories=cate)
    word_sum = 0
    for w in words:
        word_sum += len(w)
    avg_word = word_sum/len(words)
    
    sents = brown.sents(categories=cate)
    sent_sum = 0
    for sent in sents:
        sent_sum += len(sent)
    avg_sent = sent_sum/len(sents)
    
    print(cate+":"+str(4.71*avg_word+0.5*avg_sent-21.43))

30

import nltk
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()
print([porter.stem(w) for w in brown.words(categories='belles_lettres')][:50])
print([lancaster.stem(w) for w in brown.words(categories='belles_lettres')][:50])

33

'inexxpersjsnk'.index('sj')
words = brown.words(categories='belles_lettres')[:20]
words.index('They')
words[:words.index('They')]

38

def multi_raw(string):
    words = string.split(' ')
    n_w=''
    for w in words:
        w = ''.join(re.findall('\S',w))
        if re.sub(r'\-','',w) in nltk.corpus.words.words():
            w = re.sub(r'\-','',w)
        n_w =n_w+' '+w
    return n_w
multi_raw("habe long-\nterm encyclo-\npedia")

41

words = ''.join([random.choice('abcudefghijklmnop ') for _ in range(100)]).split()
res = [''.join([w for w in word if w in 'aeiou' ]) for word in words ]
  • 1
    点赞
  • 14
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值