《Python自然语言处理》第三章练习题答案

最新推荐文章于 2022-09-16 20:26:33 发布

heize19

最新推荐文章于 2022-09-16 20:26:33 发布

阅读量1.6k

点赞数 1

本文链接：https://blog.csdn.net/qq_44715621/article/details/115013962

版权

自然语言处理专栏收录该内容

4 篇文章 0 订阅

订阅专栏

第三章
这本书网上能找到的答案很少。
这一章主要涉及字符串处理、正则表达式、爬虫、列表推导等内容。
1

s = 'colorful'
print(s[:3]+'u'+s[3:])

s[-9]

s[::2]

s[::-1]

import re
string = "The purpose of this research was to create a framework of indicators that enabled us to measure the classic dimensions of sustainable development (SD): People, Planet, and Profit, in combination with the sustainability of the heritage values and the policy dimension. Methods developed as an approach to sustainable urban planning and that were based on system analysis"
re.findall(r'a|an|the',string)
string = "The purpose of this research was to create 2+3*7"
re.search(r'\d+([+*]\d)*',string)

from  urllib import request
with request.urlopen('https://news.sina.com.cn/gov/xlxw/2021-03-18/doc-ikknscsi7715675.shtml') as f:
    data = f.read()
    print('Status:', f.status, f.reason)
    for k, v in f.getheaders():
        print('%s: %s' % (k, v))
    print('Data:', data.decode('utf-8'))
without_html = re.sub(r'<(\S*?)[^>]*>.*?|<.*? />','',data.decode('utf-8'))
without_space = re.sub(r'^\s*|\s*$','',without_html)

def load(f):
    try:
        with open(f,encoding='utf8') as f:
            data = f.read()
    except:
        with open(f,encoding='gbk') as f:
            data = f.read()

    return re.findall( r'[\。\，\；\“\’\？\(\)\：\-\——\`]',data)

pattern = re.compile(r"""(?x)
([A-Z]\.)+
|[A-Z][a-z]*\s[A-Z][a-z]*
|\$?\d+(\.\d+)?%?
|\d+\-\d+\-\d+
""",re.X)
text='That U.S.A 2011-2-3 soub $12.22'
res=pattern.findall(text)
print(res)

sent=['ans','sjs','aaa']
res = [(w,len(w)) for w in sent]
res

raw = 'aksjd askjdn eurs snnd'
raw.split('s')

for w in raw.split(' '):
    print(w)

#split() 与 split(' ')的区别是split()默认所有空格和制表符作为分隔符，而split(' ')只以一个空格作为分隔符
raw='skjkaj   sass  \t'
raw.split()
raw.split(' ')

import jieba
'''
sort()方法是针对列表类型的变量的一种排序方法，使用时：比如列表类型的变量ls, ls.sort()
sorted()方法是针对所有类型的变量，比sort()使用范围更广，使用时：sorted(ls)
'''
sent='people, Xi extended sincere greetings and best wishes to Bangladeshi President Abdul Hamid, Bangladeshi Prime Minister Sheikh Hasina, and the Bangladeshi government and people. (Xinhua/Li Xueren)'
words = sent.split()
words.sort()
sorted(words)

"3"*7
3*7

%load monty.py
monty = 'Monty python'

string = 'ashshser alskdn eeee'
print('%a-6s' % string)

from nltk.corpus import brown
words = brown.words()
whs = []
for w in words:
    if re.match(r'[wW]h[A-Za-z]+',w):
        whs.append(w)
set(whs)

import requests 
import bs4
import re 
url='http://news.weather.com.cn/2021/03/3449612.shtml'
response = requests.get(url)
response.encoding='utf-8'
soup = bs4.BeautifulSoup(response.text,'html.parser')
for i in soup.findAll(name='div',attrs = {'class':'articleBody'}):       
    print(i.get_text())

from nltk.corpus import words
nltk_words = words.words()
def unknown(url):
    unknown_words = []
    cont = ''
    response = requests.get(url)
    response.encoding='utf-8'
    soup = bs4.BeautifulSoup(response.text,'html.parser')
    for i in soup.findAll(name='p'):       
        cont += i.get_text()
    sp_words = cont.split(' ')
    for w in sp_words:
        w = re.findall(r'[A-Za-z]+',w)
        if w:
            w = w[0].lower()
            if w not in nltk_words and w != '':
                unknown_words.append(w)
    return unknown_words
unknown('http://www.chinadaily.com.cn/a/202103/19/WS6053fedba31024ad0bab0310.html')

print(re.sub('don\'t','do n\'t',"don't touch me"))
print(re.sub('don\'t','do n\'t',"《don't|\w+》"))#因为原来句子由转义符所以不工作

def word_token(string):
    w = re.sub('e','3',string)
    w = re.sub('i','1',w)
    w = re.sub('o','0',w)
    w = re.sub('1','|',w)
    w = re.sub('s','5',w)
    w = re.sub('\.','5w33t!',w)
    w = re.sub('ate','8',w)
    return w
word_token('esjioa1s.ppate')

def word_token(string):
    if string[0]=='s':
        w = '$'+re.sub('s','5',string[1:])
    elif 's' in string:
        w = re.sub('s','5',string)
    return w
word_token('shs')

def word_transfer(string):
    words = string.split(' ')
    trans_words = []
    for word in words:
        if re.match(r'[^aeiou]+',word) and re.match(r'[^A-Z]+',word):
            fu = re.match(r'[^aeiou]+',word)
            yuan = re.search(r'[aeiou](.*)',word)
            trans_words.append(yuan.group(0)+fu.group(0)+'ay')
        else:
            trans_words.append(word)
    return ' '.join(trans_words)
word_transfer('Pig Latin string idle')

import random
def generaize_sents(num):
    words = []
    for i in range(num):
        words.append(random.choice('aehh '))
    return ''.join(words)
generaize_sents(500)

from nltk.corpus import brown
for cate in brown.categories():
    words = brown.words(categories=cate)
    word_sum = 0
    for w in words:
        word_sum += len(w)
    avg_word = word_sum/len(words)
    
    sents = brown.sents(categories=cate)
    sent_sum = 0
    for sent in sents:
        sent_sum += len(sent)
    avg_sent = sent_sum/len(sents)
    
    print(cate+":"+str(4.71*avg_word+0.5*avg_sent-21.43))

import nltk
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()
print([porter.stem(w) for w in brown.words(categories='belles_lettres')][:50])
print([lancaster.stem(w) for w in brown.words(categories='belles_lettres')][:50])

'inexxpersjsnk'.index('sj')

words = brown.words(categories='belles_lettres')[:20]
words.index('They')

words[:words.index('They')]

def multi_raw(string):
    words = string.split(' ')
    n_w=''
    for w in words:
        w = ''.join(re.findall('\S',w))
        if re.sub(r'\-','',w) in nltk.corpus.words.words():
            w = re.sub(r'\-','',w)
        n_w =n_w+' '+w
    return n_w
multi_raw("habe long-\nterm encyclo-\npedia")

words = ''.join([random.choice('abcudefghijklmnop ') for _ in range(100)]).split()
res = [''.join([w for w in word if w in 'aeiou' ]) for word in words ]

heize19

关注

1
点赞
踩
14

收藏

觉得还不错? 一键收藏
2
评论
《Python自然语言处理》第三章练习题答案

第三章这本书网上能找到的答案很少。这一章主要涉及字符串处理、正则表达式、爬虫、列表推导等内容。1s = 'colorful'print(s[:3]+'u'+s[3:])2s[-9]4s[::2]5s[::-1]7import restring = "The purpose of this research was to create a framework of indicators that enabled us to measure the classic dimen
复制链接

扫一扫