第三章
这本书网上能找到的答案很少。
这一章主要涉及字符串处理、正则表达式、爬虫、列表推导等内容。
1
s = 'colorful'
print(s[:3]+'u'+s[3:])
2
s[-9]
4
s[::2]
5
s[::-1]
7
import re
string = "The purpose of this research was to create a framework of indicators that enabled us to measure the classic dimensions of sustainable development (SD): People, Planet, and Profit, in combination with the sustainability of the heritage values and the policy dimension. Methods developed as an approach to sustainable urban planning and that were based on system analysis"
re.findall(r'a|an|the',string)
string = "The purpose of this research was to create 2+3*7"
re.search(r'\d+([+*]\d)*',string)
8
from urllib import request
with request.urlopen('https://news.sina.com.cn/gov/xlxw/2021-03-18/doc-ikknscsi7715675.shtml') as f:
data = f.read()
print('Status:', f.status, f.reason)
for k, v in f.getheaders():
print('%s: %s' % (k, v))
print('Data:', data.decode('utf-8'))
without_html = re.sub(r'<(\S*?)[^>]*>.*?|<.*? />','',data.decode('utf-8'))
without_space = re.sub(r'^\s*|\s*$','',without_html)
9
def load(f):
try:
with open(f,encoding='utf8') as f:
data = f.read()
except:
with open(f,encoding='gbk') as f:
data = f.read()
return re.findall( r'[\。\,\;\“\’\?\(\)\:\-\——\`]',data)
pattern = re.compile(r"""(?x)
([A-Z]\.)+
|[A-Z][a-z]*\s[A-Z][a-z]*
|\$?\d+(\.\d+)?%?
|\d+\-\d+\-\d+
""",re.X)
text='That U.S.A 2011-2-3 soub $12.22'
res=pattern.findall(text)
print(res)
10
sent=['ans','sjs','aaa']
res = [(w,len(w)) for w in sent]
res
11
raw = 'aksjd askjdn eurs snnd'
raw.split('s')
12
for w in raw.split(' '):
print(w)
13
#split() 与 split(' ')的区别是split()默认所有空格和制表符作为分隔符,而split(' ')只以一个空格作为分隔符
raw='skjkaj sass \t'
raw.split()
raw.split(' ')
14
import jieba
'''
sort()方法是针对列表类型的变量的一种排序方法,使用时:比如列表类型的变量ls, ls.sort()
sorted()方法是针对所有类型的变量,比sort()使用范围更广,使用时:sorted(ls)
'''
sent='people, Xi extended sincere greetings and best wishes to Bangladeshi President Abdul Hamid, Bangladeshi Prime Minister Sheikh Hasina, and the Bangladeshi government and people. (Xinhua/Li Xueren)'
words = sent.split()
words.sort()
sorted(words)
15
"3"*7
3*7
16
%load monty.py
monty = 'Monty python'
17
string = 'ashshser alskdn eeee'
print('%a-6s' % string)
18
from nltk.corpus import brown
words = brown.words()
whs = []
for w in words:
if re.match(r'[wW]h[A-Za-z]+',w):
whs.append(w)
set(whs)
20
import requests
import bs4
import re
url='http://news.weather.com.cn/2021/03/3449612.shtml'
response = requests.get(url)
response.encoding='utf-8'
soup = bs4.BeautifulSoup(response.text,'html.parser')
for i in soup.findAll(name='div',attrs = {'class':'articleBody'}):
print(i.get_text())
21
from nltk.corpus import words
nltk_words = words.words()
def unknown(url):
unknown_words = []
cont = ''
response = requests.get(url)
response.encoding='utf-8'
soup = bs4.BeautifulSoup(response.text,'html.parser')
for i in soup.findAll(name='p'):
cont += i.get_text()
sp_words = cont.split(' ')
for w in sp_words:
w = re.findall(r'[A-Za-z]+',w)
if w:
w = w[0].lower()
if w not in nltk_words and w != '':
unknown_words.append(w)
return unknown_words
unknown('http://www.chinadaily.com.cn/a/202103/19/WS6053fedba31024ad0bab0310.html')
23
print(re.sub('don\'t','do n\'t',"don't touch me"))
print(re.sub('don\'t','do n\'t',"《don't|\w+》"))#因为原来句子由转义符所以不工作
24
def word_token(string):
w = re.sub('e','3',string)
w = re.sub('i','1',w)
w = re.sub('o','0',w)
w = re.sub('1','|',w)
w = re.sub('s','5',w)
w = re.sub('\.','5w33t!',w)
w = re.sub('ate','8',w)
return w
word_token('esjioa1s.ppate')
def word_token(string):
if string[0]=='s':
w = '$'+re.sub('s','5',string[1:])
elif 's' in string:
w = re.sub('s','5',string)
return w
word_token('shs')
25
def word_transfer(string):
words = string.split(' ')
trans_words = []
for word in words:
if re.match(r'[^aeiou]+',word) and re.match(r'[^A-Z]+',word):
fu = re.match(r'[^aeiou]+',word)
yuan = re.search(r'[aeiou](.*)',word)
trans_words.append(yuan.group(0)+fu.group(0)+'ay')
else:
trans_words.append(word)
return ' '.join(trans_words)
word_transfer('Pig Latin string idle')
27
import random
def generaize_sents(num):
words = []
for i in range(num):
words.append(random.choice('aehh '))
return ''.join(words)
generaize_sents(500)
29
from nltk.corpus import brown
for cate in brown.categories():
words = brown.words(categories=cate)
word_sum = 0
for w in words:
word_sum += len(w)
avg_word = word_sum/len(words)
sents = brown.sents(categories=cate)
sent_sum = 0
for sent in sents:
sent_sum += len(sent)
avg_sent = sent_sum/len(sents)
print(cate+":"+str(4.71*avg_word+0.5*avg_sent-21.43))
30
import nltk
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()
print([porter.stem(w) for w in brown.words(categories='belles_lettres')][:50])
print([lancaster.stem(w) for w in brown.words(categories='belles_lettres')][:50])
33
'inexxpersjsnk'.index('sj')
words = brown.words(categories='belles_lettres')[:20]
words.index('They')
words[:words.index('They')]
38
def multi_raw(string):
words = string.split(' ')
n_w=''
for w in words:
w = ''.join(re.findall('\S',w))
if re.sub(r'\-','',w) in nltk.corpus.words.words():
w = re.sub(r'\-','',w)
n_w =n_w+' '+w
return n_w
multi_raw("habe long-\nterm encyclo-\npedia")
41
words = ''.join([random.choice('abcudefghijklmnop ') for _ in range(100)]).split()
res = [''.join([w for w in word if w in 'aeiou' ]) for word in words ]