Python3-Cookbook-Study Notes chap2:字符串和文本

本文链接：https://blog.csdn.net/chenlan_Cynthia/article/details/106544963

1.使用多个界定符分割字符串 —— re.split()

import re
# re.split()为分隔符指定多个正则模式

line = 'asdf fjdk; afed, fjek,asdf, foo'
re.split(r'[;,\s]\s*', line)
# ['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']

# 需要特别注意的是正则表达式中是否包含一个括号捕获分组。 如果使用了捕获分组，那么被匹配的文本也将出现在结果列表中
fields = re.split(r'(;|,|\s)\s*', line)
# ['asdf', ' ', 'fjdk', ';', 'afed', ',', 'fjek', ',', 'asdf', ',', 'foo']

values = fields[::2]
delimiters = fields[1::2] + ['']
# values  ['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']
# delimiters  [' ', ';', ',', ',', ',', '']
# Reform the line using the same delimiters
''.join(v+d for v,d in zip(values, delimiters))
# 'asdf fjdk;afed,fjek,asdf,foo'

# 如果你不想保留分割字符串到结果列表中去，但仍然需要使用到括号来分组正则表达式的话， 确保你的分组是非捕获分组，形如 (?:...)
re.split(r'(?:,|;|\s)\s*', line)
# ['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']

2.字符串开头或结尾匹配

# 检查字符串开头或结尾 ： str.startswith() / str.endswith()
filename = 'spam.txt'
filename.endswith('.txt')
# True

# 检查多种匹配可能，只需要将所有的匹配项放入到一个元组中去
[name for name in filenames if name.endswith(('.c', '.h'))]

3.用Shell通配符匹配字符串

from fnmatch import fnmatch, fnmatchcase
fnmatch('foo.txt', '*.txt')
True
fnmatch('foo.txt', '?oo.txt')
True
fnmatch('Dat45.csv', 'Dat[0-9]*')
True
names = ['Dat1.csv', 'Dat2.csv', 'config.ini', 'foo.py']
[name for name in names if fnmatch(name, 'Dat*.csv')]
['Dat1.csv', 'Dat2.csv']

# fnmatch() 函数使用底层操作系统的大小写敏感规则(不同的系统是不一样的)来匹配模式
# On OS X (Mac)
fnmatch('foo.txt', '*.TXT') # False
# On Windows
fnmatch('foo.txt', '*.TXT') # True

4.字符串匹配和搜索

# 简单匹配
str.find() , str.endswith() , str.startswith()

# 复杂的匹配：使用正则表达式和 re 模块
# match() 从字符串开始去匹配
# findall() 查找字符串任意部分的模式出现位置

text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
datepat.findall(text) # ['11/27/2012', '3/13/2013']
# 在定义正则式的时候，通常会利用括号去捕获分组
datepat = re.compile(r'(\d+)/(\d+)/(\d+)')
m = datepat.match('11/27/2012')
m.group(0) # '11/27/2012'
m.group(1)  # '11'
month, day, year = m.groups()

# findall() 方法会搜索文本并以列表形式返回所有的匹配
# finditer()以迭代方式返回匹配

5.字符串搜索和替换

# 简单替换 str.replace()
text = 'yeah, but no, but yeah, but no, but yeah'
text.replace('yeah', 'yep')

# 复杂模式：使用 re 模块中的 sub() 函数
# sub() 函数中的第一个参数是被匹配的模式，第二个参数是替换模式
# 如果你打算用相同的模式做多次替换，考虑先编译它来提升性能
import re
text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
re.sub(r'(\d+)/(\d+)/(\d+)', r'\3-\1-\2', text)
# 'Today is 2012-11-27. PyCon starts 2013-3-13.'

# 如果你使用了命名分组，那么第二个参数请使用 \g<group_name> 
re.sub(r'(?P<month>\d+)/(?P<day>\d+)/(?P<year>\d+)', r'\g<year>-\g<month>-\g<day>', text
# 'Today is 2012-11-27. PyCon starts 2013-3-13.'

# 对于更加复杂的替换，可以传递一个替换回调函数来代替
from calendar import month_abbr
def change_date(m):
	mon_name = month_abbr[int(m.group(1))]
	return '{} {} {}'.format(m.group(2), mon_name, m.group(3))
datepat.sub(change_date, text)
# 'Today is 27 Nov 2012. PyCon starts 13 Mar 2013.'

# 如果除了替换后的结果外，你还想知道有多少替换发生了
newtext, n = datepat.subn(r'\3-\1-\2', text)
newtext # 'Today is 2012-11-27. PyCon starts 2013-3-13.'
n # 2

6.字符串忽略大小写的搜索替换

# 使用 re 模块的时候给这些操作提供 re.IGNORECASE 标志参数
text = 'UPPER PYTHON, lower python, Mixed Python'
re.findall('python', text, flags=re.IGNORECASE)  # ['PYTHON', 'python', 'Python']
re.sub('python', 'snake', text, flags=re.IGNORECASE) # 'UPPER snake, lower snake, Mixed snake'

7.最短匹配模式 —— 加上?

str_pat = re.compile(r'"(.*)"')
text1 = 'Computer says "no."'
str_pat.findall(text1) # ['no.']
text2 = 'Computer says "no." Phone says "yes."'
str_pat.findall(text2) # ['no." Phone says "yes.']

str_pat = re.compile(r'"(.*?)"')
str_pat.findall(text2)  # ['no.', 'yes.']

8.多行匹配模式 —— 修改模式字符串，增加对换行的支持

comment = re.compile(r'/\*(.*?)\*/')
comment = re.compile(r'/\*((?:.|\n)*?)\*/')

# re.compile() 函数接受一个标志参数叫 re.DOTALL，可以让正则表达式中的点(.)匹配包括换行符在内的任意字符

9.将Unicode文本标准化

import unicodedata
t1 = unicodedata.normalize('NFC', s1)
# normalize() 第一个参数指定字符串标准化的方式。 NFC表示字符应该是整体组成(比如可能的话就使用单一编码)，而NFD表示字符应该分解为多个组合字符表示
# combining() 函数可以测试一个字符是否为和音字符
unicodedata.combining(c))

10.在正则式中使用Unicode

import re
# 如果你想在模式中包含指定的Unicode字符，你可以使用Unicode字符对应的转义序列(比如 \uFFF 或者 \UFFFFFFF )
arabic = re.compile('[\u0600-\u06ff\u0750-\u077f\u08a0-\u08ff]+')
pat = re.compile('stra\u00dfe', re.IGNORECASE)

11.删除字符串中不需要的字符

# strip() 方法能用于删除开始或结尾的字符
# lstrip() 和 rstrip() 从左和从右执行删除操作。 
# 默认情况下，这些方法会去除空白字符，但是你也可以指定其他字符
s.strip()

12.审查清理文本字符串 —— str.translate()

s = 'pýtĥöñ\fis\tawesome\r\n'
remap = {
    ord('\t') : ' ',
    ord('\f') : ' ',
    ord('\r') : None # Deleted
	}
a = s.translate(remap)
a # 'pýtĥöñ is awesome\n'

# 删除所有的和音符
import unicodedata
import sys
# 使用 dict.fromkeys() 方法构造一个字典
cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode) if unicodedata.combining(chr(c)))
b = unicodedata.normalize('NFD', a)
b.translate(cmb_chrs)

# 涉及到I/O解码与编码函数:先对文本做一些初步的清理，再结合 encode() 或者 decode() 操作来清除或修改它
b = unicodedata.normalize('NFD', a)
b.encode('ascii', 'ignore').decode('ascii')

*13.字符串对齐 —— ljust() , rjust() 和 center() *

text.ljust(20)
text.rjust(20,'=')
text.center(20,'*')

# 使用format()来对齐字符串，可以用来格式化任何值
format(text, '>20')  # 右
format(text, '<20')  # 左
format(text, '^20')  # 居中

format(text, '=>20s')

# 当格式化多个值的时候，这些格式代码也可以被用在 format() 方法中
'{:>10s} {:>10s}'.format('Hello', 'World') 
# '     Hello      World'

14.合并拼接字符串

parts = ['Is', 'Chicago', 'Not', 'Chicago?']
' '.join(parts)  # 'Is Chicago Not Chicago?'
','.join(parts)  # 'Is,Chicago,Not,Chicago?'

15.字符串中插入变量 —— format()

s = '{name} has {n} messages.'
s.format(name='Guido', n=37)  # 'Guido has 37 messages.'

# 要被替换的变量能在变量域中找到,可以结合使用 format_map() 和 vars() 
name = 'Guido'
n = 37
s.format_map(vars())

16.以指定列宽格式化字符串 —— 使用 textwrap 模块

s = "Look into my eyes, look into my eyes, the eyes, the eyes, \
the eyes, not around the eyes, don't look around the eyes, \
look into my eyes, you're under."

import textwrap
print(textwrap.fill(s, 70))
print(textwrap.fill(s, 40))
print(textwrap.fill(s, 40, initial_indent='    '))
print(textwrap.fill(s, 40, subsequent_indent='    '))

# 希望输出自动匹配终端大小
# 使用 os.get_terminal_size() 方法来获取终端的大小尺寸
import os
os.get_terminal_size().columns

17.在字符串中处理html和xml

# 使用 html.escape() 函数：替换文本字符串中的 ‘<’ 或者 ‘>’
import html
s = 'Elements are written as "<tag>text</tag>".'
print(html.escape(s)) # Elements are written as &quot;&lt;tag&gt;text&lt;/tag&gt;&quot;.
print(html.escape(s, quote=False)) # Elements are written as "&lt;tag&gt;text&lt;/tag&gt;".

s = 'Spicy Jalapeño'
s.encode('ascii', errors='xmlcharrefreplace')  # b'Spicy Jalape&#241;o'

s = 'Spicy &quot;Jalape&#241;o&quot.'
from html.parser import HTMLParser
p = HTMLParser()
p.unescape(s) # 'Spicy "Jalapeño".'

t = 'The prompt is &gt;&gt;&gt;'
from xml.sax.saxutils import unescape
unescape(t) # 'The prompt is >>>'

18～20 待重新看！