Python–cookbook–2.字符串与文本

最新推荐文章于 2024-07-18 15:53:23 发布

柴寺仓

最新推荐文章于 2024-07-18 15:53:23 发布

阅读量125

点赞数

分类专栏： python--相关特性文章标签： python 开发语言

本文链接：https://blog.csdn.net/weixin_44332995/article/details/127204484

版权

python--相关特性专栏收录该内容

35 篇文章 0 订阅

订阅专栏

Python–cookbook–2.字符串与文本

导入对应模块

import re
import os
from fnmatch import fnmatch, fnmatchcase
from calendar import month_abbr
import unicodedata
import sys
import textwrap
import html
from html.parser import HTMLParser
from collections import namedtuple

字符串分割成多段，但分隔符不固定

line = 'asdf fjdk; afed, fjek,asdf, foo'
print(re.split(r'[;,\s]\s*', line))  # 分隔符可以是逗号，分号或者是空格
print(re.split(r'(;|,|\s)\s*', line))  # 保留分割字符串
print(re.split(r'(?:,|;|\s)\s*', line))  #  (?:...)不保留

字符串开头和结尾匹配

filename = 'spam.txt'
print(filename.endswith('.txt'))
print(filename.startswith('file:'))
# 检查多种匹配可能，匹配项放入元组
filenames = os.listdir('.')
print(filenames)
print([name for name in filenames if name.endswith(('.c', '.txt'))])

用Shell通配符匹配字符串

# *匹配所有字符、？匹配单个字符、[seq]匹配指定范围内的字符、[!seq]匹配不在指定范围内的字符
print(fnmatch('foo.txt', '*.txt'))
print(fnmatch('Dat45.csv', 'Dat[0-9]*'))
print(fnmatch('foo.txt', '?oo.txt'))
addresses = [
    '5412 N CLARK ST',
    '1060 W ADDISON ST',
    '1039 W GRANVILLE AVE',
    '2122 N CLARK ST',
    '4802 N BROADWAY',
]
# fnmatchcase大小写敏感
print([addr for addr in addresses if fnmatchcase(addr, '54[0-9][0-9] *CLARK*')])

字符串匹配和搜索

# 基本：str.find() , str.endswith() , str.startswith()
# 复杂：正则
text1 = '11/27/2012'
text2 = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
datepat = re.compile(r'\d+/\d+/\d+')
print(datepat.match(text1))  # 返回re或者None
print(datepat.findall(text2))

datepat2 = re.compile(r'(\d+)/(\d+)/(\d+)')  # 使用括号去捕获分组
m = datepat2.match('11/27/2012')
print(m.groups(), m.group(0), m.group(1), m.group(2), m.group(3))
n = datepat2.findall(text2)
print(n)

字符串搜索和替换

# str.replace()  re.sub()
# 正则、替换模式（数字指代捕获组号）、字符串
print(re.sub(r'(\d+)/(\d+)/(\d+)', r'\3-\1-\2', text2))
# 复杂的替换，传递替换回调函数
def change_date(m):
    mon_name = month_abbr[int(m.group(1))]
    return '{} {} {}'.format(m.group(2), mon_name, m.group(3))
print(datepat2.sub(change_date, text2))

字符串忽略大小写的搜索替换

# re.IGNORECASE
text = 'UPPER PYTHON, lower python, Mixed Python'
print(re.findall('python', text, flags=re.IGNORECASE))
def matchcase(word):
    def replace(m):
        text = m.group()
        if text.isupper():
            return word.upper()
        elif text.islower():
            return word.lower()
        elif text[0].isupper():
            return word.capitalize()
        else:
            return word
    return replace
print(re.sub('python', matchcase('snake'), text, flags=re.IGNORECASE))

最短匹配模式

text2 = 'Computer says "no." Phone says "yes."'
str_pat = re.compile(r'\"(.*)\"')
str_pat2 = re.compile(r'\"(.*?)\"')  # *？非贪婪模式
print(str_pat.findall(text2))
print(str_pat2.findall(text2))

多行匹配模式

text2 = '''/* this is a
multiline comment */
'''
comment = re.compile(r'/\*(.*?)\*/')
comment2 = re.compile(r'/\*((?:.|\n)*?)\*/')  # 增加对换行支持
comment3 = re.compile(r'/\*(.*?)\*/', re.DOTALL)  # 让.可以匹配换行符在内任何字符
print(comment.findall(text2))
print(comment2.findall(text2))
print(comment3.findall(text2))

将Unicode文本标准化

s1 = 'Spicy Jalape\u00f1o'  # 整体字符
s2 = 'Spicy Jalapen\u0303o'  # 组合字符
print(s1)
print(s2)
print(len(s1), len(s2), s1 == s2)
t1 = unicodedata.normalize('NFC', s1)  # 整体组成
t2 = unicodedata.normalize('NFC', s2)
print(t1 == t2)
print(ascii(t1), ascii(t2))
t3 = unicodedata.normalize('NFD', s1)  # 分解组成
t4 = unicodedata.normalize('NFD', s2)
print(t3 == t4)
print(ascii(t3), ascii(t4))
# 同样支持扩展的标准化形式 NFKC 和 NFKD
# combining测试和音
print(''.join(c for c in t1 if not unicodedata.combining(c)))

在正则中使用Unicode

num = re.compile('\d+')
print(num.match('\u0661\u0662\u0663'))

aslice = slice(5, 50, 2)
print(aslice.start, aslice.stop, aslice.step)
s = 'HelloWorld'
print(aslice.indices(len(s)))  # indices(size) 方法将它映射到一个确定大小的序列上
for i in range(*aslice.indices(len(s))):
    print(s[i], end=' ')
print()

删除字符串中不需要的字符

# strip() lstrip() rstrip() 默认去除换行符、空白字符
t = '-----hello====='
print(t.strip('-='))
s = ' hello   world \n'
s1 = s.replace(' ', '')
print(s1)
# 去除换行符
# with open(filename) as f:
#     lines = (line.strip() for line in f)
#     for line in lines:
#         print(line)

审查清理文本字符串

#  str.translate()
s = 'pýtĥöñ\fis\tawesome\r\n'
remap = {
    ord('\t'): ' ',
    ord('\f'): ' ',
    ord('\r'): None  # Deleted
}
a = s.translate(remap)
# print(a)
cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode)
    if unicodedata.combining(chr(c)))
b = unicodedata.normalize('NFD', a)
c = b.translate(cmb_chrs)
# print(c)
# print(unicodedata.normalize('NFD', a).encode('ascii', 'ignore').decode('ascii'))

字符串对齐

# ljust() rjust() center() format()
text = 'Hello python'
print(text.center(20))
print(text.ljust(20, '*'))
print(format(text, '^20'))  # 中心对其
print(format(text, '*<20'))  # 左对齐，填充*

合并拼接字符串

# join + format
parts = ['Is', 'Chicago', 'Not', 'Chicago?']
print(' '.join(parts))
a = 'Is Chicago'
b = 'Not Chicago?'
print(a + ' ' + b)  # 非常耗时，涉及内存复制以及垃圾回收
print('{} {}'.format(a, b))
d = 'Is Chicago' 'Not Chicago?'
print(d)

字符串中插入变量

s = '{name} has {n} messages.'
name = 'Guido_copy'
n = 37
print(s.format(name='Guido', n=37))
print(s.format_map(vars()))  # format_map() vars()
class Info:
    def __init__(self, name, n):
        self.name = name
        self.n = n
ac = Info('Guido_copy', 38)
print(s.format_map(vars(ac)))
# 变量缺失
class safesub(dict):
    """ 防止 key 找不到 """
    def __missing__(self, key):
        return '{' + key + '}'
del n
print(s.format_map(safesub(vars())))

以指定列宽格式化字符串

s = '''Look into my eyes, look into my eyes, the eyes, the eyes, 
the eyes, not around the eyes, don't look around the eyes, 
look into my eyes, you're under.'''
# print(textwrap.fill(s, 40))
# print(textwrap.fill(s, 40, initial_indent='  '))
# print(textwrap.fill(s, 40, subsequent_indent=' '))

在字符串处理html和xml

sh = 'Elements are written as "<tag>text</tag>".'
print(html.escape(sh))
print(html.escape(sh, quote=False))
sa = 'Spicy Jalapeño'
print(sa.encode('ascii', errors='xmlcharrefreplace'))
sp = 'Spicy &quot;Jalape&#241;o&quot.'
p = HTMLParser()

字符串令牌解析

text = 'foo = 23 + 42 * 10'
# tokens = [('NAME', 'foo'), ('EQ', '='), ('NUM', '23'), ('PLUS', '+'),
#           ('NUM', '42'), ('TIMES', '*'), ('NUM', '10')]
NAME = r'(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)'
NUM = r'(?P<NUM>\d+)'
PLUS = r'(?P<PLUS>\+)'
TIMES = r'(?P<TIMES>\*)'
EQ = r'(?P<EQ>=)'
WS = r'(?P<WS>\s+)'
master_pat = re.compile('|'.join([NAME, NUM, PLUS, TIMES, EQ, WS]))
def generate_tokens(pat, text):
    Token = namedtuple('Token', ['type', 'value'])
    scanner = pat.scanner(text)
    for m in iter(scanner.match, None):
        yield Token(m.lastgroup, m.group())
# Example use
for tok in generate_tokens(master_pat, 'foo = 42'):
    print(tok)