import re
import os
from fnmatch import fnmatch, fnmatchcase
from calendar import month_abbr
import unicodedata
import sys
import textwrap
import html
from html.parser import HTMLParser
from collections import namedtuple
filename ='spam.txt'print(filename.endswith('.txt'))print(filename.startswith('file:'))# 检查多种匹配可能,匹配项放入元组
filenames = os.listdir('.')print(filenames)print([name for name in filenames if name.endswith(('.c','.txt'))])
用Shell通配符匹配字符串
# *匹配所有字符、?匹配单个字符、[seq]匹配指定范围内的字符、[!seq]匹配不在指定范围内的字符print(fnmatch('foo.txt','*.txt'))print(fnmatch('Dat45.csv','Dat[0-9]*'))print(fnmatch('foo.txt','?oo.txt'))
addresses =['5412 N CLARK ST','1060 W ADDISON ST','1039 W GRANVILLE AVE','2122 N CLARK ST','4802 N BROADWAY',]# fnmatchcase大小写敏感print([addr for addr in addresses if fnmatchcase(addr,'54[0-9][0-9] *CLARK*')])
num = re.compile('\d+')print(num.match('\u0661\u0662\u0663'))
aslice =slice(5,50,2)print(aslice.start, aslice.stop, aslice.step)
s ='HelloWorld'print(aslice.indices(len(s)))# indices(size) 方法将它映射到一个确定大小的序列上for i inrange(*aslice.indices(len(s))):print(s[i], end=' ')print()
删除字符串中不需要的字符
# strip() lstrip() rstrip() 默认去除换行符、空白字符
t ='-----hello====='print(t.strip('-='))
s =' hello world \n'
s1 = s.replace(' ','')print(s1)# 去除换行符# with open(filename) as f:# lines = (line.strip() for line in f)# for line in lines:# print(line)
审查清理文本字符串
# str.translate()
s ='pýtĥöñ\fis\tawesome\r\n'
remap ={ord('\t'):' ',ord('\f'):' ',ord('\r'):None# Deleted}
a = s.translate(remap)# print(a)
cmb_chrs =dict.fromkeys(c for c inrange(sys.maxunicode)if unicodedata.combining(chr(c)))
b = unicodedata.normalize('NFD', a)
c = b.translate(cmb_chrs)# print(c)# print(unicodedata.normalize('NFD', a).encode('ascii', 'ignore').decode('ascii'))
字符串对齐
# ljust() rjust() center() format()
text ='Hello python'print(text.center(20))print(text.ljust(20,'*'))print(format(text,'^20'))# 中心对其print(format(text,'*<20'))# 左对齐,填充*
合并拼接字符串
# join + format
parts =['Is','Chicago','Not','Chicago?']print(' '.join(parts))
a ='Is Chicago'
b ='Not Chicago?'print(a +' '+ b)# 非常耗时,涉及内存复制以及垃圾回收print('{} {}'.format(a, b))
d ='Is Chicago''Not Chicago?'print(d)
字符串中插入变量
s ='{name} has {n} messages.'
name ='Guido_copy'
n =37print(s.format(name='Guido', n=37))print(s.format_map(vars()))# format_map() vars()classInfo:def__init__(self, name, n):
self.name = name
self.n = n
ac = Info('Guido_copy',38)print(s.format_map(vars(ac)))# 变量缺失classsafesub(dict):""" 防止 key 找不到 """def__missing__(self, key):return'{'+ key +'}'del n
print(s.format_map(safesub(vars())))
以指定列宽格式化字符串
s ='''Look into my eyes, look into my eyes, the eyes, the eyes,
the eyes, not around the eyes, don't look around the eyes,
look into my eyes, you're under.'''# print(textwrap.fill(s, 40))# print(textwrap.fill(s, 40, initial_indent=' '))# print(textwrap.fill(s, 40, subsequent_indent=' '))
在字符串处理html和xml
sh ='Elements are written as "<tag>text</tag>".'print(html.escape(sh))print(html.escape(sh, quote=False))
sa ='Spicy Jalapeño'print(sa.encode('ascii', errors='xmlcharrefreplace'))
sp ='Spicy "Jalapeño".'
p = HTMLParser()
字符串令牌解析
text ='foo = 23 + 42 * 10'# tokens = [('NAME', 'foo'), ('EQ', '='), ('NUM', '23'), ('PLUS', '+'),# ('NUM', '42'), ('TIMES', '*'), ('NUM', '10')]
NAME =r'(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)'
NUM =r'(?P<NUM>\d+)'
PLUS =r'(?P<PLUS>\+)'
TIMES =r'(?P<TIMES>\*)'
EQ =r'(?P<EQ>=)'
WS =r'(?P<WS>\s+)'
master_pat = re.compile('|'.join([NAME, NUM, PLUS, TIMES, EQ, WS]))defgenerate_tokens(pat, text):
Token = namedtuple('Token',['type','value'])
scanner = pat.scanner(text)for m initer(scanner.match,None):yield Token(m.lastgroup, m.group())# Example usefor tok in generate_tokens(master_pat,'foo = 42'):print(tok)