Python 3 日记 - 字符串（二）

最新推荐文章于 2022-09-20 21:32:34 发布

孤舟钓客

最新推荐文章于 2022-09-20 21:32:34 发布

阅读量2k

点赞数 1

分类专栏： Python 文章标签： cookbook python Python 学习日记

本文链接：https://blog.csdn.net/guzhou_diaoke/article/details/8279904

版权

Python 专栏收录该内容

10 篇文章 2 订阅

订阅专栏

注：以下内容为学习笔记，多数是从书本、资料中得来，只为加深印象，及日后参考。然而本人表达能力较差，写的不好。因非翻译、非转载，只好选原创，但多数乃摘抄，实为惭愧。但若能帮助一二访客，幸甚！

2012-12-10 星期一

1.控制大小写和访问子字符串

1）控制大小写

org_str = 'heLLo PyTHon, Hello worLd'

# 转成大写
big_str = org_str.upper()
print(big_str)

# 转成小写
little_str = org_str.lower()
print(little_str)

# 第一个字母大写，其他小写，相当于
# s[:1].upper() + s[1:].lower()
capitalize_str = org_str[:1].upper() + org_str[1:].lower()
print(capitalize_str)

capitalize_str = org_str.capitalize()
print(capitalize_str)

# 每个单词第一个字母大写
title_str = org_str.title()
print(title_str)

输出：

HELLO PYTHON, HELLO WORLD
hello python, hello world
Hello python, hello world
Hello python, hello world
Hello Python, Hello World
2）访问子字符串

# 访问子字符串
the_line = b'hello keyan python hi hello world!'

# 分片
print(the_line[12:18])

# unpack
import struct

# 得到一个6字节的字符串，跳过6字节，得到一个7字节，跳过3字节，得到8字节及其余部分
base_format = '6s 6x 7s 3x 6s'

# 计算超出的长度
num_remain = len(the_line) - struct.calcsize(base_format)
print(num_remain)

# 用合适的s或x子段完成格式，然后unpack
the_format = '%s %ds' % (base_format, num_remain)
print(the_format)
l, s1, s2, t = struct.unpack(the_format, the_line)
print(l, s1, s2, t)
print(l + s1 + s2 + t)

# 若剩余部分跳过
l, s1, s2 = struct.unpack(base_format, the_line[:struct.calcsize(base_format)])
print(l, s1, s2)
print(l + s1 + s2)

# 获取5个字节一组的数据
print('**************************************************************')
fivers = [the_line[k : k+5] for k in range(0, len(the_line), 5)]
print(fivers)

# 将字符串的前5个字符切成一个个的字符
chars = list(the_line[:5])
print(chars)

# 将数据切成指定长度的列
print('**************************************************************')
cuts = [6, 12, 19, 22, 28]
# zip返回一个列表，其中除最后一项和第一项外每项都是形如(cuts[k], cuts[k+1])这样的数对
# 第一项是(0, cuts[0]), 最后一项是(cuts[len(cuts)-1], None)
pieces = [the_line[i:j] for i, j in zip([0] + cuts, cuts + [None])]
print(pieces)

输出：

b'python'
6
6s 6x 7s 3x 6s 6s
b'hello ' b'python ' b'hello ' b'world!'
b'hello python hello world!'
b'hello ' b'python ' b'hello '
b'hello python hello '
**************************************************************
[b'hello', b' keya', b'n pyt', b'hon h', b'i hel', b'lo wo', b'rld!']
[104, 101, 108, 108, 111]
**************************************************************
[b'hello ', b'keyan ', b'python ', b'hi ', b'hello ', b'world!']
封装成函数：

# 封装成函数
print('**************************************************************')
def fields(base_format, the_line, last_field = False):
    num_remain = len(the_line) - struct.calcsize(base_format)
    the_format = '%s %d%s' % (base_format, num_remain, last_field and 's' or 'x')
    return struct.unpack(the_format, the_line)

print(fields(base_format, the_line, False))
print(fields(base_format, the_line, True))

# 使用memorizing机制的fields版本
# 适用于在循环内部调用
print('**************************************************************')
def fields_mem(base_format, the_line, last_field = False, _cache = {}):
    key = base_format, len(the_line), last_field
    the_format = _cache.get(key)
    if the_format is None:
        num_remain = len(the_line) - struct.calcsize(base_format)
        _cache[key] = the_format = '%s %d%s' % (
            base_format, num_remain, last_field and 's' or 'x')
    return struct.unpack(the_format, the_line)

print(fields(base_format, the_line, False))
print(fields(base_format, the_line, True))

# 对按字符个数分隔的方法的封装
def split_by(the_line, n, last_field = False):
    pieces = [the_line[k: k+n] for k in range(0, len(the_line), n)]
    if not last_field and len(pieces[-1]) < n:
        pieces.pop()
    return pieces

print(split_by(the_line, 5, False))
print(split_by(the_line, 5, True))

# 将数据切成指定的列的封装
def split_at(the_line, cuts, last_field = False):
    pieces = [ the_line[i:j] for i, j in zip([0] + cuts, cuts + [None]) ]
    if not last_field:
        pieces.pop()
    return pieces

print(split_at(the_line, cuts, False))
print(split_at(the_line, cuts, True))


# 用生成器来实现
print('**************************************************************')
def split_at_yield(the_line, cuts, last_field = False):
    last = 0
    for cut in cuts:
        yield the_line[last: cut]
        last = cut
    if last_field:
        yield the_line[last: ]
        
print(split_at_yield(the_line, cuts, False))
print(split_at_yield(the_line, cuts, True))
        
def split_by_yield(the_line, n, last_field = False):
    return split_at_yield(the_line, range(n, len(the_line), n), last_field)

print(list(split_by_yield(the_line, 5, False)))
print(list(split_by_yield(the_line, 5, True)))

输出：

**************************************************************
(b'hello ', b'python ', b'hello ')
(b'hello ', b'python ', b'hello ', b'world!')
**************************************************************
(b'hello ', b'python ', b'hello ')
(b'hello ', b'python ', b'hello ', b'world!')
[b'hello', b' keya', b'n pyt', b'hon h', b'i hel', b'lo wo']
[b'hello', b' keya', b'n pyt', b'hon h', b'i hel', b'lo wo', b'rld!']
[b'hello ', b'keyan ', b'python ', b'hi ', b'hello ']
[b'hello ', b'keyan ', b'python ', b'hi ', b'hello ', b'world!']
**************************************************************
[b'hello ', b'keyan ', b'python ', b'hi ', b'hello ']
[b'hello ', b'keyan ', b'python ', b'hi ', b'hello ', b'world!']
[b'hello', b' keya', b'n pyt', b'hon h', b'i hel', b'lo wo']
[b'hello', b' keya', b'n pyt', b'hon h', b'i hel', b'lo wo', b'rld!']

2012-12-11 星期二

2.改变多行文本字符串的缩进

'''
Created on Dec 11, 2012
改变多行文本字符串的缩进
@author: liury_lab
'''

# 多行文本的字符串，在每行行首添加或删除一些空格，以保证每行的缩进都是指定数目的空格数
def reindent(s, num_space):
    leading_space = num_space * ' '
    lines = [ leading_space + line.strip() for line in s.splitlines() ]
    return '\n'.join(lines)

org_str = """        hello python
  hello python
    hello python
      hello python"""
      

print(org_str)
print('**************************************************')


print(reindent(org_str, 4))

# 添加空格
def add_spaces(s, num_add):
    white = ' ' * num_add
    return white + white.join(s.splitlines(True))

print('**************************************************')

# 获取空格数
def num_space(s):
    return [ len(line) - len(line.lstrip()) for line in s.splitlines() ]

# 减少空格
def del_space(s, num_del):
    if num_del > min(num_space(s)):
        raise ValueError("removing more spaces than there are!")
    return '\n'.join([ line[num_del:] for line in s.splitlines() ])

nums = num_space(org_str)
print(nums)
print(del_space(org_str, 1))

print('**************************************************')
print(del_space(org_str, min(num_space(org_str))))

输出：

hello python
hello python
hello python
hello python
**************************************************
hello python
hello python
hello python
hello python
**************************************************
[8, 2, 4, 6]
hello python
hello python
hello python
hello python
**************************************************
hello python
hello python
hello python
hello python

2012-12-12 星期三

3.扩展和压缩制表符

'''
Created on Dec 12, 2012
扩展和压缩制表符
@author: liury_lab
'''

org_str = '\thello\t'

# 将制表符转换为一定数目的空格
org_list = list(org_str)
print(org_str, org_list)

exp_str = org_str.expandtabs()
exp_list = list(exp_str)
print(exp_str, exp_list)


# 将空格转成制表符
def un_expand(astring, tab_len = 4):
    import re
    # 当正则表达式包含了一个括弧组时，re.split 返回了一个list列表，列表中的
    # 每一个pieces列表，所有的连续空白字符串和非空白字符串都成为了它的子项
    pieces = re.split(r'( +)', astring.expandtabs(tab_len))
    print('pieces = %s' % pieces)
    
    # 记录目前的字符串总长度
    len_sofar = 0
    for i, piece in enumerate(pieces):
        this_len = len(piece)
        len_sofar += this_len
        if piece.isspace():
            # 将各个空格序列改成tabs + spaces
            num_blanks = len_sofar % tab_len
            num_tabs = (this_len - num_blanks + tab_len - 1) // tab_len
            print(len_sofar, num_blanks, num_tabs)
            pieces[i] = '\t' * num_tabs + ' ' * num_blanks
    
    return ''.join(pieces)

exp_str = '  hello       '
exp_list = list(exp_str)
print(exp_str, exp_list)

org_str = un_expand(exp_str, 4)
org_list = list(org_str)
print(org_str, org_list)

输出：
hello ['\t', 'h', 'e', 'l', 'l', 'o', '\t']
hello [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'h', 'e', 'l', 'l', 'o', ' ', ' ', ' ']
hello [' ', ' ', 'h', 'e', 'l', 'l', 'o', ' ', ' ', ' ', ' ', ' ', ' ', ' ']
pieces = ['', ' ', 'hello', ' ', '']
2 2 0
14 2 2
hello [' ', ' ', 'h', 'e', 'l', 'l', 'o', '\t', '\t', ' ', ' ']

2012-12-13 星期四

4.替换字符串中的子串

'''
Created on Dec 13, 2012
替换字符串中的字串
@author: liury_lab
'''

from string import Template

# 从字符串生成模板，其中标示符被￥标记
temp = Template('this is $thing')

# 给模板的substinute方法传入一个字典参数并调用
print(temp.substitute({'thing' : 5}))
print(temp.substitute({'thing' : 'test'}))

print(temp.substitute(thing = 5))
print(temp.substitute(thing = 'test'))

输出：
this is 5
this is test
this is 5

this is test

# 有时为了给substitute准备一个字典做参数，最简单的方法是设定一些本地变量，
# 然后将所有这些变量交给locals（此函数将创建一个字典，字典的key就是本地变量，
# 本的变量的值可通过key来访问）
temp = Template('the square of $num is $square')

for i in range(3):
    print(temp.substitute(num = i, square = i*i))
    
for num in range(3):
    square = num * num
    print(temp.substitute(locals()))
    
for num in range(3):
    print(temp.substitute(locals(), square = num * num))

输出：
the square of 0 is 0
the square of 1 is 1
the square of 2 is 4
the square of 0 is 0
the square of 1 is 1
the square of 2 is 4
the square of 0 is 0
the square of 1 is 1
the square of 2 is 4

2012-12-14 星期五雪

5.使用正则表达式一次完成多个替换

'''
Created on Dec 14, 2012
一次完成多个替换
@author: liury_lab
'''

# 对字符串的某些子串进行替换
# re对象提供的强大sub方法，非常利于进行高效的正则表达式匹配替换
adict = {'hello':'hi', 'perl':'python'}
text = 'hello world, hello perl, hello python, hello ppperlppp'

import re

# 该函数返回一个输入字符串的拷贝，该拷贝中的所有能够在指定字典中找到的
# 子串都被替换为字典中的对应值
def multiple_replace(text, adict):
    # re.compile函数：返回一个Regex对象，这个对象就是一个正则表达式的抽象
    # re.escape函数：将传入的原字符串进行转义，将那些非字母数字的字符转为\xx之类。
    # 比如’\’就会被转义为’\\’
    # 创建一个形如a1|a2|...|an 的正则表达式，由n个需要被替换的字符串组成
    rx = re.compile('|'.join(map(re.escape, adict)))
    
    # match对象及其group函数：可以由Regex对象的match、search函数生成，
    # Regex对象的sub函数其实在执行时也隐式生成该对象。一个Match对象其实就是
    # 相应正则表达式（Regex对象）的一个匹配，而其group函数则表示了细粒度的、
    # 该Match的各个“组”（简单的表述可看作正则表达式中的小括号），group函数默认参数为0，
    # 代表整个匹配。
    def one_xlat(match):
        return adict[match.group(0)]
    
    # 每遇到一次匹配，re.sub就会调用回调函数，回调函数返回要替换成的字符串
    # Regex对象的sub函数，用来替换字符串。
    return rx.sub(one_xlat, text)

re_text = multiple_replace(text, adict)
print(re_text)


# 有时只需要使用同一个固定不变的翻译表来完成很多文本替换，只需要做一次准备工作
def make_xlat(*args, **kwds):
    adict = dict(*args, **kwds)
    rx = re.compile('|'.join(map(re.escape, adict)))
    
    def one_xlat(match):
        return adict[match.group(0)]
    
    def xlat(text):
        return rx.sub(one_xlat, text)
    
    return xlat

translate = make_xlat(adict)
re_text = translate(text)
print(re_text)


# 实现一个功能类似的类
class make_xlat_class:
    def __init__(self, *args, **kwds):
        self.adict = dict(*args, **kwds)
        self.rx = self.make_rx()
    def make_rx(self):
        return re.compile('|'.join(map(re.escape, self.adict)))
    def one_xlat(self, match):
        return self.adict[match.group(0)]
    def __call__(self, text):
        return self.rx.sub(self.one_xlat, text)
    
translate = make_xlat_class(adict)
re_text = translate(text)
print(re_text)

# 用子类实现一个准确匹配单词开头结尾的类
class make_xlat_by_whole_words_class(make_xlat_class):
    def make_rx(self):
        return re.compile(r'\b%s\b' % r'\b|\b'.join(map(re.escape, self.adict)))
    
translate = make_xlat_by_whole_words_class(adict)
re_text = translate(text)
print(re_text)

输出：
hi world, hi python, hi python, hi pppythonppp
hi world, hi python, hi python, hi pppythonppp
hi world, hi python, hi python, hi pppythonppp
hi world, hi python, hi python, hi ppperlppp

2012-12-15 星期六

6.检查字符串中的结束标记

def any_true(predicate, sequence):
    return True in map(predicate, sequence)

def ends_with(s, *endings):
    return any_true(s.endswith, endings)

import os

for filename in os.listdir('d:\\下载\\埙'):
    if ends_with(filename, '.jpg', '.jpeg', '.gif', '.bmp'):
        print(filename)

输出：
22f6362ac65c1038d469f62db2119313b17e8974.jpg
28fb1f4fbfad45dd841b55b79dbb0d77.jpg
8ad4b31c8701a18bd90491f29e2f07082938fea4.jpg
埙追梦.jpg

2012-12-16 星期天

7.在Unicode和普通字符串之间转换

'''
Created on Dec 14, 2012
在Unicode和普通字符串之间转换
@author: liury_lab
'''

unicode_str = 'hello python'

# 将Unicode转化为普通python字符串：“encode”
utf8_str = unicode_str.encode("utf-8")
print(utf8_str)

ascii_str = unicode_str.encode("ascii")
print(ascii_str)

iso_str = unicode_str.encode("ISO-8859-1")
print(iso_str)

utf16_str = unicode_str.encode("utf-16")
print(utf16_str)

# 将普通python字符串转化为Unicode: "decode"
print(str(utf8_str, "utf-8"))
print(str(ascii_str, "ascii"))
print(str(iso_str, "ISO-8859-1"))
print(str(utf16_str, "utf-16"))

输出：

b'hello python'
b'hello python'
b'hello python'
b'\xff\xfeh\x00e\x00l\x00l\x00o\x00 \x00p\x00y\x00t\x00h\x00o\x00n\x00'
hello python
hello python
hello python
hello python

注python3中默认都是Unicode，输出中前面的b 表示byte。

孤舟钓客

关注

1
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
Python 3 日记 - 字符串（二）

注：以下内容为学习笔记，多数是从书本、资料中得来，只为加深印象，及日后参考。然而本人表达能力较差，写的不好。因非翻译、非转载，只好选原创，但多数乃摘抄，实为惭愧。但若能帮助一二访客，幸甚！2012-12-10 星期一1.控制大小写和访问子字符串1）控制大小写org_str = 'heLLo PyTHon, Hello worLd'# 转成大写big_str =
复制链接

扫一扫