【脚本语言系列】关于Python操作数据文本字符串,你需要知道的事

如何使用文本字符串

Unicode

# -*- coding:utf-8 -*-
import unicodedata
# only for Python 3.*
def unicode_test(value):
    name = unicodedata.name(value)
    value2 = unicodedata.lookup(name)
    print "name %s" %name
    print "value %s" %value
    print "value2 %s" %value2

unicode_test("\u00a2")
unicode_test("\u20ac")
---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-42-9a9ee43694fd> in <module>()
      9     print "value2 %s" %value2
     10 
---> 11 unicode_test("\u00a2")
     12 unicode_test("\u20ac")


<ipython-input-42-9a9ee43694fd> in unicode_test(value)
      3 # only for Python 3.*
      4 def unicode_test(value):
----> 5     name = unicodedata.name(value)
      6     value2 = unicodedata.lookup(name)
      7     print "name %s" %name


TypeError: name() argument 1 must be unicode, not str
# -*- coding:utf-8 -*-
# only for Pyhton 3.*
# encode to string
snowman = "\u2603"
print len(snowman)
ds = snowman.encode("utf-8")
print ds
ds = snowman.encode("ascii","ignore")
print ds
ds = snowman.encode("ascii","replace")
print ds
ds = snowman.encode("ascii","backslashreplace")
print ds
ds = snowman.encode("ascii","xmlcharrefreplace")
print ds
# decode to unicode
place = "caf\u00e9"
print place
print type(place)
place_bytes = place.decode("utf-8")
print place_bytes
place2 = place_bytes.decode("utf-8")
print place2
place3 = place_bytes.decode("ascii")
print place3
place4 = place_bytes.decode("latin-1")
print place4
place5 = place_bytes.decode("windows-1252")
print place5
6
\u2603
\u2603
\u2603
\u2603
caf\u00e9
<type 'str'>
caf\u00e9
caf\u00e9
caf\u00e9
caf\u00e9
caf\u00e9

格式化

  • 旧式格式化
# -*- coding:utf-8 -*-
# format int to str, decimal, heximal, octal
print "format int."
int_num = 123
print "%s" %int_num
print "%d" %int_num
print "%x" %int_num
print "%o" %int_num

# format float to str, decimal, heximal, octal
print "\nformat float."
float_num = 1.23
print "%s" %float_num
print "%f" %float_num
print "%e" %float_num
print "%g" %float_num

# interpolation
print "\nformat interpolation."
name = "Allen Moore"
job = "System Designer"
weight = "128"
print "My job is %s" %job
print "My name is %s. His weight is %s" %(name, weight)

# format style
print "\nformat style."
int_num = 123
float_num = 1.23
str_str = "str"
print "%d %f %s" %(int_num, float_num, str_str)
print "%10d %10f %10s" %(int_num, float_num, str_str)
print "%-10d %-10f %-10s" %(int_num, float_num, str_str)
print "%10.4d %10.4f %10.4s" %(int_num, float_num, str_str)
print "%.4d %.4f %.4s" %(int_num, float_num, str_str)
print "%*.*d %*.*f %*.*s" %(10, 4, int_num, 10, 4, float_num, 10, 4, str_str)
format int.
123
123
7b
173

format float.
1.23
1.230000
1.230000e+00
1.23

format interpolation.
My job is System Designer
My name is Allen Moore. His weight is 128

format style.
123 1.230000 str
       123   1.230000        str
123        1.230000   str       
      0123     1.2300        str
0123 1.2300 str
      0123     1.2300        str
  • 新式格式化
# -*- coding:utf-8 -*-
int_num = 123
float_num = 1.23
str_str = "str"
print "{} {} {}".format(int_num, float_num, str_str)
print "{2} {0} {1}".format(int_num, float_num, str_str)
print "{int_num} {float_num} {str_str}".format(int_num=123, float_num=1.23, str_str="str")
one = {'int_num':123, 'float_num':1.23, 'str_str':"str"}
print "{0[int_num]} {0[float_num]} {0[str_str]}{1}".format(one,' other')
print "{0:d} {1:f} {2:s}".format(int_num, float_num, str_str)
# format style
print "\nHere is format style"
print "{int_num:d} {float_num:f} {str_str:s}".format(int_num=123, float_num=1.23, str_str="str")
print "{0:10d} {1:10f} {2:10s}".format(int_num, float_num, str_str)
print "{0:>10d} {1:>10f} {2:>10s}".format(int_num, float_num, str_str)
print "{0:<10d} {1:<10f} {2:<10s}".format(int_num, float_num, str_str)
print "{0:^10d} {1:^10f} {2:^10s}".format(int_num, float_num, str_str)
print "{0:>10d} {1:>10.4f} {2:10.2s}".format(int_num, float_num, str_str)
print "{0:!^25s}".format("Allen Moore")
123 1.23 str
str 123 1.23
123 1.23 str
123 1.23 str other
123 1.230000 str

Here is format style
123 1.230000 str
       123   1.230000 str       
       123   1.230000        str
123        1.230000   str       
   123      1.230000     str    
       123     1.2300 st        
!!!!!!!Allen Moore!!!!!!!

使用正则表达式匹配

# -*- coding:utf-8 -*-
import re
# match
# match preposition
source = "Allen Moore"
result = re.match("^All", source)
if result:
    print result.group()
result = re.match("Moo", source)
if result:
    print result.group()
# match anyposition
result = re.match(".*Moo", source)
if result:
    print result.group()

# search  
# search first 
print "\n"
result = re.search("Moo", source)
if result:
    print result.group()    

# findall
# findall all
print "\n"
result = re.findall("o", source)
print result

# split
# split all
print "\n"
result = re.split("e", source)
print result

# sub
# sub all
print "\n"
result = re.sub("e","?", source)
print result

# pattern: special character
print "\n"
import string
printable = string.printable
print len(printable)
print printable[0:50]
print printable[50:]
print re.findall('\d',printable)
print re.findall('\w',printable)
print re.findall('\s',printable)

x = 'abc' + '-/*' + '\u00ea' + '\u0115'
print re.findall('\w', x)

# pattern: character
print "\n"
src = '''Hello Allen Moore, Welcome to Python World.'''
print re.findall('Welcome', src)
print re.findall('Welcome|Python', src)
print re.findall('Hello', src)
print re.findall('^Hello', src)
print re.findall('World.$', src)
print re.findall('World\.$', src)
print re.findall('[oW]or', src)
print re.findall('[o]+', src)
print re.findall('Allen (?=Moore)', src)
print re.findall('(?<=Allen) Moore', src)
print re.findall('\bAllen', src)
print re.findall(r'\bAllen', src)

# pattern: character
print "\n"
result = re.search(r'(. Allen\b).*(\bPython)', src)
print result.group()
print result.groups()
result = re.search(r'(?P<Name>. Allen\b).*(?P<Lang>\bPython)',src)
print result.group()
print result.groups()
print result.group('Name')
print result.group('Lang')
All
Allen Moo


Moo


['o', 'o']


['All', 'n Moor', '']


All?n Moor?


100
0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMN
OPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~  

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '_']
[' ', '\t', '\n', '\r', '\x0b', '\x0c']
['a', 'b', 'c', 'u', '0', '0', 'e', 'a', 'u', '0', '1', '1', '5']


['Welcome']
['Welcome', 'Python']
['Hello']
['Hello']
['World.']
['World.']
['oor', 'Wor']
['o', 'oo', 'o', 'o', 'o', 'o']
['Allen ']
[' Moore']
[]
['Allen']


o Allen Moore, Welcome to Python
('o Allen', 'Python')
o Allen Moore, Welcome to Python
('o Allen', 'Python')
o Allen
Python
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值