- 1.使用正则re查找文本中特定中文字符串
#coding:utf-8
import re
temp = "s2f程序员杂志一2d3程序员杂志二2d3程序员杂志三2d3程序员杂志四2d3"
findword=u"(杂志+)" #需要查找的特定中文字符串
pattern = re.compile(findword)
results = pattern.findall(temp)
for result in results :
print result
输出如下:
- 2.查找的目标文件line的内容”OUSER.LAST_LAST_POST=xxxxxx”,就是查找这个文档中的所有短信验证码是多少。
import re
line = "OTOPIC.TGNUM=1.000|OUSER.FANS=7034.000|OUSER.LAST_LAST_POST=1536738413.000|OUSER.LEVEL=1.000|\
OUSER.SCF=1.000|OUSER.URANK=33.000|OUSER.USTATE=524288.000|OUSER.USUD=1552277017.000|\
OUSER.UTYPE=16448.000(adc_user)|OUSER.VALID_FANS=1297.000|OUSER.VTYPE=2.000|OUSER.ZCF=3.000|\
PIC.EWADS=327.000(327.000000)|PIC.EWADS_WEIGHT=327.000|PIC.FIGS=1.000(001000082216010170003018078190193003)|\
PIC.MINUP=2.000|PIC.MWADS_RADIO=1.000|PIC.NUM=1.000(4011677ely1g0yym6fib3j20u0140wia)|PIC.PWS=2.203|\
PIC.WADS=327.000(整体厨房|定制|定制|卧室)|PIC.WADS_RADIO=1.000|PIC.WAMAX=327.000|PIC.WAMIN=327.000|\
PROBA.CHEAT=0.000|PROBA.LOWQ=0.000|QI.NEW=304.000|SOURCE.SCORE=78.000"
findword = u"(OUSER.LAST_LAST_POST=+\d\d\d\d\d\d\d\d\d\d)"#匹配后面10个数字
pattern = re.compile(findword)
result = pattern.findall(line)
print(result)
运行结果如下:
如果有多个OUSER.LAST_LAST_POST,则会全部输出
- 3.查找的目标文件logyzm.txt的内容”您的验证码是..........”,这个字段后面单行所有内容
#!/usr/nbin/python
# --*-- coding:utf-8 --*--
import re
logyzm = open("/root/python/dinpay/logyzm.txt").read()
#print logyzm
temp = logyzm.decode("utf8")
findword = u"(您的验证码是.+)"# .+表示匹配至少一个任意字符
#findword = u"(.+您的验证码是.+)"#表示取有“您的验证码是”字符串的这行所有数据
pattern = re.compile(findword)
results = pattern.findall(temp)
for result in results:
print result
- 4.去除重复的数据,显示为一个列表
#!/usr/nbin/python
# --*-- coding:utf-8 --*--
import re
lastlist = []
logyzm = open("/usr/local/tomcat_corp/logs/catalina.out").read()
#print logyzm
temp = logyzm.decode("utf8")
findword = r'bankCode=.{100}'#取该字符串以及其后面100个字符数据
pattern = re.compile(findword)
results = re.findall(pattern,temp)
for result in results:
#print result
lastlist.append(result)
list = set(lastlist)#对重复数据进行去重处理
for l in list:
print l
- 5.取有某个特定字符串的前几位与后几位数据
#!/usr/nbin/python
# --*-- coding:utf-8 --*--
import re
lastlist = []
logyzm = open("/usr/local/tomcat_corp/logs/catalina.out").read()
#print logyzm
temp = logyzm.decode("utf8")
findword = r'.{75}bankCode=.{100}'#取该字符串的前面75个字符数据以及其后面100个字符数据
pattern = re.compile(findword)
results = re.findall(pattern,temp)
for result in results:
#print result
lastlist.append(result)
list = set(lastlist)#对重复数据进行去重处理
for l in list:
print l