python—-使用re正则表达式刷选数据,去重,列表,取特定行数据(适用于web的html回包数据提取)
环境配置:对目标服务器的日志文件进行刷选特定数据(192.168.4.27)
/usr/local/tomcat_corp/logs/catalina.out
python脚本必须在该服务器上运行
1、筛选银行卡字段bankCode=
python代码:
[root@cdn tmp]# ls
findbankid_back_before.py findbankid.py findemail.py findidno.py findmobile.py findreadlname.py
[root@cdn tmp]#
[root@cdn tmp]# cat findbankid_back_before.py
#!/usr/nbin/python
# --*-- coding:utf-8 --*--
import re
lastlist = []
logyzm = open("/usr/local/tomcat_corp/logs/catalina.out").read()
#print logyzm
temp = logyzm.decode("utf8")
findword = r'.{75}bankCode=.{100}'#取该字符串前75个字符以及其后面100个字符数据
pattern = re.compile(findword)
results = re.findall(pattern,temp)
for result in results:
#print result
lastlist.append(result)
list = set(lastlist)#对重复数据进行去重处理
for l in list:
print l
[root@cdn tmp]
脚本运行情况:
[root@cdn tmp]# python findbankid_back_before.py
..............................
.............................
bjectDTO [t=[com.dinpay.dpp.domain.system.config.BankGateway@*****[id=3,bankCode=CCB,bankAccount=62148502********,rate=0.0,name=建设银行,status=1,remark=<null>,defaultFlag=0,maxLimitAmo
uency=0], com.dinpay.dpp.domain.system.config.BankGateway@*****[id=1002,bankCode=SPABANK,bankAccount=01120004********,rate=0.0,name=深圳平安银企直连代付,status=1,remark=<null>,defaultFlag=0,
tDTO [t=[com.dinpay.dpp.domain.system.config.PayChannel@*****[id=<null>,bankCode=GDB,chargeType=<null>,rate=<null>,dinpayRate=<null>,name=广东发展银行,status=<null>,remark=<null>,remark2=
2、筛选email邮箱地址
python代码:
[root@cdn tmp]# cat findemail.py
#!/usr/nbin/python
# --*-- coding:utf-8 --*--
import re
lastlist = []
logyzm = open("/usr/local/tomcat_corp/logs/catalina.out").read()
#print logyzm
temp = logyzm.decode("utf8")
findword = r'.{100}bindEmail.{90}'#取该字符串前100个字符以及其后面90个字符数据
pattern = re.compile(findword)
results = re.findall(pattern,temp)
for result in results:
#print result
lastlist.append(result)
list = set(lastlist)#去重
for l in list:
print l
[root@cdn tmp]#
代码运行情况:
[root@cdn tmp]# python findemail.py
anageController toFindPayPwdByCard memberObjectResponse:MemberDetailResponse [memberId=137****1580, bindEmail=null, bindMobile=137*****1580, companyName=李*, certificationType=1, createDate=Tue Dec 19
ankCardController toBankCardManage memberObjectResponse:MemberDetailResponse [memberId=186****3214, bindEmail=null, bindMobile=186*****3214, companyName=聂*平, certificationType=1, createDate=Thu May 0
eController toAccountManage memberObjectResponse:MemberDetailResponse [memberId=*****@163.com, bindEmail=ssh*****.com, bindMobile=137*****4764, companyName=沈*, certificationType=1, createDate=Tu
3、筛选身份证号码
python代码:
[root@cdn tmp]# cat findidno.py
#!/usr/nbin/python
# --*-- coding:utf-8 --*--
import re
lastlist = []
logyzm = open("/usr/local/tomcat_corp/logs/catalina.out").read()
#print logyzm
temp = logyzm.decode("utf8")
findword = r'.{100}certNum.{20}'#取该字符串前100个字符以及其后面20个字符数据
pattern = re.compile(findword)
results = re.findall(pattern,temp)
for result in results:
#print result
lastlist.append(result)
list = set(lastlist)#去重
for l in list:
print l
[root@cdn tmp]#
代码运行情况:
[root@cdn tmp]# python findidno.py
l, address=null, supportBalance=1, bankCode=CCB, auditStatus=null, authStatus=null, isEnterprise=1, certNum=4*****************3
l, address=null, supportBalance=1, bankCode=ABC, auditStatus=null, authStatus=null, isEnterprise=1, certNum=4****************2]
l, address=null, supportBalance=1, bankCode=ABC, auditStatus=null, authStatus=null, isEnterprise=1, certNum=4*****************3
, address=null, supportBalance=1, bankCode=ICBC, auditStatus=null, authStatus=null, isEnterprise=0, certNum=4******************
l, address=null, supportBalance=1, bankCode=CMB, auditStatus=null, authStatus=null, isEnterprise=1, certNum=4****************X]
, address=null, supportBalance=1, bankCode=ICBC, auditStatus=null, authStatus=null, isEnterprise=1, certNum=4****************X]
4、筛选手机号码
python代码;
[root@cdn tmp]# cat findmobile.py
#!/usr/nbin/python
# --*-- coding:utf-8 --*--
import re
lastlist = []
logyzm = open("/usr/local/tomcat_corp/logs/catalina.out").read()
#print logyzm
temp = logyzm.decode("utf8")
findword = r'.{100}bindMobile.{65}'#取该字符串前100个字符以及其后面65个字符数据
pattern = re.compile(findword)
results = re.findall(pattern,temp)
for result in results:
#print result
lastlist.append(result)
list = set(lastlist)
for l in list:
print l
[root@cdn tmp]#
代码运行情况:
[root@cdn tmp]# python findmobile.py
oller setMemberExtInfo:MemberDetailResponse [memberId=*****@163.com, bindEmail=464*****.com, bindMobile=null, companyName=聂*平, certificationType=1, createDate=Thu Jun 2
er toAccountManage memberObjectResponse:MemberDetailResponse [memberId=131****8888, bindEmail=null, bindMobile=131*****8888, companyName=陈*荣2, certificationType=1, createDate=
-MemberLoginController setMemberExtInfo:MemberDetailResponse [memberId=131****8888, bindEmail=null, bindMobile=861*****1066, companyName=陈*荣, certificationType=1, createDate=S
-MemberLoginController setMemberExtInfo:MemberDetailResponse [memberId=131****8888, bindEmail=null, bindMobile=153*****6761, companyName=陈*荣, certificationType=0, createDate=S
5、筛选姓名
python代码:
[root@cdn tmp]# cat findreadlname.py
#!/usr/nbin/python
# --*-- coding:utf-8 --*--
import re
lastlist = []
logyzm = open("/usr/local/tomcat_corp/logs/catalina.out").read()
#print logyzm
temp = logyzm.decode("utf8")
findword = r'.{100}realName=.{90}'
pattern = re.compile(findword)
results = re.findall(pattern,temp)
for result in results:
#print result
lastlist.append(result)
list = set(lastlist)
for l in list:
print l
[root@cdn tmp]#
代码运行情况:
[root@cdn tmp]# python findreadlname.py
,rgeRecordVO [rechargeDateStr=2017-11-20 16:35:18, dealDateStr=2017-11-20 16:35:18, transferType=充值, realName=陈*荣, memberId=q******q@sina.com, getSerialno()=21686, getAccountId()=35700*****, getRechar
rgeRecordVO [rechargeDateStr=2018-01-17 11:53:41, dealDateStr=2018-01-17 11:53:41, transferType=充值, realName=聂*平, memberId=j**********6@163.com, getSerialno()=22012, getAccountId()=25800*****, getRec
rgeRecordVO [rechargeDateStr=2018-04-23 15:39:57, dealDateStr=2018-04-23 15:39:57, transferType=充值, realName=徐*波, memberId=b***********n@163.com, getSerialno()=22191, getAccountId()=10000000*****,
rgeRecordVO [rechargeDateStr=2017-04-26 16:54:14, dealDateStr=2017-04-26 16:54:14, transferType=充值, realName=田*君, memberId=b******3@163.com, getSerialno()=19996, getAccountId()=10100*****, getRecharg
rgeRecordVO [rechargeDateStr=2017-11-17 09:39:10, dealDateStr=2017-11-17 09:39:10, transferType=充值, realName=深*店, memberId=5*******5@qq.com, getSerialno()=21616, getAccountId()=10000000*****, getRec
ordVO [rechargeDateStr=2017-09-19 17:15:32, dealDateStr=2017-09-19 17:15:32, transferType=Recharge, realName=聂*平, memberId=j**********2@163.com, getSerialno()=21239, getAccountId()=100000000*****, g
ordVO [rechargeDateStr=2017-11-20 16:17:49, dealDateStr=2017-11-20 16:17:49, transferType=Recharge, realName=深*店, memberId=q******q@sina.com, getSerialno()=21683, getAccountId()=35700*****, getRechar