</pre>做数据清洗时候,难免会碰到有些客户电话,邮箱乱填写,一看就明显不正确,用到python写个正则表达式可以进行快速的清洗</p><p>废话不多说,上代码:</p><p><pre name="code" class="python">#encoding:utf-8
'''
Created on 2015年9月18日
@author: ZHOUMEIXU204
'''
import MySQLdb
import re
import pandas as pd
con=MySQLdb.connect(host="10.10.109.62", port=1333, user="zhoumeixu204", \
passwd="zhoumeixu204@123456!", db="ubs", use_unicode=True, charset="utf8")
result=pd.read_sql("SELECT cust_id , USR_MP,USR_EMAIL FROM ubs.usr_oper_info;",con)
def mobile_match(x):
pattern=re.compile(r'1[3,5,4,7,8]{1}[0-9]{9}')
match=pattern.match(x)
if match:
return True
else:
return False
result['mobile_judge']=result['USR_MP'].map(mobile_match)
def email_match(x):
if x.strip().endswith('pingan.com.cn'):
patten=re.compile(r'[a-zA-Z]{1,15}[0-9]{3}\@pingan\.com\.cn')
match=patten.match(x)
if match:
return True
else:
return False
else:
if len(x) > 7:
if re.match("^.+\\@(\\[?)[a-zA-Z0-9\\-\\.]+\\.([a-zA-Z]{2,3}|[0-9]{1,3})(\\]?)$", x) != None:
return True
else:
return False
else:
return False
result['usr_email_judge']=result['USR_EMAIL'].map(email_match)
result.to_excel(u"D:\\匹配结果.xls",index=False)
print(result.head())
con.close()
实际都是一些技巧,记录一些,免得要用时候又要查找