# -*- coding: utf-8 -*-
"""
Created on Thu Dec 5 17:50:21 2019
@author: zhanggl21
"""
'''
直聊聊天记录-客户留电探究
'''
#导入数据
import pandas as pd
zldf=pd.read_excel(r'd:\Users\zhanggl21\Desktop\2019年11月直聊总表.xlsx')
#编译所需要的正则表达式
import re
shouji_re=re.compile('1[0-9]{10}',re.I)
shoujiweixin_re=re.compile('[微信|vx]{2,}.*1[0-9]{10}|1[0-9]{10}.*[微信|vx]{2,}',re.I)
weixinfeishouji_re=re.compile('[微信|vx]{2,}.*[a-z]{3,}|[a-z]{3,}.*[微信|vx]{2,}|[微信|vx]{2,}.*[0-9]{4,10}|[0-9]{4,10}.*[微信|vx]{2,}',re.I)
#计算匹配到的手机号个数
sj_shu=0
sj_lst=[]
for x in zldf.聊天记录:
if re.search(shouji_re,str(x)) is not None:
sj_shu=sj_shu+1
sj_lst.append(re.search(shouji_re,str(x)).group())
else:
sj_lst.append(None)
print('匹配到的手机号总个数(不去重)为:',sj_shu)
#计算手机号是微信的个数(统计标准:用户在聊天时指明手机号是微信)
sjw_shu=0
sjw_lst=[]
for x in zldf.聊天记录:
if re.search(shoujiweixin_re,str(x)) is not None:
sjw_shu=sjw_shu+1
sjw_lst.append(re.search(shoujiweixin_re,str(x)).group())
else:
sjw_lst.append(None)
print('匹配到的手机号是微信的个数为:',sjw_shu)
#计算微信不是手机号(字母或数字但非手机)的个数
wxsznotsj_shu=0
wxsznotsj_lst=[]
for x in zldf.聊天记录:
if re.search(weixinfeishouji_re,str(x)) is not None:
if ('http' not in re.search(weixinfeishouji_re,str(x)).group()) and\
(re.search('1\d{10}',str(x)) is None) and\
('APP' not in re.search(weixinfeishouji_re,str(x)).group()) and\
('app' not in re.search(weixinfeishouji_re,str(x)).group()) and\
('App' not in re.search(weixinfeishouji_re,str(x)).group()):#排除匹配到的链接类和手机号类微信和字母包括app
wxsznotsj_shu=wxsznotsj_shu+1
wxsznotsj_lst.append(re.search(weixinfeishouji_re,str(x)).group())
else:
wxsznotsj_lst.append(None)
else:
wxsznotsj_lst.append(None)
print('微信不是手机号(字母或数字但非手机)的个数',wxsznotsj_shu)
#把几类数据放到新列
zldf['手机号类']=sj_lst
zldf['手机号是微信']=sjw_lst
zldf['微信非手机类']=wxsznotsj_lst
#导出数据
zldf.to_excel(r'd:\Users\zhanggl21\Desktop\客户留电探究20191205.xlsx',index=False)