import jieba
print('开始处理表1中的文本特征...')
mywords =['户号','分时','抄表','抄表示数','工单','单号','工单号','空气开关','脉冲灯','计量表','来电','报修']for word in mywords:
jieba.add_word(word)
stops =set()withopen(r'..\电费敏感预测\stopwords.txt', encoding ='utf-8')as f:for word in f:
word = word.strip()
stops.add(word)deffenci(line):
res =[]
words = jieba.cut(line)for word in words:if word notin stops:
res.append(word)return' '.join(res)print('分词ing...')
jobinfo['contents']= jobinfo.ACCEPT_CONTENT.apply(lambda x: fenci(x))
4.2 处理手机号,户号等后面连接的号码
import re
defhash_number(x):
shouji_pattern = re.compile('\s1\d{10}\s|\s1\d{10}\Z')if shouji_pattern.findall(x):
x = re.sub(shouji_pattern,' 手机number ', x)
huhao_pattern = re.compile('\s\d{10}\s|\s\d{10}\Z')if huhao_pattern.findall(x):
x = re.sub(huhao_pattern,' 户号number ', x)
tuiding_pattern = re.