import jieba
import pandas as pd
import re
data=pd.read_csv(r'/Users/atsushi/Desktop/python/data_analysis/data/data.csv',encoding='gbk',header=None)
#header=None,表示读取数据无表头
#给列赋名字
data.columns=['foreign_id','text']
jieba分词
1.自定义添加少数词和字典
jieba.add_word(“长江黄河”)
2.自定义添加很多词和字典
jibe.load_userdict(‘user_dict.txt’)函数
user_dict.txt格式如下:
jieba.cut方法接收三个输入参数:需要分词的字符串;cut_all参数用来控制是否采用全模式,HMM参数用来控制是否使用HMM模型
key_words=['柜员机','分期','有效期']
for word in key_words:
jieba.add_word(word)
将csv文件的数据按照一定规则放入dataframe中
df=pd.DataFrame(columns=['foreign_id','sentence','role','key_word','begin_time','end_time'])
for i in range(data.shape[0]):
df1=df=pd.DataFrame(columns=['foreign_id','sentence','role','key_word','begin_time','end_time'])
df1.sentence=re.findall(r'\(.*?\)',data['text'][i])
df1.foreign_id=data['foreign_id'][i]
df=df.append(df1)
# .*? 为正则表达式中分非贪婪匹配规则
#re.findall(r'\(.*?\)',data['text'][0])的返回结果为一个list:["('您好,请问有什么可以帮你'- '坐席'- ''- ''- '0.18'- '1.32')","('你好,请问是**公司吗'- '客户'- ''- ''- '2.00'- '5.32')","('是的,请说明你的问题,很高兴为你服务'- '坐席'- ''- ''- '6.76'- '10.32')"]
提取role、key_word、end_time、begin_time列值
#获得role
def get_role(sentence):
role=re.sub(r"['\(\)]",'',sentence).split('-')[1].strip()
return role
#获得keyword
def get_keyword(sentence):
keywords=[]
words=re.sub(r"['\(\)]",'',sentence).split('-')[0].strip()
word=(',').join([word for word in jieba.lcut(words) if word in key_words])
return word
#获得begin_time
def get_begin_time(sentence):
begin_time=re.sub(r"['\(\)]",'',sentence).split('-')[4].strip()
return begin_time
#获得end_time
def get_end_time(sentence):
end_time=re.sub(r"['\(\)]",'',sentence).split('-')[4].strip()
return end_time
df['role']=df['sentence'].apply(get_role)
df['key_word']=df['sentence'].apply(get_keyword)
df['begin_time']=df['sentence'].apply(get_begin_time)
df['end_time']=df['sentence'].apply(get_end_time)
#删除没有关键词
df=df[df['key_word']!='']
#重置索引
df.reset_index(drop=True,inplace=True)
#将key_word有多个词的拆分为多列
result=df.drop('key_word',axis=1).join(df['key_word'].str.split(',',expand=True).stack().reset_index(level=1,drop=True).rename('key_word'))