import pandas as pd
import os
import numpy as np
import re
import random
from tqdm import tqdm
def read_data():
#os.path.join自动拼接路径 不用写斜杠之类的
all_data = pd.read_csv(os.path.join("data","test_data.csv"))
#取content这一列文字数据,转成list类型数据,会比pandas的数据结构存取快很多
all_text = all_data["content"].tolist()
return all_text
def resplit_text(text_list):
#句子短了就拼接
result = []
#初始化sentence为空
sentence = ""
for text in text_list:
#加随机策略训练很短和很长的句子
if sentence == "" :
if random.random() < 0.2:
result.append(text + "。")
if len(sentence) < 30 or random.random() < 0.2:
#拼接时加,
sentence += text + ","
else:
result.append(sentence[:-1]+"。")
#把句子里原来的分隔符去掉加句号
sentence = ""
return result
def split_text(text):
#正则表达式 以patten符号进行切分
#patten = r"。|?|!"
patten = r"[,、:;。?]"
sp_text = re.split(patten,text)
new_sp_text = resplit_text(sp_text)
return new_sp_text
def build_neg_pos_data(text_list):
all_text1,all_text2 = [],[]
all_label = []
for tidx, text in enumerate(text_list):
#枚举
if tidx == len(text_list)-1:
break
#不break的话永远有下一句
all_text1.append(text)
all_text2.append(text_list[tidx+1])
#下一句是上一句+1即相邻
all_label.append(1)
c_id = [i for i in range(len(text_list))if i != tidx and i != tidx+1]
#不相邻 不为当前句和当前句的下一句
other_idx = random.choice(c_id)
other_text = text_list[other_idx]
all_text1.append(text)
all_text2.append(other_text)
all_label.append(0)
return all_text1,all_text2,all_label
def build_task2_dataset(text_list):
#由于数据长度差很大,设计切割函数,按照句号切分
#构建数据集
all_text1 = []
all_text2 = []
all_label = []
for text in tqdm(text_list):
#tqdm可以看进度
sp_text = split_text(text)
#如果一篇小于两句话就没法执行正负标签
if len(sp_text)<=2:
continue
text1,text2,label = build_neg_pos_data(sp_text)
all_text1.extend(text1)
all_text2.extend(text2)
all_label.extend(label)
pd.DataFrame({"text1":all_text1,"text2":all_text2,"label":all_label}).to_csv("task2_hw.csv",index=False)
def build_word_2_index(all_text):
#构建字到数字的映射
word_2_index = {}
for text in all_text:
for w in text:
if w not in word_2_index:
word_2_index[w] = len(word_2_index)
#else: 不需要
# print("")
return word_2_index
#先写主函数
if __name__ == "__main__":
all_text = read_data()
#data_len = [len(i) for i in all_text]
#np.mean(data_len)
#统计数据长度 np.min(data_len)=14 np.max(data_len)=50523 长度差很大
build_task2_dataset(all_text)
word_2_index = build_word_2_index(all_text)
print("")
01-11
07-01
6736
10-08
1938
06-06
06-06
08-26
“相关推荐”对你有帮助么?
-
非常没帮助
-
没帮助
-
一般
-
有帮助
-
非常有帮助
提交