Bert复现笔记

听小瑜儿

于 2023-02-18 23:57:03 发布

阅读量127

点赞数

文章标签： bert Powered by 金山文档

本文链接：https://blog.csdn.net/m0_61267964/article/details/129106401

版权

import pandas as pd
import os
import numpy as np
import re
import random
from tqdm import tqdm 

def read_data():
    
    #os.path.join自动拼接路径 不用写斜杠之类的
    all_data = pd.read_csv(os.path.join("data","test_data.csv"))
    #取content这一列文字数据，转成list类型数据，会比pandas的数据结构存取快很多
    all_text = all_data["content"].tolist()

    return all_text

def resplit_text(text_list):
    
    #句子短了就拼接
    result = []
    #初始化sentence为空
    sentence = ""
    for text in text_list:
        #加随机策略训练很短和很长的句子
        if sentence == "" :
            if random.random() < 0.2:
                result.append(text + "。")
        if len(sentence) < 30 or random.random() < 0.2:
            #拼接时加，
            sentence += text + ","
        else:
            result.append(sentence[:-1]+"。")
            #把句子里原来的分隔符去掉加句号
            sentence = ""

    return result

def split_text(text):

    #正则表达式  以patten符号进行切分
    #patten = r"。|？|！"
    patten = r"[，、：；。？]"
    sp_text = re.split(patten,text)
    new_sp_text = resplit_text(sp_text)

    return new_sp_text

def build_neg_pos_data(text_list):
    all_text1,all_text2 = [],[]
    all_label = []

    for tidx, text in enumerate(text_list):
        #枚举
        if tidx == len(text_list)-1:
            break
        #不break的话永远有下一句
        all_text1.append(text)
        all_text2.append(text_list[tidx+1])
        #下一句是上一句+1即相邻
        all_label.append(1)

        c_id = [i for i in range(len(text_list))if i != tidx and i != tidx+1]
        #不相邻 不为当前句和当前句的下一句
        other_idx = random.choice(c_id)
        
        other_text = text_list[other_idx]
        all_text1.append(text)
        all_text2.append(other_text)
        all_label.append(0)
    return all_text1,all_text2,all_label

def build_task2_dataset(text_list):

    #由于数据长度差很大，设计切割函数，按照句号切分
    #构建数据集
    all_text1 = []
    all_text2 = []
    all_label = []

    for text in tqdm(text_list):
        #tqdm可以看进度
        sp_text = split_text(text)

        #如果一篇小于两句话就没法执行正负标签
        if len(sp_text)<=2:
            continue
        text1,text2,label = build_neg_pos_data(sp_text)  
        
        all_text1.extend(text1)
        all_text2.extend(text2)
        all_label.extend(label)

    pd.DataFrame({"text1":all_text1,"text2":all_text2,"label":all_label}).to_csv("task2_hw.csv",index=False)

def build_word_2_index(all_text):
    #构建字到数字的映射
    word_2_index = {}

    for text in all_text:
        for w in text:
            if w not in word_2_index:
                word_2_index[w] = len(word_2_index)
            #else:           不需要
            #    print("")

    return word_2_index

#先写主函数
if __name__ == "__main__":
    all_text = read_data()
    #data_len = [len(i) for i in all_text]
    #np.mean(data_len)
    #统计数据长度 np.min(data_len)=14  np.max(data_len)=50523 长度差很大
    
    build_task2_dataset(all_text)
    word_2_index = build_word_2_index(all_text)
    print("")