2023-02-21干活记录

妈耶今天考研出分,妈耶今天还有组会。

先浅读一篇论文吧

On Analyzing the Role of Image for Visual-enhanced Relation Extraction :
把图文对打乱,发现性能没降低,离谱!
就这么多吧
做了下午组会的ppt
学一下然后准备开会吧:
BERT task2:
import pandas as pd
import numpy as np
import re
import math
import random
from tqdm import tqdm


def read_data(file_path):
    all_data = pd.read_csv(file_path)
    all_texts = all_data["content"].tolist()
    return all_texts

def split_text(text):
    #pattern = r"。|?"
    pattern = r"[,。?!;:、]"
    sp_text = re.split(pattern, text)
    new_text = resplit_text(sp_text)
    return new_text

def resplit_text(text_list):
    result = []
    sentence = ""

    for text in text_list:
        if sentence == "":
            if len(text)<3:
                continue
            if random.random() < 0.1:
                result.append(text + "。")
                continue
        if len(sentence) < 30 or random.random() < 0.1:
            sentence += text + ","
        else:
            result.append(sentence[:-1]+"。")
            sentence = text
    return result

def build_neg_pos_text(text_list):
    all_text1, all_text2 = [], []
    all_label = []
    for text_id, text in enumerate(text_list):
        if text_id == (len(text_list)-1):
            break
        all_text1.append(text)
        all_text2.append(text_list[text_id+1])
        all_label.append(1)
        c_id = [i for i in range(len(text_list)) if i != text_id and i != (text_id+1)]
        other_id = random.choice(c_id)
        all_text1.append(text)
        all_text2.append(text_list[other_id])
        all_label.append(0)
    return all_text1, all_text2, all_label

def build_task2_dataset(text_list):
    all_text1 = []
    all_text2 = []
    all_label = []

    for text in tqdm(text_list):
        sp_text = split_text(text)
        if len(sp_text) <= 2:
            continue

        t1, t2, l = build_neg_pos_text(sp_text)
        all_text1.extend(t1)
        all_text2.extend(t2)
        all_label.extend(l)



    pd.DataFrame({"text1": all_text1, "text2": all_text2, "label": all_label}).to_csv("..//self_bert//data//my_task2.csv", index=False)

def build_word_2_index(all_text):
    word_2_index = {"{[PAD]": 0, "[CLS]": 1, "[SEP]": 2, "[MASK]": 3, "[UNK]": 4}
    for text in all_text:
        for w in text:
            if w not in word_2_index:
                word_2_index[w] = len(word_2_index)
    index_2_word = list(word_2_index)

    return word_2_index, index_2_word




if __name__ == "__main__":
    all_texts = read_data("..//self_bert//data//test_data.csv")
    #build_task2_dataset(all_texts)
    word_2_index, index_2_word = build_word_2_index(all_texts)
    #with open("..//self_bert//data//index_2_word.txt", "w", encoding="utf-8") as f:
        #f.write("\n".join(index_2_word))

鱼书:

 

回家学一下深度学习花书:

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值