1010深度学习理解and数据的裁剪及填充

这个代码就是将数据取出来,后面我们对取出来的数据进行一个操作。

import random
import numpy as np

class MyDataset:
    def __init__(self,all_text,all_label,batch_size,shuffle=True):
        self.all_text = np.array(all_text)
        self.all_label = np.array(all_label)

        assert len(self.all_text) == len(self.all_label), "数据和标签长度不等!"

        self.batch_size = batch_size
        self.shuffle = shuffle

    def __iter__(self):
        return DataLoader(self)

    def __len__(self):
        return len(self.all_text)


class DataLoader:
    def __init__(self,dataset):
        self.dataset = dataset
        self.cursor = 0
        self.shuffle_index = np.array([i for i in range(0,len(self.dataset))])

        if dataset.shuffle == True:
            np.random.shuffle(self.shuffle_index)
        # self.batch_size = batch_size
        # self.shuffle = shuffle

    def __next__(self):
        if self.cursor >= len(self.dataset):
            raise StopIteration
        index = self.shuffle_index[self.cursor:self.cursor+self.dataset.batch_size]
        text = self.dataset.all_text[index]
        label = self.dataset.all_label[index]

        self.cursor += self.dataset.batch_size
        return text,label





def get_data():
    all_text = ["今天天气正好", "晚上的麻辣烫很难吃", "这件衣服很难看", "早上空腹吃早饭不健康", "晚上早点睡觉很健康"]
    all_label = [1, 0, 0, 0 ,1]
    return all_text,all_label

if __name__ == "__main__":
    all_text, all_label = get_data()


    epoch = 10
    batch_size = 2

    dataset = MyDataset(all_text,all_label,batch_size,shuffle=True)

    for e in range(epoch):
        print(  "*" * 20 )
        for batch_text,batch_label in dataset:
            print(e,batch_text,batch_label)
# data:
# 1 * 8  :晚上麻辣烫很难吃 ————> 0 1 2 3 4 5 6 7 --->  0   (1 * 1) |0-100|
# 1 * 6  :这衣服很好看    ————> 8 9 10 5 11 12  ---->  1   (1 * 1)

# P  0 ~ 1

# 1*8 x 8*100 x 100*100 x 100 * 1 = 100 ---> sigmoid --> 0~1  (1*1)  p>0.5 -->1     0<p<0.5 -->1  # 推理 (预测)  p > 0 ---> 1   p<0 --> 0
# 1*6 x 6 * 1 = 1*1
# 激活函数,就是为了增加非线性因素,之前的程程佳佳都是线性的,加了非线性运算能使的更多的拟合点。
# model:
# 6 * 6

# 结果
# 0 : 消极
# 1 : 积极
# 一个数字就可以表示类别

# 分类任务,回归任务

对将数据变成index,然后进行填充裁剪

这里首先是对文件夹的一个读取数据的操作,然后去dataset里面一条一条的读取数据,在dataloader中读取数据我们还要做三件事,裁剪,word_to_index,填充,最后输出的是一个矩阵,然后我们将其扔到模型中去

import random
import numpy as np
import os

def read_data(file):
    all_text = []
    all_label = []
    with open(file,"r",encoding="utf-8") as f:
        all_data = f.read().split("\n")
    for data in all_data:
        data = data.split(" ")
        if len(data) != 2:
            continue
        else:
            text,label = data
        all_text.append(text)
        all_label.append(int(label))#在这里将label转化为int类型
    return all_text,all_label

def build_word_2_index(train_text):
    word_2_index = {"<PAD>":0}
    for text in train_text:
        for w in text:
            if w not in word_2_index:
                word_2_index[w] = len(word_2_index)
    return word_2_index


class MyDataset:
    def __init__(self,all_text,all_label,batch_size,shuffle=True):
        self.all_text = np.array(all_text)
        self.all_label = np.array(all_label)
        assert len(self.all_text) == len(self.all_label), "数据和标签长度不等!"
        self.batch_size = batch_size
        self.shuffle = shuffle

    def __iter__(self):
        return DataLoader(self)
    def __len__(self):
        return len(self.all_text)


class DataLoader:
    def __init__(self,dataset):
        self.dataset = dataset
        self.cursor = 0
        self.shuffle_index = np.array([i for i in range(0,len(self.dataset))])

        if dataset.shuffle == True:
            np.random.shuffle(self.shuffle_index)

    def __getitem__(self, index):
        global max_len,word_2_index

        # 先裁剪 , word ---> index , 填充,这一点很重要。

        text = self.dataset.all_text[index][:max_len]
        label = self.dataset.all_label[index]

        text_idx = [word_2_index[i] for i in text]
        text_idx = text_idx + [0] * (max_len - len(text_idx))

        return text,text_idx, label


    def __next__(self):
        if self.cursor >= len(self.dataset):
            raise StopIteration
        index = self.shuffle_index[self.cursor:self.cursor+self.dataset.batch_size]
        batch_text = []
        batch_label = []
        batch_text_idx = []
        for i in index :
            text,text_idx,label = self[i]
            batch_text.append(text)
            batch_text_idx.append(text_idx)
            batch_label.append(label)
        # text = self.dataset.all_text[index]
        # label = self.dataset.all_label[index]

        self.cursor += self.dataset.batch_size
        return batch_text,np.array(batch_text_idx),np.array(batch_label)
#在这里要将数据转化为numpy形式

class MyModel:
    def __init__(self):
        self.model = np.random.uniform( low=-1.0, high=1.0,size=(20,1))

    def forward(self,batch_idx): # 2*20 @ 20*1 = 2*1    1*20 @ 20 * 1 = 1*1
        pre = batch_idx @ self.model
        return pre


if __name__ == "__main__":

    train_text,train_label = read_data(os.path.join("data","train.txt"))
    assert len(train_text) == len(train_label) , "文本数量和标签数量不等"
    print(f"加载数据成功,长度为:{len(train_text)}")

    word_2_index = build_word_2_index(train_text)
    epoch = 3
    batch_size = 2
    max_len = 20

    train_dataset = MyDataset(train_text,train_label,batch_size,shuffle=True)
    model = MyModel()
    for e in range(epoch):
        print(e,"*" * 20)

        for batch_text,batch_text_idx,batch_label in train_dataset:
            pre = model.forward(batch_text_idx)
            print(pre)
            print("*"*4)
            # print(batch_text,batch_text_idx.shape,batch_label)

yield的利用,

def fun1(list1):
    print("hello")
    for i in list1:
        print("welcome")
        yield i

if __name__ == "__main__":
    list1 = [1,2,3,4]
    r = fun1(list1)
    print(next(r))
    print(next(r))
    print(next(r))
    print(next(r))
#经过窥探,就变成r 就变成了生成器,他看到其中的yield了
通过yied,r就变成了生成器,我们可以调用其中的next了

yield用的不多,可以看一哈

import random
import numpy as np
import os

def read_data(file):
    all_text = []
    all_label = []
    with open(file,"r",encoding="utf-8") as f:
        all_data = f.read().split("\n")
    for data in all_data:
        data = data.split(" ")
        if len(data) != 2:
            continue
        else:
            text,label = data
        all_text.append(text)
        all_label.append(int(label))
    return all_text,all_label

def build_word_2_index(train_text):
    word_2_index = {"<PAD>":0}
    for text in train_text:
        for w in text:
            if w not  in word_2_index:
                word_2_index[w] = len(word_2_index)
    return word_2_index



def get_dataset(all_text,all_label,batch_size):

    cursor = 0
    batch_num = int(np.ceil(len(all_text) / batch_size))  # np.floor()
    for i in range(batch_num):
        batch_text = all_text[i*batch_size:(i+1)*batch_size]
        batch_label = all_label[i*batch_size:(i+1)*batch_size]

        yield batch_text,batch_label


if __name__ == "__main__":
    train_text, train_label = read_data(os.path.join("data", "train.txt"))
    assert len(train_text) == len(train_label), "文本数量和标签数量不等"
    print(f"加载数据成功,长度为:{len(train_text)}")

    word_2_index = build_word_2_index(train_text)
    epoch = 3
    batch_size = 2
    max_len = 20



    for e in range(epoch):
        print("*"*100)
        dataset = get_dataset(train_text, train_label, batch_size)
        for batch_text,batch_label in dataset:
            print(batch_text,batch_label)

介绍一下pandas的一些基本用法

import pandas
import pandas as pd

df = pandas.read_csv("data\\m_result.csv")
print(df)
# pd.isna(df)
df = df.fillna("")这是将df里面的空的填充一个空的字符串
ids = list(df["id"])变成list
ids = df.id.values.tolist()也可以去数据,直接df.加一列
questions = df["quesiton"].values.tolist()#这里表现了两种方法
preds = list(df["pred")
rations = list(df["rati]on"])
result = []
for id,question,pred,ration in zip(ids,questions,preds,rations):
    data = str(id) + "\t" + question + "\t" + pred + "\t" + ration#这是将数据给拼接起来
    result.append(data)
with open("data\\m_result.txt","w",encoding="utf-8") as f:
    f.write("\n".join(result))

# index , text , answer, ration

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值