这个代码就是将数据取出来,后面我们对取出来的数据进行一个操作。
import random
import numpy as np
class MyDataset:
def __init__(self,all_text,all_label,batch_size,shuffle=True):
self.all_text = np.array(all_text)
self.all_label = np.array(all_label)
assert len(self.all_text) == len(self.all_label), "数据和标签长度不等!"
self.batch_size = batch_size
self.shuffle = shuffle
def __iter__(self):
return DataLoader(self)
def __len__(self):
return len(self.all_text)
class DataLoader:
def __init__(self,dataset):
self.dataset = dataset
self.cursor = 0
self.shuffle_index = np.array([i for i in range(0,len(self.dataset))])
if dataset.shuffle == True:
np.random.shuffle(self.shuffle_index)
# self.batch_size = batch_size
# self.shuffle = shuffle
def __next__(self):
if self.cursor >= len(self.dataset):
raise StopIteration
index = self.shuffle_index[self.cursor:self.cursor+self.dataset.batch_size]
text = self.dataset.all_text[index]
label = self.dataset.all_label[index]
self.cursor += self.dataset.batch_size
return text,label
def get_data():
all_text = ["今天天气正好", "晚上的麻辣烫很难吃", "这件衣服很难看", "早上空腹吃早饭不健康", "晚上早点睡觉很健康"]
all_label = [1, 0, 0, 0 ,1]
return all_text,all_label
if __name__ == "__main__":
all_text, all_label = get_data()
epoch = 10
batch_size = 2
dataset = MyDataset(all_text,all_label,batch_size,shuffle=True)
for e in range(epoch):
print( "*" * 20 )
for batch_text,batch_label in dataset:
print(e,batch_text,batch_label)
# data:
# 1 * 8 :晚上麻辣烫很难吃 ————> 0 1 2 3 4 5 6 7 ---> 0 (1 * 1) |0-100|
# 1 * 6 :这衣服很好看 ————> 8 9 10 5 11 12 ----> 1 (1 * 1)
# P 0 ~ 1
# 1*8 x 8*100 x 100*100 x 100 * 1 = 100 ---> sigmoid --> 0~1 (1*1) p>0.5 -->1 0<p<0.5 -->1 # 推理 (预测) p > 0 ---> 1 p<0 --> 0
# 1*6 x 6 * 1 = 1*1
# 激活函数,就是为了增加非线性因素,之前的程程佳佳都是线性的,加了非线性运算能使的更多的拟合点。
# model:
# 6 * 6
# 结果
# 0 : 消极
# 1 : 积极
# 一个数字就可以表示类别
# 分类任务,回归任务
对将数据变成index,然后进行填充裁剪
这里首先是对文件夹的一个读取数据的操作,然后去dataset里面一条一条的读取数据,在dataloader中读取数据我们还要做三件事,裁剪,word_to_index,填充,最后输出的是一个矩阵,然后我们将其扔到模型中去
import random
import numpy as np
import os
def read_data(file):
all_text = []
all_label = []
with open(file,"r",encoding="utf-8") as f:
all_data = f.read().split("\n")
for data in all_data:
data = data.split(" ")
if len(data) != 2:
continue
else:
text,label = data
all_text.append(text)
all_label.append(int(label))#在这里将label转化为int类型
return all_text,all_label
def build_word_2_index(train_text):
word_2_index = {"<PAD>":0}
for text in train_text:
for w in text:
if w not in word_2_index:
word_2_index[w] = len(word_2_index)
return word_2_index
class MyDataset:
def __init__(self,all_text,all_label,batch_size,shuffle=True):
self.all_text = np.array(all_text)
self.all_label = np.array(all_label)
assert len(self.all_text) == len(self.all_label), "数据和标签长度不等!"
self.batch_size = batch_size
self.shuffle = shuffle
def __iter__(self):
return DataLoader(self)
def __len__(self):
return len(self.all_text)
class DataLoader:
def __init__(self,dataset):
self.dataset = dataset
self.cursor = 0
self.shuffle_index = np.array([i for i in range(0,len(self.dataset))])
if dataset.shuffle == True:
np.random.shuffle(self.shuffle_index)
def __getitem__(self, index):
global max_len,word_2_index
# 先裁剪 , word ---> index , 填充,这一点很重要。
text = self.dataset.all_text[index][:max_len]
label = self.dataset.all_label[index]
text_idx = [word_2_index[i] for i in text]
text_idx = text_idx + [0] * (max_len - len(text_idx))
return text,text_idx, label
def __next__(self):
if self.cursor >= len(self.dataset):
raise StopIteration
index = self.shuffle_index[self.cursor:self.cursor+self.dataset.batch_size]
batch_text = []
batch_label = []
batch_text_idx = []
for i in index :
text,text_idx,label = self[i]
batch_text.append(text)
batch_text_idx.append(text_idx)
batch_label.append(label)
# text = self.dataset.all_text[index]
# label = self.dataset.all_label[index]
self.cursor += self.dataset.batch_size
return batch_text,np.array(batch_text_idx),np.array(batch_label)
#在这里要将数据转化为numpy形式
class MyModel:
def __init__(self):
self.model = np.random.uniform( low=-1.0, high=1.0,size=(20,1))
def forward(self,batch_idx): # 2*20 @ 20*1 = 2*1 1*20 @ 20 * 1 = 1*1
pre = batch_idx @ self.model
return pre
if __name__ == "__main__":
train_text,train_label = read_data(os.path.join("data","train.txt"))
assert len(train_text) == len(train_label) , "文本数量和标签数量不等"
print(f"加载数据成功,长度为:{len(train_text)}")
word_2_index = build_word_2_index(train_text)
epoch = 3
batch_size = 2
max_len = 20
train_dataset = MyDataset(train_text,train_label,batch_size,shuffle=True)
model = MyModel()
for e in range(epoch):
print(e,"*" * 20)
for batch_text,batch_text_idx,batch_label in train_dataset:
pre = model.forward(batch_text_idx)
print(pre)
print("*"*4)
# print(batch_text,batch_text_idx.shape,batch_label)
yield的利用,
def fun1(list1):
print("hello")
for i in list1:
print("welcome")
yield i
if __name__ == "__main__":
list1 = [1,2,3,4]
r = fun1(list1)
print(next(r))
print(next(r))
print(next(r))
print(next(r))
#经过窥探,就变成r 就变成了生成器,他看到其中的yield了
通过yied,r就变成了生成器,我们可以调用其中的next了
yield用的不多,可以看一哈
import random
import numpy as np
import os
def read_data(file):
all_text = []
all_label = []
with open(file,"r",encoding="utf-8") as f:
all_data = f.read().split("\n")
for data in all_data:
data = data.split(" ")
if len(data) != 2:
continue
else:
text,label = data
all_text.append(text)
all_label.append(int(label))
return all_text,all_label
def build_word_2_index(train_text):
word_2_index = {"<PAD>":0}
for text in train_text:
for w in text:
if w not in word_2_index:
word_2_index[w] = len(word_2_index)
return word_2_index
def get_dataset(all_text,all_label,batch_size):
cursor = 0
batch_num = int(np.ceil(len(all_text) / batch_size)) # np.floor()
for i in range(batch_num):
batch_text = all_text[i*batch_size:(i+1)*batch_size]
batch_label = all_label[i*batch_size:(i+1)*batch_size]
yield batch_text,batch_label
if __name__ == "__main__":
train_text, train_label = read_data(os.path.join("data", "train.txt"))
assert len(train_text) == len(train_label), "文本数量和标签数量不等"
print(f"加载数据成功,长度为:{len(train_text)}")
word_2_index = build_word_2_index(train_text)
epoch = 3
batch_size = 2
max_len = 20
for e in range(epoch):
print("*"*100)
dataset = get_dataset(train_text, train_label, batch_size)
for batch_text,batch_label in dataset:
print(batch_text,batch_label)
介绍一下pandas的一些基本用法
import pandas
import pandas as pd
df = pandas.read_csv("data\\m_result.csv")
print(df)
# pd.isna(df)
df = df.fillna("")这是将df里面的空的填充一个空的字符串
ids = list(df["id"])变成list
ids = df.id.values.tolist()也可以去数据,直接df.加一列
questions = df["quesiton"].values.tolist()#这里表现了两种方法
preds = list(df["pred")
rations = list(df["rati]on"])
result = []
for id,question,pred,ration in zip(ids,questions,preds,rations):
data = str(id) + "\t" + question + "\t" + pred + "\t" + ration#这是将数据给拼接起来
result.append(data)
with open("data\\m_result.txt","w",encoding="utf-8") as f:
f.write("\n".join(result))
# index , text , answer, ration