最近封在宿舍,炼丹不顺,浅帮朋友写个课程project吧~
Reference:
MRR vs MAP vs NDCG: Rank-Aware Evaluation Metrics And When To Use Them
Word Embedding: Word2Vec With Genism, NLTK, and t-SNE Visualization
几周前写完的,为了避免一些问题,本来想ddl之后就发的,结果保存了忘记了
现在自己的丹也炼出来了,要准备考研了捏
task1
在数据集上跑BM25并且用MAP和NDCG作为metric
重点:用numpy 实现mAP和NDCG,并取得一个尽量高的分数
BM25
from collections import Counter
import numpy as np
class BM25_Model(object):
def __init__(self, documents_list, k1=1.2, k2=100, b=0.75):
self.documents_list = documents_list
self.documents_number = len(documents_list)
self.avg_documents_len = sum([len(document) for document in documents_list]) / self.documents_number
self.f = []
self.idf = {}
self.k1 = k1
self.k2 = k2
self.b = b
self.init()
def init(self):
df = {}
for document in self.documents_list:
temp = {}
for word in document:
temp[word] = temp.get(word, 0) + 1
self.f.append(temp) # each doc's words counts
for key in temp.keys():
df[key] = df.get(key, 0) + 1
for key, value in df.items():
self.idf[key] = np.log((self.documents_number - value + 0.5) / (value + 0.5))
def get_score(self, index, query):
score = 0.0
document_len = len(self.f[index])
qf = Counter(query)
for q in query:
if q not in self.f[index]:
continue
W = self.idf[q]
K = self.k1 * (1 - self.b + self.b * document_len / self.avg_documents_len)
R = (self.f[index][q] * (self.k1 + 1) / (self.f[index][q] + K)) * (qf[q] * (self.k2 + 1) / (qf[q] + self.k2))
score += W * R
return score
def get_documents_score(self, query, top_k=None):
score_list = []
for i in range(self.documents_number):
score_list.append(self.get_score(i, query))
if top_k is None:
return score_list
else:
score_list.sort(reverse=True)
return score_list[:top_k]
main
import pandas as pd
from metrics import NDCG, mean_average_precision
from model import BM25_Model
from utils import tokenize_stem_removalstop
def main():
validation_data = pd.read_csv('../2022IRDMdata_part2/part2/validation_data.tsv', delimiter=' ')
qid_list = validation_data['qid'].unique().tolist()
for qid in qid_list:
ww = validation_data[validation_data['qid'] == qid]
ww = ww.reset_index(drop=True)
ww['passage'] = [tokenize_stem_removalstop(text) for text in ww['passage']]
query = tokenize_stem_removalstop(ww['queries'][0])
bm25_model = BM25_Model(ww['passage'])
ww['scores'] = bm25_model.get_documents_score(query)
ww = ww.sort_values(by='scores', ascending=False)
ndcg_score = NDCG(ww['relevancy'].values, ww['scores'].values, 100)
map_score = mean_average_precision(ww['scores'].values, ww['relevancy'].values, top_k=100)
print("qid : {}, NDCG score : {}, mAP score : {}".format(qid, ndcg_score, map_score))
if __name__ == '__main__':
main()
NDCG和MAP
import numpy as np
def NDCG(golden, current, n=-1): # top_n will be selected
log2_table = np.log2(np.arange(2, 102))
def dcg_at_n(rel, n):
rel = np.asfarray(rel)[:n] # convert int to float
for idx, r in enumerate(rel):
if r > 0:
rel[idx] = 1
dcg = np.sum(np.divide(np.power(2, rel) - 1, log2_table[:rel.shape[0]]))
return dcg
idcg = dcg_at_n(sorted(golden, reverse=True), n)
dcg = dcg_at_n(current, n)
return idcg / dcg
def mean_average_precision(scores, relevancy, top_k=-1):
if top_k == -1:
top_k = len(scores)
if len(scores) < top_k:
top_k = len(scores)
res = []
now = 0.
for idx in range(top_k):
if relevancy[idx] > 0:
now += 1
res.append(now / (idx + 1))
result = 0.
for i in res:
result += i
if len(res) > 0:
return result / len(res)
else:
return 0
task2
使用词向量,训练logistic regression,分析学习率对模型训练损失的影响
重点:用numpy实现logistic regression,加载词向量,并用query的平均词向量作为特征
import numpy as np
class logistic_regression:
def __init__(self, hidden_size, epochs, batch_size, learning_rate=0.01):
self.hidden_size = hidden_size
self.w = None
self.learning_rate = learning_rate
self.num_classes = 1
self.epochs = epochs
self.batch_size = batch_size
self.init()
def init(self):
self.w = np.random.randn(self.batch_size, self.num_classes)
def forward(self, X, w, y_label):
z = np.dot(X, w)
a = sigmoid(z)
loss_list = cross_entropy(a, y_label)
loss = sum(loss_list)
return z, a, loss_list, loss
def gradients(self, z, a, loss_list, X, y_label):
grad1 = np.ones(len(loss_list))
grad2 = cross_entropy_grad(a, y_label)
grad_sigmoid = sigmoid(z) * (1 - sigmoid(z))
grad_w = X
return np.dot(grad1, np.multiply(np.multiply(grad2, grad_sigmoid), grad_w))
def train(self, X, y):
for epoch in range(self.epochs):
z, a, loss_list, loss = self.forward(X, self.w, y)
g_w = self.gradients(z, a, loss_list, X, y)
self.w -= self.learning_rate * g_w.reshape([-1, 1])
# if epoch % 10 == 0:
# print("y : {}, y_label : {}".format(a, y_label))
# print("loss:", loss / self.batch_size)
def perdict(self, X, y):
z, a, loss_list, loss = self.forward(X, self.w, y_label=y)
return a, loss
def get_features(query, passage):
return np.matmul(query, passage.T)
def cross_entropy(y, y_pre):
return np.multiply(-y_pre, np.log(y)) - np.multiply((1 - y_pre), np.log(1 - y))
def cross_entropy_grad(y, y_pre):
return (-y_pre) / (y) + (1 - y_pre) / (1 - y)
def sigmoid(z):
return 1 / (1 + np.exp(-z)) # label == 1
main
import pandas as pd
from model import *
pd.options.mode.chained_assignment = None
import numpy as np
import nltk
import re
from nltk import word_tokenize
from tqdm import tqdm
from nltk.corpus import stopwords
from gensim.models import KeyedVectors
from metrics import NDCG
from metrics import mean_average_precision
filename = '../test/glove.6B.100d.txt.word2vec'
model = KeyedVectors.load_word2vec_format(filename, binary=False)
def tokenize_stem_removalstop(text):
text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
tokenized_text = word_tokenize(text.lower())
tokenized_text = [w for w in tokenized_text if w not in stopwords.words('english')]
return tokenized_text
def get_embedding(sentence):
res1 = np.zeros_like(model['queen'])
for qq in sentence:
if qq not in model.index_to_key:
continue
else:
res1 += model[qq]
res1 /= len(sentence)
return res1
def main():
validation_data = pd.read_csv('../2022IRDMdata_part2/part2/validation_data.tsv', delimiter=' ')
train_data = pd.read_csv('../2022IRDMdata_part2/part2/train_data.tsv', delimiter=' ')
STOP_WORDS = nltk.corpus.stopwords.words()
train_qid_list = train_data['qid'].unique().tolist()
valid_qid_list = validation_data['qid'].unique().tolist()
score_list = []
ad = None
np.random.seed(2022)
batch_size = 5
logistic_model = logistic_regression(hidden_size=batch_size, epochs=5, batch_size=batch_size, learning_rate=0.0003)
for qid in train_qid_list:
ww = train_data[train_data['qid'] == qid]
ww = ww.reset_index(drop=True)
query = tokenize_stem_removalstop(ww['queries'][0])
ww['passage'] = [tokenize_stem_removalstop(text) for text in ww['passage']]
q_embedding = get_embedding(query)
# p_embedding = get_embedding(ww['passage'][0])
aa = q_embedding
for i in tqdm(range(0, len(ww), batch_size)):
p_embedding = get_embedding(ww['passage'][i])
q_embedding = get_embedding(query)
if i + batch_size < len(ww):
for j in range(i + 1, i + batch_size):
now_embeding = get_embedding(ww['passage'][j])
p_embedding = np.vstack((p_embedding, now_embeding))
q_embedding = np.vstack((q_embedding, aa))
else:
for j in range(i + 1, len(ww)):
now_embeding = get_embedding(ww['passage'][j])
p_embedding = np.vstack((p_embedding, now_embeding))
q_embedding = np.vstack((q_embedding, aa))
X = get_features(q_embedding, p_embedding)
if X.shape == ():
continue
if X.shape[0] < batch_size:
continue
if i + batch_size <= len(ww):
y_label = np.array(ww['relevancy'][i:i + batch_size]).reshape(-1, 1)
else:
y_label = np.array(ww['relevancy'][i:len(ww)]).reshape(-1, 1)
logistic_model.train(X, y_label)
first = ad
first = first.sort_values(by='score', ascending=False)
first['rank'] = [i + 1 for i in range(len(first))]
first['algoname'] = 'logistic_regression'
for qid in valid_qid_list[1:]:
# qid = 995825
ww = validation_data[validation_data['qid'] == qid]
ad = ww
ww = ww.reset_index(drop=True)
query = tokenize_stem_removalstop(ww['queries'][0])
ww['passage'] = [tokenize_stem_removalstop(text) for text in ww['passage']]
q_embedding = get_embedding(query)
# p_embedding = get_embedding(ww['passage'][0])
aa = q_embedding
for i in tqdm(range(0, len(ww), batch_size)):
p_embedding = get_embedding(ww['passage'][i])
q_embedding = get_embedding(query)
if i + batch_size < len(ww):
for j in range(i + 1, i + batch_size):
now_embeding = get_embedding(ww['passage'][j])
p_embedding = np.vstack((p_embedding, now_embeding))
q_embedding = np.vstack((q_embedding, aa))
else:
for j in range(i + 1, len(ww)):
now_embeding = get_embedding(ww['passage'][j])
p_embedding = np.vstack((p_embedding, now_embeding))
q_embedding = np.vstack((q_embedding, aa))
X = get_features(q_embedding, p_embedding)
if X.shape == ():
continue
if X.shape[0] < batch_size:
continue
if i + batch_size <= len(ww):
y_label = np.array(ww['relevancy'][i:i + batch_size]).reshape(-1, 1)
else:
y_label = np.array(ww['relevancy'][i:len(ww)]).reshape(-1, 1)
score, loss = logistic_model.perdict(X, y_label)
score_list.append([w for w in score])
score_df = []
for scores in score_list:
for score in scores:
score_df.append(score)
score_df = pd.DataFrame(score_df)
ad['score'] = score_df
ad = ad.sort_values(by='score', ascending=False)
ad = ad.fillna(0)
ad = ad[:100] # select top 100
ad['rank'] = [i + 1 for i in range(len(ad))]
ad['algoname'] = 'logistic_regression'
first = pd.concat([first, ad])
for qid in valid_qid_list:
now = validation_data[validation_data['qid'] == qid]
print("qid : {}, ndcg score : {}, map score : {}".format(qid, NDCG(now['relevancy'], now['score']),
mean_average_precision(now['score'],
now['relevancy'])))
first = first.drop('queries', axis=1)
first = first.drop('passage', axis=1)
first['A2'] = ['A2' for i in range(len(first))]
first = first.drop('relevancy', axis=1)
new = first.reindex(columns=['qid', 'A2', 'pid', 'score', 'rank', 'algoname'])
new.to_csv('LR.txt', sep=' ')
if __name__ == '__main__':
main()
task3
使用Xgboost库的LambdaMART Model来重排段落
重点:描述你如何进行输入处理,以及作为输入的表示/特征
import re
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from nltk import word_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm
from xgboost import XGBRFRegressor
from metrics import NDCG
from metrics import mean_average_precision
pd.options.mode.chained_assignment = None
train_data = pd.read_csv('../2022IRDMdata_part2/part2/train_data.tsv', delimiter=' ')
filename = '../test/glove.6B.100d.txt.word2vec'
glove = KeyedVectors.load_word2vec_format(filename, binary=False)
def get_embedding(sentence):
res1 = np.zeros_like(glove['queen'])
for qq in sentence:
if qq not in glove.index_to_key:
continue
else:
res1 += glove[qq]
res1 /= len(sentence)
return res1
def tokenize_stem_removalstop(text):
text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
tokenized_text = word_tokenize(text.lower())
tokenized_text = [w for w in tokenized_text if w not in stopwords.words('english')]
return tokenized_text
def main():
train_qid_list = train_data['qid'].unique().tolist()
xgb_model = XGBRFRegressor(learning_rate=0.01, max_depth=3, tree_method='gpu_hist')
q = get_embedding(tokenize_stem_removalstop(train_data['queries'][0]))
p = get_embedding(tokenize_stem_removalstop(train_data['passage'][0]))
X = np.hstack([q, p]).reshape(1, -1)
Y = train_data['relevancy'][0].reshape(1, -1)
for i in tqdm(range(len(train_data))):
q = get_embedding(tokenize_stem_removalstop(train_data['queries'][i]))
p = get_embedding(tokenize_stem_removalstop(train_data['passage'][i]))
x = np.hstack([q, p]).reshape(1, -1)
y = train_data['relevancy'][i].reshape(1, -1)
X = np.vstack([X, x])
Y = np.vstack([Y, y])
xgb_model.fit(X=X, y=Y)
validation_data = pd.read_csv('../2022IRDMdata_part2/part2/validation_data.tsv', delimiter=' ')
res = []
for j in tqdm(range(0, 1100000, 50000)):
q = get_embedding(tokenize_stem_removalstop(validation_data['queries'][0]))
p = get_embedding(tokenize_stem_removalstop(validation_data['passage'][0]))
X = np.hstack([q, p]).reshape(1, -1)
for i in tqdm(range(len(validation_data[j:j + 50000]))):
q = get_embedding(tokenize_stem_removalstop(validation_data['queries'][i]))
p = get_embedding(tokenize_stem_removalstop(validation_data['passage'][i]))
x = np.hstack([q, p]).reshape(1, -1)
X = np.vstack([X, x])
w = xgb_model.predict(X)
res.append(w)
for j in tqdm(range(0, len(validation_data), 50000)):
q = get_embedding(tokenize_stem_removalstop(validation_data['queries'][0]))
p = get_embedding(tokenize_stem_removalstop(validation_data['passage'][0]))
X = np.hstack([q, p]).reshape(1, -1)
for i in tqdm(range(len(validation_data[j:j + 50000]))):
q = get_embedding(tokenize_stem_removalstop(validation_data['queries'][i]))
p = get_embedding(tokenize_stem_removalstop(validation_data['passage'][i]))
x = np.hstack([q, p]).reshape(1, -1)
X = np.vstack([X, x])
w = xgb_model.predict(X)
res.append(w)
for j in tqdm(range(1100000, len(validation_data), 50000)):
q = get_embedding(tokenize_stem_removalstop(validation_data['queries'][0]))
p = get_embedding(tokenize_stem_removalstop(validation_data['passage'][0]))
X = np.hstack([q, p]).reshape(1, -1)
for i in tqdm(range(len(validation_data[j:]))):
q = get_embedding(tokenize_stem_removalstop(validation_data['queries'][i]))
p = get_embedding(tokenize_stem_removalstop(validation_data['passage'][i]))
x = np.hstack([q, p]).reshape(1, -1)
X = np.vstack([X, x])
w = xgb_model.predict(X)
res.append(w)
print(len(res))
new = np.concatenate([res[i] for i in range(len(res))], axis=0)
validation_data['score'] = new
validation_data['algoname'] = 'LambdaMART'
qid_list = validation_data['qid'].unique().tolist()
validation_data['rank'] = 0
kk = []
for qid in qid_list:
mid = validation_data[validation_data['qid'] == qid]
mid = mid.sort_values(by='score', ascending=False)
mid = mid[:100]
lens = len(mid)
mid['rank'] = [i + 1 for i in range(lens)]
kk.append(mid)
now = validation_data[validation_data['qid'] == qid]
print("qid : {}, ndcg score : {}, map score : {}".format(qid, NDCG(now['relevancy'], now['score']),
mean_average_precision(now['score'],
now['relevancy'])))
out = pd.concat([bb for bb in kk], axis=0)
out['A2'] = 'A2'
out = out.drop(labels='queries', axis=1)
out = out.drop(labels='passage', axis=1)
out = out.reindex(columns=['qid', 'A2', 'pid', 'rank', 'score', 'algoname'])
out = out.reset_index()
out.to_csv('LM.txt', sep=' ')
if __name__ == '__main__':
main()
task4
使用神经网络模型来重排段落
重点:使用PyTorch,这一部分的分数将取决于你为该任务选择的模型是否合适,以及在训练中使用的表示/特征
import re
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from gensim.models import KeyedVectors
from nltk import word_tokenize
from nltk.corpus import stopwords
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from metrics import NDCG
from metrics import mean_average_precision
filename = '../test/glove.6B.100d.txt.word2vec'
glove = KeyedVectors.load_word2vec_format(filename, binary=False)
batch_size = 8
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
hidden_size = 50
dropout = 0.2
embedding_size = 100
epochs = 5
learning_rate = 0.001
class my_dataset(Dataset):
def __init__(self, dataset):
self.queries = dataset['queries'].values
self.passage = dataset['passage'].values
self.label = dataset['relevancy'].values
def __getitem__(self, item):
queries = self.queries[item]
passage = self.passage[item]
label = self.label[item]
q_embedding = get_embedding(queries)
p_embedding = get_embedding(passage)
q_embedding = torch.from_numpy(q_embedding)
p_embedding = torch.from_numpy(p_embedding)
label = torch.tensor(label, dtype=torch.float32)
return q_embedding.to(device), p_embedding.to(device), label.to(device)
def __len__(self):
return len(self.label)
class attention(nn.Module):
def __init__(self, embedding_size, hidden_size, dropout):
super().__init__()
self.embedding_size = embedding_size
self.hidden_size = hidden_size
self.matrix_q = nn.Linear(embedding_size, hidden_size)
self.matrix_k = nn.Linear(embedding_size, hidden_size)
self.matrix_v = nn.Linear(embedding_size, hidden_size)
self.fc1 = nn.Linear(hidden_size, 1)
self.dropout = nn.Dropout(dropout)
def forward(self, q, p):
'''
:param q: batch_size * embedding_size
:param p: batch_size * embedding_size
:return: score : batch_size * 1
'''
Q = self.matrix_q(q) # batch_size * hidden_size
K = self.matrix_k(p)
V = self.matrix_v(p)
scores = torch.matmul(Q, K.transpose(1, 0)) # batch_size * batch_size
attn = F.softmax(scores, dim=1) # batch_size * 1
# print(attn.size())
now = torch.matmul(attn, V)
now = torch.relu(now)
now = self.dropout(now)
result = self.fc1(now)
score = torch.sigmoid(result)
return torch.tensor(score, dtype=torch.float32)
def tokenize_stem_removalstop(text):
text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
tokenized_text = word_tokenize(text.lower())
tokenized_text = [w for w in tokenized_text if w not in stopwords.words('english')]
return tokenized_text
def get_embedding(sentence):
res1 = np.zeros_like(glove['queen'])
for qq in sentence:
if qq not in glove.index_to_key:
continue
else:
res1 += glove[qq]
res1 /= len(sentence)
return res1
if __name__ == '__main__':
train_data = pd.read_csv('../2022IRDMdata_part2/part2/train_data.tsv', delimiter=' ')
validation_data = pd.read_csv('../2022IRDMdata_part2/part2/validation_data.tsv', delimiter=' ')
# Compute embedding for both lists
train_dataset = my_dataset(train_data[:1000])
validation_dataset = my_dataset(validation_data[:1000])
model = attention(embedding_size, hidden_size, dropout)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
loss_func = torch.nn.MSELoss()
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
validation_daloader = DataLoader(validation_dataset, shuffle=True, batch_size=1)
loss_list = []
model.train()
for epoch in range(epochs):
loss = 0
for idx, (p, q, label) in tqdm(enumerate(train_dataloader)):
model.zero_grad()
score = model(p, q)
label = label.unsqueeze(-1)
loss = loss_func(score, label)
loss.requires_grad_(True)
loss.backward()
optimizer.step()
loss_list.append(loss.item())
if idx % 100 == 0:
print("epoch : {}, loss : {}".format(epoch, loss))
score_list = []
model.eval()
with torch.no_grad():
for epoch in range(epochs):
loss = 0
for idx, (p, q, label) in tqdm(enumerate(validation_daloader)):
model.zero_grad()
score = model(p, q)
score_list.append(score.item())
scores = np.array(score_list)
validation_data['score'] = scores
validation_data['algoname'] = 'attention'
qid_list = validation_data['qid'].unique().tolist()
validation_data['rank'] = 0
kk = []
for qid in qid_list:
mid = validation_data[validation_data['qid'] == qid]
mid = mid.sort_values(by='score', ascending=False)
mid = mid[:100]
lens = len(mid)
mid['rank'] = [i + 1 for i in range(lens)]
kk.append(mid)
now = validation_data[validation_data['qid'] == qid]
print("qid : {}, ndcg score : {}, map score : {}".format(qid, NDCG(now['relevancy'], now['score']),
mean_average_precision(now['score'],
now['relevancy'])))
out = validation_data
out['A2'] = 'A2'
out = out.drop(labels='queries', axis=1)
out = out.drop(labels='passage', axis=1)
out = out.reindex(columns=['qid', 'A2', 'pid', 'rank', 'score', 'algoname'])
out = out.reset_index()
out.to_csv('NN.txt', sep=' ')
总结和评价
首先,信息抽取这个任务我自己没做过,所以做了好几天,一直在查查查改代码
吐槽一下,老师要求的这个特征表示是平均一句话里面所有词的词向量,那这样这个词向量基本就没什么用处了,其次,这个数据集非常的脏,1比1000的正负样本比例,模型基本是直接全部学成负的就可以了,所以在metric上的表现非常的难看
最后总结一下通过这个作业,学习了一些信息抽取的模型,算法,更加熟悉一些包的使用,总之对自己代码能力也是一个小小的锻炼吧