对句子二分类,检测两个句子是否表达是同一个意思,模型数据来自天池全球人工智能技术创新大赛【赛道三】详情
模型测试集准确率高99.5%,线上准确率高75%左右,略低于baseline
数据格式:
fasttext使用方法可查看:fasttext官网
import pandas as pd
import random
cate_dic = {'same':1, 'different':0}
#数据加载,未构建验证集
train_file = r'G:\chromeDownload\预测是否属于同一语义\baseline_tfidf_lr\oppo_breeno_round1_data\gaiic_track3_round1_train_20210228.tsv'
test_file = r'G:\chromeDownload\预测是否属于同一语义\baseline_tfidf_lr\oppo_breeno_round1_data\gaiic_track3_round1_testA_20210228.tsv'
df_train = pd.read_table(train_file,names=['q1', 'q2', 'label']).fillna("0") # (100000, 3)
df_test = pd.read_table(test_file, names=['q1', 'q2']).fillna( "0") # (25000, 2)
label = df_train['label'].values
df = pd.concat([df_train, df_test], ignore_index=True) # (125000, 4)
df['text'] = df['q1'] + " " + df['q2']
#处理成fasttext格式
def preprocess_text(content_lines, sentences, category):
for line in content_lines:
try:
sentences.append("__label__"+str(category)+" , "+line)
except:
print(line)
continue
#生成训练数据
sentences = []
same_sentences = df_train[df_train.label==1]
same_sentences = (same_sentences['q1']+ " " + same_sentences['q2']).values.tolist()
diffent_sentences = df_train[df_train.label==0]
diffent_sentences = (diffent_sentences['q1']+ " " + diffent_sentences['q2']).values.tolist()
preprocess_text(same_sentences, sentences, cate_dic['same'])
preprocess_text(diffent_sentences, sentences, cate_dic['different'] )
random.shuffle(sentences)
#写入到文本
out = open(r'G:\chromeDownload\预测是否属于同一语义\baseline_tfidf_lr\train_data.txt', 'w',encoding='utf-8')
for sentence in sentences:
out.write(sentence+"\n")
#训练
classifier = fasttext.train_supervised(input='train_data.txt', lr=1.0, epoch=25, wordNgrams=3, bucket=200000, dim=50, loss='hs')
#查看效果,默认是所有类别的f1score
classifier.test('train_data.txt')
#100000, 0.9671, 0.9671
#预测
lr_0_predictions = []
lr_1_predictions = []
test_sentences_list = (df_test['q1']+ " " + df_test['q2']).values.tolist()
for i,texts in enumerate(test_sentences_list):
labels, probabilities = classifier.predict(texts, k=2)
print(labels,'--', probabilities)
if (labels[0]=='__label__0'):
lr_0_predictions.append(probabilities[0])
if (labels[0]=='__label__1'):
lr_0_predictions.append(probabilities[1])
#预测的第一列是0,写入到文件
pd.DataFrame(lr_0_predictions).to_csv("result.csv", index=False, header=False)