fasttext版本
0.9.1 参考文本分类官方教程:https://fasttext.cc/docs/en/supervised-tutorial.html
数据集格式
" ".join(["__label__classId"]+["我们","中国"]),标签用特定的__label__连接,和分词共同使用空格分割拼接在一起。官方给的参考数据下载地址:https://dl.fbaipublicfiles.com/fasttext/data/cooking.stackexchange.tar.gz
代码
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import time
import jieba
import logging
import fasttext
import pandas as pd
import codecs
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import fasttext
#处理训练集,将训练集的文本信息和label信息合并,清洗特殊符合,同时将文本内容进行分词
def merge_feature_label(feature_name,label_name):
feature=pd.read_csv(feature_name,sep=",")
label=pd.read_csv(label_name,sep=",")
data=feature.merge(label,on='id')
data["X"]=data[["title","content"]].apply(lambda x:"".join([str(x[0]),str(x[1])]),axis=1)
dataDropNa=data.dropna(axis=0, how='any')
print(dataDropNa.info())
dataDropNa["X"]=dataDropNa["X"].apply(lambda x: str(x).replace("\\n","").replace(".","").replace("\n","").replace(" ","").replace("↓","").replace("/","").replace("|","").replace(" ",""))
dataDropNa["X_split"]=dataDropNa["X"].apply(lambda x:" ".join(jieba.cut(x)))
return dataDropNa
#将数据集转换为fasttext的训练输入格式
def data_to_fasttext(dataset):
ft_train=dataset[["X_split","label"]].apply(lambda x:'__label__'+str(x[1])+" "+" ".join(x[0].replace("\ n","").replace(".","").split(" ")),axis=1)
ft_train.to_csv("ft_train.csv",index=0,header=0)
print("done to make fasttext train and valication")
#处理测试数据
def process_test(test_name):
test=pd.read_csv(test_name,sep=",")
test["X"]=test[["title","content"]].apply(lambda x:"".join([str(x[0]),str(x[1])]),axis=1)
#print(test.info())
test["X"]=test["X"].apply(lambda x: str(x).replace("\\n","").replace(".","").replace("\n","").replace(" ","").replace("↓","").replace("/","").replace("|","").replace(" ",""))
test["X_split"]=test["X"].apply(lambda x:" ".join(jieba.cut(x)))
return test
#将测试数据转换为fasttext的输入格式
def test_to_fasttext(testset):
test=testset["X_split"]
#print(test.info())
test.to_csv("ft_test.csv",index=0,header=0)
#将训练集数据文本信息和label拼接,同时将数据格式转换为fasttext输入形式
dataDropNa=merge_feature_label("Train_DataSet.csv","Train_DataSet_Label.csv")
testData=process_test("Test_DataSet.csv")
data_to_fasttext(dataDropNa)
#按照比例将训练数据切分为训练集和验证集
import random
def train_test_split(train_name,radio):
ft_train_data=open(train_name,encoding="utf-8")
lines=ft_train_data.readlines()
random.shuffle(lines)
f_train=open("ft_train_train.csv","w+",encoding="utf-8")
f_val=open("ft_train_val.csv","w+",encoding="utf-8")
for i in range(len(lines)):
if i <= len(lines)*radio:
f_train.write(lines[i].strip("\""))
else:
f_val.write(lines[i].strip("\""))
f_train.close()
f_val.close()
# 随机切分数据集并训练
train_test_split("ft_train.csv",0.7)
classifier=fasttext.train_supervised("ft_train_train.csv")
#这部分是是使用最简单的方法来生成预测结果文件和预测准确率的代码
validate_texts = []
labels_origin=[]
with open("0.7ft_train_val.csv", 'r', encoding='utf-8') as infile:
for line in infile:
#print(line)
line=line.strip().split(" ",1)
if len(line)==2:
validate_texts += [line[1]]
labels_origin+=[line[0]]
# 预测结果
print(validate_texts[0][:10]+"...",len(validate_texts))
print(labels_origin[0],len(labels_origin))
labels = classifier.predict(validate_texts)
print(len(labels))
print(labels[0][1])
# 结果文件
result_file = codecs.open("result.txt", 'w+', 'utf-8')
# validate_data = pd.read_table("ft_train_val.csv", header=None, error_bad_lines=False)
# validate_data.columns = ['id', 'content']
# 写入
for index in range(len(validate_texts)):
outline =validate_texts[index] + '\t' + labels_origin[index]+ "\t"+labels[0][index][0]+'\n'
result_file.write(outline)
result_file.flush()
result_file.close()
count=0.0
count_all=len(validate_texts)
for index in range(len(validate_texts)):
if labels_origin[index]==labels[0][index][0]:
count+=1
print(count/count_all)
count,count_all
result = classifier.test('ft_train_val.csv')
print(result)
#将测试集得到结果,并生成提交结果格式
test_result=classifier.predict(list(testData["X_split"]))
test_label=[test_result[0][i][0][-1] for i in range(len(test_result[0]))]
test_label_df=pd.DataFrame(test_label,columns=["label"])
testData_label=testData.join(test_label_df)
testData_label[["id","label"]].to_csv("fasttext_submit.csv",index=0,header=0)
#由于fasttext容易过拟合,所以小样本训练将参数设置避免过拟合,添加参数训练版本
train_test_split("ft_train.csv",0.5)
classifier_param=fasttext.train_supervised(input="ft_train_train.csv",lr=0.8,dim=50,word_ngrams=1,min_count=8)
result_param = classifier_param.test('ft_train_val.csv')
print(result_param)
#生成可提交结果
test_result=classifier_param.predict(list(testData["X_split"]))
test_label=[test_result[0][i][0][-1] for i in range(len(test_result[0]))]
test_label_df=pd.DataFrame(test_label,columns=["label"])
testData_label=testData.join(test_label_df)
testData_label[["id","label"]].to_csv("fasttext_submit.csv",index=0)
结果
分析
看了一些博客介绍,fasttext在小样本上容易过拟合,所以在数据上的效果不如baseline的效果,在调参的过程中也发现,dim大效果反而下降,学习率小同样结果下降。