DataFountain-互联网新闻情感分析fasttext篇

fasttext版本

0.9.1  参考文本分类官方教程:https://fasttext.cc/docs/en/supervised-tutorial.html 

数据集格式

" ".join(["__label__classId"]+["我们","中国"]),标签用特定的__label__连接,和分词共同使用空格分割拼接在一起。官方给的参考数据下载地址:https://dl.fbaipublicfiles.com/fasttext/data/cooking.stackexchange.tar.gz

代码

import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import time
import jieba
import logging
import fasttext
import pandas as pd
import codecs
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import fasttext
#处理训练集,将训练集的文本信息和label信息合并,清洗特殊符合,同时将文本内容进行分词
def merge_feature_label(feature_name,label_name):
    feature=pd.read_csv(feature_name,sep=",")
    label=pd.read_csv(label_name,sep=",")
    data=feature.merge(label,on='id')
    data["X"]=data[["title","content"]].apply(lambda x:"".join([str(x[0]),str(x[1])]),axis=1)
    dataDropNa=data.dropna(axis=0, how='any')
    print(dataDropNa.info())
    dataDropNa["X"]=dataDropNa["X"].apply(lambda x: str(x).replace("\\n","").replace(".","").replace("\n","").replace(" ","").replace("↓","").replace("/","").replace("|","").replace(" ",""))
    dataDropNa["X_split"]=dataDropNa["X"].apply(lambda x:" ".join(jieba.cut(x)))
    return dataDropNa


#将数据集转换为fasttext的训练输入格式
def data_to_fasttext(dataset):
    ft_train=dataset[["X_split","label"]].apply(lambda x:'__label__'+str(x[1])+" "+" ".join(x[0].replace("\ n","").replace(".","").split(" ")),axis=1)
    ft_train.to_csv("ft_train.csv",index=0,header=0)
    print("done to make fasttext train and valication")


#处理测试数据
def process_test(test_name):
    test=pd.read_csv(test_name,sep=",")
    test["X"]=test[["title","content"]].apply(lambda x:"".join([str(x[0]),str(x[1])]),axis=1)
    #print(test.info())
    test["X"]=test["X"].apply(lambda x: str(x).replace("\\n","").replace(".","").replace("\n","").replace(" ","").replace("↓","").replace("/","").replace("|","").replace(" ",""))
    test["X_split"]=test["X"].apply(lambda x:" ".join(jieba.cut(x)))
    return test


#将测试数据转换为fasttext的输入格式
def test_to_fasttext(testset):
    test=testset["X_split"]
    #print(test.info())
    test.to_csv("ft_test.csv",index=0,header=0)
#将训练集数据文本信息和label拼接,同时将数据格式转换为fasttext输入形式
dataDropNa=merge_feature_label("Train_DataSet.csv","Train_DataSet_Label.csv")
testData=process_test("Test_DataSet.csv")
data_to_fasttext(dataDropNa)
#按照比例将训练数据切分为训练集和验证集
import random
def train_test_split(train_name,radio):
    ft_train_data=open(train_name,encoding="utf-8")
    lines=ft_train_data.readlines()
    random.shuffle(lines)
    f_train=open("ft_train_train.csv","w+",encoding="utf-8")
    f_val=open("ft_train_val.csv","w+",encoding="utf-8")
    for i in range(len(lines)):
        if i <= len(lines)*radio:
            f_train.write(lines[i].strip("\""))
        else:
            f_val.write(lines[i].strip("\""))
    f_train.close()
    f_val.close()
# 随机切分数据集并训练
train_test_split("ft_train.csv",0.7)
classifier=fasttext.train_supervised("ft_train_train.csv")

#这部分是是使用最简单的方法来生成预测结果文件和预测准确率的代码
validate_texts = []
labels_origin=[]
with open("0.7ft_train_val.csv", 'r', encoding='utf-8') as infile:
    for line in infile:
        #print(line)
        line=line.strip().split(" ",1)
        if len(line)==2:
            validate_texts += [line[1]]
            labels_origin+=[line[0]]

# 预测结果
print(validate_texts[0][:10]+"...",len(validate_texts))
print(labels_origin[0],len(labels_origin))
labels = classifier.predict(validate_texts)
print(len(labels))
print(labels[0][1])
# 结果文件
result_file = codecs.open("result.txt", 'w+', 'utf-8')

# validate_data = pd.read_table("ft_train_val.csv", header=None, error_bad_lines=False)
# validate_data.columns = ['id', 'content']

# 写入
for index in range(len(validate_texts)):
    outline =validate_texts[index]  + '\t' + labels_origin[index]+ "\t"+labels[0][index][0]+'\n'
    result_file.write(outline)
    result_file.flush()

result_file.close()
count=0.0
count_all=len(validate_texts)
for index in range(len(validate_texts)):
    if labels_origin[index]==labels[0][index][0]:
        count+=1
        
print(count/count_all)
count,count_all
result = classifier.test('ft_train_val.csv')
print(result)

#将测试集得到结果,并生成提交结果格式
test_result=classifier.predict(list(testData["X_split"]))
test_label=[test_result[0][i][0][-1] for i in range(len(test_result[0]))]
test_label_df=pd.DataFrame(test_label,columns=["label"])

testData_label=testData.join(test_label_df)
testData_label[["id","label"]].to_csv("fasttext_submit.csv",index=0,header=0)
#由于fasttext容易过拟合,所以小样本训练将参数设置避免过拟合,添加参数训练版本
train_test_split("ft_train.csv",0.5)
classifier_param=fasttext.train_supervised(input="ft_train_train.csv",lr=0.8,dim=50,word_ngrams=1,min_count=8)
result_param = classifier_param.test('ft_train_val.csv')
print(result_param)

#生成可提交结果
test_result=classifier_param.predict(list(testData["X_split"]))
test_label=[test_result[0][i][0][-1] for i in range(len(test_result[0]))]
test_label_df=pd.DataFrame(test_label,columns=["label"])
testData_label=testData.join(test_label_df)
testData_label[["id","label"]].to_csv("fasttext_submit.csv",index=0)

结果

分析

看了一些博客介绍,fasttext在小样本上容易过拟合,所以在数据上的效果不如baseline的效果,在调参的过程中也发现,dim大效果反而下降,学习率小同样结果下降。

  • 0
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值