用fasttext 训练

最新推荐文章于 2024-08-01 09:53:29 发布

ithinking110

最新推荐文章于 2024-08-01 09:53:29 发布

阅读量1.6k

点赞数

分类专栏： nlp 文章标签： fasttext pytorch

本文链接：https://blog.csdn.net/ithinking110/article/details/105437268

版权

nlp 专栏收录该内容

19 篇文章 0 订阅

订阅专栏

用fasttext 训练

下载fasttext

官网： https://github.com/facebookresearch/fastText

编译和安装

wget https://github.com/facebookresearch/fastText/archiv/v0.9.1.zip
unzip v0.9.1.zip
cd fastText-0.9.1
 make
 pip install .

使用fasttext

格式：

__label__Sociology , 年轻男子疑因感情受挫旅馆内跳楼身亡新快报
__label__Technology , 18 倍光变 27mm 广角尼康 P80 套装 2800 元作

喂给 fasttext 的文档必须是这样的格式：前缀+标签 + 逗号 + 分词

__label__  这个是前缀
Sociology： 这个是标签
年轻 男子 疑因 感情   ：这是中文分词 中间用空格隔开

变换数据

1，要将数据集分成训练集和测试集第一步将数据转换成 csv

方便后面的切割操作

2，将CSV数据切割成 train.txt 和 test.txt

import fasttext

import re
from types import MethodType, FunctionType

import jieba

from collections import defaultdict

import  pandas  as pd

import  random
import  os

class TransformData(object):

    #将文档转换成 CSV
    def to_csv(self, inPath, outPath, index=False):
        dd = {}
        handler = open(inPath)


        for line in handler:
            label, content = line.split(',', 1)
            key =label.strip('__label__').strip()
            if not  dd.get(key,False):
                dd[key] =[]
            dd[key].append(content.strip())
        handler.close()

        df = pd.DataFrame()
        for key in dd.keys():
            col = pd.Series(dd[key], name=key)
            df = pd.concat([df, col], axis=1)
        return df.to_csv(outPath, index=index, encoding='utf-8')
    #切割数据集 成 train.txt  test.txt
    def  SplitTrainTest(self,inPath,splitRate=0.8):
         baseName = inPath.rsplit('.',1)[0]
         trainFile = baseName + '_Train.txt'
         testFile = baseName+"_Test.txt"
         handle = pd.read_csv(inPath,index_col=False,low_memory=False)
         trainDataSet=[]
         testDataSet=[]
         for head  in list(handle.head()):
              print("head==",head,handle[head].dropna())
              trainNub= int(handle[head].dropna().__len__()*splitRate)
              subList=[f"__label__{head} , {item.strip()}\n" for item in handle[head].dropna().tolist()]
              trainDataSet.extend(subList[:trainNub])
              testDataSet.extend(subList[trainNub:])
              print("subList=",subList)

         random.shuffle(trainDataSet)
         random.shuffle(testDataSet)
         with open(trainFile, 'w', encoding='utf-8') as trainf, \
                 open(testFile, 'w', encoding='utf-8') as testf:
             for tmpItem in  trainDataSet:
                 trainf.write(tmpItem)
             for testItem in  testDataSet:
                 testf.write(testItem)

使用和测试

使用面对对象编程思想写一下使用网上的各种面向过程看的眼花本人有代码强迫症


这三个方法其它都可以使用  所以 面向过程
#去除 字母 和字符
def ClearTxt(raw):
    fil = re.compile(r"[^0-9a-zA-Z\u4e00-\u9fa5]+")
    return fil.sub(' ', raw)

#去除停顿词
def StopWords(stopPath= "./Data/stopwords.txt"):
    with open(stopPath, 'r', encoding='utf-8') as swf:
        return [line.strip() for line in swf]


#用结巴分词分割变成 闲暇 友人 光顾 这种形式
def SegSentence(sentence,stopWord):
    sentence =  ClearTxt(sentence)
    result= ' '.join([i for i in jieba.cut(sentence) if i.strip() and i not in stopWord])

    print("seg==",result)
    return  result

封装一下  面对对象编程
class  UsingText:
    """
        训练一个监督模型, 返回一个模型对象

        @param input:           训练数据文件路径
        @param lr:              学习率
        @param dim:             向量维度
        @param ws:              cbow模型时使用
        @param epoch:           次数
        @param minCount:        词频阈值, 小于该值在初始化时会过滤掉
        @param minCountLabel:   类别阈值，类别小于该值初始化时会过滤掉
        @param minn:            构造subword时最小char个数
        @param maxn:            构造subword时最大char个数
        @param neg:             负采样
        @param wordNgrams:      n-gram个数
        @param loss:            损失函数类型, softmax, ns: 负采样, hs: 分层softmax
        @param bucket:          词扩充大小, [A, B]: A语料中包含的词向量, B不在语料中的词向量
        @param thread:          线程个数, 每个线程处理输入数据的一段, 0号线程负责loss输出
        @param lrUpdateRate:    学习率更新
        @param t:               负采样阈值
        @param label:           类别前缀
        @param verbose:         ??
        @param pretrainedVectors: 预训练的词向量文件路径, 如果word出现在文件夹中初始化不再随机
        @return model object
    """
    def   __init__(self,inFilePath,dim=100,lr=0.1,epoch=5,loss="softmax",wordNgrams=2,prefixLabel="__label__"):
        self.ipt = inFilePath
        self.loss=loss
        self.wordNgrams= wordNgrams
        self.dim= dim
        self.lr= lr
        self.epoch= epoch
        self.modePath= f"dim{str(self.dim)}_lr{str(self.lr)}_iter{str(self)}.model"
        self.prefixLable= prefixLabel
    # 开始训练
    def Train(self):
        if os.path.exists(self.modePath):
             self.classify= fasttext.load_model(self.modePath)
        else:
            self.classify = fasttext.train_supervised(self.ipt,\
                                                      label=self.prefixLable,dim=self.dim,\
                                                      epoch=self.epoch,lr=self.lr,\
                                                      wordNgrams=self.wordNgrams)
            self.classify.save_model(self.modePath)
    def  Test(self,testFilePath):
         result = self.classify.test(testFilePath)

         print("result==",result)
    #计算精确度 召回率
    def CalPrecisionRecall(self,file='data_test.txt'):
        precision = defaultdict(int)
        recall = defaultdict(int)
        total = defaultdict(int)
        stopWord= StopWords()
        with open(file) as f:
            for line in f:
                label, content = line.split(',', 1)
                total[label.strip().strip(self.prefixLable)] += 1
                #labels2 = self.classify.predict([seg(sentence=content.strip(), sw='', apply=clean_txt)])

                contentList= [SegSentence(content.strip(),stopWord)]

                print("contentList==",contentList)

                labels2 = self.classify.predict(contentList)

                print("label2==",labels2)

                pre_label, sim = labels2[0][0][0], labels2[1][0][0]
                recall[pre_label.strip().strip(self.prefixLable)] += 1

                if label.strip() == pre_label.strip():
                    precision[label.strip().strip(self.prefixLable)] += 1

        print('precision', precision.keys())
        print('recall', recall.keys())
        print('total', total.keys())
        for sub in precision.keys():
            pre = precision[sub] / total[sub]
            rec = precision[sub] / recall[sub]
            F1 = (2 * pre * rec) / (pre + rec)
            print(f"{sub.strip(self.prefixLable)}  precision: {str(pre)}  recall: {str(rec)}  F1: {str(F1)}")



if __name__ == '__main__':

    #分割
    if(not os.path.exists("./Data/Out_Train.txt") or  not os.path.exists('./Data/Out_Test.txt') ):
        transData =  TransformData()

        transData.to_csv("./Data/data.txt","./Data/Out.csv")

        transData.SplitTrainTest("./Data/Out.csv")

    #训练
    useFast = UsingText("./Data/Out_Train.txt")
    useFast.Train()

    #useFast.Test("./Data/Out_Test.txt")
    
    #测试验证
    useFast.CalPrecisionRecall("./Data/Out_Test.txt")

    print("finish")