利用skift实现fasttext模型

skift: 用于Python fastText的scikit-learn 包装器

什么是 skift?

skift包括几个scikit-learn兼容包装器,里面封装了fasttext模型,fasttext原理类似于word2vec,主要用于文本快速分类。其优势在于分类速度快,使用n-gram特征容易获得文本句子局部信息、构造新词。缺点是随着语料的增长,内存需求也会增长。那么如果解决内存问题呢?fasttext这种提出三种解决方法,包括

  1. 过滤掉出现次数少的词;
  2. 使用Hash存储
  3. 采用word粒度,而非char粒度
    例如句子: 我喜欢去中国, 如果采用char粒度,则使用2-gram的话,产生的特征为
    我喜 喜欢 欢中 中国
    如果采用word粒度的话,产生的特征为
    我喜欢 喜欢去 去中国

关于fasttext原理比较好的参考有FastText文本分类算法学习笔记FastText的内部机制,这里不详阐述。

下面使用skift实现faxtText来对细粒度情感分析模板

from tqdm import tqdm
from skift import FirstColFtClassifier
from sklearn.model_selection import KFold
import numpy as np
import os
import pickle

class BasicModel(object):
    """Docstring for BasicModel. """

    def __init__(self):
        """TODO: to be defined1. """
        pass

    def create_model(self, kfold_X_train, y_train, kfold_X_test, y_test, test):
        pass

    # Generate batches
    def batch_iter(self, data, batch_size, num_epochs=1, shuffle=True):
        data = np.array(data)
        data_size = len(data)
        num_batches_per_epoch = int((data_size - 1) / batch_size) + 1
        for epoch in range(num_epochs):
            if shuffle:
                shuffle_indices = np.random.permutation(np.arange(data_size))
                shuffled_data = data[shuffle_indices]
            else:
                shuffled_data = data
            for batch_num in range(num_batches_per_epoch):
                start_index = batch_num * batch_size
                end_index = min((1 + batch_num) * batch_size, data_size)
                yield shuffled_data[start_index:end_index]

    def get_f1_score(self, x, y, verbose=False):
        tp = np.sum(np.logical_and(y > 0, x == y))
        fp = np.sum(np.logical_and(x > 0, y == 0)) + np.sum(np.logical_and(x * y > 0, y != x))  # 多判或者错判
        fn = np.sum(np.logical_and(y > 0, x == 0))  # 漏判

        P = float(tp) / (float(tp + fp) + 1e-8)
        R = float(tp) / (float(tp + fn) + 1e-8)
        F = 2 * P * R / (P + R + 1e-8)

        if verbose:
            print('P->', P)
            print('R->', R)
            print('F->', F)
        return F


class BasicStaticModel(BasicModel):
    def __init__(self, config=None, n_folds=5, name='BasicStaticModel'):
        self.n_folds = n_folds
        self.name = name
        self.config = config
        self.kf = KFold(n_splits=n_folds, shuffle=True, random_state=10)

    def train_predict(self, train, train_y, test, option=None):
        name = self.name

        predict = np.zeros((test.shape[0], 10, 4))
        oof_predict = np.zeros((train.shape[0], 10, 4))
        scores_f1 = []

        for train_index, dev_index in self.kf.split(train):
            kfold_X_train, kfold_X_val = train[train_index], train[dev_index]
            y_train, y_dev = train_y[train_index], train_y[dev_index]

            model_dict = {}
            print('start train model:')
            for idx in tqdm(range(10)):
                label = y_train[:, idx]
                model = self.create_model()
                model.fit(kfold_X_train, label)
                model_dict[idx] = model
            print('complete train model')
            print('start validate model')
            f1_scores = []
            for idx in tqdm(range(10)):
                label_dev = y_dev[:, idx]
                model = model_dict[idx]
                dev_prob = model.predict_proba(kfold_X_val)
                test_prob = model.predict_proba(test)

                oof_predict[dev_index, idx] = dev_prob
                predict[:, idx] += test_prob / self.n_folds

                dev_predict = np.argmax(dev_prob, 1)
                f1_scores.append(self.get_f1_score(dev_predict, label_dev))
            f1_score = np.mean(f1_scores)
            scores_f1.append(f1_score)
            print('f1_scores-> ', f1_scores)
            print('f1_score: ', f1_score)
            if self.config.is_debug == True:
                break

        print('Total f1->', scores_f1)
        print("Total f1'mean is ", np.mean(scores_f1))

        # 保存结果
        os.makedirs('../data/result-ml', exist_ok=True)

        with open('../data/result-ml/{}_oof_f1_{}.pkl'.format(name, str(np.mean(scores_f1))), 'wb') as f:
            pickle.dump(oof_predict, f)

        with open('../data/result-ml/{}_pre_f1_{}.pkl'.format(name, str(np.mean(scores_f1))), 'wb') as f:
            pickle.dump(predict, f)

        print('done')


class Fasttext(BasicStaticModel):
    def __init__(self, name='basicModel', n_folds=5, config=None):
        BasicStaticModel.__init__(self, name=name, n_folds=n_folds, config=config)

    def create_model(self):
        # 重写
        sk_clf = FirstColFtClassifier(lr=1.0, epoch=10,
                                      wordNgrams=1,
                                      minCount=5, verbose=2)
        return sk_clf
展开阅读全文

没有更多推荐了,返回首页