监督分类空白处也被分类了_文本分类语料构建——半监督分类4

最新推荐文章于 2020-11-28 09:33:57 发布

weixin_39574287

最新推荐文章于 2020-11-28 09:33:57 发布

阅读量117

点赞数

文章标签：监督分类空白处也被分类了

利用类关联词约束候选类目，这篇文章说明类关联词提取。我们把之前自动标注错误较多的三个类目（经济、民生、政治）的标注错误数据整理出来。

1. 特征选择法抽取类关联词

这里使用了专栏里前边的文章https://zhuanlan.zhihu.com/p/57969179的代码，稍做了一些修改，抽取出的特征词如下：

1. “政治”、“文教”两个类目的数据杂乱，我们通过聚类后tf-idf法整理出这两个类目的关联词。这里需要再次指出，所有类目badcase中“政治”类标题中的单个词语都无法体现出“政治”类文本的特征。“民生”类中“文教”badcase中确实含有较多民生类词语比如“民生#灾害”类的词语，“民生#人力”类的词语。

实现代码

#coding:utf-8
"""
互信息、卡方、频率征选择示例
"""

import codecs
import re
from math import log2
from sklearn.datasets import load_files
from pyhanlp import *

# 加载实词分词器 参考https://github.com/hankcs/pyhanlp/blob/master/tests/demos/demo_notional_tokenizer.py
Term =JClass("com.hankcs.hanlp.seg.common.Term")
NotionalTokenizer = JClass("com.hankcs.hanlp.tokenizer.NotionalTokenizer")


def getDocuments(root_path, file_path_li):
    """
    读取原始文档集并进行预处理
    :param file_path: 文档集所在路径
    :return: 预处理后的文档列表
    """
    all_text = []
    all_data = load_files(container_path=root_path, categories=file_path_li, 
    encoding="utf-8", decode_error="ignore")
    for label, raw_text in zip(all_data.target, all_data.data):
        word_li = preprocess(raw_text)
        label = all_data.target_names[label]
        all_text.append((label, set(word_li)))
    return all_text


def preprocess(raw_text):
    """
    预处理
    :param raw_text:
    :return:
    """
    # 将换行回车符替换为空格
    raw_text = re.sub(u'r|n', ' ', raw_text)
    # 去掉数值字母
    raw_text = re.sub(u'[0-9a-zA-z.]+', u'', raw_text)
    # 分词
    word_li = [w.word for w in NotionalTokenizer.segment(raw_text)]
    # 去除空白符
    word_li = [w.strip() for w in word_li if w.strip()]
    # 移除单字词
    word_li = [w for w in word_li if len(w)>1]
    return word_li


def getVocabulary(all_text):
    """
    获取文档集词汇表
    :param all_text:
    :return:
    """
    global vocabulary
    for label, word_set in all_text:
        vocabulary |= word_set


def multual_infomation(N_10, N_11, N_00, N_01):
    """
    互信息计算
    :param N_10:
    :param N_11:
    :param N_00:
    :param N_01:
    :return: 词项t互信息值
    """
    N = N_11 + N_10 + N_01 + N_00
    I_UC = (N_11 * 1.0 / N) * log2((N_11 * N * 1.0) / ((N_11 + N_10) * (N_11 + N_01))) + 
           (N_01 * 1.0 / N) * log2((N_01 * N * 1.0) / ((N_01 + N_00) * (N_01 + N_11))) + 
           (N_10 * 1.0 / N) * log2((N_10 * N * 1.0) / ((N_10 + N_11) * (N_10 + N_00))) + 
           (N_00 * 1.0 / N) * log2((N_00 * N * 1.0) / ((N_00 + N_10) * (N_00 + N_01)))
    return I_UC


def chi_square(N_10, N_11, N_00, N_01):
    """
    卡方计算
    :param N_10:
    :param N_11:
    :param N_00:
    :param N_01:
    :return: 词项t卡方值
    """
    fenzi = (N_11 + N_10 + N_01 + N_00)*(N_11*N_00-N_10*N_01)*(N_11*N_00-N_10*N_01)
    fenmu = (N_11+N_01)*(N_11+N_10)*(N_10+N_00)*(N_01+N_00)
    return fenzi*1.0/fenmu


def freq_select(t_doc_cnt, doc_cnt):
    """
    频率特征计算
    :param t_doc_cnt: 类别c中含有词项t的文档数
    :param doc_cnt: 类别c中文档总数
    :return: 词项t频率特征值
    """
    return t_doc_cnt*1.0/doc_cnt


def selectFeatures(documents, category_name, top_k, select_type="chi"):
    """
    特征抽取
    :param documents: 预处理后的文档集
    :param category_name: 类目名称
    :param top_k:  返回的最佳特征数量
    :param select_type: 特征选择的方法，可取值chi,mi,freq，默认为chi
    :return:  最佳特征词序列
    """
    L = []
    # 互信息和卡方特征抽取方法
    if select_type == "chi" or select_type == "mi":
        for t in vocabulary:
            N_11 = 0
            N_10 = 0
            N_01 = 0
            N_00 = 0
            N = 0
            for label, word_set in documents:
                if (t in word_set) and (category_name == label):
                    N_11 += 1
                elif (t in word_set) and (category_name != label):
                    N_10 += 1
                elif (t not in word_set) and (category_name == label):
                    N_01 += 1
                elif (t not in word_set) and (category_name != label):
                    N_00 += 1
                else:
                    print("N error")
                    exit(1)

            if N_00 == 0 or N_01 == 0 or N_10 == 0 or N_11 == 0:
                continue
            # 互信息计算
            if select_type == "mi":
                A_tc = multual_infomation(N_10, N_11, N_00, N_01)
            # 卡方计算
            else:
                A_tc = chi_square(N_10, N_11, N_00, N_01)
            L.append((t, A_tc))
    # 频率特征抽取法
    elif select_type == "freq":
        for t in vocabulary:
            # C类文档集中包含的文档总数
            doc_cnt = 0
            # C类文档集包含词项t的文档数
            t_doc_cnt = 0
            for label, word_set in documents:
                if category_name == label:
                    doc_cnt += 1
                    if t in word_set:
                        t_doc_cnt += 1
            A_tc = freq_select(t_doc_cnt, doc_cnt)
            L.append((t, A_tc))
    else:
        print("error param select_type")
    return sorted(L, key=lambda x:x[1], reverse=True)[:top_k]


# 定义词汇表
vocabulary = set()

if __name__ == "__main__":
    # 读取文档集（需要根据具体类目名称修改）
    category_name_li = [u"农业", u"工业", u"政治",
                        u"文教", "民生", "经济"]
    # 获取文本（根目录需要根据具体类目名称修改）
    all_text = getDocuments(r"../data/经济_民生_政治badcase", category_name_li)
    print("all_text len = ", len(all_text))
    # 读取词汇表
    getVocabulary(all_text)
    print("vocabulary len = ", len(vocabulary))
    # 获取特征词表
    print("="*20, 'n', "  卡方特征选择  n", "="*20)
    feature_select_type = "chi"
    for category_name in category_name_li:
        # 特征抽取，最后一个参数可选值 "chi"卡方,"mi"互信息,"freq"频率
        feature_li = selectFeatures(all_text, category_name, 10, feature_select_type)
        print(category_name)
        for t, i_uc in feature_li:
            print(t, i_uc)
        print("="*10)
    
    print("="*20, 'n', "  互信息特征选择  n", "="*20)
    feature_select_type = "mi"
    for category_name in category_name_li:
        # 特征抽取，最后一个参数可选值 "chi"卡方,"mi"互信息,"freq"频率
        feature_li = selectFeatures(all_text, category_name, 10, feature_select_type)
        print(category_name)
        for t, i_uc in feature_li:
            print(t, i_uc)
        print("="*10)

    print("="*20, 'n', "  频率特征选择  n", "="*20)
    feature_select_type = "freq"
    for category_name in category_name_li:
        # 特征抽取，最后一个参数可选值 "chi"卡方,"mi"互信息,"freq"频率
        feature_li = selectFeatures(all_text, category_name, 10, feature_select_type)
        print(category_name)
        for t, i_uc in feature_li:
            print(t, i_uc)
        print("="*10)
    print("program finished")

2. 聚类后TF-IDF抽取簇关键词

对“政治”badcase文本聚类后做2grams词组的tf-idf关键词抽取，抽取结果如下：

这里只贴出了部分结果，标黄部分为人工挑出的。

“文教”类badcase文本聚类后做词语级tf-idf关键词抽取，这里不再展示结果。

实现代码

#coding:utf-8
"""
关键词抽取tf-idf法
要求：python3，NLTK，PyHanLP，fastText
输入：默认一篇文本占一行
"""
from time import time
from fastText import load_model
import numpy as np
from pyhanlp import *
import sys
import codecs
from py.类目关键词抽取_tfidf import extract_keyword
from nltk import ngrams

# 实词分词器实例
Term = None
NotionalTokenizer = None
# fastText模型
fasttext_model = None
# 存储所有文本向量的矩阵
text_vec = None
# 存储每篇文本的主题序号
topic_serial = None
# 当前拥有的主题数量
topic_cnt = None
# 每个主题中的文本数量
topic_cnt_dict = None
# 预处理后的文本列表
preprocessed_data = None


# 系统初始化
def init():
    global fasttext_model
    global text_vec
    global topic_serial
    global topic_cnt
    global topic_cnt_dict
    global Term
    global NotionalTokenizer
    global preprocessed_data

    # 读取fastText词语向量矩阵
    fasttext_model = read_fasttext_data('../dictionary/cc.zh.300.bin')
    # 初始化文本向量矩阵
    text_vec = np.array([])
    text_vec.resize((0, fasttext_model.get_dimension()))
    # 初始化文本话题编号序列
    topic_serial = []
    # 初始化话题数量
    topic_cnt = 0
    # 初始化每个主题中的文本数量变量
    topic_cnt_dict = dict()
    # 加载实词分词器 参考https://github.com/hankcs/pyhanlp/blob/master/tests/demos/demo_notional_tokenizer.py
    Term = JClass("com.hankcs.hanlp.seg.common.Term")
    NotionalTokenizer = JClass("com.hankcs.hanlp.tokenizer.NotionalTokenizer")
    #  初始化文本列表
    preprocessed_data = []


# 通用预处理（训练语料和预测语料通用）
def preprocess(text, n_grams=1):
    # 全部字母转小写
    text =text.lower()
    word_li = []

    #  NotionalTokenizer.segment中有去除停用词的操作
    for term in NotionalTokenizer.segment(text):
        word = str(term.word)
        pos = str(term.nature)
        # 去掉时间词
        if pos == u't':
            continue
        # 去掉单字词（这样的词的出现有可能是因为分词系统未登录词导致的）
        if n_grams == 1 and len(word) == 1:
            continue
        word_li.append(word)

    if n_grams == 2:
        ngrams2_li = [u'_'.join(w) for w in ngrams(word_li, n_grams)]
        word_li = ngrams2_li
    return word_li


# 读取fastText词语向量矩阵
def read_fasttext_data(file_path):
    t0 = time()
    fasttext_model = load_model(file_path)
    t1 = time()
    print("加载fastText向量库时间%.2fs" % (t1-t0))
    return fasttext_model


# 计算句子的单位向量
def compute_sentence_vector(word_li):
    global fasttext_model

    # 初始化句子向量
    sen_vec = np.array([])
    sen_vec.resize((1, fasttext_model.get_dimension()))
    # 在fastText中登陆的词语列表
    has_vec_word_li = []
    for word in word_li:
        # 词语有向量值
        if fasttext_model.get_word_id(word) != -1:
            has_vec_word_li.append(word)
            word_vec = fasttext_model.get_word_vector(word)
            sen_vec += word_vec
    if len(has_vec_word_li) != 0:
        sen_vec /= len(has_vec_word_li)
        # 单位化句子向量
        sen_vec /= np.linalg.norm(sen_vec)
    return sen_vec, has_vec_word_li


# SinglePass文本聚类
def single_pass(sen_vec, sim_threshold=0.6, max_text_number=100):
    global text_vec
    global topic_serial
    global topic_cnt
    if topic_cnt == 0:  # 第1次送入的文本
        # 添加文本向量
        text_vec = np.vstack([text_vec, sen_vec])
        # 话题数量+1
        topic_cnt += 1
        # 分配话题编号，话题编号从1开始
        topic_serial.append(topic_cnt)
        # 初始化话题内文本数量
        topic_cnt_dict[topic_cnt] = 1
    else:  # 第2次及之后送入的文本
        # 文本逐一与已有的话题中的各文本进行相似度计算
        sim_vec = np.dot(sen_vec, text_vec.T)
        # 获取最大相似度值
        max_value = np.max(sim_vec)
        # 获取最大相似度值的文本所对应的话题编号
        topic_ser = topic_serial[np.argmax(sim_vec)]
        print("最相似文本的话题编号", topic_ser, "相似度值", max_value)
        # 添加文本向量
        text_vec = np.vstack([text_vec, sen_vec])
        # 分配话题编号(相似度值大于等于sim_threshold，且话题内文本数量小于等于max_text_number）
        if max_value >= sim_threshold and topic_cnt_dict[topic_ser] <= max_text_number:
            # 将文本聚合到该最大相似度的话题中
            topic_serial.append(topic_ser)
            # 话题内文本数量+1
            topic_cnt_dict[topic_ser] += 1
        else:  # 否则新建话题，将文本聚合到该话题中
            # 话题数量+1
            topic_cnt += 1
            # 将新增的话题编号（也就是增加话题后的话题数量）分配给当前文本
            topic_serial.append(topic_cnt)
            # 初始化话题内文本数量
            topic_cnt_dict[topic_cnt] = 1


def main():
    global preprocessed_data
    global topic_serial

    # 输入文件名
    file_name = sys.argv[1]
    # 资源初始化
    init()
    # 读文本并进行增量聚类
    with codecs.open(file_name, 'rb', 'utf-8') as infile:
        for line in infile:
            line = line.strip()
            if line:
                word_li = preprocess(line, 1)
                sen_vec, has_vec_word_li = compute_sentence_vector(word_li)
                if has_vec_word_li:
                    preprocessed_data.append(u' '.join(word_li))
                    single_pass(sen_vec)
    # 输出聚类结果
    cluster_text_li = []
    outfile_name = file_name.split(u'/')[-1]
    outfile_name = u'Cluster_%s' % outfile_name
    with open(outfile_name, 'wb') as outfile:
        sorted_topic_cnt_li = sorted(topic_cnt_dict.items(), key=lambda x:x[1], reverse=True)
        for out_topic_ser, text_cnt in sorted_topic_cnt_li:
            cluster_text = u''
            if text_cnt >= 5:
                for topic_ser, text in zip(topic_serial, preprocessed_data):
                    if topic_ser == out_topic_ser:
                        out_str = u'%dt%sn' % (topic_ser, text)
                        outfile.write(out_str.encode('utf-8', 'ignore'))
                        cluster_text += u'%s ' % text
            if cluster_text:
                cluster_text_li.append(cluster_text)
    # 对每个簇抽取关键词
    category_keywords_li = extract_keyword(cluster_text_li)
    for key_word_li in category_keywords_li:
        print(u','.join(key_word_li))


if __name__ == "__main__":
    main()

类目关键词抽取_tfidf.py文件

#coding:utf-8
"""
关键词抽取tf-idf法
用法：python 类目关键词抽取tf.py 文件名 每个类目最大关键词数量
要求：python3，sklearn，PyHanLP
说明：输入文件中每一行存储一个类目的所有文本。
程序会统计每个词项的tf-idf值，这里的idf指的逆类目频率，
并输出每个类目的按tf-idf值降序的topx个词语，x由第2个参数决定默认为10
"""

import codecs
from pyhanlp import *
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import ngrams

# 加载实词分词器 参考https://github.com/hankcs/pyhanlp/blob/master/tests/demos/demo_notional_tokenizer.py
Term = JClass("com.hankcs.hanlp.seg.common.Term")
NotionalTokenizer = JClass("com.hankcs.hanlp.tokenizer.NotionalTokenizer")


# 通用预处理（训练语料和预测语料通用）
def preprocess(text):
    n_grams = 2
    # 全部字母转小写
    text =text.lower()
    word_li = []

    #  NotionalTokenizer.segment中有去除停用词的操作
    for term in NotionalTokenizer.segment(text):
        word = str(term.word)
        pos = str(term.nature)
        # 去掉时间词
        if pos == u't':
            continue
        word_li.append(word)

        # 去掉单字词（这样的词的出现有可能是因为分词系统未登录词导致的）
        if n_grams == 1 and len(word) == 1:
            continue
        word_li.append(word)

    # 如果只是分词则直接返回word_li即可
    if n_grams == 2:
        ngrams2_li = [u'_'.join(w) for w in ngrams(word_li, 2)]
        word_li = ngrams2_li
    return word_li


def extract_keyword(text_li, topx=10):
    """
    用tf-idf法抽取每个类目的关键词
    :param text_li: 类目文本类表，每个元素表示一个类目的所有文本串
    :param topx: 每个类目抽取出的关键词数量
    :return: 返回每个类目的关键词序列
    """
    tv = TfidfVectorizer(analyzer=preprocess)
    tv_fit = tv.fit_transform(text_li)
    vsm = tv_fit.toarray()
    category_keywords_li = []
    for i in range(vsm.shape[0]):
        sorted_keyword = sorted(zip(tv.get_feature_names(), vsm[i]), key=lambda x:x[1], reverse=True)
        category_keywords = [w[0] for w in sorted_keyword[:topx]]
        category_keywords_li.append(category_keywords)
    return category_keywords_li


def main():
    input_file_name = sys.argv[1]
    if len(sys.argv) == 3:
        topx = int(sys.argv[2])
    else:
        topx = 10
    with codecs.open(input_file_name, 'rb', 'utf-8', 'igonre') as infile:
        text_li = infile.readlines()
    category_keywords_li = extract_keyword(text_li, topx)
    print(category_keywords_li)


if __name__ == "__main__":
    main()

明天实现预测前加入类关联词约束限制，然后再使用用基于向量相似度的方法自动标注文本。

weixin_39574287

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
监督分类空白处也被分类了_文本分类语料构建——半监督分类4

利用类关联词约束候选类目，这篇文章说明类关联词提取。我们把之前自动标注错误较多的三个类目（经济、民生、政治）的标注错误数据整理出来。1. 特征选择法抽取类关联词这里使用了专栏里前边的文章https://zhuanlan.zhihu.com/p/57969179的代码，稍做了一些修改，抽取出的特征词如下：1. “政治”、“文教”两个类目的数据杂乱，我们通过聚类后tf-idf法整理出这两个类目的关联词...
复制链接

扫一扫

监督分类空白处也被分类了_文本分类语料构建——半监督分类4

1. 特征选择法抽取类关联词

2. 聚类后TF-IDF抽取簇关键词

“相关推荐”对你有帮助么？