基于keras深度学习模型新闻标签一二级分类

新闻标签一二级深度学习分类模型
对一篇新闻的标题/正文/来源数据进行建模分析,一级标签正确率90%,二级标签正确率72%,解决一二级标签不一致的情况在这里插入代码片

导入所需的包

import tensorflow as tf
import pandas as pd
import numpy as np
from keras_bert import Tokenizer
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras import backend as K
from scipy import sparse
from tensorflow.keras.optimizers import Adam
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import json
import argparse

“”"
模型需求:一二级标签分类需求
时间:2020/12
模型描述:运用textCNN深度学习模型搭建而成
“”"

def parse_args():
“”“解析参数.”“”
parser = argparse.ArgumentParser(usage=“it’s usage tip.”,
description=“predict news type”)
parser.add_argument(“–label-sample-num”, default=10000, help=“每个标签样本数量”)
parser.add_argument(“–embedding-size”, default=128, type=int,
help=“字向量嵌入规格”)
parser.add_argument(“–max-title-size”, default=32, type=int, help=“最大入参标题字数”)
parser.add_argument(“–max-content-size”, default=512, type=int,
help=“最大文章内容字数”)
parser.add_argument(“–max-source-size”, default=32, type=int, help=“最大文章来源字数”)
parser.add_argument(“–batch-size”, default=16, type=int,
help=“模型训练每个批次样本数量”)
parser.add_argument(“–epochs”, default=5, type=int,
help=“模型训练次数”)
parser.add_argument(“–model-save-path”, type=str, help=“模型存储路径”)
parser.add_argument(“–model-desc-save-path”, type=str, help=“模型描述文件存储路径”)
parser.add_argument(“–con1-size”, default=128, type=int,
help=“第一层卷积规格”)
parser.add_argument(“–con2-size”, default=64, type=int, help=“第二层卷积规格”)
parser.add_argument(“–dense-size”, default=128, type=int,
help=“全链接规格”)
parser.add_argument(“–learning-rate”, default=0.001, type=float,
help=“模型学习率”)
return parser.parse_args()

创建模型类

class TextCNN(object):
“”"文本深度学习标签分类模型

    新闻文章的标题/来源/正文作为入参字段,经过数据编码处理深度学习然后输出文章一二级标签分类

    参数:
        vocab: 字向量字典
        embedding_size: 相关字嵌入的规格
        max_title: 标题最长取值
        max_content: 内容最长取值
        max_source: 来源最长取值
        first_class_num: 一级标签数量
        second_class_num: 二级标签数量

    返回:
        model1, model2: 返回训练好的模型
    """

FIRST_CLASS = "first_class"
SECOND_CLASS = "second_class"

def __init__(self, vocab, embedding_size, max_title, max_content, max_source, first_class_num, second_class_num):
    self.vocab = vocab
    self.max_title = max_title
    self.max_content = max_content
    self.max_source = max_source
    self.tokenizer, self.vocab_size = self.__get_tokenizer()
    self.embedding_size = embedding_size
    self.first_class_num = first_class_num
    self.second_class_num = second_class_num

def __get_tokenizer(self):
    """
    建立字对应的字典索引,返回每个字及对应的索引号
    """
    token_dict = {}
    with open(self.vocab, 'r', encoding='utf-8') as reader:
        for line in reader:
            token = line.strip()
            token_dict[token] = len(token_dict)
    return Tokenizer(token_dict), len(token_dict)

def get_tokenizer(self):
    return self.tokenizer

def encode(self, text, max_len):
    """
    输入文本,按最长取值截取,返回每个字对应的索引,不足的以0填充
    """
    return self.tokenizer.encode(first=text, max_len=max_len)[0]

def get_model(self, class_map, con1_size, con2_size, dense_size, learning_rate):
    """
    输入样本的标题/来源/内容进行深度学习,返回文章一二级标签分类
    """
    title = Input(shape=(self.max_title,), name='title_ids', dtype=tf.float32)
    content = Input(shape=(self.max_content,), name='content_ids', dtype=tf.float32)
    source = Input(shape=(self.max_source,), name='source_ids', dtype=tf.float32)
    embedding_layer = Embedding(self.vocab_size + 1, self.embedding_size)
    mask_layer = Embedding(self.first_class_num, self.second_class_num, weights=[class_map],
                           trainable=False)  # 消除一致性问题
    embedding_title = embedding_layer(title)
    embedding_content = embedding_layer(content)
    embedding_source = embedding_layer(source)
    flat_layers = []
    for embedding_layer in [embedding_title, embedding_content, embedding_source]:
        layers = []
        for i in [3, 5, 7]:
            conv = Conv1D(con1_size, i, padding='same', strides=1, activation='relu')(embedding_layer)
            pool = MaxPooling1D(pool_size=3, padding='same')(conv)
            conv = Conv1D(con2_size, i, padding='same', strides=1, activation='relu')(pool)
            pool = MaxPooling1D(pool_size=3, padding='same')(conv)
            layers += [pool]
        flat = Flatten()(concatenate(layers, axis=-1))
        flat = Activation("relu")(BatchNormalization()(Dense(dense_size, activation=None)(flat)))
        flat_layers += [flat]
    flat_concat = concatenate(flat_layers, axis=-1)
    dense = Activation("relu")(BatchNormalization()(Dense(dense_size, activation=None)(flat_concat)))
    output_first = Dense(self.first_class_num, activation='softmax')(dense)
    first_class_value = Lambda(lambda x: K.argmax(x), name=self.FIRST_CLASS)(output_first)
    mask = mask_layer(first_class_value)
    second = Dense(self.second_class_num, activation=None)(dense)
    second = Multiply()([second, mask])
    output_second = Activation("softmax")(second)
    second_class_value = Lambda(lambda x: K.argmax(x), name=self.SECOND_CLASS)(output_second)
    model1 = Model(inputs=[title, content, source], outputs=[output_first, output_second])
    model1.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(lr=learning_rate), metrics=['accuracy'])
    model2 = Model(inputs=[title, content, source], outputs=[first_class_value, second_class_value])
    model2.summary()
    return model1, model2

定义模型描述类

class ModelDescription(object):
“”"分类模型描述文件

    描述分类模型的入参出参等字段

    参数:
        dim: 数据维度
        map_key: 模型入参字段和输入字段对应字段
        data_type: 模型入参字段名
        handler: 处理器
        fill_value: 字段不足时填充值

    返回:
        model: 模型描述文件字典
    """

model = {}

def __init__(self):
    self.model['model_desc'] = {}
    self.model['model_desc']['signature_name'] = ""
    self.model['model_desc']['inputs'] = {}
    self.model['model_desc']['outputs'] = []
    pass

def build_context_field(self, dim, map_key, tensor_name, data_type="int", handler="tokenizer", fill_value=0):
    field = {'dim': dim, 'map_key': map_key, 'tensor_name': tensor_name, 'data_type': data_type, 'handler': handler,
             'fill_value': fill_value}
    return field

def build_source(self, len, tensor_name):
    return self.build_context_field(len, "source", tensor_name)

def build_title(self, len, tensor_name):
    return self.build_context_field(len, "title", tensor_name)

def build_content(self, len, tensor_name):
    return self.build_context_field(len, "content", tensor_name)

def set_context(self, source_len, source_tensor_name, title_len, title_tensor_name, content_len,
                content_tensor_name):
    source = self.build_source(source_len, source_tensor_name)
    title = self.build_title(title_len, title_tensor_name)
    content = self.build_content(content_len, content_tensor_name)
    self.model['model_desc']['inputs']['context'] = [source, title, content]

def add_out_put(self, map_key, tensor_name, tag_name):
    output = {"map_key": map_key, "tensor_name": tensor_name, "data_type": "int", "handler": "tags",
              "tag_name": tag_name, "fill_value": "0", "dim": -1}
    self.model['model_desc']['outputs'] = self.model['model_desc']['outputs'] + [output]

def to_json(self):
    return json.dumps(self.model, ensure_ascii=False)

def news_classify_algo():
# 调用pyspark并获取sample数据
spark = SparkSession
.builder
.config(“spark.sql.broadcastTimeout”, “3000”)
.master(“yarn”)
.enableHiveSupport()
.getOrCreate()

args = parse_args()

# 从HIVE表读取数据
sql = '''select news_id,title,content,type,source,content_type,first_label,second_label
from dp_dm.content_center_news_classify_sample_data
where rank <={}'''.format(args.label_sample_num)
news_sample = spark.sql(sql).toPandas()

# 简单过滤缺失/重复数据
news = news_sample[
    ['news_id', 'title', 'content', 'first_label', 'source', 'second_label']].dropna().drop_duplicates()

# 建立类别索引
category = sorted(np.unique(news['first_label'].dropna().values))
sub_category = sorted(np.unique(news['second_label'].dropna().values))
category_map = dict(zip(category, np.arange(len(category))))
sub_category_map = dict(zip(sub_category, np.arange(len(sub_category))))
# 初始化模型
text_cnn = TextCNN(vocab="vocab.txt", embedding_size=args.embedding_size, max_title=args.max_title_size,
                   max_content=args.max_content_size, max_source=args.max_source_size
                   , first_class_num=len(category), second_class_num=len(sub_category))

# 对类别和特征进行编码
news['category'] = news['first_label'].map(category_map)
news['sub_category'] = news['second_label'].map(sub_category_map)
news['title_ids'] = news['title'].apply(lambda x: text_cnn.encode(x, text_cnn.max_title))
news['content_ids'] = news['content'].apply(lambda x: text_cnn.encode(x, text_cnn.max_content))
news['source_ids'] = news['source'].apply(lambda x: text_cnn.encode(x, text_cnn.max_source))

# 建立一二级标签映射字典
category_level_reverse_map = dict(zip(news['sub_category'], news['category']))

# 切分数据集为训练集和验证集
train_x, test_x, train_y, test_y = train_test_split(news[['title_ids', 'content_ids', 'source_ids']]
                                                    , news[['category', 'sub_category']])

# 建立分类矩阵
def get_class_matrix(class_dict):
    data = np.ones(len(class_dict))
    indice = list(class_dict.values())
    indictor = list(class_dict.keys())
    map_mat = sparse.csr_matrix((data, (indice, indictor))).todense()
    return map_mat

# 确立数据集x,y值
tx_title = np.array(train_x['title_ids'].values.tolist()).astype(np.float32)
tx_content = np.array(train_x['content_ids'].values.tolist()).astype(np.float32)
tx_source = np.array(train_x['source_ids'].values.tolist()).astype(np.float32)
tx = [tx_title, tx_content, tx_source]
ty_cate = np.array(train_y['category'].values.tolist()).astype(np.float32)
ty_subcate = np.array(train_y['sub_category'].values.tolist()).astype(np.float32)
ty = [ty_cate, ty_subcate]
ex_title = np.array(test_x['title_ids'].values.tolist()).astype(np.float32)
ex_content = np.array(test_x['content_ids'].values.tolist()).astype(np.float32)
ex_source = np.array(test_x['source_ids'].values.tolist()).astype(np.float32)
ex = [ex_title, ex_content, ex_source]
ey_cate = np.array(test_y['category'].values.tolist()).astype(np.float32)
ey_subcate = np.array(test_y['sub_category'].values.tolist()).astype(np.float32)
ey = [ey_cate, ey_subcate]

model1, model2 = text_cnn.get_model(get_class_matrix(category_level_reverse_map),
                                    args.con1_size,
                                    args.con2_size,
                                    args.dense_size,
                                    args.learning_rate)

# 模型训练
model1.fit(x=tx, y=ty, batch_size=args.batch_size, validation_data=(ex, ey), epochs=args.epochs)

# 保存模型到HDFS
model2.save(args.model_save_path)

# 以下是为了保存模型描述文件
news_model = ModelDescription()
news_model.set_context(args.max_source_size, 'source_ids', args.max_title_size, 'title_ids', args.max_content_size, 'content_ids')
news_model.add_out_put('一级标签', text_cnn.FIRST_CLASS, list(category_map.keys()))
news_model.add_out_put('二级标签', text_cnn.SECOND_CLASS, list(sub_category_map.keys()))
sc = spark.sparkContext
fs_class = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
conf_class = sc._gateway.jvm.org.apache.hadoop.conf.Configuration
fs = fs_class.get(conf_class())
path_class = sc._gateway.jvm.org.apache.hadoop.fs.Path

def save_file(path: str, data: str):
    """保存文件至hdfs.
    参数:
        path(str): hdfs上的路径
        data(str): 数据
    """
    output = fs.create(path_class(path))
    output.write(data.encode())
    output.flush()
    output.close()

# 保存描述文件到HDFS
data = news_model.to_json()
save_file(args.model_desc_save_path, data)

主函数入口

if name == ‘main’:
news_classify_algo()

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
【课程介绍】      Pytorch项目实战 垃圾分类课程从实战的角度出发,基于真实数据集与实际业务需求,结合当下最新话题-垃圾分类问题为实际业务出发点,介绍最前沿的深度学习解决方案。    从0到1讲解如何场景业务分析、进行数据处理,模型训练与调优,最后进行测试与结果展示分析。全程实战操作,以最接地气的方式详解每一步流程与解决方案。    课程结合当下深度学习热门领域,尤其是基于facebook 开源分类神器ResNext101网络架构,对网络架构进行调整,以计算机视觉为核心讲解各大网络的应用于实战方法,适合快速入门与进阶提升。【课程要求】 (1)开发环境:python版本:Python3.7+; torch版本:1.2.0+; torchvision版本:0.4.0+ (2)开发工具:Pycharm;(3)学员基础:需要一定的Python基础,及深度学习基础;(4)学员收货:掌握最新科技图像分类关键技术;(5)学员资料:内含完整程序源码和数据集;(6)课程亮点:专题技术,完整案例,全程实战操作,徒手撸代码【课程特色】 阵容强大讲师一直从事与一线项目开发,高级算法专家,一直从事于图像、NLP、个性化推荐系统热门技术领域。仅跟前沿基于当前热门讨论话题:垃圾分类,课程采用学术届和工业届最新前沿技术知识要点。实战为先根据实际深度学习工业场景-垃圾分类,从产品需求、产品设计和方案设计、产品技术功能实现、模型上线部署。精心设计工业实战项目保障效果项目实战方向包含了学术届和工业届最前沿技术要点项目包装简历优化课程内垃圾分类图像实战项目完成后可以直接优化到简历中【课程思维导图】 【课程实战案例】

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值