ML.NET 决策树二元分类模型 文本特征分析

3 篇文章 0 订阅

前置工作

本次,需要通过NuGet包管理其,引入如下NuGet
在这里插入图片描述
由上到下依次为
Python 中文分词组件jieba
ML组件
ML决策树组件

一、预设问题

本次假设对会议记录,使用机械学习进行判断,其填写内容是否合格.这里就要涉及到文本特征提取。
由于符号文字序列不能直接传递给算法.计算机程序只能对固定长度的数字矩阵特征向量(float或float数组)进行处理,我们需要对文本特征进行提取.

二、具体思路

我们使用Jieba进行进行处理,对句子进行分割,得到词语的数组,最后以空格为间隔进行拼接,入下图:
在这里插入图片描述
在建立学习管道时,引用该数据处理管道,进行分词:
在这里插入图片描述执行方式参考上一篇对决策树二元分类的文章,结果如下:
在这里插入图片描述以上!
完整代码:

using Microsoft.ML;
using Microsoft.ML.Data;
using System;
using System.Collections.Generic;
using System.IO;
using JiebaNet;


namespace BinaryClassification_TextFeaturize
{
    class Program
    {
        static readonly string DataPath = Path.Combine(Environment.CurrentDirectory, "Data", "meeting_data_full.csv");

        static void Main(string[] args)
        {
            MLContext mlContext = new MLContext();
            var fulldata = mlContext.Data.LoadFromTextFile<MeetingInfo>(DataPath, separatorChar: ',', hasHeader: false);
            var trainTestData = mlContext.Data.TrainTestSplit(fulldata, testFraction: 0.15);
            var trainData = trainTestData.TrainSet;
            var testData = trainTestData.TestSet;

            //建立模型
            //创建一个 TextFeaturizingEstimator ,它将文本列转换为特征化向量 Single ,该向量表示 n 元语法和字符语法的规范化
            IEstimator<ITransformer> dataProcessPipeline = mlContext.Transforms.Text.FeaturizeText(outputColumnName: "Features", inputColumnName: "JiebaText");//JiebaText
            //快速树
            IEstimator<ITransformer> trainer = mlContext.BinaryClassification.Trainers.FastTree(labelColumnName: "Label", featureColumnName: "Features");//Features
            //以將輸入資料行的自訂對應套用至輸出資料行。  JiebaLambdaInput->JiebaLambdaOutput
            IEstimator<ITransformer> customMapping = mlContext.Transforms.CustomMapping<JiebaLambdaInput, JiebaLambdaOutput>(mapAction: JiebaLambda.MyAction, contractName: "JiebaLambda");
            IEstimator<ITransformer> trainingPipeline = customMapping.Append(dataProcessPipeline.Append(trainer));
            //ITransformer dateModel = dataProcessPipeline.Fit(trainData);
            ITransformer trainedModel = trainingPipeline.Fit(trainData);


            //评估
            var predictions = trainedModel.Transform(testData);
            DebugData(mlContext, predictions);
            var metrics = mlContext.BinaryClassification.Evaluate(data: predictions, labelColumnName: "Label");
            Console.WriteLine($"评估精度: {metrics.Accuracy:P2}");

            //保存模型
            /*string ModelPath = Path.Combine(Environment.CurrentDirectory, "Data", "meeting_data_full.zip");
            mlContext.Model.Save(trainedModel, trainData.Schema, ModelPath);
            Console.WriteLine($"保存地址 :{ModelPath}");
            Console.WriteLine();*/


            //创建预测引擎
            string LoadPath = Path.Combine(Environment.CurrentDirectory, "Data", "meeting_data_full.zip");
            //无法载入模型 因为分词管道未能加入模型
            //ITransformer LoadModel = mlContext.Model.Load(LoadPath, out var inputSchema);
            var predEngine = mlContext.Model.CreatePredictionEngine<MeetingInfo, PredictionResult>(trainedModel);

            //预测1
            MeetingInfo sampleStatement1 = new MeetingInfo { Text = "支委会。" };
            var predictionresult1 = predEngine.Predict(sampleStatement1);
            Console.WriteLine($"{sampleStatement1.Text}:{predictionresult1.PredictedLabel}");
            predictionresult1.PrintToConsole();

            //预测2
            MeetingInfo sampleStatement2 = new MeetingInfo { Text = "进行新时代中国特色社会主义思想专讲的党员答题活动。" };
            var predictionresult2 = predEngine.Predict(sampleStatement2);
            Console.WriteLine($"{sampleStatement2.Text}:{predictionresult2.PredictedLabel}");
            predictionresult2.PrintToConsole();

            //预测3
            MeetingInfo sampleStatement3 = new MeetingInfo { Text = "优秀党员评选文件的宣读。" };
            var predictionresult3 = predEngine.Predict(sampleStatement3);
            Console.WriteLine($"{sampleStatement3.Text}:{predictionresult3.PredictedLabel}");
            predictionresult3.PrintToConsole();


            Console.WriteLine("回车退出!");
            Console.ReadKey();
        }

        private static void DebugData(MLContext mlContext, IDataView predictions)
        {
            //检查 IDataView 的最快方法之一是将其转换为 IEnumerable。 若要将 IDataView 转换为 IEnumerable,请使用 CreateEnumerable 方法。
            var trainDataShow = new List<PredictionResult>(mlContext.Data.CreateEnumerable<PredictionResult>(predictions, false, true));

            foreach (var dataline in trainDataShow)
            {
                dataline.PrintToConsole();
            }
        }
    }

    public class MeetingInfo
    {

        private string _jiebatext;

        [LoadColumn(0)]
        public bool Label { get; set; }
        [LoadColumn(1)]
        public string Text { get; set; }
        /*[LoadColumn(1)]
        public string JiebaText
        {
            get { return JiebaLambda.MyAction(_jiebatext); }
            set { _jiebatext = value; }
        }*/
    }

    public class PredictionResult : MeetingInfo
    {
        public string JiebaText { get; set; }
        public float[] Features { get; set; }
        public bool PredictedLabel;
        public float Score;
        public float Probability;
        public void PrintToConsole()
        {
            Console.WriteLine($"分词={JiebaText}");
            Console.WriteLine($"预测标签:{PredictedLabel},得分:{Score},可能性:{Probability}");
            Console.WriteLine($"文本特征长度:{Features.Length}");
            /*if (Features != null)
            {
                foreach (var f in Features)
                {
                    Console.Write($"{f},");
                }
                Console.WriteLine();
            }*/
            Console.WriteLine();
        }
    }

    public class JiebaLambdaInput
    {
        public string Text { get; set; }
    }

    public class JiebaLambdaOutput
    {
        public string JiebaText { get; set; }
    }

    public class JiebaLambda
    {
        public static void MyAction(JiebaLambdaInput input, JiebaLambdaOutput output)
        {
            //jieba 是目前最好的 Python 中文分词组件
            JiebaNet.Segmenter.JiebaSegmenter jiebaSegmenter = new JiebaNet.Segmenter.JiebaSegmenter();
            output.JiebaText = string.Join(" ", jiebaSegmenter.Cut(input.Text));

            Count++;
            //Console.WriteLine($"JiebaLambda.MyAction Debug:{Count}");
        }

        static int Count = 0;
    }
}

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值