前置工作
本次,需要通过NuGet包管理其,引入如下NuGet
由上到下依次为
Python 中文分词组件jieba
ML组件
ML决策树组件
一、预设问题
本次假设对会议记录,使用机械学习进行判断,其填写内容是否合格.这里就要涉及到文本特征提取。
由于符号文字序列不能直接传递给算法.计算机程序只能对固定长度的数字矩阵特征向量(float或float数组)进行处理,我们需要对文本特征进行提取.
二、具体思路
我们使用Jieba进行进行处理,对句子进行分割,得到词语的数组,最后以空格为间隔进行拼接,入下图:
在建立学习管道时,引用该数据处理管道,进行分词:
执行方式参考上一篇对决策树二元分类的文章,结果如下:
以上!
完整代码:
using Microsoft.ML;
using Microsoft.ML.Data;
using System;
using System.Collections.Generic;
using System.IO;
using JiebaNet;
namespace BinaryClassification_TextFeaturize
{
class Program
{
static readonly string DataPath = Path.Combine(Environment.CurrentDirectory, "Data", "meeting_data_full.csv");
static void Main(string[] args)
{
MLContext mlContext = new MLContext();
var fulldata = mlContext.Data.LoadFromTextFile<MeetingInfo>(DataPath, separatorChar: ',', hasHeader: false);
var trainTestData = mlContext.Data.TrainTestSplit(fulldata, testFraction: 0.15);
var trainData = trainTestData.TrainSet;
var testData = trainTestData.TestSet;
//建立模型
//创建一个 TextFeaturizingEstimator ,它将文本列转换为特征化向量 Single ,该向量表示 n 元语法和字符语法的规范化
IEstimator<ITransformer> dataProcessPipeline = mlContext.Transforms.Text.FeaturizeText(outputColumnName: "Features", inputColumnName: "JiebaText");//JiebaText
//快速树
IEstimator<ITransformer> trainer = mlContext.BinaryClassification.Trainers.FastTree(labelColumnName: "Label", featureColumnName: "Features");//Features
//以將輸入資料行的自訂對應套用至輸出資料行。 JiebaLambdaInput->JiebaLambdaOutput
IEstimator<ITransformer> customMapping = mlContext.Transforms.CustomMapping<JiebaLambdaInput, JiebaLambdaOutput>(mapAction: JiebaLambda.MyAction, contractName: "JiebaLambda");
IEstimator<ITransformer> trainingPipeline = customMapping.Append(dataProcessPipeline.Append(trainer));
//ITransformer dateModel = dataProcessPipeline.Fit(trainData);
ITransformer trainedModel = trainingPipeline.Fit(trainData);
//评估
var predictions = trainedModel.Transform(testData);
DebugData(mlContext, predictions);
var metrics = mlContext.BinaryClassification.Evaluate(data: predictions, labelColumnName: "Label");
Console.WriteLine($"评估精度: {metrics.Accuracy:P2}");
//保存模型
/*string ModelPath = Path.Combine(Environment.CurrentDirectory, "Data", "meeting_data_full.zip");
mlContext.Model.Save(trainedModel, trainData.Schema, ModelPath);
Console.WriteLine($"保存地址 :{ModelPath}");
Console.WriteLine();*/
//创建预测引擎
string LoadPath = Path.Combine(Environment.CurrentDirectory, "Data", "meeting_data_full.zip");
//无法载入模型 因为分词管道未能加入模型
//ITransformer LoadModel = mlContext.Model.Load(LoadPath, out var inputSchema);
var predEngine = mlContext.Model.CreatePredictionEngine<MeetingInfo, PredictionResult>(trainedModel);
//预测1
MeetingInfo sampleStatement1 = new MeetingInfo { Text = "支委会。" };
var predictionresult1 = predEngine.Predict(sampleStatement1);
Console.WriteLine($"{sampleStatement1.Text}:{predictionresult1.PredictedLabel}");
predictionresult1.PrintToConsole();
//预测2
MeetingInfo sampleStatement2 = new MeetingInfo { Text = "进行新时代中国特色社会主义思想专讲的党员答题活动。" };
var predictionresult2 = predEngine.Predict(sampleStatement2);
Console.WriteLine($"{sampleStatement2.Text}:{predictionresult2.PredictedLabel}");
predictionresult2.PrintToConsole();
//预测3
MeetingInfo sampleStatement3 = new MeetingInfo { Text = "优秀党员评选文件的宣读。" };
var predictionresult3 = predEngine.Predict(sampleStatement3);
Console.WriteLine($"{sampleStatement3.Text}:{predictionresult3.PredictedLabel}");
predictionresult3.PrintToConsole();
Console.WriteLine("回车退出!");
Console.ReadKey();
}
private static void DebugData(MLContext mlContext, IDataView predictions)
{
//检查 IDataView 的最快方法之一是将其转换为 IEnumerable。 若要将 IDataView 转换为 IEnumerable,请使用 CreateEnumerable 方法。
var trainDataShow = new List<PredictionResult>(mlContext.Data.CreateEnumerable<PredictionResult>(predictions, false, true));
foreach (var dataline in trainDataShow)
{
dataline.PrintToConsole();
}
}
}
public class MeetingInfo
{
private string _jiebatext;
[LoadColumn(0)]
public bool Label { get; set; }
[LoadColumn(1)]
public string Text { get; set; }
/*[LoadColumn(1)]
public string JiebaText
{
get { return JiebaLambda.MyAction(_jiebatext); }
set { _jiebatext = value; }
}*/
}
public class PredictionResult : MeetingInfo
{
public string JiebaText { get; set; }
public float[] Features { get; set; }
public bool PredictedLabel;
public float Score;
public float Probability;
public void PrintToConsole()
{
Console.WriteLine($"分词={JiebaText}");
Console.WriteLine($"预测标签:{PredictedLabel},得分:{Score},可能性:{Probability}");
Console.WriteLine($"文本特征长度:{Features.Length}");
/*if (Features != null)
{
foreach (var f in Features)
{
Console.Write($"{f},");
}
Console.WriteLine();
}*/
Console.WriteLine();
}
}
public class JiebaLambdaInput
{
public string Text { get; set; }
}
public class JiebaLambdaOutput
{
public string JiebaText { get; set; }
}
public class JiebaLambda
{
public static void MyAction(JiebaLambdaInput input, JiebaLambdaOutput output)
{
//jieba 是目前最好的 Python 中文分词组件
JiebaNet.Segmenter.JiebaSegmenter jiebaSegmenter = new JiebaNet.Segmenter.JiebaSegmenter();
output.JiebaText = string.Join(" ", jiebaSegmenter.Cut(input.Text));
Count++;
//Console.WriteLine($"JiebaLambda.MyAction Debug:{Count}");
}
static int Count = 0;
}
}