基于Kubernetes、Docker的机器学习微服务系统设计 | |||||
---|---|---|---|---|---|
实践篇 | (1)概念与构想 | (二)架构与部署 | (三)微服务框架 | (四)中文分词 | (五)预处理 |
(六)特征选择 | (七)分类器微 | (八)部署配置 | (九)应用服务 | (十)数据可视化 | |
研究篇 | RS中文分词 | MP特征选择 | NLV文本分类 | 快速kNN | 文本分类 |
文本分类任务的预处理阶段一般包括中文分词。这里中文分词单提出来,预处理阶段的主要任务是停用词去除、索引词典的构建、词文档矩阵化——向量空间模型(VSM,Vector Space Model)。
实现代码
预处理Action实现类
package com.robin.pretreatment.action;
import com.robin.loader.MircoServiceAction;
import com.robin.log.RobinLogger;
import com.robin.pretreatment.DicIndex;
import com.robin.pretreatment.DicIndex.Language;
import com.robin.pretreatment.WordDocMatrix;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.codehaus.jettison.json.JSONArray;
import org.codehaus.jettison.json.JSONException;
import org.codehaus.jettison.json.JSONObject;
/**
* <DT><B>描述:</B></DT>
* <DD>预处理Action实现类</DD>
*
* @version Version1.0
* @author Robin
* @version <I> V1.0 Date:2018-04-08</I>
* @author <I> E-mail:xsd-jj@163.com</I>
*/
public class PretreatAction implements MircoServiceAction {
private static final Logger LOGGER = RobinLogger.getLogger();
public enum StatusCode {
OK,
JSON_ERR,
KIND_ERR,
VERSION_ERR,
MIN_FREQUENCY_ERR,
TEXTS_NULL,
}
private class ActionStatus {
StatusCode statusCode;
String msg;
}
private JSONObject getErrorJson(ActionStatus actionStatus) {
JSONObject errJson = new JSONObject();
try {
errJson.put("status", actionStatus.statusCode.toString());
errJson.put("msg", actionStatus.msg);
} catch (JSONException ex) {
LOGGER.log(Level.SEVERE, ex.getMessage());
}
return errJson;
}
private ActionStatus checkJSONObjectTerm(JSONObject jsonObj,
String key,
HashSet<String> valueSet,
StatusCode errStatusCode) {
ActionStatus actionStatus = new ActionStatus();
try {
if (!jsonObj.isNull(key)) {
String value = jsonObj.getString(key);
if (!valueSet.contains(value)) {
actionStatus.msg = "The value [" + value + "] of " + key + " is error.";
actionStatus.statusCode = errStatusCode;
return actionStatus;
}
} else {
actionStatus.msg = "The input parameter is missing " + key + ".";
actionStatus.statusCode = errStatusCode;
return actionStatus;
}
} catch (JSONException ex) {
LOGGER.log(Level.SEVERE, ex.getMessage());
}
actionStatus.statusCode = StatusCode.OK;
return actionStatus;
}
private ActionStatus checkInputJSONObject(JSONObject jsonObj) {
ActionStatus actionStatus = new ActionStatus();
ActionStatus retActionStatus;
HashSet<String> valueSet = new HashSet();
valueSet.add("pretreatment");
retActionStatus = checkJSONObjectTerm(jsonObj, "kind", valueSet, StatusCode.KIND_ERR);
if (!retActionStatus.statusCode.equals(StatusCode.OK)) {
return retActionStatus;
}
valueSet.clear();
valueSet.add("v1");
retActionStatus = checkJSONObjectTerm(jsonObj, "version", valueSet, StatusCode.VERSION_ERR);
if (!retActionStatus.statusCode.equals(StatusCode.OK)) {
return retActionStatus;
}
actionStatus.statusCode = StatusCode.OK;
return actionStatus;
}
@Override
public Object action(Object obj) {
ActionStatus actionStatus = new ActionStatus();
ActionStatus retActionStatus;
if (!(obj instanceof JSONObject)) {
actionStatus.msg = "The action arguments is not JSONObject.";
LOGGER.log(Level.SEVERE, actionStatus.msg);
actionStatus.statusCode = StatusCode.JSON_ERR;
return this.getErrorJson(actionStatus);
}
JSONObject preJson = (JSONObject) obj;
retActionStatus = this.checkInputJSONObject(preJson);
if (!retActionStatus.statusCode.equals(StatusCode.OK)) {
LOGGER.log(Level.SEVERE, retActionStatus.msg);
return this.getErrorJson(retActionStatus);
}
try {
long beginTime = System.currentTimeMillis();
JSONObject textsObj = preJson.getJSONObject("texts");
if (null == textsObj) {
actionStatus.statusCode = StatusCode.TEXTS_NULL;
actionStatus.msg = "The input texts is null.";
LOGGER.log(Level.SEVERE, actionStatus.msg);
return this.getErrorJson(actionStatus);
}
DicIndex dicIndex;
String lang = preJson.getJSONObject("metadata").getJSONObject("corpus").getString("lang");
if (lang.equals("en")) {
dicIndex = new DicIndex(Language.EN);
} else {
dicIndex = new DicIndex(Language.CN);
}
JSONObject preMetadataJson = preJson.getJSONObject("metadata").getJSONObject("pretreatment");
dicIndex.create(preJson, preMetadataJson.getInt("minFrequency"));
HashMap<String, Integer> dicMap = dicIndex.getDicMap(preJson);
if (dicMap.isEmpty()) {
JSONObject errJson = new JSONObject();
errJson.put("status", StatusCode.MIN_FREQUENCY_ERR.toString());
errJson.put("result", "The minFrequency is too big.");
return errJson;
}
// 循环所有文本
Iterator<String> labelsIt = textsObj.keys();
while (labelsIt.hasNext()) {
String label = labelsIt.next();
JSONArray aLabelTextsArr = textsObj.getJSONArray(label);
int len = aLabelTextsArr.length();
for (int i = 0; i < len; i++) {
JSONObject textJson = aLabelTextsArr.getJSONObject(i);
String text = textJson.getString("text");
if (null != text) {
String result = WordDocMatrix.create(text, dicMap);
String[] wordsDocArr = result.split("-");
textJson.remove("text");
textJson.put("totalWords", Integer.valueOf(wordsDocArr[0]));
textJson.put("text", wordsDocArr[1]);
}
}
}
long endTime = System.currentTimeMillis();
int spendTime = (int) (endTime - beginTime);
preMetadataJson.put("spendTime", spendTime);
} catch (JSONException ex) {
LOGGER.log(Level.SEVERE, ex.getMessage());
}
JSONObject rsp = new JSONObject();
try {
rsp.put("status", "OK");
rsp.put("result", preJson);
} catch (JSONException ex) {
LOGGER.log(Level.SEVERE, ex.getMessage());
}
return rsp;
}
}
停用词类
package com.robin.pretreatment;
import com.robin.config.ConfigUtil;
import java.util.Arrays;
import com.robin.file.FileUtil;
import com.robin.log.RobinLogger;
import java.util.HashSet;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
* <DT><B>描述:</B></DT>
* <DD>停用词类</DD>
*
* @version Version1.0
* @author Robin
* @version <I> Date:2018-04-21</I>
* @author <I> E-mail:xsd-jj@163.com</I>
*/
public class StopWords {
// 日志
private static final Logger LOOGER = RobinLogger.getLogger();
/**
* 停用词构造方法
*/
public StopWords() {
}
/**
* 获取中文stop words
*
* @return 中文stop words
*/
public HashSet<String> getChineseSet() {
String cnStopWordsPath = ConfigUtil.getConfig("stopWords.chinese");
return this.load(cnStopWordsPath);
}
/**
* 获取英文stop words
*
* @return 英文stop words
*/
public HashSet<String> getEnglishSet() {
String enStopWordsPath = ConfigUtil.getConfig("stopWords.english");
return this.load(enStopWordsPath);
}
/**
* 获取特殊符号
*
* @return 特殊符号
*/
public HashSet<String> getSymbolSet() {
String symbolPath = ConfigUtil.getConfig("stopWords.symbol");
return this.load(symbolPath);
}
/**
* 加载 stop words 文件
*
* @param stopWordsPath stop words 文件路径
* @return stop words List
*/
private HashSet<String> load(String stopWordsPath) {
HashSet<String> set = new HashSet<>();
String stopWordsText = FileUtil.readText(stopWordsPath);
if (null == stopWordsText) {
LOOGER.log(Level.SEVERE, "读取停止词文件失败,检查文件及路径.");
return null;
}
String[] words = stopWordsText.split(" ");
set.addAll(Arrays.asList(words));
return set;
}
}
请求JSON
预处理微服务请求的JSON格式如下,红框所示请求类型以及回填数据参数。
响应JSON
预处理服务响应的JSON格式如下,红框所示返回的结果。
知更鸟博文推荐 | |
---|---|
上一篇 | 基于Kubernetes、Docker的机器学习微服务系统设计系列——(四)中文分词微服务 |
下一篇 | 基于Kubernetes、Docker的机器学习微服务系统设计系列——(六)特征选择微服务 |
推荐篇 | 基于Kubernetes、Docker的机器学习微服务系统设计——完整版 |
研究篇 | RS中文分词 | MP特征选择 | NLV文本分类 | 快速kNN |
作者简介 | |
兴趣爱好 | 机器学习、云计算、自然语言处理、文本分类、深度学习 |
xsd-jj@163.com (欢迎交流) |