1、公司名归类,简称cc码
2、算法:多层感知分类
3、总思路:文本分词-->Word2Vec--->矩阵---->MultilayerPerceptronClassifier
①中文分词使用是 IK Analyzer
例如:浙江工人日报社印刷厂---->分词后--->浙江|工人日报|社|印刷厂|
代码
import java.io.StringReader
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
import org.apache.spark.{SparkConf, SparkContext}
import org.wltea.analyzer.lucene.IKAnalyzer
/**
* Created by dongdong on 17/4/24.
*/
object Participles {
def main(args: Array[String]): Unit = {
val inpath = "/Users/dongdong/Desktop/cc/small_data/mlj_total_cc.txt"
val outpath = "/Users/dongdong/Desktop/cc/participles_small"
val conf = new SparkConf().setMaster("local[2]").setAppName("Participles")
val sc = new SparkContext(conf)
//read data
val originalData = sc
.textFile(inpath)
.map(line => {
val arr = line.split("\t")
arr
}).filter(t => {
t.length == 3
})
//splits data
val participles_data = originalData.map(t => {
var words = ""
val company_name = t(0).trim
val label = t(1).trim
val cNumber = t(2).trim
// val address = t(3).trim
val anal = new IKAnalyzer(true)
val reader = new StringReader(company_name)
val ts = anal.tokenStream("", reader)
ts.reset()
val term: CharTermAttribute = ts.getAttribute(classOf[CharTermAttribute])
while (ts.incrementToken()) {
words += term.toString + "|"
}
val words_repalce = words.replaceAll(",", "")
words_repalce + "," + label + "," + cNumber
})
//save data
participles_data.repartition(1).saveAsTextFile(outpath)
sc.stop()
}
}
②MultilayerPerceptronClassifier
核心代码
//The label into vector
val labelIndexer = new StringIndexer()
.setInputCol("label")
.setOutputCol("indexedLabel")
.fit(originalData)
val labelIndexer_data: DataFrame = labelIndexer.transform(originalData)
//Break up the word
val tokenizer = new RegexTokenizer()
.setInputCol("text")
.setOutputCol("words")
.setPattern("\\|")
val tokenizer_ts_data: DataFrame = tokenizer.transform(labelIndexer_data)
//Filter the useless words
val arr = Array("有限公司", "有限责任公司", "", "公司", "分公司", "责任公司", "有限", "责任")
val remover = new StopWordsRemover()
.setInputCol("words")
.setOutputCol("filtered")
.setStopWords(arr)
val fitered_data: DataFrame = remover.transform(tokenizer_ts_data)
//The words into vector
val word2Vec = new Word2Vec()
.setInputCol("filtered")
.setOutputCol("features")
//Set features number
.setVectorSize(VECTOR_SIZE)
.setMinCount(1)
.setMaxIter(100)
// .setNumPartitions(3)
// The hidden layer nodes=2n+1,n input nodes
//the 43 is number of we want to classification
val layers = Array[Int](VECTOR_SIZE, 101, 100, 43)
val mlpc = new MultilayerPerceptronClassifier()
.setLayers(layers)
.setBlockSize(512)
.setSeed(1234L)
.setMaxIter(128)
.setFeaturesCol("features")
.setLabelCol("indexedLabel")
.setPredictionCol("prediction")
//To convert vector label to the label of type String
val labelConverter = new IndexToString()
.setInputCol("prediction")
.setOutputCol("predictedLabel")
.setLabels(labelIndexer.labels)
val Array(trainingData, testData) = originalData.randomSplit(Array(0.8, 0.2))
val pipeline = new Pipeline().setStages(Array(tokenizer, remover, labelIndexer, word2Vec, mlpc, labelConverter))
③试过用TF-IDF 和LogisticRegression(逻辑回归)组合
TF-IDF 和 NaiveBayes(朴素贝叶斯)组合
效果都不太好,其中LogisticRegression只支持二分类
④ 由于先验数据集分布不均匀
最终的正确率:0.606549930730621
total_rate 659490 527397 132093 80121 0.606549930730621