我是第一次使用Java使用Apache Spark进行文本挖掘。 我正在尝试对文本数据进行LDA。 首先,我使用IDF模型提取相关单词。 然后,我创建一个LDA模型来获取我的主题。 结果,我得到一个带有termIndices和termWeights的表。
如何从我的LDA模型中获取主题作为单词?
这是我使用的代码:
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.spark.ml.clustering.LDA;
import org.apache.spark.ml.clustering.LDAModel;
import org.apache.spark.ml.feature.HashingTF;
import org.apache.spark.ml.feature.IDF;
import org.apache.spark.ml.feature.IDFModel;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.ml.feature.Tokenizer;
import java.util.Arrays;
import java.util.List;
public class Main {
public static void main(String[] args){
Logger.getLogger("org.apache").setLevel(Level.WARN);
SparkSession sparkSession = SparkSession.builder().appName("testing").master("local[*]").getOrCreate();
List data = Arrays.asList(
RowFactory.create(0.0, "Hi I heard about Spark"),
RowFactory.create(0.0, "I wish Java could use case classes"),
RowFactory.create(1.0, "Logistic regression models are neat")
);
StructType schema = new StructType(new StructField[]{
new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
});
Dataset sentenceData = sparkSession.createDataFrame(data, schema);
Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");
Dataset wordsData = tokenizer.transform(sentenceData);
int numFeatures = 20;
HashingTF hashingTF = new HashingTF()
.setInputCol("words")
.setOutputCol("rawFeatures")
.setNumFeatures(numFeatures);
Dataset featurizedData = hashingTF.transform(wordsData);
IDF idf = new IDF().setInputCol("rawFeatures").setOutputCol("features");
IDFModel idfModel = idf.fit(featurizedData);
Dataset rescaledData = idfModel.transform(featurizedData);
rescaledData.select("label", "features").show();
LDA lda = new LDA().setK(2).setMaxIter(10).setOptimizer("em").setFeaturesCol("features");
LDAModel ldaModel = lda.fit(rescaledData);
ldaModel.describeTopics().show();
Dataset transformed = ldaModel.transform(rescaledData);
transformed.show();
sparkSession.close();
}
}
这是我的代码的输出:
+-----+--------------------+
|label| features|
+-----+--------------------+
| 0.0|(20,[0,5,9,17],[0...|
| 0.0|(20,[2,7,9,13,15]...|
| 1.0|(20,[4,6,13,15,18...|
+-----+--------------------+
+-----+--------------------+--------------------+
|topic| termIndices| termWeights|
+-----+--------------------+--------------------+
| 0|[17, 9, 6, 2, 0, ...|[0.16715273617466...|
| 1|[17, 9, 7, 18, 5,...|[0.15751266315244...|
+-----+--------------------+--------------------+
+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|label| sentence| words| rawFeatures| features| topicDistribution|
+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
| 0.0|Hi I heard about ...|[hi, i, heard, ab...|(20,[0,5,9,17],[1...|(20,[0,5,9,17],[0...|[0.50052275837267...|
| 0.0|I wish Java could...|[i, wish, java, c...|(20,[2,7,9,13,15]...|(20,[2,7,9,13,15]...|[0.49871227849509...|
| 1.0|Logistic regressi...|[logistic, regres...|(20,[4,6,13,15,18...|(20,[4,6,13,15,18...|[0.50063942630916...|
+-----+--------------------+--------------------+--------------------+--------------------+--------------------+