头歌Spark的机器学习-MLlib

第1关:MLlib介绍

package com.educoder.bigData.sparksql5;

import java.util.Arrays;
import java.util.List;

import org.apache.spark.ml.Pipeline;
import org.apache.spark.ml.PipelineModel;
import org.apache.spark.ml.PipelineStage;
import org.apache.spark.ml.classification.*;
import org.apache.spark.ml.feature.HashingTF;
import org.apache.spark.ml.feature.Tokenizer;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

public class Test1 {

	public static void main(String[] args) {

		SparkSession spark = SparkSession.builder().appName("test1").master("local").getOrCreate();

		List<Row> trainingList = Arrays.asList(
				RowFactory.create(1.0, "a b c d E spark"), 
				RowFactory.create(0.0, "b d"),
				RowFactory.create(1.0, "hadoop Mapreduce"), 
				RowFactory.create(0.0, "f g h"));
		
		List<Row> testList = Arrays.asList(
				RowFactory.create(0.0, "spark I j k"),
				RowFactory.create(0.0, "l M n"),
				RowFactory.create(0.0, "f g"),
				RowFactory.create(0.0, "apache hadoop")
				);
		
		/********* Begin *********/
		// 创建训练数据的schema
		StructType schema = new StructType(new StructField[] {
				new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
				new StructField("text", DataTypes.StringType, false, Metadata.empty())
		});
		
		// 创建训练数据和测试数据的DataFrame
		Dataset<Row> training = spark.createDataFrame(trainingList, schema);
		Dataset<Row> test = spark.createDataFrame(testList, schema);
		
		// 创建分词器
		Tokenizer tokenizer = new Tokenizer()
				.setInputCol("text")
				.setOutputCol("words");
		
		// 创建特征提取器
		HashingTF hashingTF = new HashingTF()
				.setNumFeatures(1000)
				.setInputCol("words")
				.setOutputCol("features");
		
		// 创建逻辑回归分类器
		LogisticRegression lr = new LogisticRegression()
				.setMaxIter(10)
				.setRegParam(0.001);
		
		// 创建Pipeline
		Pipeline pipeline = new Pipeline()
				.setStages(new PipelineStage[] {tokenizer, hashingTF, lr});
		
		// 训练模型
		PipelineModel model = pipeline.fit(training);
		
		// 使用模型进行预测
		Dataset<Row> predictions = model.transform(test);
		
		// 显示预测结果
		predictions.select("prediction").show();
		
		/********* End *********/
	}
}

第2关:MLlib-垃圾邮件检测

package com.educoder.bigData.sparksql5;

import java.util.Arrays;

import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.ml.Pipeline;
import org.apache.spark.ml.PipelineModel;
import org.apache.spark.ml.PipelineStage;
import org.apache.spark.ml.classification.*;
import org.apache.spark.ml.feature.StringIndexer;
import org.apache.spark.ml.feature.Word2Vec;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

public class Case2 {

	public static PipelineModel training(SparkSession spark) {

		/********* Begin *********/
		// 读取数据文件
		JavaRDD<Row> map = spark.read().textFile("SMSSpamCollection").toJavaRDD()
				.map(String -> String.split(" ")).map(new Function<String[], Row>() {
					@Override
					public Row call(String[] v1) throws Exception {
						String[] copyOfRange = Arrays.copyOfRange(v1, 1, v1.length);
						return RowFactory.create(v1[0], copyOfRange);
					}
				});
				
		// 定义schema
		StructType schema = new StructType(new StructField[] {
			new StructField("label", DataTypes.StringType, false, Metadata.empty()),
			new StructField("message", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty())
		});
		
		// 创建DataFrame
		Dataset<Row> df = spark.createDataFrame(map, schema);
		
		// 标签转换
		StringIndexer labelIndexer = new StringIndexer()
			.setInputCol("label")
			.setOutputCol("indexedLabel");
		
		// Word2Vec特征转换
		Word2Vec word2Vec = new Word2Vec()
			.setInputCol("message")
			.setOutputCol("features")
			.setVectorSize(200)
			.setMinCount(1)
			.setWindowSize(5);
		
		// 使用随机森林分类器
		RandomForestClassifier rf = new RandomForestClassifier()
			.setLabelCol("indexedLabel")
			.setFeaturesCol("features")
			.setNumTrees(50)
			.setMaxDepth(10)
			.setMaxBins(32);
		
		// 创建Pipeline
		Pipeline pipeline = new Pipeline()
			.setStages(new PipelineStage[] {labelIndexer, word2Vec, rf});
		
		// 训练模型
		PipelineModel model = pipeline.fit(df);
		
		/********* End *********/
        return model;
	}
} 

第3关:MLlib-红酒分类预测

package com.educoder.bigData.sparksql5;

import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.ml.Pipeline;
import org.apache.spark.ml.PipelineModel;
import org.apache.spark.ml.PipelineStage;
import org.apache.spark.ml.classification.*;
import org.apache.spark.ml.linalg.VectorUDT;
import org.apache.spark.ml.linalg.Vectors;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

public class Case3 {

	public static PipelineModel training(SparkSession spark) {

		/********* Begin *********/
		// 读取数据文件
		JavaRDD<String> data = spark.sparkContext().textFile("dataset.csv", 1).toJavaRDD();
		
		// 将数据转换为Row格式
		JavaRDD<Row> rowRDD = data.map(new Function<String, Row>() {
			@Override
			public Row call(String line) throws Exception {
				String[] parts = line.split(",");
				double label = Double.parseDouble(parts[0]);
				double[] features = new double[parts.length - 1];
				for (int i = 1; i < parts.length; i++) {
					features[i-1] = Double.parseDouble(parts[i]);
				}
				return RowFactory.create(label, Vectors.dense(features));
			}
		});
		
		// 定义schema
		StructType schema = new StructType(new StructField[] {
			new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
			new StructField("features", new VectorUDT(), false, Metadata.empty())
		});
		
		// 创建DataFrame
		Dataset<Row> df = spark.createDataFrame(rowRDD, schema);
		
		// 使用随机森林分类器
		RandomForestClassifier rf = new RandomForestClassifier()
			.setLabelCol("label")
			.setFeaturesCol("features")
			.setNumTrees(50)
			.setMaxDepth(10)
			.setMaxBins(32);
		
		// 创建Pipeline
		Pipeline pipeline = new Pipeline()
			.setStages(new PipelineStage[] {rf});
		
		// 训练模型
		PipelineModel model = pipeline.fit(df);
		
		/********* End *********/
		return model;
	}
} 

### 关于 Apache Spark MLlib 的概述 Apache Spark MLlibSpark 提供的一个分布式机器学习库,旨在简化大规模数据集上执行机器学习的过程。该库不仅包含了常见的机器学习算法实现,还提供了用于构建、评估和调优机器学习管道的各种工具[^1]。 #### 主要特性 - **丰富的算法支持**:MLlib 支持多种类型的机器学习任务,包括分类、回归、聚类以及协同过滤等。 - **易于使用的 API**:无论是 Python 还是 Scala 用户都能找到适合自己的接口来快速搭建模型。 - **高效的性能表现**:得益于 Spark 的内存计算能力,即使面对海量的数据也能保持高效运行。 - **集成性强**:可以方便地与其他 Spark 组件(如 SQL 查询引擎或流处理模块)结合使用。 ```python from pyspark.ml import Pipeline from pyspark.ml.classification import LogisticRegression from pyspark.ml.feature import HashingTF, Tokenizer # 准备训练数据 training = spark.createDataFrame([ (0L, "a b c d e spark", 1.0), (1L, "b d", 0.0), (2L, "spark f g h", 1.0), (3L, "hadoop mapreduce", 0.0) ], ["id", "text", "label"]) # 配置各个阶段的转换器 tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=10, regParam=0.001) # 构建并拟合流水线 pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) model = pipeline.fit(training) ``` 此代码片段展示了如何利用 `Pipeline` 将多个数据变换步骤串联起来形成完整的机器学习工作流程;这里选择了逻辑回归作为最终的学习算法来进行二元分类任务[^4]。
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值