package io.renren.utils.spark;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.apache.spark.sql.expressions.UserDefinedFunction;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import scala.Function1;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import static org.apache.spark.sql.functions.*;
/**
* @description:
* @author: yyyyjinying
* @create: 2023-06-20 15:24
**/
public class JavaSparkSQLExample {
public static void runBasicDataFrameExample(SparkSession spark) {
Dataset<Row> df = spark.read().json("renren-admin\\renren-admin-server\\src\\main\\resources\\employees.json");
df.show();
df.printSchema();
df.select("name").show();
// 使用内置函数
// df.select(col("name"), col("salary").plus(1).as("onsalary")).show();
// +-------+--------+
// | name|salary|
// +-------+--------+
// |Michael| 3000|
// | Andy| 4500|
// | Justin| 3500|
// | Berta| 4000|
// +-------+--------+
// 内置函数输出自定义列
// df.select(col("name"), col("salary").gt(21)).show();
// df.select(col("name").as("zname")).where("salary='3000'").show();
UserDefinedFunction plusOne = udf(
(String in, String out) -> in + out, // 传递参数类型
DataTypes.StringType // 返回类型
);
// 自定义函数
spark.udf().register("plusOne", plusOne);
// select:处理列或表达式
// selectExpr:处理字符串表达式 expr("random")
// df.select(col("name"), col("salary"), expr("random()")).show();
df.selectExpr("name", "salary as sal", "plusOne(name, '1')").show();
}
/**
* @param spark 1
* @return void
* @description 创建全局临时表使用自定义函数
* @author yyyyjinying
* @since 2023/6/20 16:38
*/
public static void createGlobalTempViewExample(SparkSession spark) throws AnalysisException {
Dataset<Row> df = spark.read().json("renren-admin\\renren-admin-server\\src\\main\\resources\\employees.json");
df.createGlobalTempView("employees");
UserDefinedFunction plusOne = udf(
(String in, String out) -> in + out, // 传递参数类型
DataTypes.StringType // 返回类型
);
// 自定义函数
spark.udf().register("plusOne", plusOne);
df.sqlContext().sql("select *, plusOne(name, '1') as plus from global_temp.employees").show();
}
@Data
@NoArgsConstructor
public static class Preson implements Serializable {
private String name;
private Long age;
}
// 第一个输入参数,第二参数返回参数
public static class FnMapOne implements Function1<Long, Long>, Serializable {
@Override
public Long apply(Long v1) {
return v1 + 1L;
}
}
public static void runDatasetCreationExample(SparkSession spark) {
Preson preson = new Preson();
preson.setName("zz");
preson.setAge(123456L);
Encoder<Preson> personEncoder = Encoders.bean(Preson.class);
Dataset<Preson> javaBeanDS = spark.createDataset(Collections.singletonList(preson), personEncoder);
List<Preson> presons = javaBeanDS.collectAsList();
System.out.println(presons);
Dataset<Long> longDS = spark.createDataset(Arrays.asList(10000L, 2L, 3L, 4L), Encoders.LONG());
Dataset<Long> map = longDS.map((MapFunction<Long, Long>) value -> value + 1L, Encoders.LONG());
// Dataset<Long> map = longDS.map(new FnMapOne(), Encoders.LONG());
// Task not serializable
// Dataset<Long> map = longDS.map((Function1<Long, Long>) x -> x + 1L, Encoders.LONG());
System.out.println("10001");
map.show();
}
@Data
@AllArgsConstructor
@NoArgsConstructor
public static class Employees {
private String name;
private Long salary;
}
public static void createDataFrameEmployees(SparkSession spark) {
Dataset<Employees> ds = spark.read().json("renren-admin\\renren-admin-server\\src\\main\\resources\\employees.json")
.as(Encoders.bean(Employees.class));
ds.select(col("name"), col("salary").plus(1L).as("plus")).show();
}
public static void runInferSchemaExample(SparkSession spark) {
JavaRDD<Preson> prdd = spark.read().textFile("renren-admin\\renren-admin-server\\src\\main\\resources\\people.txt")
.javaRDD().map((line) -> {
String[] split = line.split(",");
Preson preson = new Preson();
preson.setName(split[0]);
preson.setAge(Long.parseLong(split[1].trim()));
return preson;
});
// System.out.println(prdd.collect());
Dataset<Row> df = spark.createDataFrame(prdd, Preson.class);
try {
df.createTempView("people");
Dataset<Row> sql = spark.sql("select * from people");
System.out.println("zzzz");
// sql.map((MapFunction<Row, String>) row -> row.prettyJson(), Encoders.STRING())
// .show(false);
// sql.map((MapFunction<Row, String>) row -> row.toString(), Encoders.STRING())
// .show(false);
sql.map((MapFunction<Row, String>) row -> row.getAs("name"), Encoders.STRING())
.show(false);
} catch (AnalysisException e) {
e.printStackTrace();
}
}
public static void runProgrammaticSchemaExample(SparkSession spark) {
JavaRDD<String> rdd = spark
.sparkContext()
.textFile("renren-admin\\renren-admin-server\\src\\main\\resources\\people.txt", 1)
.toJavaRDD();
String schemaString = "name age";
List<StructField> list = new ArrayList<>();
for (String fieldName : schemaString.split(" ")) {
if ("age".equals(fieldName)) {
StructField structField = DataTypes.createStructField(fieldName, DataTypes.LongType, true);
list.add(structField);
} else {
StructField structField = DataTypes.createStructField(fieldName, DataTypes.StringType, true);
list.add(structField);
}
}
StructType schema = DataTypes.createStructType(list);
JavaRDD<Row> rowRdd = rdd.map((item) -> {
String[] s = item.split(" ");
Row row = RowFactory.create(s[0], Long.parseLong(s[1].trim()));
return row;
});
Dataset<Row> dataFrame = spark.createDataFrame(rowRdd, schema);
System.out.println("zzzz");
dataFrame.show();
}
private static void runBasicDataSourceExample(SparkSession spark) {
Dataset<Row> peopleDF = spark.read()
.format("json")
.load("renren-admin\\renren-admin-server\\src\\main\\resources\\employees.json");
peopleDF.select("name", "salary").write().format("json").save("renren-admin\\renren-admin-server\\src\\main\\resources\\a.json");
}
public static void runDataSourceExample(SparkSession spark){
Dataset<Row> jdbcDF = spark.read()
.format("jdbc")
.option("url", "jdbc:mysql://qar2:3306/yourdbname?useSSL=false&useUnicode=true")
.option("dbtable", "emp_add")
.option("user", "root")
.option("password", "123456")
.load();
jdbcDF.show();
// jdbcDF.write().format("json").save("renren-admin\\renren-admin-server\\src\\main\\resources\\a.json");
// jdbcDF.write()
// .format("jdbc")
// .option("url", "jdbc:mysql://qar2:3306/yourdbname?useSSL=false&useUnicode=true")
// .option("dbtable", "emp_add_copy")
// .option("user", "root")
// .option("password", "123456")
// .save();
}
public static void main(String[] args) throws AnalysisException {
SparkSession spark = SparkSession
.builder()
.appName("sql")
.master("local[*]")
.getOrCreate();
// runBasicDataFrameExample(spark);
// createGlobalTempViewExample(spark);
// runDatasetCreationExample(spark);
// createDataFrameEmployees(spark);
// runInferSchemaExample(spark);
// runProgrammaticSchemaExample(spark);
// runBasicDataSourceExample(spark);
runDataSourceExample(spark);
}
}
spring-boot开发saprk大数据
于 2023-06-21 11:23:09 首次发布