一、多表关联
输入是两个文件,一个代表工厂表,包含工厂名列和地址编号列;另一个代表地址表,包含地址名列和地址编号列。要求从输入数据中找出工厂名和地址名的对应关系,输出"工厂名——地址名"表
二、maven设置
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.mk</groupId>
<artifactId>spark-test</artifactId>
<version>1.0</version>
<name>spark-test</name>
<url>http://spark.mk.com</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<scala.version>2.11.1</scala.version>
<spark.version>2.4.4</spark.version>
<hadoop.version>2.6.0</hadoop.version>
</properties>
<dependencies>
<!-- scala依赖-->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<!-- spark依赖-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<pluginManagement>
<plugins>
<plugin>
<artifactId>maven-clean-plugin</artifactId>
<version>3.1.0</version>
</plugin>
<plugin>
<artifactId>maven-resources-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.0</version>
</plugin>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.22.1</version>
</plugin>
<plugin>
<artifactId>maven-jar-plugin</artifactId>
<version>3.0.2</version>
</plugin>
</plugins>
</pluginManagement>
</build>
</project>
三、编程代码
public class MultiTableJoinApp implements SparkConfInfo {
public static void main(String[] args) {
String factoryFilePath = "E:\\spark\\factory.txt";
SparkSession sparkSession = new MultiTableJoinApp().getSparkConf("MultiTableJoinApp");
JavaPairRDD<String, String> addressFactoryRdd = sparkSession.sparkContext()
.textFile(factoryFilePath, 4)
.toJavaRDD()
.flatMap(v -> Arrays.asList(v.split("\n")).iterator())
.mapToPair(v -> {
if(v.matches("\\s+factoryname\\s+addressed\\s+")){
return null;
}
String[] data = v.trim().split("\\s{2,}");
if (data.length != 2) {
return null;
}
return new Tuple2<>(data[1],data[0]);
}).filter(v -> v != null);
String addressFilePath = "E:\\spark\\address.txt";
JavaPairRDD<String, String> addressNameRdd = sparkSession.sparkContext()
.textFile(addressFilePath, 4)
.toJavaRDD()
.flatMap(v -> Arrays.asList(v.split("\n")).iterator())
.mapToPair(v -> {
if(v.matches("\\s+addressID\\s+addressname\\s+")){
return null;
}
String[] data = v.trim().split("\\s{2,}");
if (data.length != 2) {
return null;
}
return new Tuple2<>(data[0],data[1]);
}).filter(v -> v != null);
JavaPairRDD<String, Tuple2<String, String> > joinRdd = addressFactoryRdd.join(addressNameRdd);
List<Tuple2<String, String>> childGrand = joinRdd.mapToPair(v->new Tuple2<>(v._2._1, v._2._2))
.sortByKey(true)
.collect();
System.out.println("factoryname\t\taddressname");
childGrand.forEach(v -> System.out.println(v._1 + "\t\t" + v._2));
sparkSession.stop();
}
}
public interface SparkConfInfo {
default SparkSession getSparkConf(String appName){
SparkConf sparkConf = new SparkConf();
if(System.getProperty("os.name").toLowerCase().contains("win")) {
sparkConf.setMaster("local[4]");
System.out.println("使用本地模拟是spark");
}else
{
sparkConf.setMaster("spark://hadoop01:7077,hadoop02:7077,hadoop03:7077");
sparkConf.set("spark.driver.host","192.168.150.1");//本地ip,必须与spark集群能够相互访问,如:同一个局域网
sparkConf.setJars(new String[] {".\\out\\artifacts\\spark_test\\spark-test.jar"});//项目构建生成的路径
}
SparkSession session = SparkSession.builder().appName(appName).config(sparkConf).config(sparkConf).getOrCreate();
return session;
}
}
factory.txt文件内容
factoryname addressed
Beijing Red Star 1
Shenzhen Thunder 3
Guangzhou Honda 2
Beijing Rising 1
Guangzhou Development Bank 2
Tencent 3
Back of Beijing 1
address.txt文件内容
addressID addressname
1 Beijing
2 Guangzhou
3 Shenzhen
4 Xian
输出
factoryname addressname
Back of Beijing Beijing
Beijing Red Star Beijing
Beijing Rising Beijing
Guangzhou Development Bank Guangzhou
Guangzhou Honda Guangzhou
Shenzhen Thunder Shenzhen
Tencent Shenzhen
四、join方法
<W> JavaPairRDD<K, Tuple2<V, W>> join(JavaPairRDD<K, W> other)
关联表返回相同可以的键值对