Spark入门(十八)之多表关联

一、多表关联

输入是两个文件,一个代表工厂表,包含工厂名列和地址编号列;另一个代表地址表,包含地址名列和地址编号列。要求从输入数据中找出工厂名和地址名的对应关系,输出"工厂名——地址名"表

 

二、maven设置

<?xml version="1.0" encoding="UTF-8"?>
 
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
 
  <groupId>com.mk</groupId>
  <artifactId>spark-test</artifactId>
  <version>1.0</version>
 
  <name>spark-test</name>
  <url>http://spark.mk.com</url>
 
  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <maven.compiler.source>1.8</maven.compiler.source>
    <maven.compiler.target>1.8</maven.compiler.target>
    <scala.version>2.11.1</scala.version>
    <spark.version>2.4.4</spark.version>
    <hadoop.version>2.6.0</hadoop.version>
  </properties>
 
  <dependencies>
    <!-- scala依赖-->
    <dependency>
      <groupId>org.scala-lang</groupId>
      <artifactId>scala-library</artifactId>
      <version>${scala.version}</version>
    </dependency>
 
    <!-- spark依赖-->
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-core_2.11</artifactId>
      <version>${spark.version}</version>
    </dependency>
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-sql_2.11</artifactId>
      <version>${spark.version}</version>
    </dependency>
 
 
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.11</version>
      <scope>test</scope>
    </dependency>
  </dependencies>
 
  <build>
    <pluginManagement>
      <plugins>
 
        <plugin>
          <artifactId>maven-clean-plugin</artifactId>
          <version>3.1.0</version>
        </plugin>
 
        <plugin>
          <artifactId>maven-resources-plugin</artifactId>
          <version>3.0.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-compiler-plugin</artifactId>
          <version>3.8.0</version>
        </plugin>
        <plugin>
          <artifactId>maven-surefire-plugin</artifactId>
          <version>2.22.1</version>
        </plugin>
        <plugin>
          <artifactId>maven-jar-plugin</artifactId>
          <version>3.0.2</version>
        </plugin>
      </plugins>
    </pluginManagement>
  </build>
</project>

 

三、编程代码 

public class MultiTableJoinApp implements SparkConfInfo {

    public static void main(String[] args) {

        String factoryFilePath = "E:\\spark\\factory.txt";
        SparkSession sparkSession = new MultiTableJoinApp().getSparkConf("MultiTableJoinApp");
        JavaPairRDD<String, String> addressFactoryRdd = sparkSession.sparkContext()
                .textFile(factoryFilePath, 4)
                .toJavaRDD()
                .flatMap(v -> Arrays.asList(v.split("\n")).iterator())
                .mapToPair(v -> {

                    if(v.matches("\\s+factoryname\\s+addressed\\s+")){
                        return null;
                    }
                    String[] data = v.trim().split("\\s{2,}");
                    if (data.length != 2) {
                        return null;
                    }
                    return new Tuple2<>(data[1],data[0]);
                }).filter(v -> v != null);

        String addressFilePath = "E:\\spark\\address.txt";
        JavaPairRDD<String, String> addressNameRdd = sparkSession.sparkContext()
                .textFile(addressFilePath, 4)
                .toJavaRDD()
                .flatMap(v -> Arrays.asList(v.split("\n")).iterator())
                .mapToPair(v -> {
                    if(v.matches("\\s+addressID\\s+addressname\\s+")){
                        return null;
                    }
                    String[] data = v.trim().split("\\s{2,}");
                    if (data.length != 2) {
                        return null;
                    }


                    return new Tuple2<>(data[0],data[1]);
                }).filter(v -> v != null);

        JavaPairRDD<String, Tuple2<String, String> >  joinRdd = addressFactoryRdd.join(addressNameRdd);
        List<Tuple2<String, String>> childGrand = joinRdd.mapToPair(v->new Tuple2<>(v._2._1, v._2._2))
               .sortByKey(true)
                .collect();

        System.out.println("factoryname\t\taddressname");
        childGrand.forEach(v -> System.out.println(v._1 + "\t\t" + v._2));

        sparkSession.stop();
    }
}



public interface SparkConfInfo {

    default SparkSession getSparkConf(String appName){
        SparkConf sparkConf = new SparkConf();
        if(System.getProperty("os.name").toLowerCase().contains("win")) {
            sparkConf.setMaster("local[4]");
            System.out.println("使用本地模拟是spark");
        }else
        {
            sparkConf.setMaster("spark://hadoop01:7077,hadoop02:7077,hadoop03:7077");
            sparkConf.set("spark.driver.host","192.168.150.1");//本地ip,必须与spark集群能够相互访问,如:同一个局域网
            sparkConf.setJars(new String[] {".\\out\\artifacts\\spark_test\\spark-test.jar"});//项目构建生成的路径
        }

        SparkSession session = SparkSession.builder().appName(appName).config(sparkConf).config(sparkConf).getOrCreate();
        return session;
    }
}

factory.txt文件内容

factoryname        addressed 
Beijing Red Star        1
Shenzhen Thunder        3
Guangzhou Honda        2
Beijing Rising        1
Guangzhou Development Bank        2
Tencent        3
Back of Beijing        1

address.txt文件内容

addressID        addressname 
1        Beijing
2        Guangzhou
3        Shenzhen
4        Xian

输出

factoryname		addressname
Back of Beijing		Beijing
Beijing Red Star		Beijing
Beijing Rising		Beijing
Guangzhou Development Bank		Guangzhou
Guangzhou Honda		Guangzhou
Shenzhen Thunder		Shenzhen
Tencent		Shenzhen

 

四、join方法

<W> JavaPairRDD<K, Tuple2<V, W>> join(JavaPairRDD<K, W> other)

关联表返回相同可以的键值对

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值