完整spark sql 访问 aws s3 文件并写入mysql数据库的代码

书接上篇,需求同上。将读取的本地文件路径改为aws s3路径,实现如下:

1. 直接 上pom文件

<project xmlns="http://maven.apache.org/POM/4.0.0"
    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>org.example</groupId>
    <artifactId>JavaDemo</artifactId>
    <version>0.0.1-SNAPSHOT</version>
    <packaging>jar</packaging>

    <name>JavaDemo</name>
    <url>http://maven.apache.org</url>

    
    
    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <scala.version>2.11</scala.version>
        <spark.version>2.4.3</spark.version>
        <hadoop.version>2.8.5</hadoop.version>
        <aws.version>1.11.636</aws.version>
        <spark.pom.scope>compile</spark.pom.scope>
    </properties>

    <dependencies>
        <dependency>
            <groupId>io.netty</groupId>
            <artifactId>netty-all</artifactId>
            <version>4.1.17.Final</version>
            <exclusions>
                <exclusion>
                  <artifactId>netty</artifactId>
                  <groupId>io.netty</groupId>
                </exclusion>
            </exclusions>
        </dependency>
        <!-- https://mvnrepository.com/artifact/com.amazonaws/aws-java-sdk-s3 -->
        <!-- <dependency>
            <groupId>com.amazonaws</groupId>
            <artifactId>aws-java-sdk-s3</artifactId>
            <version>${aws.version}</version>
        </dependency> -->
        <!-- https://mvnrepository.com/artifact/com.amazonaws/aws-java-sdk-kms -->
        <!-- <dependency>
            <groupId>com.amazonaws</groupId>
            <artifactId>aws-java-sdk-kms</artifactId>
            <version>${aws.version}</version>
        </dependency> -->
        <!-- https://mvnrepository.com/artifact/com.amazonaws/aws-java-sdk-core -->
        <!-- <dependency>
            <groupId>com.amazonaws</groupId>
            <artifactId>aws-java-sdk-core</artifactId>
            <version>${aws.version}</version>
        </dependency> -->
        <!-- https://mvnrepository.com/artifact/com.amazonaws/jmespath-java -->
        <!-- <dependency>
            <groupId>com.amazonaws</groupId>
            <artifactId>jmespath-java</artifactId>
            <version>${aws.version}</version>
        </dependency> -->
        
        <dependency>
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-core</artifactId>
            <version>2.6.3</version>
        </dependency>
        <dependency>
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-databind</artifactId>
            <version>2.6.3</version>
        </dependency>
        <dependency>
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-annotations</artifactId>
            <version>2.6.3</version>
        </dependency>
        
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-aws</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>net.java.dev.jets3t</groupId>
            <artifactId>jets3t</artifactId>
            <version>0.9.4</version>
        </dependency>
        
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpcore</artifactId>
            <version>4.4</version>
        </dependency>
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.4</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_${scala.version}</artifactId>
            <version>${spark.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_${scala.version}</artifactId>
            <version>${spark.version}</version>
        </dependency>
        <!-- 
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-hive_${scala.version}</artifactId>
            <version>${spark.version}</version>
        </dependency>
         -->
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.47</version>
        </dependency>
        <!-- 
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming_${scala.version}</artifactId>
            <version>${spark.version}</version>
        </dependency>
        <dependency>
            <groupId>com.databricks</groupId>
            <artifactId>spark-csv_2.10</artifactId>
            <version>1.4.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-mllib-local_${scala.version}</artifactId>
            <version>${spark.version}</version>
        </dependency>
         -->
        <!-- SparkStreaming + Kafka -->
        <!-- 
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
            <version>2.3.1</version>
        </dependency>
         -->
        <!-- 向kafka 生产数据需要包 -->
        <!-- 
        <dependency>
            <groupId>org.apache.kafka</groupId>
            <artifactId>kafka-clients</artifactId>
            <version>0.10.0.0</version>
        </dependency>
         -->
        <!--连接 Redis 需要的包-->
        <!-- 
        <dependency>
            <groupId>redis.clients</groupId>
            <artifactId>jedis</artifactId>
            <version>2.6.1</version>
        </dependency>
        <dependency>
            <groupId>log4j</groupId>
            <artifactId>log4j</artifactId>
            <version>1.2.12</version>
        </dependency>
        <dependency>
            <groupId>com.google.collections</groupId>
            <artifactId>google-collections</artifactId>
            <version>1.0</version>
        </dependency>
        <dependency>
            <groupId>org.specs</groupId>
            <artifactId>specs</artifactId>
            <version>1.2.5</version>
            <scope>test</scope>
        </dependency>
        -->
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>3.8.1</version>
            <scope>test</scope>
        </dependency>
         
    </dependencies>
</project>
-------------------------

2. 上代码:

package org.example.JavaDemo;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.apache.hadoop.fs.s3a.S3AFileSystem;

import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Properties;

public class SparkSqlCsvToCsv {


    public static void main(String[] args) {
        /**
         * 中国区域s3.cn-north-1.amazonaws.com.cn
         * 宁夏cn-northwest-1,北京cn-north-1
         */
        System.out.println("=========0000=========");
        String hdfsInAddress = "s3a://emr-demo-input/mydata/";//"hdfs://192.168.209.129:9000/"; //server ip //D:\DevTemp\AWS\ ;s3://emr-demo-input/mydata/
        String inputAddress = "";//"in/";
        String csvFileName="emr-demo-data.csv";
        System.out.println("======111============");
        SparkConf conf = new SparkConf().setMaster("local").setAppName("TestSpark");
        System.out.println("=========222=========");

        /*
         * Properties properties = new Properties(); InputStream inputStream =
         * Object.class.getResourceAsStream("/s3.properties");
         * properties.load(inputStream);
         */
        System.out.println("=========333=========");
        JavaSparkContext sc = new JavaSparkContext(conf);//JavaSparkContext过时
//        SparkContext sc = new SparkContext(conf);
        System.out.println("=========444-1=========");
        /*
         * sc.hadoopConfiguration().set("fs.s3a.access.key",properties.getProperty(
         * "fs.s3a.access.key"));
         * sc.hadoopConfiguration().set("fs.s3a.secret.key",properties.getProperty(
         * "fs.s3a.secret.key"));
         * sc.hadoopConfiguration().set("fs.s3a.endpoint",properties.getProperty(
         * "fs.s3a.endpoint"));//spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem
         */
        /* spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem  
            spark.hadoop.fs.s3a.access.key=ACCESSKEY  
            spark.hadoop.fs.s3a.secret.key=SECRETKEY
         */
        //sc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem");
        sc.hadoopConfiguration().set("fs.s3a.access.key","AKIA2CIDQ6XXXXXXXXX");
        sc.hadoopConfiguration().set("fs.s3a.secret.key","VR1spXe+Jb5pK4m1gKcBFXXXXXXXXXXXX");
        sc.hadoopConfiguration().set("fs.s3a.endpoint","s3.cn-northwest-1.amazonaws.com.cn");//这里使用的是宁夏服务器
        System.out.println("=========444=========");
        SQLContext sqlContext = new SQLContext(sc);
        System.out.println("=========555=========");
        HashMap<String,String> options = new HashMap<String,String> ();
        options.put("header", "true");//设置第一行为头
        options.put("inferSchema", "true");//设置自动分析片段类型
        //options.put("path", hdfsInAddress + inputAddress + filePath);
        options.put("path", hdfsInAddress + inputAddress + csvFileName);
        options.put("dateFormat","YYYY-MM-DD");
        System.out.println("打印上传文件在hdfs的路径:"+hdfsInAddress + inputAddress + csvFileName);
        System.out.println("=========666=========");
        /****声明字段类型****/
        StructField structFields[] = new StructField[9];
        structFields[0] = DataTypes.createStructField("Tier", DataTypes.StringType,true);
        structFields[1] = DataTypes.createStructField("SellerCode",DataTypes.StringType,true);
        structFields[2] = DataTypes.createStructField("SellerName",DataTypes.StringType,true);
        structFields[3] = DataTypes.createStructField("DataSource",DataTypes.StringType,true);
        structFields[4] = DataTypes.createStructField("SellerProvince",DataTypes.StringType,true);
        structFields[5] = DataTypes.createStructField("_201901",DataTypes.DoubleType,true);
        structFields[6] = DataTypes.createStructField("_201902",DataTypes.DoubleType,true);
        structFields[7] = DataTypes.createStructField("_201903",DataTypes.DoubleType,true);
        structFields[8] = DataTypes.createStructField("flag",DataTypes.StringType,true);
        StructType structType = new StructType(structFields);
        System.out.println("=========777=========");
        Dataset dataFrame = sqlContext.load("com.databricks.spark.csv", structType, options);
        System.out.println("=========8888=========");
        // DataFrame cars = (new CsvParser()).withUseHeader(true).csvFile(sqlContext, "cars.csv");//通过CsvParser里面的函数来读取CSV文件
        dataFrame.registerTempTable("result");
        System.out.println("=========9999=========");
        StringBuffer sparkSql = new StringBuffer("select ");
        sparkSql.append("Tier");
        sparkSql.append(", SellerCode");
        sparkSql.append(", SellerName");
        sparkSql.append(", DataSource");
        sparkSql.append(", SellerProvince");
        sparkSql.append(", _201901");
        sparkSql.append(", _201902");
        sparkSql.append(", _201903");
        sparkSql.append(", if(_201903>_201902,'up','down') as flag");
        sparkSql.append(" from result");
        Dataset resultFrame=sqlContext.sql(sparkSql.toString() );
        //resultFrame.createOrReplaceTempView("resultView");//创建视图

        //System.out.println("***************用Dataset打印*peopleScore********"+resultFrame.limit(10).showString(20,0,false));
        System.out.println("******print schema *******");
        resultFrame.printSchema();
        System.out.println("*************");
        //resultFrame.select("SellerName").show();
        System.out.println("*************");
        //Tier    SellerCode    SellerName    DataSource    SellerProvince    _201901    _201902    _201903
        Dataset df = resultFrame.select(
                resultFrame.col("Tier"),
                resultFrame.col("SellerCode"),
                resultFrame.col("SellerName"),
                resultFrame.col("DataSource"),
                resultFrame.col("SellerProvince"),
                resultFrame.col("_201901"),
                resultFrame.col("_201902"),
                resultFrame.col("_201903"),
                resultFrame.col("flag")
        );
        df = df.filter(df.col("Tier").contains("T"));//where condition:equalTo/
        //df = df.filter((df.col("_201902").cast(DataTypes.FloatType)).gt((df.col("201901").cast(DataTypes.FloatType))));//gt 大于
        //df = df.orderBy(df.col("_201902").cast(DataTypes.FloatType).asc_nulls_first());//转换类型并升序
        //df.groupBy("age").count();//分组

        System.out.println("******df.show() print schema *******");
        df.show();
        System.out.println("******df.show() print schema *******");
        
        /*************将结果写入到 mysql 数据库******************/
        //数据库连接
        String url = "jdbc:mysql://127.0.0.1:3306/hive?useUnicode=true&characterEncoding=utf-8";
        Properties connectionProperties = new Properties();
        connectionProperties.put("user","root");
        connectionProperties.put("password","123456");
        connectionProperties.put("driver","com.mysql.jdbc.Driver");

        /**插入数据库表中**/
        df.write().mode(SaveMode.Overwrite).jdbc(url,"t_result",connectionProperties);//Overwrite会覆盖数据和表结构
        sc.stop();
    }
}

------------------

run 结果:

=========0000=========
======111============
=========222=========
=========333=========
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
20/03/18 08:58:18 INFO SparkContext: Running Spark version 2.4.3
20/03/18 08:58:18 WARN Shell: Did not find winutils.exe: {}
java.io.FileNotFoundException: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset. -see https://wiki.apache.org/hadoop/WindowsProblems
    at org.apache.hadoop.util.Shell.fileNotFoundException(Shell.java:528)
    at org.apache.hadoop.util.Shell.getHadoopHomeDir(Shell.java:549)
    at org.apache.hadoop.util.Shell.getQualifiedBin(Shell.java:572)
    at org.apache.hadoop.util.Shell.<clinit>(Shell.java:669)
    at org.apache.hadoop.util.StringUtils.<clinit>(StringUtils.java:79)
    at org.apache.hadoop.conf.Configuration.getBoolean(Configuration.java:1555)
    at org.apache.hadoop.security.SecurityUtil.getLogSlowLookupsEnabled(SecurityUtil.java:497)
    at org.apache.hadoop.security.SecurityUtil.<clinit>(SecurityUtil.java:90)
    at org.apache.hadoop.security.UserGroupInformation.initialize(UserGroupInformation.java:293)
    at org.apache.hadoop.security.UserGroupInformation.ensureInitialized(UserGroupInformation.java:281)
    at org.apache.hadoop.security.UserGroupInformation.loginUserFromSubject(UserGroupInformation.java:837)
    at org.apache.hadoop.security.UserGroupInformation.getLoginUser(UserGroupInformation.java:807)
    at org.apache.hadoop.security.UserGroupInformation.getCurrentUser(UserGroupInformation.java:680)
    at org.apache.spark.util.Utils$$anonfun$getCurrentUserName$1.apply(Utils.scala:2422)
    at org.apache.spark.util.Utils$$anonfun$getCurrentUserName$1.apply(Utils.scala:2422)
    at scala.Option.getOrElse(Option.scala:121)
    at org.apache.spark.util.Utils$.getCurrentUserName(Utils.scala:2422)
    at org.apache.spark.SparkContext.<init>(SparkContext.scala:293)
    at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
    at org.example.JavaDemo.SparkSqlCsvToCsv.main(SparkSqlCsvToCsv.java:40)
Caused by: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset.
    at org.apache.hadoop.util.Shell.checkHadoopHomeInner(Shell.java:448)
    at org.apache.hadoop.util.Shell.checkHadoopHome(Shell.java:419)
    at org.apache.hadoop.util.Shell.<clinit>(Shell.java:496)
    ... 16 more
20/03/18 08:58:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
20/03/18 08:58:19 INFO SparkContext: Submitted application: TestSpark
20/03/18 08:58:19 INFO SecurityManager: Changing view acls to: Ace
20/03/18 08:58:19 INFO SecurityManager: Changing modify acls to: Ace
20/03/18 08:58:19 INFO SecurityManager: Changing view acls groups to: 
20/03/18 08:58:19 INFO SecurityManager: Changing modify acls groups to: 
20/03/18 08:58:19 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users  with view permissions: Set(Ace); groups with view permissions: Set(); users  with modify permissions: Set(Ace); groups with modify permissions: Set()
20/03/18 08:58:20 INFO Utils: Successfully started service 'sparkDriver' on port 53335.
20/03/18 08:58:20 INFO SparkEnv: Registering MapOutputTracker
20/03/18 08:58:20 INFO SparkEnv: Registering BlockManagerMaster
20/03/18 08:58:20 INFO BlockManagerMasterEndpoint: Using org.apache.spark.storage.DefaultTopologyMapper for getting topology information
20/03/18 08:58:20 INFO BlockManagerMasterEndpoint: BlockManagerMasterEndpoint up
20/03/18 08:58:20 INFO DiskBlockManager: Created local directory at C:\Users\Lenovo\AppData\Local\Temp\blockmgr-49830c1c-118a-4540-a84b-6ba29b2c3bd8
20/03/18 08:58:21 INFO MemoryStore: MemoryStore started with capacity 1984.5 MB
20/03/18 08:58:21 INFO SparkEnv: Registering OutputCommitCoordinator
20/03/18 08:58:21 INFO Utils: Successfully started service 'SparkUI' on port 4040.
20/03/18 08:58:21 INFO SparkUI: Bound SparkUI to 0.0.0.0, and started at http://Ace-Sun:4040
20/03/18 08:58:21 INFO Executor: Starting executor ID driver on host localhost
20/03/18 08:58:21 INFO Utils: Successfully started service 'org.apache.spark.network.netty.NettyBlockTransferService' on port 53348.
20/03/18 08:58:21 INFO NettyBlockTransferService: Server created on Ace-Sun:53348
20/03/18 08:58:21 INFO BlockManager: Using org.apache.spark.storage.RandomBlockReplicationPolicy for block replication policy
20/03/18 08:58:21 INFO BlockManagerMaster: Registering BlockManager BlockManagerId(driver, Ace-Sun, 53348, None)
20/03/18 08:58:21 INFO BlockManagerMasterEndpoint: Registering block manager Ace-Sun:53348 with 1984.5 MB RAM, BlockManagerId(driver, Ace-Sun, 53348, None)
20/03/18 08:58:21 INFO BlockManagerMaster: Registered BlockManager BlockManagerId(driver, Ace-Sun, 53348, None)
20/03/18 08:58:21 INFO BlockManager: Initialized BlockManager: BlockManagerId(driver, Ace-Sun, 53348, None)
=========444-1=========
=========444=========
=========555=========
打印上传文件在hdfs的路径:s3a://emr-demo-input/mydata/emr-demo-data.csv
=========666=========
=========777=========
20/03/18 08:58:21 INFO SharedState: Setting hive.metastore.warehouse.dir ('null') to the value of spark.sql.warehouse.dir ('file:/D:/DevWorkspase/eclipse-workspace/JavaDemo/spark-warehouse/').
20/03/18 08:58:21 INFO SharedState: Warehouse path is 'file:/D:/DevWorkspase/eclipse-workspace/JavaDemo/spark-warehouse/'.
20/03/18 08:58:22 INFO StateStoreCoordinatorRef: Registered StateStoreCoordinator endpoint
=========8888=========
=========9999=========
******print schema *******
root
 |-- Tier: string (nullable = true)
 |-- SellerCode: string (nullable = true)
 |-- SellerName: string (nullable = true)
 |-- DataSource: string (nullable = true)
 |-- SellerProvince: string (nullable = true)
 |-- _201901: double (nullable = true)
 |-- _201902: double (nullable = true)
 |-- _201903: double (nullable = true)
 |-- flag: string (nullable = false)

*************
*************
******df.show() print schema *******
20/03/18 08:58:28 INFO FileSourceStrategy: Pruning directories with: 
20/03/18 08:58:28 INFO FileSourceStrategy: Post-Scan Filters: isnotnull(Tier#0),Contains(Tier#0, T)
20/03/18 08:58:28 INFO FileSourceStrategy: Output Data Schema: struct<Tier: string, SellerCode: string, SellerName: string, DataSource: string, SellerProvince: string ... 6 more fields>
20/03/18 08:58:28 INFO FileSourceScanExec: Pushed Filters: IsNotNull(Tier),StringContains(Tier,T)
20/03/18 08:58:28 INFO CodeGenerator: Code generated in 271.4181 ms
20/03/18 08:58:28 INFO CodeGenerator: Code generated in 35.9997 ms
20/03/18 08:58:28 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 244.1 KB, free 1984.3 MB)
20/03/18 08:58:29 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 20.0 KB, free 1984.2 MB)
20/03/18 08:58:29 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on Ace-Sun:53348 (size: 20.0 KB, free: 1984.5 MB)
20/03/18 08:58:29 INFO SparkContext: Created broadcast 0 from show at SparkSqlCsvToCsv.java:126
20/03/18 08:58:29 INFO FileSourceScanExec: Planning scan with bin packing, max size: 4199352 bytes, open cost is considered as scanning 4194304 bytes.
20/03/18 08:58:29 INFO SparkContext: Starting job: show at SparkSqlCsvToCsv.java:126
20/03/18 08:58:29 INFO DAGScheduler: Got job 0 (show at SparkSqlCsvToCsv.java:126) with 1 output partitions
20/03/18 08:58:29 INFO DAGScheduler: Final stage: ResultStage 0 (show at SparkSqlCsvToCsv.java:126)
20/03/18 08:58:29 INFO DAGScheduler: Parents of final stage: List()
20/03/18 08:58:29 INFO DAGScheduler: Missing parents: List()
20/03/18 08:58:29 INFO DAGScheduler: Submitting ResultStage 0 (MapPartitionsRDD[3] at show at SparkSqlCsvToCsv.java:126), which has no missing parents
20/03/18 08:58:29 INFO MemoryStore: Block broadcast_1 stored as values in memory (estimated size 14.9 KB, free 1984.2 MB)
20/03/18 08:58:29 INFO MemoryStore: Block broadcast_1_piece0 stored as bytes in memory (estimated size 7.2 KB, free 1984.2 MB)
20/03/18 08:58:29 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on Ace-Sun:53348 (size: 7.2 KB, free: 1984.5 MB)
20/03/18 08:58:29 INFO SparkContext: Created broadcast 1 from broadcast at DAGScheduler.scala:1161
20/03/18 08:58:29 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 0 (MapPartitionsRDD[3] at show at SparkSqlCsvToCsv.java:126) (first 15 tasks are for partitions Vector(0))
20/03/18 08:58:29 INFO TaskSchedulerImpl: Adding task set 0.0 with 1 tasks
20/03/18 08:58:29 INFO TaskSetManager: Starting task 0.0 in stage 0.0 (TID 0, localhost, executor driver, partition 0, PROCESS_LOCAL, 8323 bytes)
20/03/18 08:58:29 INFO Executor: Running task 0.0 in stage 0.0 (TID 0)
20/03/18 08:58:29 INFO FileScanRDD: Reading File path: s3a://emr-demo-input/mydata/emr-demo-data.csv, range: 0-5048, partition values: [empty row]
20/03/18 08:58:29 INFO CodeGenerator: Code generated in 22.3421 ms
20/03/18 08:58:30 INFO Executor: Finished task 0.0 in stage 0.0 (TID 0). 3241 bytes result sent to driver
20/03/18 08:58:30 INFO TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 596 ms on localhost (executor driver) (1/1)
20/03/18 08:58:30 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool 
20/03/18 08:58:30 INFO DAGScheduler: ResultStage 0 (show at SparkSqlCsvToCsv.java:126) finished in 0.700 s
20/03/18 08:58:30 INFO DAGScheduler: Job 0 finished: show at SparkSqlCsvToCsv.java:126, took 0.737439 s
+----+----------+----------------------------+----------+--------------+--------------+-------------+-------------+----+
|Tier|SellerCode|                  SellerName|DataSource|SellerProvince|       _201901|      _201902|      _201903|flag|
+----+----------+----------------------------+----------+--------------+--------------+-------------+-------------+----+
|  T1|    HE003|          医药有限公司|       DDI|          河北|1.0559443903E8|3.312345429E7|5.428380069E7|  up|
|  T1|  HE0009S|河公司田医药站|       DDI|          河北|     249239.76|    168139.14|    260403.56|  up|
|  T1|  HE006S|          河药集团有限公司|       DDI|          河北|    3856199.08|    1384355.4|   4070853.03|  up|
|  T1|   HEA1S|          邢医药药材有限公司|       DDI|          河北|     405327.83|     63712.79|     89365.28|  up|
|  T1|   H865S|            衡水医药有限公司|       DDI|          河北|      648096.6|     188102.8|     239028.8|  up|
|  T1|   HEA3S|            保定医药有限公司|       DDI|          河北|      794278.6|    143358.86|    280220.74|  up|
|  T1|    HB001|唐公司(新分公司)|       DDI|          河北|    2844517.25|    1066305.9|   1154788.35|  up|
|  T1|   T18S|              华医药有限公司|       DDI|          河北| 1.986586353E7|   3419255.58|   9636006.07|  up|
|  T1|   T34S|    国药控有限公司|       DDI|          河北|    2073843.21|     698878.7|    799672.08|  up|
|  T2|  H135S|    国药医药有限公司|       DDI|          河北|     161440.74|    111466.11|     111115.2|down|
|  T2|  HE3S|      国药堂医药有限公司|       DDI|          河北|    6660979.13|   1417602.22|   2650979.14|  up|
|  T2|  HE9S|      国药堂公司|       DDI|          河北|    4707805.76|   1884585.75|   2670068.27|  up|
|  T2|  H17S|    国药岛医药有限公司|       DDI|          河北|    2889987.07|    997135.23|   1670409.38|  up|
|  T2|  H0368S|      国药堂坊医药有限公司|       DDI|          河北|    2563005.46|    810446.44|   1546372.15|  up|
|  T2|  H0593S|      国药堂药有限公司|       DDI|          河北|    5412119.26|   1241300.64|   1654506.05|  up|
|  T2|   1006S|        河贸易有限公司|       DDI|          河北|      31847.58|      6605.72|      9101.52|  up|
|  T2|   1206S|        承盛限责任公司|       DDI|          河北|     372629.21|     81697.68|    157634.23|  up|
|  T2|   H227S|    国药庄医药有限公司|       DDI|          河北|    1382932.07|    243595.74|    892387.49|  up|
|  T2|   H1S|        有限公司|       DDI|          河北|    1581317.58|    1268579.3|   1270598.91|  up|
|  T2|   330S|            华有限公司|       DDI|          河北|    2133488.87|    685468.02|   1198794.77|  up|
+----+----------+----------------------------+----------+--------------+--------------+-------------+-------------+----+
only showing top 20 rows

******df.show() print schema *******
20/03/18 08:58:30 INFO SparkUI: Stopped Spark web UI at http://Ace-Sun:4040
20/03/18 08:58:30 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
20/03/18 08:58:30 INFO MemoryStore: MemoryStore cleared
20/03/18 08:58:30 INFO BlockManager: BlockManager stopped
20/03/18 08:58:30 INFO BlockManagerMaster: BlockManagerMaster stopped
20/03/18 08:58:30 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
20/03/18 08:58:30 INFO SparkContext: Successfully stopped SparkContext
20/03/18 08:58:30 INFO ShutdownHookManager: Shutdown hook called
20/03/18 08:58:30 INFO ShutdownHookManager: Deleting directory C:\Users\Lenovo\AppData\Local\Temp\spark-eb60c4d7-fe60-4af6-9eb6-3a2e934d1405
---------------------------

抛出的异常是由于本地没有hadoop环境所导致,并不影响运行。

可以将hadoop lib包导入 到本地开发环境,设置hadoop home解决。也可以在本地安装hadoop环境解决。

以上为个人学习研究代码,有冗余请自行优化。

交流学习或指教可添加微信 spsace

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值