头歌大数据答案(自用)

第一关

# 命令行
start-all.sh
nohup hive --service metastore &
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.functions._
object cleandata {
   
   
  def main(args: Array[String]): Unit = {
   
   
    //创建spark对象
    val spark = SparkSession.builder().appName("HiveSupport").master("local[*]")
    .config("spark.sql.warehouse.dir", "hdfs://127.0.0.1:9000/opt/hive/warehouse")
    .config("hive.metastore.uris", "thrift://127.0.0.1:9083")
    .config("dfs.client.use.datanode.hostname", "true")
    .enableHiveSupport().getOrCreate()
    //############# Begin ############
     //创建hive数据库daobidata
    spark.sql("create database daobidata")
    //创建hive数据表
    spark.sql("use daobidata")
    //创建diedata表
    spark.sql("create table if not exists diedata(bianh int,com_name string," +
      "com_addr string,cat string,se_cat string,com_des string,born_data string," +
      "death_data string,live_days int,financing string,total_money int,death_reason string,"+
      "invest_name string,ceo_name string,ceo_des string"+
      ")row format delimited fields terminated by ',';")
    //将本地datadie.csv文件导入至hive数据库diedata表中
    spark.sql("load data local inpath '/data/workspace/myshixun/data/datadie.csv' into table diedata;")
   
    //进入diedata表进行清洗操作,删除为空的数据,根据倒闭原因切分出最主要原因,根据成立时间切分出,企业成立的年份,根据倒闭时间切分出,企业倒闭的年份
    val c1 = spark.table("diedata").na.drop("any").distinct()
      .withColumn("death_reason",split(col("death_reason")," ")(0))
      .withColumn("bornyear",split(col("born_data"),"/")(0))
      .withColumn("deathyear",split(col("death_data"),"/")(0))
      
    c1.createOrReplaceTempView("c1")
    //创建die_data表
    spark.sql("create table if not exists die_data(bianh int,com_name string," +
      "com_addr string,cat string,se_cat string,com_des string,born_data string," +
      "death_data string,live_days int,financing string,total_money int,death_reason string,"+
      "invest_name string,ceo_name string,ceo_des string,bornyear string,deathyear string"+
      ")row format delimited fields terminated by ',';")
    //将清洗完的数据导入至die_data表中
    spark.sql("insert overwrite table die_data select * from c1")
   //############# End ##############
    spark.stop()
  }
}

第二关

import org.apache.spark.sql.{
   
   SaveMode, SparkSession}
object citydiedata {
   
   
  def main(args: Array[String]): Unit = {
   
   
    val spark = SparkSession.builder().appName("SparkCleanJob")
      .master("local[*]").getOrCreate()
    //************* Begin **************
    //读取数据,用逗号分隔,第一行不做为数据,做为标题
    val df1 = spark.read.option("delimiter", ",").option("header",true).csv("/data/workspace/myshixun/die_data.csv")
    df1.createOrReplaceTempView("df1")
    //使用spark SQL语句,根据城市统计企业倒闭top5
    val df=spark.sql("select df1.com_addr as com_addr,count(df1.com_addr) as saddr from df1 group by df1.com_addr order by saddr desc limit 5")
      .repartition(1)
      .write
      //连接数据库
      .format("jdbc")
      .option("url", "jdbc:mysql://127.0.0.1:3306/diedata?useUnicode=true&characterEncoding=utf-8")
      .option("driver","com.mysql.jdbc.Driver")
      //保存至数据库的数据表名
      .option("dbtable", "addr")
      //用户名
      .option("user", "root")
      //连接数据库的密码
      .option("password", "123123")
      //不破坏数据表结构,在后添加
      .mode(SaveMode.Append)
      .save()
    //************ End ***********
    spark.stop()
  }
}   

import org.apache.spark.sql.{
   
   SaveMode, SparkSession}
object industrydata {
   
   
  def main(args: Array[String]): Unit = {
   
   
    val spark = SparkSession.builder().appName("SparkCleanJob")
      .master("local[*]").getOrCreate()
     //########## Begin ############
    //读取数据,用逗号分隔,第一行不做为数据,做为标题
    val df1 = spark.read.option("delimiter", ",").option("header",true).csv("/data/workspace/myshixun/die_data.csv")
    df1.createOrReplaceTempView("df1")
    //使用spark SQL语句,根据行业统计企业倒闭top10
    val df=spark.sql("select df1.cat as industry,count(df1.cat) as catindustry from df1 group by df1.cat order by catindustry desc limit 10 ")
      .repartition(1)
      .write
      //连接数据库
      .format("jdbc")
      .option("url", "jdbc:mysql://127.0.0.1:3306/diedata?useUnicode=true&characterEncoding=utf-8")
      .option("driver","com.mysql.jdbc.Driver")
      //数据表名
      .option("dbtable", "industry")
      .option("user", "root")
      .option("password", "123123")
      //不破坏数据表结构,在后添加
      .mode(SaveMode.Append)
      .save()
    //############ End ###########
    spark.stop()
  }
}  

import org.apache.spark.sql.{
   
   SaveMode, SparkSession}
object closedown {
   
   
  def main(args: Array[String]): Unit = {
   
   
    val spark = SparkSession.builder().appName("SparkCleanJob")
      .master("local[*]").getOrCreate()
    //############ Begin ###########
    //读取数据,用逗号分隔,第一行不做为数据,做为标题
    val df1 = spark.read.option("delimiter", ",").option("header",true).csv("/data/workspace/myshixun/die_data.csv")
    df1.createOrReplaceTempView("df1")
    //使用spark SQL语句,根据倒闭原因字段,找到企业倒闭的主要原因,统计主要原因的个数
    val df=spark.sql("select df1.death_reason as death_reason,count(df1.death_reason) as dreason from df1 group by df1.death_reason order by dreason desc")
      .repartition(1)
      .write
      //连接数据库
      .format("jdbc")
      //数据库名
      .option("url", "jdbc:mysql://127.0.0.1:3306/diedata?useUnicode=true&characterEncoding=utf-8")
      .option("driver","com.mysql.jdbc.Driver")
      //数据表名
      .option("dbtable", "cldown")
      .option("user", "root")
      .option("password", "123123")
      //不破坏表结构,在后面添加
      .mode(SaveMode.Append)
      .save()
    //############ End ###########
    spark.stop()
  }
}

import org.apache
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值