importorg.apache.spark.sql.SparkSessionimportorg.apache.spark.sql.functions.colimportorg.apache.spark.sql.functions._
object cleandata {
def main(args:Array[String]):Unit={
//创建spark对象
val spark =SparkSession.builder().appName("HiveSupport").master("local[*]").config("spark.sql.warehouse.dir","hdfs://127.0.0.1:9000/opt/hive/warehouse").config("hive.metastore.uris","thrift://127.0.0.1:9083").config("dfs.client.use.datanode.hostname","true").enableHiveSupport().getOrCreate()//############# Begin ############//创建hive数据库daobidata
spark.sql("create database daobidata")//创建hive数据表
spark.sql("use daobidata")//创建diedata表
spark.sql("create table if not exists diedata(bianh int,com_name string,"+"com_addr string,cat string,se_cat string,com_des string,born_data string,"+"death_data string,live_days int,financing string,total_money int,death_reason string,"+"invest_name string,ceo_name string,ceo_des string"+")row format delimited fields terminated by ',';")//将本地datadie.csv文件导入至hive数据库diedata表中
spark.sql("load data local inpath '/data/workspace/myshixun/data/datadie.csv' into table diedata;")//进入diedata表进行清洗操作,删除为空的数据,根据倒闭原因切分出最主要原因,根据成立时间切分出,企业成立的年份,根据倒闭时间切分出,企业倒闭的年份
val c1 = spark.table("diedata").na.drop("any").distinct().withColumn("death_reason",split(col("death_reason")," ")(0)).withColumn("bornyear",split(col("born_data"),"/")(0)).withColumn("deathyear",split(col("death_data"),"/")(0))
c1.createOrReplaceTempView("c1")//创建die_data表
spark.sql("create table if not exists die_data(bianh int,com_name string,"+"com_addr string,cat string,se_cat string,com_des string,born_data string,"+"death_data string,live_days int,financing string,total_money int,death_reason string,"+"invest_name string,ceo_name string,ceo_des string,bornyear string,deathyear string"+")row format delimited fields terminated by ',';")//将清洗完的数据导入至die_data表中
spark.sql("insert overwrite table die_data select * from c1")//############# End ##############
spark.stop()}}
第二关
一
importorg.apache.spark.sql.{
SaveMode,SparkSession}
object citydiedata {
def main(args:Array[String]):Unit={
val spark =SparkSession.builder().appName("SparkCleanJob").master("local[*]").getOrCreate()//************* Begin **************//读取数据,用逗号分隔,第一行不做为数据,做为标题
val df1 = spark.read.option("delimiter",",").option("header",true).csv("/data/workspace/myshixun/die_data.csv")
df1.createOrReplaceTempView("df1")//使用spark SQL语句,根据城市统计企业倒闭top5
val df=spark.sql("select df1.com_addr as com_addr,count(df1.com_addr) as saddr from df1 group by df1.com_addr order by saddr desc limit 5").repartition(1).write
//连接数据库.format("jdbc").option("url","jdbc:mysql://127.0.0.1:3306/diedata?useUnicode=true&characterEncoding=utf-8").option("driver","com.mysql.jdbc.Driver")//保存至数据库的数据表名.option("dbtable","addr")//用户名.option("user","root")//连接数据库的密码.option("password","123123")//不破坏数据表结构,在后添加.mode(SaveMode.Append).save()//************ End ***********
spark.stop()}}
二
importorg.apache.spark.sql.{
SaveMode,SparkSession}
object industrydata {
def main(args:Array[String]):Unit={
val spark =SparkSession.builder().appName("SparkCleanJob").master("local[*]").getOrCreate()//########## Begin ############//读取数据,用逗号分隔,第一行不做为数据,做为标题
val df1 = spark.read.option("delimiter",",").option("header",true).csv("/data/workspace/myshixun/die_data.csv")
df1.createOrReplaceTempView("df1")//使用spark SQL语句,根据行业统计企业倒闭top10
val df=spark.sql("select df1.cat as industry,count(df1.cat) as catindustry from df1 group by df1.cat order by catindustry desc limit 10 ").repartition(1).write
//连接数据库.format("jdbc").option("url","jdbc:mysql://127.0.0.1:3306/diedata?useUnicode=true&characterEncoding=utf-8").option("driver","com.mysql.jdbc.Driver")//数据表名.option("dbtable","industry").option("user","root").option("password","123123")//不破坏数据表结构,在后添加.mode(SaveMode.Append).save()//############ End ###########
spark.stop()}}
三
importorg.apache.spark.sql.{
SaveMode,SparkSession}
object closedown {
def main(args:Array[String]):Unit={
val spark =SparkSession.builder().appName("SparkCleanJob").master("local[*]").getOrCreate()//############ Begin ###########//读取数据,用逗号分隔,第一行不做为数据,做为标题
val df1 = spark.read.option("delimiter",",").option("header",true).csv("/data/workspace/myshixun/die_data.csv")
df1.createOrReplaceTempView("df1")//使用spark SQL语句,根据倒闭原因字段,找到企业倒闭的主要原因,统计主要原因的个数
val df=spark.sql("select df1.death_reason as death_reason,count(df1.death_reason) as dreason from df1 group by df1.death_reason order by dreason desc").repartition(1).write
//连接数据库.format("jdbc")//数据库名.option("url","jdbc:mysql://127.0.0.1:3306/diedata?useUnicode=true&characterEncoding=utf-8").option("driver","com.mysql.jdbc.Driver")//数据表名.option("dbtable","cldown").option("user","root").option("password","123123")//不破坏表结构,在后面添加.mode(SaveMode.Append).save()//############ End ###########
spark.stop()}}