期末大数据处理技术题目是这个,从平台上可以用小号去买答案,用手机号就能注册小号,完善信息随便填就可以,手机号必须是真的。去做其他实训题目可以赚金币,用金币可以买题目的答案,我用小号赚了几千金币(做什么c语言基础,java基础,全是hello world),买了这个题的答案。
我只买了前两题的答案,后面的可视化他自己的答案都运行不出来,纯垃圾的
第一题:
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.{col, split}
import org.apache.spark.sql.functions._
object tmall_order{
def main(args: Array[String]): Unit = {
var conf = new SparkConf().setMaster("local[*]").setAppName("SparkTest")
var sc = new SparkContext(conf)
val spark:SparkSession = SparkSession.builder().config(conf).getOrCreate()
//读取数据,创建Dataframe对象
val df = spark.read
.format("csv")
.option("header","false")
//读取天猫订单数据,然后进行清洗
.load("/data/workspace/myshixun/step1/DATA//tmall_order_report.csv").toDF()
//将数据中为空的值删除
val dd=df.na.drop("any")
//对_c3那一列地区名进行格式化
val d1=dd.withColumn("_c3", regexp_replace(col("_c3"), "自治区|维吾尔|回族|壮族|省", ""))
//对_c5列以空格切分,取付款的年月日,命名为_c8
val dd1=d1.withColumn("_c8",split(col("_c5")," ")(0))
//对_c5列以空格切分,取付款的时分秒,命名为_c10
val dd2=dd1.withColumn("_c10",split(col("_c5")," ")(1))
//对_c10列以冒号切分,取付款的小时,命名为_c11
val dd3=dd2.withColumn("_c11",split(col("_c10"),":")(0))
//保存文件
dd3.distinct().coalesce(1).write.csv("/root/files/data")
//读取数据,创建Dataframe对象
val df1 = spark.read
.format("csv")
//将第一行清除掉
.option("header","true")
//读取淘宝美妆数据,并进行数据清洗
.load("/data/workspace/myshixun/step1/DATA/meizhuangdata.csv").toDF()
//空值填充,填充的数据为0
val cc1=df1.na.fill(value="0")
//求单价和卖出数量的乘积,并创建成为新的一列
val cc2=cc1.withColumn("sale_amount",col("price").cast("long") * col("sale_count").cast("long"))
//修改日期格式
val cc3=cc2.withColumn("update_time", regexp_replace(col("update_time"), "/", "-"))
//保存数据
cc3.distinct().coalesce(1).write.csv("/root/files/data1")
//读取数据,创建Dataframe对象
val df2 = spark.read
.format("csv")
//将第一行清除掉
.option("header","true")
//读取订单销量数据
.load("/data/workspace/myshixun/step1/DATA/saledata.csv").toDF()
//将订单日期格式统一
val b=df2.withColumn("订单日期", regexp_replace(col("订单日期"), "#", "/"))
//删除为空的列
val bb=b.na.drop("any")
//对所在省份进行格式化
val b1=bb.withColumn("所在省份", regexp_replace(col("所在省份"), "自治区|维吾尔|回族|壮族|省|市", ""))
//获取订单日期列中的月份
val b2=b1.withColumn("month",split(col("订单日期"),"/")(1))
//保存数据
b2.distinct().coalesce(1).write.csv("/root/files/data2")
spark.stop()
}
}
第二题:第一关
import org.apache.spark.sql.{SaveMode, SparkSession}
object test1{
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("SparkCleanJob")
.master("local[*]").getOrCreate()
//读取tmallorder文件并用逗号分隔,并将header清洗掉
val df1 = spark.read.option("delimiter", ",").option("header",true).csv("/data/workspace/myshixun/step1/tmallorder.csv")
df1.createOrReplaceTempView("df1")
//用spark sql语句将订单付款日相同的合并,对订单日相同的总金额求和
val df=spark.sql("select df1.ptime as ptime,sum(df1.amount) as sum from df1 group by df1.ptime order by ptime desc")
//连接数据库,数据库名为tmallsale,数据表名为tmallordersum
df.na.drop("any")
.repartition(1)
.write
.format("jdbc")
//数据库名为tmallsale
.option("url", "jdbc:mysql://127.0.0.1:3306/tmallsale?useUnicode=true&characterEncoding=utf-8")
.option("driver","com.mysql.jdbc.Driver")
//数据表名为tmallordersum
.option("dbtable", "tmallordersum")
//用户名为root
.option("user", "root")
//密码为123123
.option("password", "123123")
//不破坏数据表结构,在后添加
.mode(SaveMode.Append)
.save()
spark.stop()
}
}
第二题:第二关
import org.apache.spark.sql.{SaveMode, SparkSession}
object tb_mzsale {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("tb_mzsale")
.master("local[*]").getOrCreate()
//用spark sql读取meizhuangdata表中的数据,用逗号分隔,去除表头
val df1 = spark.read.option("delimiter", ",").option("header",false).csv("/data/workspace/myshixun/step2/meizhuangdata.csv")
df1.createOrReplaceTempView("df1")
//用spark sql获取相同美妆产品销量和以及销售额和这两列的数据并按要求命名排序后,将数据保存到tmallsale数据库中
spark.sql("select df1._c6 as product,sum(df1._c7) as amount,sum(df1._c4) as sale_count from df1 group by df1._c6 order by amount desc")
.repartition(1)
.write
//连接数据库
.format("jdbc")
//数据库名为tmallsale
.option("url", "jdbc:mysql://127.0.0.1:3306/tmallsale?useUnicode=true&characterEncoding=utf-8")
.option("driver","com.mysql.jdbc.Driver")
//数据表名为meizhuang
.option("dbtable", "meizhuang")
.option("user", "root")
.option("password", "123123")
//不破坏数据表结构,在后添加
.mode(SaveMode.Append)
.save()
spark.stop()
}
}
第二题:第三关
import org.apache.spark.sql.{SaveMode, SparkSession}
object tmallorderhour{
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("tmallorderhour")
.master("local[*]").getOrCreate()
//读取数据,用逗号分隔,去除表头
val df1 = spark.read.option("delimiter", ",").option("header",true).csv("/data/workspace/myshixun/step3/tmallorder.csv")
df1.createOrReplaceTempView("df1")
//用spark sql语句查询时间,时间相同的合并,统计时间相同销量数据,以销量数据降序排列
val d=spark.sql("select df1.hour as hour,count(df1.hour) as hoursale from df1 group by df1.hour order by hour")
d.na.drop("any")
.repartition(1)
.write
//连接数据库
.format("jdbc")
.option("url", "jdbc:mysql://127.0.0.1:3306/tmallsale?useUnicode=true&characterEncoding=utf-8")
.option("driver","com.mysql.jdbc.Driver")
//数据表名tmallorderhour
.option("dbtable", "tmallorderhour")
.option("user", "root")
.option("password", "123123")
//不破坏数据表结构,在后添加
.mode(SaveMode.Append)
.save()
spark.stop()
}
}
第二题:第四关
import org.apache.spark.sql.{SaveMode, SparkSession}
object saledata{
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("saledata")
.master("local[*]").getOrCreate()
val df1 = spark.read.option("delimiter", ",").option("header",true).csv("/data/workspace/myshixun/step4/saledata.csv")
df1.createOrReplaceTempView("df1")
val d=spark.sql("select df1.month as month,sum(df1.amount) as amountmoney from df1 group by df1.month order by month desc")
.repartition(1)
.write
//连接数据库
.format("jdbc")
.option("url", "jdbc:mysql://127.0.0.1:3306/tmallsale?useUnicode=true&characterEncoding=utf-8")
.option("driver","com.mysql.jdbc.Driver")
.option("dbtable", "salemonth")
.option("user", "root")
.option("password", "123123")
.mode(SaveMode.Append)
.save()
val df =spark.sql("select df1.loc_city as city,sum(df1.order_quantity) as quantity from df1 group by df1.loc_city order by quantity desc")
.repartition(1)
.write
//连接数据库
.format("jdbc")
.option("url", "jdbc:mysql://127.0.0.1:3306/tmallsale?useUnicode=true&characterEncoding=utf-8")
.option("driver","com.mysql.jdbc.Driver")
.option("dbtable", "salecity")
.option("user", "root")
.option("password", "123123")
.mode(SaveMode.Append)
.save()
spark.stop()
}
}
第二题:第五关
import org.apache.spark.sql.{SaveMode, SparkSession}
object dataproduct {
def main(args: Array[String]): Unit = {
val spark=SparkSession.builder().master("local[*]").appName("spark").getOrCreate()
//读取数据,清除字段名,读取数据以逗号分隔
val df=spark.read.option("delimiter", ",").option("header",true).csv("/data/workspace/myshixun/step5/meizhuangdata.csv")
df.createOrReplaceTempView("df")
//用spark sql语句查询产品的名称单价,算出相同产品的单价总和,最后求出相同产品的平均价格
//产品字段名dianming ,单价总和字段名为price ,平均价格字段名为aveprice
val d=spark.sql("select df.dianming as dianming,sum(df.price) as price,round(mean(price),2) as aveprice from df group by df.dianming order by aveprice desc")
.repartition(1)
.write
.format("jdbc")
.option("url", "jdbc:mysql://127.0.0.1:3306/tmallsale?useUnicode=true&characterEncoding=utf-8")
.option("driver","com.mysql.jdbc.Driver")
//数据表名为pricedata
.option("dbtable", "pricedata")
.option("user", "root")
.option("password", "123123")
//不破坏数据表结构,在后添加
.mode(SaveMode.Append)
.save()
spark.stop()
}
}