目录
使用Structured Streaming读取Socket数据,把单词和单词的反转组成 json 格式写入到当前目录中的file文件夹中代码:
使用Structured Streaming读取Socket数据,把单词和单词的反转组成 json 格式写入到当前目录中的file文件夹中
代码:
package Spark
import org.apache.spark.sql.{DataFrame, SparkSession}
/**
* 使用Structured Streaming读取Socket数据,把单词和单词的反转组成 json 格式写入到当前目录中的file文件夹中
* (abc,cba)
*/
object demo01 {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession
.builder()
.master("local[*]")
.appName("demo01")
.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
import spark.implicits._
//获取每行的数据
val lines: DataFrame = spark.readStream
.format("socket") // 设置数据源
.option("host", "cdh01")
.option("port", 10000)
.load
//
val words: DataFrame = lines.as[String].flatMap(line => {
line.split("\\W+").map(word => {
(word, word.reverse)
})
}).toDF("原单词", "反转单词")
words.writeStream
.outputMode("append")
.format("json") // // 支持 "orc", "json", "csv"
.option("path", "./file") // 输出目录
.option("checkpointLocation", "./ck1") // 必须指定 checkpoint 目录
.start
.awaitTermination()
}
}
统计出文件中的男女生各有多少人
package Spark
import org.apache.spark.SparkContext
import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.sql.types.{LongType, StringType, StructType}
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
object demo02 {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession
.builder()
.master("local[*]")
.appName("ReadFromCSV")
.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
val student_info_schema: StructType = new StructType()
.add("学号","string")
.add("姓名","string")
.add("性别","string")
.add("班级编号","integer")
.add("入学日期","String")
//2.接收数据
import spark.implicits._
val student_info: DataFrame = spark.readStream
.format("csv")
.option("header","true")//说明csv文件有标题
.schema(student_info_schema)
.load("C:\\Users\\hasee\\Desktop\\4.16\\student_info")
//2.1、统计出文件中的男女生各有多少人
val result1: Dataset[Row] =
student_info.groupBy("性别").count()
//2.2、统计出姓“王”男生和女生的各有多少人
val result2: Dataset[Row] =
student_info.filter($"姓名" .startsWith("王") ).groupBy("性别").count()
//输出结果
result2.writeStream
.format("console")
.outputMode("complete")
.trigger(Trigger.ProcessingTime(0))// 触发器 数字表示毫秒值. 0 表示立即处理
.start()
.awaitTermination()
}
}
统计出各个院系的分别多少条信息
package Spark
import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.sql.types.StructType
object demo03 {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession
.builder()
.master("local[*]")
.appName("ReadFromCSV")
.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
//读取数据
//设置数据的结构
val department_info_schema: StructType = new StructType()
.add("院系编号", "string")
.add("院系名称", "string")
val department_info: DataFrame = spark.readStream
.format("csv")
.option("header","true")
.schema(department_info_schema)
.load("C:\\Users\\hasee\\Desktop\\4.16\\department_info\\")
import spark.implicits._
//3.1统计出各个院系的分别多少条信息
val result: Dataset[Row] = department_info.groupBy("院系名称").count()
result.writeStream
.format("console")
.outputMode("complete")
.trigger(Trigger.ProcessingTime(0))// 触发器 数字表示毫秒值. 0 表示立即处理
.start()
.awaitTermination()
}
}
统计出每个班级的最高分数
package Spark
import org.apache.spark.sql._
import org.apache.spark.sql.types.StructType
import org.apache.spark.{SparkConf, SparkContext}
object demo04 {
//准备临时存储数据的样例类
def main(args: Array[String]): Unit = {
//1 创建sparksession
val spark: SparkSession = SparkSession.builder().appName("demo04").master("local[*]").getOrCreate()
spark.sparkContext.setLogLevel("WARN")
val student_scores_schema: StructType = new StructType()
.add("sno", "string")
.add("sname", "string")
.add("sex", "string")
.add("cno", "string")
.add("scores", "string")
val frame: DataFrame = spark.read.option("header", true).format("csv").schema(student_scores_schema).load("C:\\Users\\hasee\\Desktop\\4.16\\student_score\\")
frame.createOrReplaceTempView("student_scores")
//4.1、统计出每个班级的最高分数
// spark.sql("select cno ,max(scores) from student_scores group by cno ").show()
//4.2、统计出男生最高分
// spark.sql("select max(scores) from student_scores where sex='男' ").show()
//4.3、统计出女生最高分
// spark.sql("select max(scores) from student_scores where sex='女' ").show()
//4.4、分别统计出男生和女生的分数前三名
//SELECT *, Row_Number() OVER (partition by deptid ORDER BY salary desc) rank FROM employee
// spark.sql("select * from (SELECT *, Row_Number() OVER (partition by sex ORDER BY scores desc) rank FROM student_scores) t1 where t1.rank<=3").show()
//4.5、统计出分数在500分以上的人数
// spark.sql("select count(scores) from student_scores where scores>500").show()
//4.6、统计出分数在300分以下的人中男女各占多少
// spark.sql("select sex, count(*) from student_scores where scores<300 group by sex ").show
spark.stop()
}
}
统计出哪个院系的专业最多
package Spark
import org.apache.spark.SparkContext
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, SparkSession}
object demo05 {
def main(args: Array[String]): Unit = {
//1 创建sparksession
val spark: SparkSession = SparkSession.builder().appName("demo05").master("local[*]").getOrCreate()
spark.sparkContext.setLogLevel("WARN")
val class_info_schema: StructType = new StructType()
.add("classid", "string")
.add("classname", "string")
.add("date", "string")
.add("yname", "string")
val frame: DataFrame = spark.read.option("header", true).format("csv").schema(class_info_schema).load("E:\\2020-传智资料1\\第二学期Struct\\day02_作业\\周勇江4.16号练习题50道2.0\\class_info")
frame.createOrReplaceTempView("class_info")
spark.udf.register("getMajorCname", (str: String) => {
str.substring(0, str.length - 2)
})
// spark.sql("select getMajorCname(classname) from class_info ").show(100)
//5.1、统计出哪个院系的专业最多
spark.sql("select yname,count(distinct(t1.cname)) as count from (select classid,getMajorCname(classname) as cname,date,yname from class_info) t1 group by t1.yname").show(100)
// spark.sql("select * from (select yname,count(distinct(t1.cname)) as count from (select classid,getMajorCname(classname) as cname,date,yname from class_info) t1 group by t1.yname) t2 order by count desc limit 1").show()
// spark.sql(" select * from (select yname,count(distinct(t1.cname)) as count from (select classid,getMajorCname(classname) as cname,date,yname from class_info) t1 group by t1.yname) t2 order by count desc limit 1 ").show()
//5.2、统计出计算机学院中有多少专业
// spark.sql("select * from (select yname,count(distinct(t1.cname)) as count from (select classid,getMajorCname(classname) as cname,date,yname from class_info) t1 group by t1.yname ) t2 where yname='计算机学院' ").show()
//5.3、统计出经济管理学院的会计和工商管理的班级数
// spark.sql("select '会计',count(classname) from class_info where yname='经济管理学院' and classname like '会计%' ").show()
// spark.sql("select '工商管理',count(classname) from class_info where yname='经济管理学院' and classname like '工商管理%' ").show()
//5.4、分别统计出每个学院的班级最多的专业
// spark.sql("select yname,count(classid) as count ,cname from (select classid,getMajorCname(classname) as cname,date,yname from class_info) t1 group by yname,t1.cname ").show()
// spark.sql("select * from (select t2.*,rank() over( partition by t2.yname order by t2.count desc ) as rank from (select yname,count(classid) as count ,cname from (select classid,getMajorCname(classname) as cname,date,yname from class_info) t1 group by yname,t1.cname ) t2) t3 where t3.rank <= 1").show(100)
//5.5、统计出班级编号以2开头的所有的专业名称
// spark.sql("select getMajorCname(classname) as zhuanye from class_info where classid like '02%' group by zhuanye").show()
spark.stop()
}
}