1,阶段三处理数据可以用mysql语句直接解决
2,也可以用rdd查询两个表解决
3,也可以用mysql+rdd结合计算解决
sparkSession.sparkContext
通过这个方法转为Context
二、String转double
1、select cast(“111.11” as decimal(10,2));
2、select convert(“111.11”,decimal(10,2);
保留表结构
create table ods like ods_orders
保留两位小数
cast(sum(a1.totalprice) as decimal(10,2))
set hive.compute.query.using.stats=false count数据是0的情况
truncate table 表名; 删除表内数据保留表结构
----------------------------分区
1、创建表指定分区
create table test_part_table(word string,num int) partitioned by(dt string)
row format delimited fields terminated by ',' lines terminated by '\n';
特别注意:分区的列不要出现在建表的括号中,是单独在后面partitioned by (分区列名 类型)指定的
2、加载数据到分区(静态分区)
load data local inpath ‘filepath’ into table tableName partition(分区的列名='分区值');
load data local INPATH '/home/dongwentao15/dataTest/test1' overwrite into table test_part_table partition(dt='20190811');
3、在表中增加分区列
alter table tableName add partition(partitionColume=’’);
4、删除分区列
alter table tableName drop partition(partitionColume=’’);
onyanr模式启动
比赛命令:spark-submit --master yarn --class com.xyzy.Task1 /opt/jars/readmysql2.jar
静态分区
创建表设置分区
partitioned by(dt string)
create table test_part_table(word string,num int) partitioned by(dt string)
row format delimited fields terminated by ',' lines terminated by '\n';
静态分区添加数据
load data local inpath '/opt/baoxian2.txt' into table baoxian2 partition(yue='3');
动态分区插入
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
插入动态分区数据
5、动态分区识别插入
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
下面两个是分区数过大时使用:
set hive.exec.max.dynamic.partitions=50000;
set hive.exec.max.dynamic.partitions.pernode=10000;
insert into table baoxian2 partition(yue) select *,month(riqi) as yue from baoxian; //保留载入分区
insert overwrite table baoxian2 partition(yue) select *,month(riqi) as yue from baoxian;//不保留覆盖
过滤新增数据
collect()把他变成一个数组
package Spark
import org.apache.spark.sql.SparkSession
import java.util
import java.util.Properties
object day2guolv {
def main(args: Array[String]): Unit = {
val sparkSession=SparkSession.builder().appName("day2")
.master("yarn").enableHiveSupport().getOrCreate()
//从mysql中拿出数据
val prop = new Properties()
prop.setProperty("user","root")
prop.setProperty("password","123456")
val url = "jdbc:mysql://192.168.10.101:3306/fbb"
val df =sparkSession.read.jdbc(url,"(select* from tb_students)as pyy",prop)
//从hive中取出需要判断的数据
sparkSession.sql("use odsl")
val arra = sparkSession.sql("select id from student2").collect()
val list =new util.ArrayList[Int]()
for(i <- arra){
list.add(i.getInt(0))
}
df.show()
val df2= df.filter(x=>{
val number = x.getInt(0)
if(list.contains(number)){
false
}else{
true
}
})
df2.show()
df2.registerTempTable("wyy")
sparkSession.sql("use odsl")
sparkSession.sql("set hive.exec.dynamic.partition=true")
sparkSession.sql("set hive.exec.dynamic.partition.mode=nonstrict")
sparkSession.sql("insert into student2 partition(class,age) SELECT *,classinfo as class ,2022- birthday as age from wyy")
sparkSession.stop()
}
}
DF模式转RDD模式在转DF
//导入隐饰操作,否则RDD无法调用toDF方法
import sparkSession.implicits._
}).toDF().withColumnRenamed("_10","time")//指定字段修改名字
package Spark
import org.apache.spark.sql.SparkSession
import java.util.Properties
object Text1Work {
def main(args: Array[String]): Unit = {
val sparkSession=new SparkSession.Builder().appName("fbb").master("yarn").enableHiveSupport().getOrCreate()
val prop= new Properties()
prop.setProperty("user","root")
prop.setProperty("password","123456")
val url = "jdbc:mysql://192.168.10.101/ss_shtd"
val df1 =sparkSession.read.jdbc(url,"(select * from orders) as fbb",prop)
df1.show()
//导入隐饰操作,否则RDD无法调用toDF方法
import sparkSession.implicits._
val df2 =df1.rdd
val rdd = df2.map(x=>{
val arr = x.getString(4).replaceAll("-","")+"00:00:00"
val table =(x.getString(0),x.getString(1),x.getString(2),x.getString(3),x.getString(4),x.getString(5),x.getString(6),x.getString(7),x.getString(8),arr)
table
}).toDF().withColumnRenamed("_10","time")//指定字段修改名字
rdd.show()
rdd.registerTempTable("pyy")
sparkSession.sql("use ods")
sparkSession.sql("set hive.exec.dynamic.partition=true")
sparkSession.sql("set hive.exec.dynamic.partition.mode=nonstrct")
sparkSession.sql("set hive.exec.max.dynamic.partitions=50000")
sparkSession.sql("set hive.exec.max.dynamic.partitions.pernode=10000")
sparkSession.sql("insert into orders partition(time) select * from pyy")
sparkSession.stop()
}
}