package cn.bw.spark.object_cxb
import cn.bw.spark.object_cxb.WriteSQL.{sc, transfer}
import org.apache.hadoop.io.{LongWritable, Text}
import org.apache.hadoop.mapred.TextInputFormat
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Joins extends App {
//io包 汉字类型的转换 直接cp用
def transfer(sc: SparkContext, path: String): RDD[String] = {
sc.hadoopFile(path, classOf[TextInputFormat],
classOf[LongWritable], classOf[Text], 1).map(
p => new String(p._2.getBytes, 0, p._2.getLength, "GBK")
)
}
val conf = new SparkConf().setAppName("Joins").setMaster("local[3]")
val sc = new SparkContext(conf)
//sex=男, email=2u5wk5@aol.com, tel=15500121817, name=李进, road=1001
val xiao = transfer(sc, "d:/15.txt").map(t => {
val files = t.split(",")
val sex = files(0).split("=")
val email = files(1)
val tel = files(2)
val name = files(3)
val address = files(5)
(address, (name, email, tel, sex(1)))
})
//println(rdd1.take(5).toBuffer) (( name=李进, email=2u5wk5@aol.com, tel=15500121817),男)
val da = transfer(sc, "d:/16.txt").map(t => {
val daFile = t.split(",")
(daFile(3), (daFile(0), daFile(1), daFile(2)))
})
//println(da.take(5).toBuffer)
val rdd2 = da.join(xiao).map(t => {
val id = t._1
val names = t._2._2._1 //名字
val emails = t._2._2._2 //email
val tels = t._2._2._3 //电话
val sexs = t._2._2._4 //性别
val quming = t._2._1._3 //区名
(id, names, emails, tels, sexs, quming)
})
println(rdd2.collect().toBuffer)
//groupBy后就会自动把sexs变为key,之后再对value进行操作
val rdd3 = rdd2.groupBy(_._5).mapValues(t => {
t.toList.sortBy(_._4).reverse.take(5)
})
println(rdd3.collect().toBuffer)
//rdd3.saveAsTextFile("d:/obj") `` ect/join")
// println(rdd3.collect().toBuffer)
}
spark处理中文遇到的乱码问题(小表join大表)
最新推荐文章于 2021-11-30 17:40:57 发布