1.sparkCore实现wordCount(Idea+scala)
import org.apache.spark.{SparkConf, SparkContext}
object wordCount {
def main(args: Array[String]): Unit = {
//设置本地运行 2核
val conf = new SparkConf().setAppName("scalaWordCount").setMaster("local[2]")
val sc = new SparkContext(conf)
//读取文件
//val rdd1 = sc.textFile(args(0))
val rdd1 = sc.textFile("d:/data/word.txt");
//切分后压平
val rdd2 = rdd1.flatMap(x=>x.split(" "))
//将数据都赋值 1
val rdd3 = rdd2.map((_,1))
//很据key 进行计算
val rdd4 = rdd3.reduceByKey(_+_)
//排序
val rdd5 = rdd4.sortBy(_._2)
//生成新文件 保存到hdfs
// val rdd6 = rdd5.saveAsTextFile(args(1))
println(rdd5.collect().toBuffer);
sc.stop()
}
}
2.创建一个数组,根据数据创建一个Bean对象,继承Order,实现序列化(Serializable).从而对数组进行排序。
import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}
object CustomSort1 {
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.spark").setLevel(Level.OFF)
val conf = new SparkConf().setAppName("IPLocation").setMaster("local[*]")
val sc = new SparkContext(conf)
//id 姓名 年纪 颜值
val users = Array("1,tom,18,100","2,lisa,19,500","3,lisi,30,50","4,zhangsan,25,100")
val userLines = sc.makeRDD(users)
val userRDD = userLines.map(line => {
val fields = line.split(",")
val num = fields(0).toInt
val name = fields(1)
val age = fields(2).toInt
val rate = fields(3).toInt
new User(num,name,age,rate)
})
//排序
val sorted = userRDD.sortBy(x => x)
println(sorted.collect().toBuffer)
}
}
class User(val num:Int,val name:String,val age:Int,val rate:Int) extends Ordered[User] with Serializable {
override def compare(that: User): Int = {
//先根据颜值排序,如果颜值相同,根据年龄排序。
if(this.rate == that.rate){
this.age-that.age
}else{
that.rate-this.rate
}
}
override def toString: String = {
s"user:$num,$name,$age,$rate"
}
}
3.SparkCore数据保存到Mysql
import java.sql.{Connection, DriverManager}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.rdd.JdbcRDD
import org.apache.spark.{SparkConf, SparkContext}
object JDBCDemo {
val conn: () => Connection = () =>{
DriverManager.getConnection("jdbc:mysql://localhost:3306/test?characterEncoding=UTF-8","root","123456")
}
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.spark").setLevel(Level.OFF)
val conf = new SparkConf().setAppName("").setMaster("local[*]")
val sc = new SparkContext(conf)
val jdbcRDD = new JdbcRDD(
sc,
conn,
"select * from logs where id >=? and id <=?",
1,
5,
2,
rs=>{
val id = rs.getLong(1)
val name = rs.getString(2)
val age = rs.getInt(3)
(id, name, age)
}
)
val result = jdbcRDD.collect()
println(result.toBuffer)
sc.stop()
}
}