1.Spark与MySQL或者其他的关系型数据库
package scala
import java.sql.{DriverManager, PreparedStatement, Connection}
import org.apache.spark.{SparkContext, SparkConf}
object RDDtoMysql {
//这里按需设置数据库Blog表
case class Blog(name: String, count: Int)
//往数据库存
def myFun(iterator: Iterator[(String, Int)]): Unit = {
var conn: Connection = null
var ps: PreparedStatement = null
//存时间有NOW()函数?
val sql = "insert into blog(name, count) values (?, ?)"
try {
conn =DriverManager.getConnection("jdbc:mysql://localhost:3306/spark",
"root", "123456")
iterator.foreach(data => {
ps = conn.prepareStatement(sql)
ps.setString(1, data._1)
ps.setInt(2, data._2)
ps.executeUpdate()
}
)
} catch {
case e: Exception => println("Mysql Exception")
} finally {
if (ps != null) {
ps.close()
}
if (conn != null) {
conn.close()
}
}
}
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("RDDToMysql").setMaster("local")
val sc = new SparkContext(conf)
val data = sc.parallelize(List(("www", 10), ("iteblog", 20), ("com", 30)))
data.foreachPartition(myFun)
}
}
使用maven+spring MVC中涉及到hadoop maven包等
<!--hadoop-->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.6.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.6.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.6.0</version>
</dependency>
mysql 和gson包
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.24</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.7</version>
</dependency>
运行代码示例:
package sql
import java.util.Properties
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}
/**
* 生产环境:下提交任务
* spark-submit --class sql.SparkSqlMysqlDatasource --master yarn-cluster --executor-memory 2G --num-executors 2 --driver-memory 1g --executor-cores 1 /data1/e_heyutao/sparktest/sparkEnn.jar
*
*/
object SparkSqlMysqlDatasource {
//数据库配置
lazy val url = "jdbc:mysql://your_ip:3306/my_test"
lazy val username = "root"
lazy val password = "secret_password"
def main(args: Array[String]) {
// val sparkConf = new SparkConf().setAppName("sparkSqlTest").setMaster("local[2]").set("spark.app.id", "sql")
val sparkConf = new SparkConf().setAppName("sparkSqlTest").setMaster("yarn-cluster").set("spark.app.id", "sqlTest")
//序列化
sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
sparkConf.set("spark.kryoserializer.buffer", "256m")
sparkConf.set("spark.kryoserializer.buffer.max", "2046m")
sparkConf.set("spark.akka.frameSize", "500")
sparkConf.set("spark.rpc.askTimeout", "30")
//获取context
val sc = new SparkContext(sparkConf)
//获取sqlContext
val sqlContext = new SQLContext(sc)
//引入隐式转换,可以使用spark sql内置函数
import sqlContext.implicits._
//创建jdbc连接信息
val uri = url + "?user=" + username + "&password=" + password + "&useUnicode=true&characterEncoding=UTF-8"
val prop = new Properties()
//注意:集群上运行时,一定要添加这句话,否则会报找不到mysql驱动的错误
prop.put("driver", "com.mysql.jdbc.Driver")
//加载mysql数据表
val df_test1: DataFrame = sqlContext.read.jdbc(uri, "user_t", prop)
val df_test2: DataFrame = sqlContext.read.jdbc(uri, "t_user2", prop)
//从dataframe中获取所需字段
df_test2.select("id", "name", "age").collect()
.foreach(row => {
println("id " + row(0) + " ,name " + row(1) + ", age " + row(2))
})
//注册成临时表
df_test1.registerTempTable("temp_table")
val total_sql = "select * from temp_table "
val total_df: DataFrame = sqlContext.sql(total_sql)
//将结果写入数据库中
val properties=new Properties()
properties.setProperty("user","root")
properties.setProperty("password","secret_password")
total_df.write.mode("append").jdbc("jdbc:mysql://your_ip:3306/my_test?useUnicode=true&characterEncoding=UTF-8","t_result",properties)
/**
* 注意:查看源码可以知道详细意思
def mode(saveMode: String): DataFrameWriter = {
this.mode = saveMode.toLowerCase match {
case "overwrite" => SaveMode.Overwrite
case "append" => SaveMode.Append
case "ignore" => SaveMode.Ignore
case "error" | "default" => SaveMode.ErrorIfExists
case _ => throw new IllegalArgumentException(s"Unknown save mode: $saveMode. " +
"Accepted modes are 'overwrite', 'append', 'ignore', 'error'.")
}
*/
//分组后求平均值
total_df.groupBy("name").avg("age").collect().foreach(x => {
println("name " + x(0))
println("age " + x(1))
})
}
}
2.Scala语言学习
正则表达式:构造正则对象Regex直接使用r方法:val regx=”[0-9]+”.r。返回所有匹配的迭代器:val matchiterator=regx.findAllIn(“带匹配字符串”)。首个匹配项:findFirstIn,匹配字符串的开始部分:findPrefixOf