//利用cogroup 处理分隔符的文件
import org.apache.spark.{ SparkContext, SparkConf }
import java.sql.DriverManager
object HandleGroup extends App{
val beginTime = System.currentTimeMillis()
//引用spark
val conf = new SparkConf().setAppName("Simple Application").setMaster("local[*]")
val sc = new SparkContext(conf)
//获取文件
val txt1 = sc.textFile("i:/1/fg1.txt").map(line =>line.split( "\\|"))
val txt2 = sc.textFile("i:/1/fg2.txt").map(line =>line.split( "\\|"))
val results = txt1.map{v =>(v(0)+v(1)+v(5),v) }.cogroup(
txt2.map{v =>(v(0)+v(1)+v(5),v)}).filter {
case (k, (v1, v2)) => !(v1.nonEmpty&&v2.nonEmpty) }
//JDBC预处理
val driver = "oracle.jdbc.driver.OracleDriver"
val url = "jdbc:oracle:thin:@"
val username = ""
val password = ""
Class.forName(driver)
val connection = DriverManager.getConnection(url, username, password)
val statement = connection.createStatement()
//遍历结果集
results.foreach{case (k,(v1,v2)) =>
v1.foreach(a =>runSql(a(0),a(1),a(5)))
v2.foreach(a => runSql(a(0),a(1),a(5)))
}
//执行sql语句
def runSql(v1:String,v2:String,v3:String): Unit =
statement.addBatch("insert into nidaye( v1, v2, v3) values('"+v1+"','"+v2+"','"+v3+"')")
//提交
val resultSet = statement.executeBatch()
connection.close()
println(System.currentTimeMillis() - beginTime)
}
//利用SPARKSQL 处理定长分隔的文件
import java.sql.DriverManager
import org.apache.spark.rdd.JdbcRDD
import org.apache.spark.{ SparkContext, SparkConf }
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{ IntegerType, StringType, StructField, StructType }
// 导入 Row.
import org.apache.spark.sql.Row;
// 导入 Spark SQL 数据类型
import org.apache.spark.sql.types.{ StructType, StructField, StringType };
object HandleLength extends App{
val beginTime = System.currentTimeMillis()
//引用spark
val conf = new SparkConf().