import org.apache.spark.sql.SparkSession
object JoinLabel {
def main(args: Array[String]): Unit = {
val session = SparkSession.builder().master(“local[1]”).appName(“make_label”).getOrCreate()
import session.implicits._
val csvLabel1 = session.read.csv("./data/rel1.csv").rdd
val csvLabel2 = session.read.csv("./data/rel2.csv").rdd
val csvOrign = session.read.csv("./data/ownthink_v2.csv").rdd
val origRdd = csvOrign.filter(_.size == 3).filter(x => !(x.get(0)==null)).map(x => {
(x.get(0).toString, x)
})
val label1 = csvLabel1.map(x => {
(x.get(1).toString, x.get(0).toString)
})
val label2 = csvLabel2.map(x => {
(x.get(1).toString, x.get(0).toString)
})
// key,(id,info)
val midRes = label1.join(origRdd)
val joinRes = midRes.filter(x => !(x._2._2.get(2) == null)).map(x => {
(x._2._2.get(2).toString, (x._2._1, x._2._2))
})
val joinRdd = label2.join(joinRes)
// key,(id2,(id1,info))
val finRes = joinRdd.map(x => {
val id_left = x._2._2._1
val property = x._2._2._2.get(1).toString
val id_right = x._2._1
(id_left, property, id_right, "RELATIONSHIP")
})
// :START_ID,name,:END_ID,:TYPE
val finFrame = finRes.toDF(":START_ID","name",":END_ID",":TYPE")
finFrame.write.option("header","true").csv("./data/finRes")
}
}