- import org.apache.spark.sql.SparkSession
- import org.apache.spark.SparkConf
- import org.apache.spark.sql.functions.{col, split}
- /**
- * Created by wt
- */
- object AcquireOlineData {
- val spConfig = new SparkConf().setAppName("Online_click_data")
- val spark = SparkSession.builder().config(spConfig).enableHiveSupport().getOrCreate()
- val sc = spark.sqlContext
- val datadir = "/data/wt/social_click_model_new"
- spark.read.format("csv").option("header","true").load(datadir + "/f3_feature/*.csv").cache().createOrReplaceTempView("f3")
- spark.read.format("csv").option("header","true").load(datadir + "/f4_feature/*.csv").cache().createOrReplaceTempView("f4")
- spark.read.format("csv").option("header","true").load(datadir + "/f5_feature/*.csv").cache().createOrReplaceTempView("f5")
- spark.read.format("csv").option("header","true").load(datadir + "/f6_feature/*.csv").cache().createOrReplaceTempView("f6")
- def main(args: Array[String]): Unit = {
- mergeFeature("0319")
- }
- def acquireOnlineClickData(date:String): Unit ={
- import spark.sql
- sql("use ai_log")
- val click_uid = sql("select * from click_position where day="+date+" and type=1 and from='online'")
- .drop("service")
- .drop("position")
- .drop("from")
- .drop("time")
- .drop("type")
- .drop("day")
- .distinct()
- val show = sql("select * from people_show where day="+date+" and type=1 and from='online'")
- .drop("service")
- .drop("from")
- .drop("time")
- .drop("type")
- .drop("day")
- import org.apache.spark.sql.functions._
- val s1 = show.select(show("uid"),
- split(col("show_ids"),"-").getItem(0).as("s0"),
- split(col("show_ids"),"-").getItem(1).as("s1"),
- split(col("show_ids"),"-").getItem(2).as("s2"),
- split(col("show_ids"),"-").getItem(3).as("s3"),
- split(col("show_ids"),"-").getItem(4).as("s4"),
- split(col("show_ids"),"-").getItem(5).as("s5"),
- split(col("show_ids"),"-").getItem(6).as("s6"),
- split(col("show_ids"),"-").getItem(7).as("s7"),
- split(col("show_ids"),"-").getItem(8).as("s8"),
- split(col("show_ids"),"-").getItem(9).as("s9"),
- split(col("show_ids"),"-").getItem(10).as("s10"),
- split(col("show_ids"),"-").getItem(11).as("s11"),
- split(col("show_ids"),"-").getItem(12).as("s12"),
- split(col("show_ids"),"-").getItem(13).as("s13"),
- split(col("show_ids"),"-").getItem(14).as("s14"),
- split(col("show_ids"),"-").getItem(15).as("s15"),
- split(col("show_ids"),"-").getItem(16).as("s16"),
- split(col("show_ids"),"-").getItem(17).as("s17"),
- split(col("show_ids"),"-").getItem(18).as("s18"),
- split(col("show_ids"),"-").getItem(19).as("s19"))
- val s2 = s1.select("uid","s0")
- .union(s1.select("uid","s1"))
- .union(s1.select("uid","s2"))
- .union(s1.select("uid","s3"))
- .union(s1.select("uid","s4"))
- .union(s1.select("uid","s5"))
- .union(s1.select("uid","s6"))
- .union(s1.select("uid","s7"))
- .union(s1.select("uid","s8"))
- .union(s1.select("uid","s9"))
- .union(s1.select("uid","s10"))
- .union(s1.select("uid","s11"))
- .union(s1.select("uid","s12"))
- .union(s1.select("uid","s13"))
- .union(s1.select("uid","s14"))
- .union(s1.select("uid","s15"))
- .union(s1.select("uid","s16"))
- .union(s1.select("uid","s17"))
- .union(s1.select("uid","s18"))
- .union(s1.select("uid","s19"))
- val s3 = s2.withColumnRenamed("s0","show_ids")
- s3.distinct().except(click_uid).withColumnRenamed("uid","uid_s").createOrReplaceTempView("show_no_click_uid")
- click_uid.withColumnRenamed("uid","uid_c").createOrReplaceTempView("click_uid_1")
- spark.read.format("csv").option("header","true").load("/data/wangtao/social_click_model_new_20190318/f1_feature/*.csv").createOrReplaceTempView("f1")
- spark.read.format("csv").option("header","true").load("/data/wangtao/social_click_model_new_20190318/f2_feature/*.csv").createOrReplaceTempView("f2")
- spark.sql("select * from click_uid_1 left outer join f1 on click_uid_1.uid_c = f1.uid").createOrReplaceTempView("d1")
- val d2 = spark.sql("select * from d1 left outer join f2 on d1.target_uid = f2.uid_1")
- d2.write.format("csv").option("header","true").save("/data/wangtao/social_click_model_new_20190318/pos_online_"+date+"_no_corr")
- spark.sql("select * from show_no_click_uid left outer join f1 on show_no_click_uid.uid_s = f1.uid").createOrReplaceTempView("d3")
- val d4 = spark.sql("select * from d3 left outer join f2 on d3.show_ids = f2.uid_1")
- d4.write.format("csv").option("header","true").save("/data/wangtao/social_click_model_new_20190318/neg_online_"+date+"_no_corr")
- }
- def convertFeature(datadir:String): Unit ={
- val feature = spark.read.format("json").load(datadir + "/online_nearby_new_user_feature/part-*").cache()
- val f3 = feature.withColumn("_corrupt_record",split(col("_corrupt_record"),",")).select(col("_corrupt_record").getItem(0).as("uid_2"),col("_corrupt_record").getItem(1).as("age_2"),col("_corrupt_record").getItem(2).as("height_2"),col("_corrupt_record").getItem(3).as("weight_2"),col("_corrupt_record").getItem(4).as("role_2"),col("_corrupt_record").getItem(5).as("vbadge_2"),col("_corrupt_record").getItem(6).as("has_photos_2"),col("_corrupt_record").getItem(7).as("video_verified_2"),col("_corrupt_record").getItem(8).as("is_human_face_2"),col("_corrupt_record").getItem(9).as("max_ratio_2"),col("_corrupt_record").getItem(10).as("max_beauty_2"),col("_corrupt_record").getItem(11).as("has_description_2"),col("_corrupt_record").getItem(12).as("ip_location_2"),col("_corrupt_record").getItem(13).as("followed_num_2"),col("_corrupt_record").getItem(14).as("follower_num_2"),col("_corrupt_record").getItem(15).as("click_2"),col("_corrupt_record").getItem(16).as("clicked_2"),col("_corrupt_record").getItem(17).as("show_2"),col("_corrupt_record").getItem(18).as("send_session_2"),col("_corrupt_record").getItem(19).as("receive_session_2"),col("_corrupt_record").getItem(20).as("tag_1_1_2"),col("_corrupt_record").getItem(21).as("tag_1_2_2"),col("_corrupt_record").getItem(22).as("tag_1_3_2"),col("_corrupt_record").getItem(23).as("tag_1_4_2"),col("_corrupt_record").getItem(24).as("tag_2_1_2"),col("_corrupt_record").getItem(25).as("tag_2_2_2"),col("_corrupt_record").getItem(26).as("tag_2_3_2"),col("_corrupt_record").getItem(27).as("tag_2_4_2"),col("_corrupt_record").getItem(28).as("tag_2_5_2"),col("_corrupt_record").getItem(29).as("tag_2_6_2"),col("_corrupt_record").getItem(30).as("tag_2_7_2"),col("_corrupt_record").getItem(31).as("tag_2_8_2"),col("_corrupt_record").getItem(32).as("tag_2_9_2"),col("_corrupt_record").getItem(33).as("tag_2_10_2"),col("_corrupt_record").getItem(34).as("tag_2_11_2"),col("_corrupt_record").getItem(35).as("tag_2_12_2"),col("_corrupt_record").getItem(36).as("tag_3_1_2"),col("_corrupt_record").getItem(37).as("tag_3_2_2"),col("_corrupt_record").getItem(38).as("tag_3_3_2"),col("_corrupt_record").getItem(39).as("tag_3_4_2"),col("_corrupt_record").getItem(40).as("tag_4_1_2"),col("_corrupt_record").getItem(41).as("tag_4_2_2"),col("_corrupt_record").getItem(42).as("tag_4_3_2"),col("_corrupt_record").getItem(43).as("tag_4_4_2"),col("_corrupt_record").getItem(44).as("tag_4_5_2"),col("_corrupt_record").getItem(45).as("tag_4_6_2"),col("_corrupt_record").getItem(46).as("tag_4_7_2"),col("_corrupt_record").getItem(47).as("tag_4_8_2"),col("_corrupt_record").getItem(48).as("tag_4_9_2"),col("_corrupt_record").getItem(49).as("tag_4_10_2"),col("_corrupt_record").getItem(50).as("tag_4_11_2"),col("_corrupt_record").getItem(51).as("tag_4_12_2"),col("_corrupt_record").getItem(52).as("tag_5_1_2"),col("_corrupt_record").getItem(53).as("tag_5_2_2"),col("_corrupt_record").getItem(54).as("tag_5_3_2"),col("_corrupt_record").getItem(55).as("tag_5_4_2"),col("_corrupt_record").getItem(56).as("tag_5_5_2"),col("_corrupt_record").getItem(57).as("is_human_body_2"),col("_corrupt_record").getItem(58).as("desc_len_2"),col("_corrupt_record").getItem(59).as("has_avatar_2"),col("_corrupt_record").getItem(60).as("vip_2"),col("_corrupt_record").getItem(61).as("online_click_2"),col("_corrupt_record").getItem(62).as("online_clicked_2"),col("_corrupt_record").getItem(63).as("online_show_2"),col("_corrupt_record").getItem(64).as("online_showed_2"),col("_corrupt_record").getItem(65).as("nearby_click_2"),col("_corrupt_record").getItem(66).as("nearby_clicked_2"),col("_corrupt_record").getItem(67).as("nearby_show_2"),col("_corrupt_record").getItem(68).as("nearby_showed_2"),col("_corrupt_record").getItem(69).as("newbie_click_2"),col("_corrupt_record").getItem(70).as("newbie_clicked_2"),col("_corrupt_record").getItem(71).as("newbie_show_2"),col("_corrupt_record").getItem(72).as("newbie_showed_2"),col("_corrupt_record").getItem(73).as("social_stay_time_2"),col("_corrupt_record").getItem(74).as("visit_count_2"),col("_corrupt_record").getItem(75).as("visited_count_2"))
- val f4 = feature.withColumn("_corrupt_record",split(col("_corrupt_record"),",")).select(col("_corrupt_record").getItem(0).as("uid_3"),col("_corrupt_record").getItem(1).as("age_3"),col("_corrupt_record").getItem(2).as("height_3"),col("_corrupt_record").getItem(3).as("weight_3"),col("_corrupt_record").getItem(4).as("role_3"),col("_corrupt_record").getItem(5).as("vbadge_3"),col("_corrupt_record").getItem(6).as("has_photos_3"),col("_corrupt_record").getItem(7).as("video_verified_3"),col("_corrupt_record").getItem(8).as("is_human_face_3"),col("_corrupt_record").getItem(9).as("max_ratio_3"),col("_corrupt_record").getItem(10).as("max_beauty_3"),col("_corrupt_record").getItem(11).as("has_description_3"),col("_corrupt_record").getItem(12).as("ip_location_3"),col("_corrupt_record").getItem(13).as("followed_num_3"),col("_corrupt_record").getItem(14).as("follower_num_3"),col("_corrupt_record").getItem(15).as("click_3"),col("_corrupt_record").getItem(16).as("clicked_3"),col("_corrupt_record").getItem(17).as("show_3"),col("_corrupt_record").getItem(18).as("send_session_3"),col("_corrupt_record").getItem(19).as("receive_session_3"),col("_corrupt_record").getItem(20).as("tag_1_1_3"),col("_corrupt_record").getItem(21).as("tag_1_2_3"),col("_corrupt_record").getItem(22).as("tag_1_3_3"),col("_corrupt_record").getItem(23).as("tag_1_4_3"),col("_corrupt_record").getItem(24).as("tag_2_1_3"),col("_corrupt_record").getItem(25).as("tag_2_2_3"),col("_corrupt_record").getItem(26).as("tag_2_3_3"),col("_corrupt_record").getItem(27).as("tag_2_4_3"),col("_corrupt_record").getItem(28).as("tag_2_5_3"),col("_corrupt_record").getItem(29).as("tag_2_6_3"),col("_corrupt_record").getItem(30).as("tag_2_7_3"),col("_corrupt_record").getItem(31).as("tag_2_8_3"),col("_corrupt_record").getItem(32).as("tag_2_9_3"),col("_corrupt_record").getItem(33).as("tag_2_10_3"),col("_corrupt_record").getItem(34).as("tag_2_11_3"),col("_corrupt_record").getItem(35).as("tag_2_12_3"),col("_corrupt_record").getItem(36).as("tag_3_1_3"),col("_corrupt_record").getItem(37).as("tag_3_2_3"),col("_corrupt_record").getItem(38).as("tag_3_3_3"),col("_corrupt_record").getItem(39).as("tag_3_4_3"),col("_corrupt_record").getItem(40).as("tag_4_1_3"),col("_corrupt_record").getItem(41).as("tag_4_2_3"),col("_corrupt_record").getItem(42).as("tag_4_3_3"),col("_corrupt_record").getItem(43).as("tag_4_4_3"),col("_corrupt_record").getItem(44).as("tag_4_5_3"),col("_corrupt_record").getItem(45).as("tag_4_6_3"),col("_corrupt_record").getItem(46).as("tag_4_7_3"),col("_corrupt_record").getItem(47).as("tag_4_8_3"),col("_corrupt_record").getItem(48).as("tag_4_9_3"),col("_corrupt_record").getItem(49).as("tag_4_10_3"),col("_corrupt_record").getItem(50).as("tag_4_11_3"),col("_corrupt_record").getItem(51).as("tag_4_12_3"),col("_corrupt_record").getItem(52).as("tag_5_1_3"),col("_corrupt_record").getItem(53).as("tag_5_2_3"),col("_corrupt_record").getItem(54).as("tag_5_3_3"),col("_corrupt_record").getItem(55).as("tag_5_4_3"),col("_corrupt_record").getItem(56).as("tag_5_5_3"),col("_corrupt_record").getItem(57).as("is_human_body_3"),col("_corrupt_record").getItem(58).as("desc_len_3"),col("_corrupt_record").getItem(59).as("has_avatar_3"),col("_corrupt_record").getItem(60).as("vip_3"),col("_corrupt_record").getItem(61).as("online_click_3"),col("_corrupt_record").getItem(62).as("online_clicked_3"),col("_corrupt_record").getItem(63).as("online_show_3"),col("_corrupt_record").getItem(64).as("online_showed_3"),col("_corrupt_record").getItem(65).as("nearby_click_3"),col("_corrupt_record").getItem(66).as("nearby_clicked_3"),col("_corrupt_record").getItem(67).as("nearby_show_3"),col("_corrupt_record").getItem(68).as("nearby_showed_3"),col("_corrupt_record").getItem(69).as("newbie_click_3"),col("_corrupt_record").getItem(70).as("newbie_clicked_3"),col("_corrupt_record").getItem(71).as("newbie_show_3"),col("_corrupt_record").getItem(72).as("newbie_showed_3"),col("_corrupt_record").getItem(73).as("social_stay_time_3"),col("_corrupt_record").getItem(74).as("visit_count_3"),col("_corrupt_record").getItem(75).as("visited_count_3"))
- val f5 = feature.withColumn("_corrupt_record",split(col("_corrupt_record"),",")).select(col("_corrupt_record").getItem(0).as("uid_4"),col("_corrupt_record").getItem(1).as("age_4"),col("_corrupt_record").getItem(2).as("height_4"),col("_corrupt_record").getItem(3).as("weight_4"),col("_corrupt_record").getItem(4).as("role_4"),col("_corrupt_record").getItem(5).as("vbadge_4"),col("_corrupt_record").getItem(6).as("has_photos_4"),col("_corrupt_record").getItem(7).as("video_verified_4"),col("_corrupt_record").getItem(8).as("is_human_face_4"),col("_corrupt_record").getItem(9).as("max_ratio_4"),col("_corrupt_record").getItem(10).as("max_beauty_4"),col("_corrupt_record").getItem(11).as("has_description_4"),col("_corrupt_record").getItem(12).as("ip_location_4"),col("_corrupt_record").getItem(13).as("followed_num_4"),col("_corrupt_record").getItem(14).as("follower_num_4"),col("_corrupt_record").getItem(15).as("click_4"),col("_corrupt_record").getItem(16).as("clicked_4"),col("_corrupt_record").getItem(17).as("show_4"),col("_corrupt_record").getItem(18).as("send_session_4"),col("_corrupt_record").getItem(19).as("receive_session_4"),col("_corrupt_record").getItem(20).as("tag_1_1_4"),col("_corrupt_record").getItem(21).as("tag_1_2_4"),col("_corrupt_record").getItem(22).as("tag_1_3_4"),col("_corrupt_record").getItem(23).as("tag_1_4_4"),col("_corrupt_record").getItem(24).as("tag_2_1_4"),col("_corrupt_record").getItem(25).as("tag_2_2_4"),col("_corrupt_record").getItem(26).as("tag_2_3_4"),col("_corrupt_record").getItem(27).as("tag_2_4_4"),col("_corrupt_record").getItem(28).as("tag_2_5_4"),col("_corrupt_record").getItem(29).as("tag_2_6_4"),col("_corrupt_record").getItem(30).as("tag_2_7_4"),col("_corrupt_record").getItem(31).as("tag_2_8_4"),col("_corrupt_record").getItem(32).as("tag_2_9_4"),col("_corrupt_record").getItem(33).as("tag_2_10_4"),col("_corrupt_record").getItem(34).as("tag_2_11_4"),col("_corrupt_record").getItem(35).as("tag_2_12_4"),col("_corrupt_record").getItem(36).as("tag_3_1_4"),col("_corrupt_record").getItem(37).as("tag_3_2_4"),col("_corrupt_record").getItem(38).as("tag_3_3_4"),col("_corrupt_record").getItem(39).as("tag_3_4_4"),col("_corrupt_record").getItem(40).as("tag_4_1_4"),col("_corrupt_record").getItem(41).as("tag_4_2_4"),col("_corrupt_record").getItem(42).as("tag_4_3_4"),col("_corrupt_record").getItem(43).as("tag_4_4_4"),col("_corrupt_record").getItem(44).as("tag_4_5_4"),col("_corrupt_record").getItem(45).as("tag_4_6_4"),col("_corrupt_record").getItem(46).as("tag_4_7_4"),col("_corrupt_record").getItem(47).as("tag_4_8_4"),col("_corrupt_record").getItem(48).as("tag_4_9_4"),col("_corrupt_record").getItem(49).as("tag_4_10_4"),col("_corrupt_record").getItem(50).as("tag_4_11_4"),col("_corrupt_record").getItem(51).as("tag_4_12_4"),col("_corrupt_record").getItem(52).as("tag_5_1_4"),col("_corrupt_record").getItem(53).as("tag_5_2_4"),col("_corrupt_record").getItem(54).as("tag_5_3_4"),col("_corrupt_record").getItem(55).as("tag_5_4_4"),col("_corrupt_record").getItem(56).as("tag_5_5_4"),col("_corrupt_record").getItem(57).as("is_human_body_4"),col("_corrupt_record").getItem(58).as("desc_len_4"),col("_corrupt_record").getItem(59).as("has_avatar_4"),col("_corrupt_record").getItem(60).as("vip_4"),col("_corrupt_record").getItem(61).as("online_click_4"),col("_corrupt_record").getItem(62).as("online_clicked_4"),col("_corrupt_record").getItem(63).as("online_show_4"),col("_corrupt_record").getItem(64).as("online_showed_4"),col("_corrupt_record").getItem(65).as("nearby_click_4"),col("_corrupt_record").getItem(66).as("nearby_clicked_4"),col("_corrupt_record").getItem(67).as("nearby_show_4"),col("_corrupt_record").getItem(68).as("nearby_showed_4"),col("_corrupt_record").getItem(69).as("newbie_click_4"),col("_corrupt_record").getItem(70).as("newbie_clicked_4"),col("_corrupt_record").getItem(71).as("newbie_show_4"),col("_corrupt_record").getItem(72).as("newbie_showed_4"),col("_corrupt_record").getItem(73).as("social_stay_time_4"),col("_corrupt_record").getItem(74).as("visit_count_4"),col("_corrupt_record").getItem(75).as("visited_count_4"))
- val f6 = feature.withColumn("_corrupt_record",split(col("_corrupt_record"),",")).select(col("_corrupt_record").getItem(0).as("uid_5"),col("_corrupt_record").getItem(1).as("age_5"),col("_corrupt_record").getItem(2).as("height_5"),col("_corrupt_record").getItem(3).as("weight_5"),col("_corrupt_record").getItem(4).as("role_5"),col("_corrupt_record").getItem(5).as("vbadge_5"),col("_corrupt_record").getItem(6).as("has_photos_5"),col("_corrupt_record").getItem(7).as("video_verified_5"),col("_corrupt_record").getItem(8).as("is_human_face_5"),col("_corrupt_record").getItem(9).as("max_ratio_5"),col("_corrupt_record").getItem(10).as("max_beauty_5"),col("_corrupt_record").getItem(11).as("has_description_5"),col("_corrupt_record").getItem(12).as("ip_location_5"),col("_corrupt_record").getItem(13).as("followed_num_5"),col("_corrupt_record").getItem(14).as("follower_num_5"),col("_corrupt_record").getItem(15).as("click_5"),col("_corrupt_record").getItem(16).as("clicked_5"),col("_corrupt_record").getItem(17).as("show_5"),col("_corrupt_record").getItem(18).as("send_session_5"),col("_corrupt_record").getItem(19).as("receive_session_5"),col("_corrupt_record").getItem(20).as("tag_1_1_5"),col("_corrupt_record").getItem(21).as("tag_1_2_5"),col("_corrupt_record").getItem(22).as("tag_1_3_5"),col("_corrupt_record").getItem(23).as("tag_1_4_5"),col("_corrupt_record").getItem(24).as("tag_2_1_5"),col("_corrupt_record").getItem(25).as("tag_2_2_5"),col("_corrupt_record").getItem(26).as("tag_2_3_5"),col("_corrupt_record").getItem(27).as("tag_2_4_5"),col("_corrupt_record").getItem(28).as("tag_2_5_5"),col("_corrupt_record").getItem(29).as("tag_2_6_5"),col("_corrupt_record").getItem(30).as("tag_2_7_5"),col("_corrupt_record").getItem(31).as("tag_2_8_5"),col("_corrupt_record").getItem(32).as("tag_2_9_5"),col("_corrupt_record").getItem(33).as("tag_2_10_5"),col("_corrupt_record").getItem(34).as("tag_2_11_5"),col("_corrupt_record").getItem(35).as("tag_2_12_5"),col("_corrupt_record").getItem(36).as("tag_3_1_5"),col("_corrupt_record").getItem(37).as("tag_3_2_5"),col("_corrupt_record").getItem(38).as("tag_3_3_5"),col("_corrupt_record").getItem(39).as("tag_3_4_5"),col("_corrupt_record").getItem(40).as("tag_4_1_5"),col("_corrupt_record").getItem(41).as("tag_4_2_5"),col("_corrupt_record").getItem(42).as("tag_4_3_5"),col("_corrupt_record").getItem(43).as("tag_4_4_5"),col("_corrupt_record").getItem(44).as("tag_4_5_5"),col("_corrupt_record").getItem(45).as("tag_4_6_5"),col("_corrupt_record").getItem(46).as("tag_4_7_5"),col("_corrupt_record").getItem(47).as("tag_4_8_5"),col("_corrupt_record").getItem(48).as("tag_4_9_5"),col("_corrupt_record").getItem(49).as("tag_4_10_5"),col("_corrupt_record").getItem(50).as("tag_4_11_5"),col("_corrupt_record").getItem(51).as("tag_4_12_5"),col("_corrupt_record").getItem(52).as("tag_5_1_5"),col("_corrupt_record").getItem(53).as("tag_5_2_5"),col("_corrupt_record").getItem(54).as("tag_5_3_5"),col("_corrupt_record").getItem(55).as("tag_5_4_5"),col("_corrupt_record").getItem(56).as("tag_5_5_5"),col("_corrupt_record").getItem(57).as("is_human_body_5"),col("_corrupt_record").getItem(58).as("desc_len_5"),col("_corrupt_record").getItem(59).as("has_avatar_5"),col("_corrupt_record").getItem(60).as("vip_5"),col("_corrupt_record").getItem(61).as("online_click_5"),col("_corrupt_record").getItem(62).as("online_clicked_5"),col("_corrupt_record").getItem(63).as("online_show_5"),col("_corrupt_record").getItem(64).as("online_showed_5"),col("_corrupt_record").getItem(65).as("nearby_click_5"),col("_corrupt_record").getItem(66).as("nearby_clicked_5"),col("_corrupt_record").getItem(67).as("nearby_show_5"),col("_corrupt_record").getItem(68).as("nearby_showed_5"),col("_corrupt_record").getItem(69).as("newbie_click_5"),col("_corrupt_record").getItem(70).as("newbie_clicked_5"),col("_corrupt_record").getItem(71).as("newbie_show_5"),col("_corrupt_record").getItem(72).as("newbie_showed_5"),col("_corrupt_record").getItem(73).as("social_stay_time_5"),col("_corrupt_record").getItem(74).as("visit_count_5"),col("_corrupt_record").getItem(75).as("visited_count_5"))
- f3.write.format("csv").option("header","true").save(datadir + "/f3_feature")
- f4.write.format("csv").option("header","true").save(datadir + "/f4_feature")
- f5.write.format("csv").option("header","true").save(datadir + "/f5_feature")
- f6.write.format("csv").option("header","true").save(datadir + "/f6_feature")
- }
- def mergeFeature(date:String): Unit ={
- spark.read.format("csv").option("header","true").load(datadir + "/pos_online_" + date + "_no_corr/*.csv").createOrReplaceTempView("pos_data")
- val follow = spark.read.format("csv").option("header","true").load(datadir + "/follow_uid/" + date + "/*.csv").cache()
- follow.withColumnRenamed("uid","uid_f").createOrReplaceTempView("follow")
- spark.sql("select * from pos_data left outer join follow on pos_data.uid_c = follow.uid_f").createOrReplaceTempView("d1")
- spark.sql("select * from d1 left outer join f3 on d1.f1_uid = f3.uid_2").createOrReplaceTempView("d2")
- spark.sql("select * from d2 left outer join f4 on d2.f2_uid = f4.uid_3").createOrReplaceTempView("d3")
- spark.sql("select * from d3 left outer join f5 on d3.f3_uid = f5.uid_4").createOrReplaceTempView("d4")
- val pos_res = spark.sql("select * from d4 left outer join f6 on d4.f4_uid = f6.uid_5")
- .drop("uid_f")
- .drop("f1_uid")
- .drop("f2_uid")
- .drop("f3_uid")
- .drop("f4_uid")
- pos_res.write.format("csv").option("header","true").save(datadir + "/pos_online_" + date + "_with_4_follower")
- spark.read.format("csv").option("header","true").load(datadir + "/neg_online_" + date + "_no_corr/*.csv").cache().createOrReplaceTempView("neg_data")
- spark.sql("select * from neg_data left outer join follow on neg_data.uid_s = follow.uid_f").createOrReplaceTempView("d5")
- spark.sql("select * from d5 left outer join f3 on d5.f1_uid = f3.uid_2").createOrReplaceTempView("d6")
- spark.sql("select * from d6 left outer join f4 on d6.f2_uid = f4.uid_3").createOrReplaceTempView("d7")
- spark.sql("select * from d7 left outer join f5 on d7.f3_uid = f5.uid_4").createOrReplaceTempView("d8")
- val neg_res = spark.sql("select * from d8 left outer join f6 on d8.f4_uid = f6.uid_5")
- .drop("uid_f")
- .drop("f1_uid")
- .drop("f2_uid")
- .drop("f3_uid")
- .drop("f4_uid")
- neg_res.write.format("csv").option("header","true").save(datadir + "/neg_online_" + date + "_with_4_follower")
- }
- }
spark读取hive&&hdfs数据
最新推荐文章于 2024-05-16 20:12:02 发布