spark读取hive&&hdfs数据

  1. import org.apache.spark.sql.SparkSession  
  2. import org.apache.spark.SparkConf  
  3. import org.apache.spark.sql.functions.{col, split}  
  4.   
  5.   
  6. /** 
  7.   * Created by wt 
  8.   */  
  9.   
  10.  
  11. object AcquireOlineData {  
  12.   
  13.   val spConfig = new SparkConf().setAppName("Online_click_data")  
  14.   val spark = SparkSession.builder().config(spConfig).enableHiveSupport().getOrCreate()  
  15.   val sc = spark.sqlContext  
  16.   val datadir = "/data/wt/social_click_model_new"  
  17.   
  18.   spark.read.format("csv").option("header","true").load(datadir + "/f3_feature/*.csv").cache().createOrReplaceTempView("f3")  
  19.   spark.read.format("csv").option("header","true").load(datadir + "/f4_feature/*.csv").cache().createOrReplaceTempView("f4")  
  20.   spark.read.format("csv").option("header","true").load(datadir + "/f5_feature/*.csv").cache().createOrReplaceTempView("f5")  
  21.   spark.read.format("csv").option("header","true").load(datadir + "/f6_feature/*.csv").cache().createOrReplaceTempView("f6")  
  22.   
  23.   def main(args: Array[String]): Unit = {  
  24.   
  25.     mergeFeature("0319")  
  26.   
  27.   }  
  28.   
  29.   def acquireOnlineClickData(date:String): Unit ={  
  30.   
  31.     import spark.sql  
  32.   
  33.     sql("use ai_log")  
  34.   
  35.     val click_uid = sql("select * from click_position where day="+date+" and type=1 and from='online'")  
  36.       .drop("service")  
  37.       .drop("position")  
  38.       .drop("from")  
  39.       .drop("time")  
  40.       .drop("type")  
  41.       .drop("day")  
  42.       .distinct()  
  43.   
  44.   
  45.     val show = sql("select * from people_show where day="+date+" and type=1 and from='online'")  
  46.       .drop("service")  
  47.       .drop("from")  
  48.       .drop("time")  
  49.       .drop("type")  
  50.       .drop("day")  
  51.   
  52.     import org.apache.spark.sql.functions._  
  53.   
  54.     val s1 = show.select(show("uid"),  
  55.       split(col("show_ids"),"-").getItem(0).as("s0"),  
  56.       split(col("show_ids"),"-").getItem(1).as("s1"),  
  57.       split(col("show_ids"),"-").getItem(2).as("s2"),  
  58.       split(col("show_ids"),"-").getItem(3).as("s3"),  
  59.       split(col("show_ids"),"-").getItem(4).as("s4"),  
  60.       split(col("show_ids"),"-").getItem(5).as("s5"),  
  61.       split(col("show_ids"),"-").getItem(6).as("s6"),  
  62.       split(col("show_ids"),"-").getItem(7).as("s7"),  
  63.       split(col("show_ids"),"-").getItem(8).as("s8"),  
  64.       split(col("show_ids"),"-").getItem(9).as("s9"),  
  65.       split(col("show_ids"),"-").getItem(10).as("s10"),  
  66.       split(col("show_ids"),"-").getItem(11).as("s11"),  
  67.       split(col("show_ids"),"-").getItem(12).as("s12"),  
  68.       split(col("show_ids"),"-").getItem(13).as("s13"),  
  69.       split(col("show_ids"),"-").getItem(14).as("s14"),  
  70.       split(col("show_ids"),"-").getItem(15).as("s15"),  
  71.       split(col("show_ids"),"-").getItem(16).as("s16"),  
  72.       split(col("show_ids"),"-").getItem(17).as("s17"),  
  73.       split(col("show_ids"),"-").getItem(18).as("s18"),  
  74.       split(col("show_ids"),"-").getItem(19).as("s19"))  
  75.   
  76.     val s2 = s1.select("uid","s0")  
  77.       .union(s1.select("uid","s1"))  
  78.       .union(s1.select("uid","s2"))  
  79.       .union(s1.select("uid","s3"))  
  80.       .union(s1.select("uid","s4"))  
  81.       .union(s1.select("uid","s5"))  
  82.       .union(s1.select("uid","s6"))  
  83.       .union(s1.select("uid","s7"))  
  84.       .union(s1.select("uid","s8"))  
  85.       .union(s1.select("uid","s9"))  
  86.       .union(s1.select("uid","s10"))  
  87.       .union(s1.select("uid","s11"))  
  88.       .union(s1.select("uid","s12"))  
  89.       .union(s1.select("uid","s13"))  
  90.       .union(s1.select("uid","s14"))  
  91.       .union(s1.select("uid","s15"))  
  92.       .union(s1.select("uid","s16"))  
  93.       .union(s1.select("uid","s17"))  
  94.       .union(s1.select("uid","s18"))  
  95.       .union(s1.select("uid","s19"))  
  96.   
  97.     val s3 = s2.withColumnRenamed("s0","show_ids")  
  98.   
  99.   
  100.     s3.distinct().except(click_uid).withColumnRenamed("uid","uid_s").createOrReplaceTempView("show_no_click_uid")  
  101.   
  102.     click_uid.withColumnRenamed("uid","uid_c").createOrReplaceTempView("click_uid_1")  
  103.   
  104.     spark.read.format("csv").option("header","true").load("/data/wangtao/social_click_model_new_20190318/f1_feature/*.csv").createOrReplaceTempView("f1")  
  105.   
  106.     spark.read.format("csv").option("header","true").load("/data/wangtao/social_click_model_new_20190318/f2_feature/*.csv").createOrReplaceTempView("f2")  
  107.   
  108.     spark.sql("select * from click_uid_1 left outer join f1 on click_uid_1.uid_c = f1.uid").createOrReplaceTempView("d1")  
  109.   
  110.     val d2 = spark.sql("select * from d1 left outer join f2 on d1.target_uid = f2.uid_1")  
  111.   
  112.     d2.write.format("csv").option("header","true").save("/data/wangtao/social_click_model_new_20190318/pos_online_"+date+"_no_corr")  
  113.   
  114.     spark.sql("select * from show_no_click_uid left outer join f1 on show_no_click_uid.uid_s = f1.uid").createOrReplaceTempView("d3")  
  115.   
  116.     val d4 = spark.sql("select * from d3 left outer join f2 on d3.show_ids = f2.uid_1")  
  117.   
  118.     d4.write.format("csv").option("header","true").save("/data/wangtao/social_click_model_new_20190318/neg_online_"+date+"_no_corr")  
  119.   
  120.   }  
  121.   
  122.   def convertFeature(datadir:String): Unit ={  
  123.   
  124.     val feature = spark.read.format("json").load(datadir + "/online_nearby_new_user_feature/part-*").cache()  
  125.     val f3 = feature.withColumn("_corrupt_record",split(col("_corrupt_record"),",")).select(col("_corrupt_record").getItem(0).as("uid_2"),col("_corrupt_record").getItem(1).as("age_2"),col("_corrupt_record").getItem(2).as("height_2"),col("_corrupt_record").getItem(3).as("weight_2"),col("_corrupt_record").getItem(4).as("role_2"),col("_corrupt_record").getItem(5).as("vbadge_2"),col("_corrupt_record").getItem(6).as("has_photos_2"),col("_corrupt_record").getItem(7).as("video_verified_2"),col("_corrupt_record").getItem(8).as("is_human_face_2"),col("_corrupt_record").getItem(9).as("max_ratio_2"),col("_corrupt_record").getItem(10).as("max_beauty_2"),col("_corrupt_record").getItem(11).as("has_description_2"),col("_corrupt_record").getItem(12).as("ip_location_2"),col("_corrupt_record").getItem(13).as("followed_num_2"),col("_corrupt_record").getItem(14).as("follower_num_2"),col("_corrupt_record").getItem(15).as("click_2"),col("_corrupt_record").getItem(16).as("clicked_2"),col("_corrupt_record").getItem(17).as("show_2"),col("_corrupt_record").getItem(18).as("send_session_2"),col("_corrupt_record").getItem(19).as("receive_session_2"),col("_corrupt_record").getItem(20).as("tag_1_1_2"),col("_corrupt_record").getItem(21).as("tag_1_2_2"),col("_corrupt_record").getItem(22).as("tag_1_3_2"),col("_corrupt_record").getItem(23).as("tag_1_4_2"),col("_corrupt_record").getItem(24).as("tag_2_1_2"),col("_corrupt_record").getItem(25).as("tag_2_2_2"),col("_corrupt_record").getItem(26).as("tag_2_3_2"),col("_corrupt_record").getItem(27).as("tag_2_4_2"),col("_corrupt_record").getItem(28).as("tag_2_5_2"),col("_corrupt_record").getItem(29).as("tag_2_6_2"),col("_corrupt_record").getItem(30).as("tag_2_7_2"),col("_corrupt_record").getItem(31).as("tag_2_8_2"),col("_corrupt_record").getItem(32).as("tag_2_9_2"),col("_corrupt_record").getItem(33).as("tag_2_10_2"),col("_corrupt_record").getItem(34).as("tag_2_11_2"),col("_corrupt_record").getItem(35).as("tag_2_12_2"),col("_corrupt_record").getItem(36).as("tag_3_1_2"),col("_corrupt_record").getItem(37).as("tag_3_2_2"),col("_corrupt_record").getItem(38).as("tag_3_3_2"),col("_corrupt_record").getItem(39).as("tag_3_4_2"),col("_corrupt_record").getItem(40).as("tag_4_1_2"),col("_corrupt_record").getItem(41).as("tag_4_2_2"),col("_corrupt_record").getItem(42).as("tag_4_3_2"),col("_corrupt_record").getItem(43).as("tag_4_4_2"),col("_corrupt_record").getItem(44).as("tag_4_5_2"),col("_corrupt_record").getItem(45).as("tag_4_6_2"),col("_corrupt_record").getItem(46).as("tag_4_7_2"),col("_corrupt_record").getItem(47).as("tag_4_8_2"),col("_corrupt_record").getItem(48).as("tag_4_9_2"),col("_corrupt_record").getItem(49).as("tag_4_10_2"),col("_corrupt_record").getItem(50).as("tag_4_11_2"),col("_corrupt_record").getItem(51).as("tag_4_12_2"),col("_corrupt_record").getItem(52).as("tag_5_1_2"),col("_corrupt_record").getItem(53).as("tag_5_2_2"),col("_corrupt_record").getItem(54).as("tag_5_3_2"),col("_corrupt_record").getItem(55).as("tag_5_4_2"),col("_corrupt_record").getItem(56).as("tag_5_5_2"),col("_corrupt_record").getItem(57).as("is_human_body_2"),col("_corrupt_record").getItem(58).as("desc_len_2"),col("_corrupt_record").getItem(59).as("has_avatar_2"),col("_corrupt_record").getItem(60).as("vip_2"),col("_corrupt_record").getItem(61).as("online_click_2"),col("_corrupt_record").getItem(62).as("online_clicked_2"),col("_corrupt_record").getItem(63).as("online_show_2"),col("_corrupt_record").getItem(64).as("online_showed_2"),col("_corrupt_record").getItem(65).as("nearby_click_2"),col("_corrupt_record").getItem(66).as("nearby_clicked_2"),col("_corrupt_record").getItem(67).as("nearby_show_2"),col("_corrupt_record").getItem(68).as("nearby_showed_2"),col("_corrupt_record").getItem(69).as("newbie_click_2"),col("_corrupt_record").getItem(70).as("newbie_clicked_2"),col("_corrupt_record").getItem(71).as("newbie_show_2"),col("_corrupt_record").getItem(72).as("newbie_showed_2"),col("_corrupt_record").getItem(73).as("social_stay_time_2"),col("_corrupt_record").getItem(74).as("visit_count_2"),col("_corrupt_record").getItem(75).as("visited_count_2"))  
  126.     val f4 = feature.withColumn("_corrupt_record",split(col("_corrupt_record"),",")).select(col("_corrupt_record").getItem(0).as("uid_3"),col("_corrupt_record").getItem(1).as("age_3"),col("_corrupt_record").getItem(2).as("height_3"),col("_corrupt_record").getItem(3).as("weight_3"),col("_corrupt_record").getItem(4).as("role_3"),col("_corrupt_record").getItem(5).as("vbadge_3"),col("_corrupt_record").getItem(6).as("has_photos_3"),col("_corrupt_record").getItem(7).as("video_verified_3"),col("_corrupt_record").getItem(8).as("is_human_face_3"),col("_corrupt_record").getItem(9).as("max_ratio_3"),col("_corrupt_record").getItem(10).as("max_beauty_3"),col("_corrupt_record").getItem(11).as("has_description_3"),col("_corrupt_record").getItem(12).as("ip_location_3"),col("_corrupt_record").getItem(13).as("followed_num_3"),col("_corrupt_record").getItem(14).as("follower_num_3"),col("_corrupt_record").getItem(15).as("click_3"),col("_corrupt_record").getItem(16).as("clicked_3"),col("_corrupt_record").getItem(17).as("show_3"),col("_corrupt_record").getItem(18).as("send_session_3"),col("_corrupt_record").getItem(19).as("receive_session_3"),col("_corrupt_record").getItem(20).as("tag_1_1_3"),col("_corrupt_record").getItem(21).as("tag_1_2_3"),col("_corrupt_record").getItem(22).as("tag_1_3_3"),col("_corrupt_record").getItem(23).as("tag_1_4_3"),col("_corrupt_record").getItem(24).as("tag_2_1_3"),col("_corrupt_record").getItem(25).as("tag_2_2_3"),col("_corrupt_record").getItem(26).as("tag_2_3_3"),col("_corrupt_record").getItem(27).as("tag_2_4_3"),col("_corrupt_record").getItem(28).as("tag_2_5_3"),col("_corrupt_record").getItem(29).as("tag_2_6_3"),col("_corrupt_record").getItem(30).as("tag_2_7_3"),col("_corrupt_record").getItem(31).as("tag_2_8_3"),col("_corrupt_record").getItem(32).as("tag_2_9_3"),col("_corrupt_record").getItem(33).as("tag_2_10_3"),col("_corrupt_record").getItem(34).as("tag_2_11_3"),col("_corrupt_record").getItem(35).as("tag_2_12_3"),col("_corrupt_record").getItem(36).as("tag_3_1_3"),col("_corrupt_record").getItem(37).as("tag_3_2_3"),col("_corrupt_record").getItem(38).as("tag_3_3_3"),col("_corrupt_record").getItem(39).as("tag_3_4_3"),col("_corrupt_record").getItem(40).as("tag_4_1_3"),col("_corrupt_record").getItem(41).as("tag_4_2_3"),col("_corrupt_record").getItem(42).as("tag_4_3_3"),col("_corrupt_record").getItem(43).as("tag_4_4_3"),col("_corrupt_record").getItem(44).as("tag_4_5_3"),col("_corrupt_record").getItem(45).as("tag_4_6_3"),col("_corrupt_record").getItem(46).as("tag_4_7_3"),col("_corrupt_record").getItem(47).as("tag_4_8_3"),col("_corrupt_record").getItem(48).as("tag_4_9_3"),col("_corrupt_record").getItem(49).as("tag_4_10_3"),col("_corrupt_record").getItem(50).as("tag_4_11_3"),col("_corrupt_record").getItem(51).as("tag_4_12_3"),col("_corrupt_record").getItem(52).as("tag_5_1_3"),col("_corrupt_record").getItem(53).as("tag_5_2_3"),col("_corrupt_record").getItem(54).as("tag_5_3_3"),col("_corrupt_record").getItem(55).as("tag_5_4_3"),col("_corrupt_record").getItem(56).as("tag_5_5_3"),col("_corrupt_record").getItem(57).as("is_human_body_3"),col("_corrupt_record").getItem(58).as("desc_len_3"),col("_corrupt_record").getItem(59).as("has_avatar_3"),col("_corrupt_record").getItem(60).as("vip_3"),col("_corrupt_record").getItem(61).as("online_click_3"),col("_corrupt_record").getItem(62).as("online_clicked_3"),col("_corrupt_record").getItem(63).as("online_show_3"),col("_corrupt_record").getItem(64).as("online_showed_3"),col("_corrupt_record").getItem(65).as("nearby_click_3"),col("_corrupt_record").getItem(66).as("nearby_clicked_3"),col("_corrupt_record").getItem(67).as("nearby_show_3"),col("_corrupt_record").getItem(68).as("nearby_showed_3"),col("_corrupt_record").getItem(69).as("newbie_click_3"),col("_corrupt_record").getItem(70).as("newbie_clicked_3"),col("_corrupt_record").getItem(71).as("newbie_show_3"),col("_corrupt_record").getItem(72).as("newbie_showed_3"),col("_corrupt_record").getItem(73).as("social_stay_time_3"),col("_corrupt_record").getItem(74).as("visit_count_3"),col("_corrupt_record").getItem(75).as("visited_count_3"))  
  127.     val f5 = feature.withColumn("_corrupt_record",split(col("_corrupt_record"),",")).select(col("_corrupt_record").getItem(0).as("uid_4"),col("_corrupt_record").getItem(1).as("age_4"),col("_corrupt_record").getItem(2).as("height_4"),col("_corrupt_record").getItem(3).as("weight_4"),col("_corrupt_record").getItem(4).as("role_4"),col("_corrupt_record").getItem(5).as("vbadge_4"),col("_corrupt_record").getItem(6).as("has_photos_4"),col("_corrupt_record").getItem(7).as("video_verified_4"),col("_corrupt_record").getItem(8).as("is_human_face_4"),col("_corrupt_record").getItem(9).as("max_ratio_4"),col("_corrupt_record").getItem(10).as("max_beauty_4"),col("_corrupt_record").getItem(11).as("has_description_4"),col("_corrupt_record").getItem(12).as("ip_location_4"),col("_corrupt_record").getItem(13).as("followed_num_4"),col("_corrupt_record").getItem(14).as("follower_num_4"),col("_corrupt_record").getItem(15).as("click_4"),col("_corrupt_record").getItem(16).as("clicked_4"),col("_corrupt_record").getItem(17).as("show_4"),col("_corrupt_record").getItem(18).as("send_session_4"),col("_corrupt_record").getItem(19).as("receive_session_4"),col("_corrupt_record").getItem(20).as("tag_1_1_4"),col("_corrupt_record").getItem(21).as("tag_1_2_4"),col("_corrupt_record").getItem(22).as("tag_1_3_4"),col("_corrupt_record").getItem(23).as("tag_1_4_4"),col("_corrupt_record").getItem(24).as("tag_2_1_4"),col("_corrupt_record").getItem(25).as("tag_2_2_4"),col("_corrupt_record").getItem(26).as("tag_2_3_4"),col("_corrupt_record").getItem(27).as("tag_2_4_4"),col("_corrupt_record").getItem(28).as("tag_2_5_4"),col("_corrupt_record").getItem(29).as("tag_2_6_4"),col("_corrupt_record").getItem(30).as("tag_2_7_4"),col("_corrupt_record").getItem(31).as("tag_2_8_4"),col("_corrupt_record").getItem(32).as("tag_2_9_4"),col("_corrupt_record").getItem(33).as("tag_2_10_4"),col("_corrupt_record").getItem(34).as("tag_2_11_4"),col("_corrupt_record").getItem(35).as("tag_2_12_4"),col("_corrupt_record").getItem(36).as("tag_3_1_4"),col("_corrupt_record").getItem(37).as("tag_3_2_4"),col("_corrupt_record").getItem(38).as("tag_3_3_4"),col("_corrupt_record").getItem(39).as("tag_3_4_4"),col("_corrupt_record").getItem(40).as("tag_4_1_4"),col("_corrupt_record").getItem(41).as("tag_4_2_4"),col("_corrupt_record").getItem(42).as("tag_4_3_4"),col("_corrupt_record").getItem(43).as("tag_4_4_4"),col("_corrupt_record").getItem(44).as("tag_4_5_4"),col("_corrupt_record").getItem(45).as("tag_4_6_4"),col("_corrupt_record").getItem(46).as("tag_4_7_4"),col("_corrupt_record").getItem(47).as("tag_4_8_4"),col("_corrupt_record").getItem(48).as("tag_4_9_4"),col("_corrupt_record").getItem(49).as("tag_4_10_4"),col("_corrupt_record").getItem(50).as("tag_4_11_4"),col("_corrupt_record").getItem(51).as("tag_4_12_4"),col("_corrupt_record").getItem(52).as("tag_5_1_4"),col("_corrupt_record").getItem(53).as("tag_5_2_4"),col("_corrupt_record").getItem(54).as("tag_5_3_4"),col("_corrupt_record").getItem(55).as("tag_5_4_4"),col("_corrupt_record").getItem(56).as("tag_5_5_4"),col("_corrupt_record").getItem(57).as("is_human_body_4"),col("_corrupt_record").getItem(58).as("desc_len_4"),col("_corrupt_record").getItem(59).as("has_avatar_4"),col("_corrupt_record").getItem(60).as("vip_4"),col("_corrupt_record").getItem(61).as("online_click_4"),col("_corrupt_record").getItem(62).as("online_clicked_4"),col("_corrupt_record").getItem(63).as("online_show_4"),col("_corrupt_record").getItem(64).as("online_showed_4"),col("_corrupt_record").getItem(65).as("nearby_click_4"),col("_corrupt_record").getItem(66).as("nearby_clicked_4"),col("_corrupt_record").getItem(67).as("nearby_show_4"),col("_corrupt_record").getItem(68).as("nearby_showed_4"),col("_corrupt_record").getItem(69).as("newbie_click_4"),col("_corrupt_record").getItem(70).as("newbie_clicked_4"),col("_corrupt_record").getItem(71).as("newbie_show_4"),col("_corrupt_record").getItem(72).as("newbie_showed_4"),col("_corrupt_record").getItem(73).as("social_stay_time_4"),col("_corrupt_record").getItem(74).as("visit_count_4"),col("_corrupt_record").getItem(75).as("visited_count_4"))  
  128.     val f6 = feature.withColumn("_corrupt_record",split(col("_corrupt_record"),",")).select(col("_corrupt_record").getItem(0).as("uid_5"),col("_corrupt_record").getItem(1).as("age_5"),col("_corrupt_record").getItem(2).as("height_5"),col("_corrupt_record").getItem(3).as("weight_5"),col("_corrupt_record").getItem(4).as("role_5"),col("_corrupt_record").getItem(5).as("vbadge_5"),col("_corrupt_record").getItem(6).as("has_photos_5"),col("_corrupt_record").getItem(7).as("video_verified_5"),col("_corrupt_record").getItem(8).as("is_human_face_5"),col("_corrupt_record").getItem(9).as("max_ratio_5"),col("_corrupt_record").getItem(10).as("max_beauty_5"),col("_corrupt_record").getItem(11).as("has_description_5"),col("_corrupt_record").getItem(12).as("ip_location_5"),col("_corrupt_record").getItem(13).as("followed_num_5"),col("_corrupt_record").getItem(14).as("follower_num_5"),col("_corrupt_record").getItem(15).as("click_5"),col("_corrupt_record").getItem(16).as("clicked_5"),col("_corrupt_record").getItem(17).as("show_5"),col("_corrupt_record").getItem(18).as("send_session_5"),col("_corrupt_record").getItem(19).as("receive_session_5"),col("_corrupt_record").getItem(20).as("tag_1_1_5"),col("_corrupt_record").getItem(21).as("tag_1_2_5"),col("_corrupt_record").getItem(22).as("tag_1_3_5"),col("_corrupt_record").getItem(23).as("tag_1_4_5"),col("_corrupt_record").getItem(24).as("tag_2_1_5"),col("_corrupt_record").getItem(25).as("tag_2_2_5"),col("_corrupt_record").getItem(26).as("tag_2_3_5"),col("_corrupt_record").getItem(27).as("tag_2_4_5"),col("_corrupt_record").getItem(28).as("tag_2_5_5"),col("_corrupt_record").getItem(29).as("tag_2_6_5"),col("_corrupt_record").getItem(30).as("tag_2_7_5"),col("_corrupt_record").getItem(31).as("tag_2_8_5"),col("_corrupt_record").getItem(32).as("tag_2_9_5"),col("_corrupt_record").getItem(33).as("tag_2_10_5"),col("_corrupt_record").getItem(34).as("tag_2_11_5"),col("_corrupt_record").getItem(35).as("tag_2_12_5"),col("_corrupt_record").getItem(36).as("tag_3_1_5"),col("_corrupt_record").getItem(37).as("tag_3_2_5"),col("_corrupt_record").getItem(38).as("tag_3_3_5"),col("_corrupt_record").getItem(39).as("tag_3_4_5"),col("_corrupt_record").getItem(40).as("tag_4_1_5"),col("_corrupt_record").getItem(41).as("tag_4_2_5"),col("_corrupt_record").getItem(42).as("tag_4_3_5"),col("_corrupt_record").getItem(43).as("tag_4_4_5"),col("_corrupt_record").getItem(44).as("tag_4_5_5"),col("_corrupt_record").getItem(45).as("tag_4_6_5"),col("_corrupt_record").getItem(46).as("tag_4_7_5"),col("_corrupt_record").getItem(47).as("tag_4_8_5"),col("_corrupt_record").getItem(48).as("tag_4_9_5"),col("_corrupt_record").getItem(49).as("tag_4_10_5"),col("_corrupt_record").getItem(50).as("tag_4_11_5"),col("_corrupt_record").getItem(51).as("tag_4_12_5"),col("_corrupt_record").getItem(52).as("tag_5_1_5"),col("_corrupt_record").getItem(53).as("tag_5_2_5"),col("_corrupt_record").getItem(54).as("tag_5_3_5"),col("_corrupt_record").getItem(55).as("tag_5_4_5"),col("_corrupt_record").getItem(56).as("tag_5_5_5"),col("_corrupt_record").getItem(57).as("is_human_body_5"),col("_corrupt_record").getItem(58).as("desc_len_5"),col("_corrupt_record").getItem(59).as("has_avatar_5"),col("_corrupt_record").getItem(60).as("vip_5"),col("_corrupt_record").getItem(61).as("online_click_5"),col("_corrupt_record").getItem(62).as("online_clicked_5"),col("_corrupt_record").getItem(63).as("online_show_5"),col("_corrupt_record").getItem(64).as("online_showed_5"),col("_corrupt_record").getItem(65).as("nearby_click_5"),col("_corrupt_record").getItem(66).as("nearby_clicked_5"),col("_corrupt_record").getItem(67).as("nearby_show_5"),col("_corrupt_record").getItem(68).as("nearby_showed_5"),col("_corrupt_record").getItem(69).as("newbie_click_5"),col("_corrupt_record").getItem(70).as("newbie_clicked_5"),col("_corrupt_record").getItem(71).as("newbie_show_5"),col("_corrupt_record").getItem(72).as("newbie_showed_5"),col("_corrupt_record").getItem(73).as("social_stay_time_5"),col("_corrupt_record").getItem(74).as("visit_count_5"),col("_corrupt_record").getItem(75).as("visited_count_5"))  
  129.   
  130.     f3.write.format("csv").option("header","true").save(datadir + "/f3_feature")  
  131.     f4.write.format("csv").option("header","true").save(datadir + "/f4_feature")  
  132.     f5.write.format("csv").option("header","true").save(datadir + "/f5_feature")  
  133.     f6.write.format("csv").option("header","true").save(datadir + "/f6_feature")  
  134.   
  135.   }  
  136.   
  137.   def mergeFeature(date:String): Unit ={  
  138.   
  139.     spark.read.format("csv").option("header","true").load(datadir + "/pos_online_" + date + "_no_corr/*.csv").createOrReplaceTempView("pos_data")  
  140.   
  141.     val follow = spark.read.format("csv").option("header","true").load(datadir + "/follow_uid/" + date + "/*.csv").cache()  
  142.   
  143.     follow.withColumnRenamed("uid","uid_f").createOrReplaceTempView("follow")  
  144.   
  145.     spark.sql("select * from pos_data left outer join follow on pos_data.uid_c = follow.uid_f").createOrReplaceTempView("d1")  
  146.     spark.sql("select * from d1 left outer join f3 on d1.f1_uid = f3.uid_2").createOrReplaceTempView("d2")  
  147.     spark.sql("select * from d2 left outer join f4 on d2.f2_uid = f4.uid_3").createOrReplaceTempView("d3")  
  148.     spark.sql("select * from d3 left outer join f5 on d3.f3_uid = f5.uid_4").createOrReplaceTempView("d4")  
  149.   
  150.     val pos_res = spark.sql("select * from d4 left outer join f6 on d4.f4_uid = f6.uid_5")  
  151.         .drop("uid_f")  
  152.         .drop("f1_uid")  
  153.         .drop("f2_uid")  
  154.         .drop("f3_uid")  
  155.         .drop("f4_uid")  
  156.   
  157.     pos_res.write.format("csv").option("header","true").save(datadir + "/pos_online_" + date + "_with_4_follower")  
  158.   
  159.     spark.read.format("csv").option("header","true").load(datadir + "/neg_online_" + date + "_no_corr/*.csv").cache().createOrReplaceTempView("neg_data")  
  160.   
  161.     spark.sql("select * from neg_data left outer join follow on neg_data.uid_s = follow.uid_f").createOrReplaceTempView("d5")  
  162.     spark.sql("select * from d5 left outer join f3 on d5.f1_uid = f3.uid_2").createOrReplaceTempView("d6")  
  163.     spark.sql("select * from d6 left outer join f4 on d6.f2_uid = f4.uid_3").createOrReplaceTempView("d7")  
  164.     spark.sql("select * from d7 left outer join f5 on d7.f3_uid = f5.uid_4").createOrReplaceTempView("d8")  
  165.   
  166.     val neg_res = spark.sql("select * from d8 left outer join f6 on d8.f4_uid = f6.uid_5")  
  167.       .drop("uid_f")  
  168.       .drop("f1_uid")  
  169.       .drop("f2_uid")  
  170.       .drop("f3_uid")  
  171.       .drop("f4_uid")  
  172.   
  173.     neg_res.write.format("csv").option("header","true").save(datadir + "/neg_online_" + date + "_with_4_follower")  
  174.   
  175.   }  
  176.   
  177. }  
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值