对两个RDD进行关联操作,如:
1)文件post_data.txt包含:post_id\title\content
2)文件train.txt包含:dev_id\post_id\praise\time
通过post_id关联,提取post_id\content\praise字段,文件字段用不定长多个空格分割;
参考代码如下:
package scs.contest;
import java.util.List;
import java.util.Properties;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.Optional;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
public class pageview {
public static String path="/home/spark/data";
public static void main(String[] args) {
//第一步:设置环境字符集,避免中文乱码
Properties pps=System.getProperties();
pps.setProperty("file.encoding","UTF-8");
//第二步:txt数据导入并分割成到训练集和预