wedasasda
数据如下:
3333 flume
4444 ooize
5555 flume
4444 ooize
5555 flume
2222 hive
3333 hadoop
4444 hbase
3333 flume
4444 ooize
5555 flume
flume 1
hadoop 2
import java.io.Serializable;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.StorageLevels;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import scala.Tuple2;
import org.apache.spark.streaming.Durations;
public class finaltest {
public static void main(String[] args) throws InterruptedException {
// TODO Auto-generated method stub
//1.获取实时数据
SparkConf sparkConf = new SparkConf().setAppName("Streaming").setMaster("local[2]");
JavaStreamingContext ssc = new JavaStreamingContext(sparkConf,Durations.seconds(60));
JavaReceiverInputDStream<String> lines = ssc.socketTextStream(args[0], Integer.parseInt(args[1]),StorageLevels.MEMORY_AND_DISK_SER);
//2.处理数据,获得每天对某个广告点击超过N次的用户
JavaPairDStream<String,String> data = lines.mapToPair(f -> new Tuple2<>(f.split(",")[0],f.split(",")[1]));
data.foreachRDD(rdd -> {
JavaRDD<Advert> adRDD = rdd.map(f -> {
Advert ad = new Advert();
ad.setUserId(f._1);
ad.setAdvertId(f._2);
return ad;
});
SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf());
Dataset<Row> words = spark.createDataFrame(adRDD,Advert.class);
words.createOrReplaceGlobalTempView("words");
Dataset<Row> result = spark.sql("select userId from (select userId,advertId,count(*) from words group by userId,advertId having count(*) > 2 a");
//3.将实时产生的黑名单存入MYSQL数据库
result.write().format("jdbc")