设计思路:
1.将ip截取出来(多种方式)用map组成二元组(注意区分map,flatmap)将相同ip出现次数统计出来分析得出爬虫ip
2.将多次重复ip且访问密集的设为访问黑名单
3.将同一时间访问某网站的ip整理出来
package Test1225
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
object Spider01{
// 练习1 将ip出现次数多的爬虫ip挑选出来
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("ip")
val ssc = new StreamingContext(conf,Seconds(10))
ssc.sparkContext.setLogLevel("ERROR")
ssc.checkpoint("d://123/eq")
val ds = ssc.socketTextStream("lion",9999)
val paris = ds.map(_.split(" ")(0)).map(x => (x,1)).reduceByKey(_+_).filter(x => x._2 > 5)
val res = paris.updateStateByKey(updateFunction).repartition(1)
res.saveAsTextFiles("D://124//spider01")//查询到的结果输出到本地目录
res.print()