准备日志
194.237.142.21 - - [18/Sep/2019:06:49:18 +0000] "GET /wp-content/uploads/2019/07/rstudio-git3.png HTTP/1.1" 304 0 "-" "Mozilla/4.0 (compatible;)"
183.49.46.228 - - [18/Sep/2019:06:49:23 +0000] "-" 400 0 "-" "-"
163.177.71.12 - - [18/Sep/2019:06:49:33 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
163.177.71.12 - - [18/Sep/2019:06:49:33 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
163.177.71.12 - - [18/Sep/2019:06:49:36 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
163.177.71.12 - - [18/Sep/2019:06:49:36 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
101.226.68.137 - - [18/Sep/2019:06:49:42 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
101.226.68.137 - - [18/Sep/2019:06:49:45 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
60.208.6.156 - - [18/Sep/2019:06:49:48 +0000] "GET /wp-content/uploads/2019/07/rcassandra.png HTTP/1.0" 200 185524 "http://cos.name/category/software/packages/" "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36"
60.208.6.156 - - [18/Sep/2019:06:49:48 +0000] "GET /wp-content/uploads/2019/07/rcassandra.png HTTP/1.0" 200 185524 "http://cos.name/category/software/packages/" "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36"
60.208.6.156 - - [18/Sep/2019:06:49:48 +0000] "GET /wp-content/uploads/2019/07/rcassandra.png HTTP/1.0" 200 185524 "http://cos.name/category/software/packages/" "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36"
222.68.172.190 - - [18/Sep/2019:06:49:57 +0000] "GET /images/my.jpg HTTP/1.1" 200 19939 "http://www.angularjs.cn/A00n" "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36"
222.68.172.190 - - [18/Sep/2019:06:50:08 +0000] "-" 400 0 "-" "-"
222.68.172.190 - - [18/Sep/2019:06:50:08 +0000] "-" 400 0 "-" "-"
222.68.172.190 - - [18/Sep/2019:06:50:08 +0000] "-" 400 0 "-" "-"
222.68.172.190 - - [18/Sep/2019:06:50:08 +0000] "-" 400 0 "-" "-"
点击流日志分析
import os
from operator import add
from pyspark import SparkConf
from pyspark.sql import SparkSession
os.environ['PYSPARK_PYTHON'] = "/usr/bin/python3"
master = "spark://192.168.18.126:7077"
appName = "pv_uv_TopN"
sc_conf = SparkConf()
sc_conf.setMaster(master)
spark = SparkSession.builder.appName(appName).getOrCreate()
sc = spark.sparkContext
rdd1 = sc.textFile("file:///root/access.log")
rdd_total = rdd1.map(lambda x: ("pv", 1))
rdd_total_add = rdd_total.reduceByKey(add)
print(rdd_total_add.collect())
rdd_ips = rdd1.map(lambda x: x.split(" ")).map(lambda x: x[0])
rdd_ips_count = rdd_ips.distinct().map(lambda x: ("uv", 1)).reduceByKey(lambda a, b: a+b).collect()
print(rdd_ips_count)
rdd_ips_tuple = rdd_ips.map(lambda x: (x, 1))
rdd_ips_tuple_add = rdd_ips_tuple.reduceByKey(add)
rdd_ips_top3 = rdd_ips_tuple_add.sortBy(lambda x: x[1], ascending=False).filter(lambda x: x[1] >= 2).take(3)
print(rdd_ips_top3)
>[('pv', 16)]
>[('uv', 6)]
>[('222.68.172.190', 5), ('163.177.71.12', 4), ('60.208.6.156', 3)]