1 功能
- 统计实战课程访问量
- 统计从搜索引擎引流过来的实战课程访问量
1.1 python 日志产生脚本
# coding=UTF-8
import random
import time
url_paths = [
"class/112.html",
"class/128.html",
"class/145.html",
"class/146.html",
"class/131.html",
"class/130.html",
"learn/821.html",
"course/list"
]
ip_slices = [132, 111, 113, 121, 156, 187, 186, 192, 198, 201, 43, 56, 90]
http_referers = [
"http://www.baidu.com/s?wd={query}",
"http://www.sougou.com/web?query={query}",
"https://cn.bing.com/search?q={query}",
"https://search.yahoo.com/search?p={query}"
]
search_keyword = [
"Spark SQL实战",
"Hadoop入门",
"Spark Streaming实战",
"Storm实战"
]
status_codes = ["200", "404", "500"]
def sample_url():
return random.sample(url_paths, 1)[0]
def sample_ip():
slice = random.sample(ip_slices, 4)
return ".".join([str(item) for item in slice])
def sample_referer():
if random.uniform(0, 1) > 0.2:
return "-"
refer_str = random.sample(http_referers, 1)
query_str = random.sample(search_keyword, 1)
return refer_str[0].format(query=query_str[0])
def sample_status_code():
return random.sample(status_codes, 1)[0]
def generate_log(count=10):
time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
f = open("/home/hadoop/tempdata/access.log", "w+")
while count >= 1:
query_log = "{ip}\t{local_time}\t\"GET /{url} HTTP/1.1\"\t{status_code}\t{referer}".format(ip=sample_ip(),
local_time=time_str,
url=sample_url(),
status_code=sample_status_code(),
referer=sample_referer()
)
print(query_log)
f.write(query_log + "\n")
count = count - 1
if __name__ == '__main__':
generate_log(100)
43.201.187.90 2018-12-14 14:21:40 "GET /class/131.html HTTP/1.1" 200 -
132.201.111.198 2018-12-14 14:21:40 "GET /class/145.html HTTP/1.1" 200 -
111.121.201.56 2018-12-14 14:21:40 "GET /class/145.html HTTP/1.1" 404 -
132.201.56.198 2018-12-14 14:21:40 "GET /class/145.html HTTP/1.1" 500 -
192.43.201.156 2018-12-14 14:21:40 "GET /class/130.html HTTP/1.1" 404 -
201.156.90.121 2018-12-14 14:21:40 "GET /class/145.html HTTP/1.1" 404 http://www.sougou.com/web?query=Spark SQL实战
111.90.113.43 2018-12-14 14:21:40 "GET /class/145.html HTTP/1.1" 404 http://www.baidu.com/s?wd=Hadoop入门
43.56.192.132 2018-12-14 14:21:40 "GET /class/112.html HTTP/1.1" 404 https://search.yahoo.com/search?p=Spark Streaming实战
43.187.132.186 2018-12-14 14:21:40 "GET /class/128.html HTTP/1.1" 200 -
121.43.132.187 2018-12-14 14:21:40 "GET /class/146.html HTTP/1.1" 404 -
156.121.192.113 2018-12-14 14:21:40 "GET /learn/821.html HTTP/1.1" 200 https://search.yahoo.com/search?p=Hadoop入门
113.90.132.121 2018-12-14 14:21:40 "GET /class/146.html HTTP/1.1" 500 -
在node1 测试 [hadoop@node1 ~]$ python3 generate_log.py
再次运行脚本,会在 access.log
尾部追加
1.2 定时产生数据
- 编写shell 脚本
log_generator.sh
python3 /home/hadoop/generate_log.py
[hadoop@node1 ~]$ chmod u+x log_generator.sh
- 用 crontab 每一分钟调用脚本
[hadoop@node1 tempdata]$ tail -200f access.log