用python模拟访问日志的生成。将生成的日志写入一个文件中,后续Flume会从该日志文件采集数据。
1.在服务器中创建日志文件
cd /app/flume/testData
touch generateLog.log
2.代码
cd /app/flume/testData
vi sparkStreamingGenerateLog.py
#coding=UTF-8
import random
import time
url_paths = [
"class/112.hdml",
"class/128.hdml",
"class/145.hdml",
"class/146.hdml",
"class/131.hdml",
"class/130.hdml",
"learn/821.hdml",
"course/list"
]
ip_slices = [
132,156,124,10,29,167,143,187,30,46,55,63,72,87,98,168
]
# 跳转
http_referers = [
"http://www.baidu.com/s?wd={query}",
"http://www.sogou.com/web?query={query}",
"http://cn.bing.com/search?q={query}",
"http://www.yahoo.com/search?p={query}"
]
search_keyword = [
"Spark SQL实战",
"Hadoop基础",
"Storm实战",
"Spark Streaming实战",
"理论"
]
status_codes = ["200","404","500"]
# 生成URL
def sample_url():
return random.sample(url_paths,1)[0]
# 生成ip
def samle_ip():
slice = random.sample(ip_slices,4)
return ".".join([str(item) for item in slice])
# 生成referer
def sample_referer():
if random.uniform(0,1) > 0.2:
return "_"
refer_str = random.sample(http_referers,1)
query_str = random.sample(search_keyword,1)
return refer_str[0].format(query=query_str[0])
# 生成状态码
def sample_status_code():
return random.sample(status_codes,1)[0]
def generate_log(count = 10):
time_str =time.strftime("%Y-%m-%d %H:%M:%S",time.localtime())
#要写入的文件 该文件要先在服务器上创建好touch generateLog.log 代码在服务器上面跑
f = open("/app/flume/testData/generateLog.log","w+")
while count >= 1:
query_log = "{ip}\t{local_time}\t\"GET /{url} HTTP/1.1\"\t{status_code}\t{referer}".format(url=sample_url(),ip=samle_ip(),referer=sample_referer(),status_code=sample_status_code(),local_time=time_str)
print(query_log)
f.write(query_log + "\n")
count = count - 1
if __name__ == '__main__':
generate_log(100)
3.在服务器上运行Python代码,检查是否成功。
python sparkStreamingGenerateLog.py
more /app/flume/testData/generateLog.log
4.用crontab把python程序做成定时运行(每分钟)
cd /app/flume/testData
touch log_generator.sh
chmod u+x log_generator.sh
vi log_generator.sh
配置到crontab中
crontab -e
*/1 * * * * /app/flume/testData/log_generator.sh >> /app/flume/testData/crontabLog_generator
查看是否每分钟一次执行了生成日志的python代码
cd /app/flume/testData
tail -f generateLog.log