1、frame打成jar包上传到linux集群、通过java -jar,测试是否 运行成功:hdp-2:8181
2、启动nginx 目的是为了产生日志,还有负载均衡和反向代理以后更新,重点是配置文件
cd /usr/local/nginx/conf/
修改nginx.conf文件
vi nginx.conf
#user nobody;
worker_processes 1;
#error_log logs/error.log;
#error_log logs/error.log notice;
#error_log logs/error.log info;
#pid logs/nginx.pid;
events {
worker_connections 1024;
}
http {
include mime.types;
default_type application/octet-stream;
log_format main '$remote_addr';
##输出的格式
#access_log logs/access.log main;
sendfile on;
#tcp_nopush on;
#keepalive_timeout 0;
keepalive_timeout 65;
#gzip on;
upstream frame-tomcat {
server hdp-4:8181;
##指明nginx转发地址
}
server {
listen 80;
server_name hdp-8;
##nginx的服务地址
#charset koi8-r;
access_log logs/log.frame.access.log main;
##输出生成的日志文件的路径和格式
location / {
# root html;
# index index.html index.htm;
proxy_pass http://frame-tomcat;
##代理传递(转发)
}
error_page 500 502 503 504 /50x.html;
location = /50x.html {
root html;
}
}
server {
listen 80;
server_name localhost;
#charset koi8-r;
#access_log logs/host.access.log main;
location / {
root html;
index index.html index.htm;
}
#error_page 404 /404.html;
# redirect server error pages to the static page /50x.html
#
error_page 500 502 503 504 /50x.html;
location = /50x.html {
root html;
}
# proxy the PHP scripts to Apache listening on 127.0.0.1:80
#
#location ~ \.php$ {
# proxy_pass http://127.0.0.1;
#}
# pass the PHP scripts to FastCGI server listening on 127.0.0.1:9000
#
#location ~ \.php$ {
# root html;
# fastcgi_pass 127.0.0.1:9000;
# fastcgi_index index.php;
# fastcgi_param SCRIPT_FILENAME /scripts$fastcgi_script_name;
# include fastcgi_params;
#}
# deny access to .htaccess files, if Apache's document root
# concurs with nginx's one
#
#location ~ /\.ht {
# deny all;
#}
}
# another virtual host using mix of IP-, name-, and port-based configuration
#
#server {
# listen 8000;
# listen somename:8080;
# server_name somename alias another.alias;
# location / {
# root html;
# index index.html index.htm;
# }
#}
# HTTPS server
#
#server {
# listen 443;
# server_name localhost;
# ssl on;
# ssl_certificate cert.pem;
# ssl_certificate_key cert.key;
# ssl_session_timeout 5m;
# ssl_protocols SSLv2 SSLv3 TLSv1;
# ssl_ciphers HIGH:!aNULL:!MD5;
# ssl_prefer_server_ciphers on;
# location / {
# root html;
# index index.html index.htm;
# }
#}
}
3.flume采集nginx产生的日志文件到kafka,重点是配置文件
在flume路径中:
vi tail-hdfs.conf
a1.sources = source1
a1.sinks = k1
a1.channels = c1
# Describe/configure the source
a1.sources.source1.type = exec
##指明数据源来自于一个可执行指令
a1.sources.source1.command = tail -F /usr/local/nginx/logs/log.frame.access.log
##可执行指令,跟踪一个文件中的内容
# Describe the sink
##下沉到kafka的下沉类型
a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.topic = animal
a1.sinks.k1.brokerList = hdp-2:9092, hdp-3:9092
a1.sinks.k1.requiredAcks = 1
a1.sinks.k1.batchSize = 20
a1.sinks.k1.channel = c1
# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
# Bind the source and sink to the channel
a1.sources.source1.channels = c1
a1.sinks.k1.channel = c1
注解:
#spooldir:flume中自带的读取目录的source,只要出现新文件就会被读走
#定义三大组件的名称
ag1.sources = source1
ag1.sinks = sink1
ag1.channels = channel1
# 配置source组件
ag1.sources.source1.type = spooldir #官网flume.apache.org
ag1.sources.source1.spoolDir = /root/log/ #具体的目录
ag1.sources.source1.fileSuffix=.FINISHED #文件后缀,读走了就改成这样了
ag1.sources.source1.deserializer.maxLineLength=5129 #每一行的大小
# 配置sink组件 把读到的文件存放到哪个地方
ag1.sinks.sink1.type = hdfs
ag1.sinks.sink1.hdfs.path =hdfs://hdp-1:9000/access_log/%y-%m-%d/%H-%M
ag1.sinks.sink1.hdfs.filePrefix = app_log
ag1.sinks.sink1.hdfs.fileSuffix = .log
ag1.sinks.sink1.hdfs.batchSize= 100 #多少条记录切一次
ag1.sinks.sink1.hdfs.fileType = DataStream #普通的数据流读
ag1.sinks.sink1.hdfs.writeFormat = Text
## roll:滚动切换:控制写文件的切换规则,sink在hdfs中生成文件的时候
ag1.sinks.sink1.hdfs.rollSize = 512000 ## 按文件体积(字节)来切 500k
#需要讲的:hdfs.rollInterval hdfs.rollCount hdfs.writeFormat hdfs.fileType
ag1.sinks.sink1.hdfs.rollCount = 1000000 ## 按event条数切
ag1.sinks.sink1.hdfs.rollInterval = 60 ## 按时间间隔切换文件 三个,哪个满足就用哪个
## 控制生成目录的规则 目录多久切一次
ag1.sinks.sink1.hdfs.round = true
ag1.sinks.sink1.hdfs.roundValue = 10 #多久切一次 10分钟
ag1.sinks.sink1.hdfs.roundUnit = minute #单位
ag1.sinks.sink1.hdfs.useLocalTimeStamp = true #使用本地机器的时间
# channel组件配置
ag1.channels.channel1.type = memory
ag1.channels.channel1.capacity = 500000 ## event条数 在通道中暂存的最大数量 数量要大于sink的batchSize的100条
ag1.channels.channel1.transactionCapacity = 600 ##flume事务控制所需要的缓存容量600条event 多少条记录归拢到一个事务中
# 绑定source、channel和sink之间的连接
ag1.sources.source1.channels = channel1
ag1.sinks.sink1.channel = channel1
启动一个agent程序 -C 自己的配置文件在哪里 -f 采集配置 -n agent的名字 -Dflume.root.logger=INFO.console JVM的环境参数 INFO.console控制台
启动命令:./flume-ng agent -C ../conf/ -f ../dir-hdfs.conf -n ag1 -Dflume.root.logger=INFO.console
-Dflume.root.logger=INFO.console:打印到控制台
-C ../conf/ : 配置信息
-f ../dir-hdfs.conf :配置文件的位置
-n ag1 : agent的名字
测试:
cd log
生成shell脚本文件makelog.sh
vi makelog.sh
while true
do
echo 'date' >> access.log
sleep 0.1
done
增加可执行权限
chmod +x makelog.sh
执行makelog.sh
sh makelog.sh 模拟生成日志信息,可以用命令tail -f access.log 跟踪执行结果
启动flume采集程序:
在flume的bin目录下
./flume-ng agent -C ../conf/ -f ../tail-hdfs.conf -n ag1 -Dflume.root.logger=INFO,console
注意此处引用的配置文件为tail-hdfs.conf
4.启动kafka,注意启动之前要启动zookeeper,在kafka的消费者中收到数据产生临时文件
package com.zpark.onekafka;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.log4j.Logger;
import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Collections;
import java.util.Properties;
public class ConsumerDemo {
public static void main(String[] args) {
//调用接收消息的方法
receiveMsg();
}
/**
* 获取kafka topic(animal)上的数据
*/
private static void receiveMsg() {
Logger logger = Logger.getLogger("logRollingFile");
Properties properties = new Properties();
properties.put("bootstrap.servers", "hdp-3:9092");
properties.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
properties.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
properties.put("group.id","aaaa");
properties.put("enable.auto.commit", true);
//一个方法
KafkaConsumer<String, String> consumer = new KafkaConsumer<String, String>(properties);
consumer.subscribe(Collections.singleton("animal"));
URI uri = null;
Configuration conf = null;
String user = "root";
try {
uri = new URI("hdfs://hdp-1:9000");
conf = new Configuration();
conf = new Configuration();
//dfs.replication:分布式文件系统副本的数量
conf.set("dfs.replication", "2");
//dfs.blocksize:分布式文件系统的块的大小 100M 64+36
conf.set("dfs.blocksize", "64m");
} catch (URISyntaxException e) {
e.printStackTrace();
}
try {
FileOutputStream fos = new FileOutputStream("D:/shuju.log");
OutputStreamWriter osw = new OutputStreamWriter(fos);
// FileSystem fs = FileSystem.get(uri, conf, user);
// FSDataOutputStream fdos = fs.create(new Path("/cf.txt"));
while(true) {
/**
* 获取kafka
*/
ConsumerRecords<String, String> records = consumer.poll(100);
for(ConsumerRecord<String, String> record: records) {
String msg = "key:" + record.key()+ ",value:" + record.value() + ",offset:" + record.offset()+",topic:" + record.topic()+"\r\n";
System.out.printf("key=%s,value=%s,offet=%s,topic=%s",record.key() , record.value() , record.offset(), record.topic());
logger.debug(record.value());
// BufferedWriter bw = new BufferedWriter(osw);
// bw.write(msg);
// bw.flush();
}
}
}catch (Exception e) {
e.printStackTrace();
} finally {
consumer.close();
}
}
}
5.
并上传到hdfs的hive表中。
hive中建表: create external table flumetable2 (ip string ) row format delimited location '/usr/';
package com.zpark.onekafka;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.net.URI;
import java.net.URISyntaxException;
public class HdfsTest {
public static void main(String[] args) {
URI uri = null;
Configuration conf = null;
String user = "root";
FileSystem fs = null;
try {
uri = new URI("hdfs://hdp-1:9000");
conf = new Configuration();
//dfs.replication:分布式文件系统副本的数量
conf.set("dfs.replication", "2");
//dfs.blocksize:分布式文件系统的块的大小 100M 64+36
conf.set("dfs.blocksize", "64m");
fs = FileSystem.get(uri, conf, user);
fs.copyFromLocalFile(new Path("d:/testlog/access.log"),new Path("/usr/a.txt"));
/**
* 往hdfs中写文件
*/
// FSDataOutputStream out = fs.create(new Path("/bc.txt"));
// OutputStreamWriter outWriter = new OutputStreamWriter(out);
// BufferedWriter bw = new BufferedWriter(outWriter);
// bw.write("hello");
// bw.close();
// out.close();
fs.close();
} catch (URISyntaxException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
}
}
}
6.hive分析 select count(*) from flumetable2;统计访问总pv。