1 采集规划
说明:
D1 日志所在服务器1 —bigdata02.com
D2 日志所在服务器2 —bigdata03.com
A flume2 — bigdata02.com
日志收集
C flume3 — bigdata03.com
日志收集
B flume1 — bigdata01.com
日志整合
E kafka —bigdata01.com,bigdata02.com,bigdata03.com
存储到kafka
F HBase —bigdata01.com,bigdata02.com,bigdata02.com
存储到HBase
2版本
- kafka kafka_2.11-0.10.0.0
- flume flume-1.7.0-bin
- hbase hbase-0.98.6-cdh5.3.0
3 安装
3.1 kafka安装
vi config/server.properties
broker.id=1 ##其他机器修改
listeners=PLAINTEXT://bigdata01.com:9092 ##其他机器修改
port=9092
host.name=bigdata01.com ##其他机器修改
num.network.threads=3
num.io.threads=8
socket.send.buffer.bytes=102400
socket.receive.buffer.bytes=102400
socket.request.max.bytes=104857600
log.dirs=/opt/modules/kafka_2.11-0.10.0.0/kafka-logs
num.partitions=1
num.recovery.threads.per.data.dir=1
log.retention.hours=168
log.segment.bytes=1073741824
message.max.byte=5242880
default.replication.factor=2
replica.fetch.max.bytes=5242880
log.retention.check.interval.ms=300000
log.cleaner.enable=false
zookeeper.connect=bigdata01.com:2181,bigdata02.com:2181,bigdata03.com:2181
zookeeper.connection.timeout.ms=60000
发送到其他机器并修改server.properties
3.2 flume1安装
1 vi conf/flume-env.sh
export JAVA_HOME=/opt/modules/jdk1.7.0_67
export HADOOP_HOME=/opt/modules/hadoop-2.5.0
export HBASE_HOME=/opt/modules/hbase-0.98.6-cdh5.3.0
export JAVA_OPTS="-Xms100m -Xmx2000m -Dcom.sun.management.jmxremote"
2 vi conf/flume-conf.properties
agent1.sources = r1
agent1.channels = kafkaC hbaseC
agent1.sinks = kafkaSink hbaseSink
#********************flume + hbase**************************
agent1.sources.r1.type = avro
agent1.sources.r1.channels = hbaseC kafkaC
agent1.sources.r1.bind = bigdata01.com
agent1.sources.r1.port = 55555
agent1.sources.r1.threads = 5
agent1.channels.hbaseC.type = memory
agent1.channels.hbaseC.capacity = 100000
agent1.channels.hbaseC.transactionCapacity = 100000
agent1.channels.hbaseC.keep-alive = 20
agent1.sinks.hbaseSink.type = asynchbase
agent1.sinks.hbaseSink.table = weblogs
agent1.sinks.hbaseSink.columnFamily = info
agent1.sinks.hbaseSink.serializer = org.apache.flume.sink.hbase.KfkAsyncHbaseEventSerializer
agent1.sinks.hbaseSink.channel = hbaseC
agent1.sinks.hbaseSink.serializer.payloadColumn=datatime,userid,searchname,retorder,cliorder,cliurl
#********************flume + kafka*****************************
agent1.channels.kafkaC.type = memory
agent1.channels.kafkaC.capacity = 100000
agent1.channels.kafkaC.transactionCapacity = 100000
agent1.channels.kafkaC.keep-alive = 10
agent1.sinks.kafkaSink.channel = kafkaC
agent1.sinks.kafkaSink.type = org.apache.flume.sink.kafka.KafkaSink
agent1.sinks.kafkaSink.brokerList = bigdata01.com:9092,bigdata02.com:9092,bigdata03.com:9092
agent1.sinks.kafkaSink.topic = weblogs
agent1.sinks.kafkaSink.zookeeperConnect= bigdata01.com:2181,bigdata02.com:2181,bigdata03.com:2181
agent1.sinks.kafkaSink.requiredAcks = 1
agent1.sinks.kafkaSink.batchSize = 1
agent1.sinks.kafkaSink.serializer.class = kafka.serializer.StringEncoder
3.3 flume2 安装
vi conf/flume-conf.properties
agent2.sources = s1
agent2.channels = c1
agent2.sinks = k1
agent2.sources.s1.inputCharset = GBK
agent2.sources.s1.type = exec
agent2.sources.s1.command = tail -F /opt/datas/flume.log
agent2.sources.s1.channels=c1
#channels configuration
agent2.channels.c1.type = memory
agent2.channels.c1.capacity = 10000
agent2.channels.c1.transactionCapacity = 10000
agent2.channels.c1.keep-alive = 3
#sinks configuration
agent2.sinks.k1.type = avro
agent2.sinks.k1.hostname = bigdata01.com
agent2.sinks.k1.port = 55555
agent2.sinks.k1.channel = c1
3.3 flume3安装
vi conf/flume-conf.properties
agent3.sources = s1
agent3.channels = c1
agent3.sinks = k1
agent3.sources.s1.inputCharset = GBK
agent3.sources.s1.type = exec
agent3.sources.s1.command = tail -F /opt/datas/flume.log
agent3.sources.s1.channels=c1
#channels configuration
agent3.channels.c1.type = memory
agent3.channels.c1.capacity = 10000
agent3.channels.c1.transactionCapacity = 10000
agent3.channels.c1.keep-alive = 3
#sinks configuration
agent3.sinks.k1.type = avro
agent3.sinks.k1.hostname = bigdata01.com
agent3.sinks.k1.port = 55555
agent3.sinks.k1.channel = c1
3.3 数据下载和预处理
数据预处理 文本中有’\t’和" " 两种分割符 预处理的时候将两种分隔符统一用","分割
cat weblog.log |tr "\t" "," >weblog2.log
cat weblog2.log |tr " " "," >weblog3.log
3.4 flume ->hbase 源码修改
源码修改原因: 在初始源码中是一条数据Event只带一个列簇信息,而在这里一个event带了6个列簇信息 所以需要修改源码
- 下载源码
- 导入idea flume-ng-hbase-sink项目
- 新建org.apache.flume.sink.hbase.KfkAsyncHbaseEventSerializer.java
package org.apache.flume.sink.hbase;
import com.google.common.base.Charsets;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.FlumeException;
import org.apache.flume.conf.ComponentConfiguration;
import org.apache.flume.sink.hbase.SimpleHbaseEventSerializer.KeyType;
import org.hbase.async.AtomicIncrementRequest;
import org.hbase.async.PutRequest;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
public class KfkAsyncHbaseEventSerializer implements AsyncHbaseEventSerializer {
private byte[] table;
private byte[] cf;
private byte[] payload;
private byte[] payloadColumn;
private byte[] incrementColumn;
private String rowPrefix;
private byte[] incrementRow;
private KeyType keyType;
@Override
public void initialize(byte[] table, byte[] cf) {
this.table = table;
this.cf = cf;
}
//修改在这里
@Override
public List<PutRequest> getActions() {
List<PutRequest> actions = new ArrayList<PutRequest>();
if (payloadColumn != null) {
byte[] rowKey;
try {
String [] colums = new String(this.payloadColumn).split(",");
String [] columValue = new String(this.payload).split(",");
for(int i =0;i<6;i++) {
if(colums.length != columValue.length) {
break;
}
String datetime = columValue[0].toString();
String userid = columValue[1].toString();
byte [] hColColum = colums[i].getBytes();
byte [] values = columValue[i].getBytes(Charsets.UTF_8);
rowKey = SimpleRowKeyGenerator.getKfkTimestampKey(datetime,userid);
PutRequest putRequest = new PutRequest(table, rowKey, cf,
hColColum, values);
actions.add(putRequest);
}
} catch (Exception e) {
throw new FlumeException("Could not get row key!", e);
}
}
return actions;
}
public List<AtomicIncrementRequest> getIncrements() {
List<AtomicIncrementRequest> actions = new ArrayList<AtomicIncrementRequest>();
if (incrementColumn != null) {
AtomicIncrementRequest inc = new AtomicIncrementRequest(table,
incrementRow, cf, incrementColumn);
actions.add(inc);
}
return actions;
}
@Override
public void cleanUp() {
// TODO Auto-generated method stub
}
@Override
public void configure(Context context) {
String pCol = context.getString("payloadColumn", "pCol");
String iCol = context.getString("incrementColumn", "iCol");
rowPrefix = context.getString("rowPrefix", "default");
String suffix = context.getString("suffix", "uuid");
if (pCol != null && !pCol.isEmpty()) {
if (suffix.equals("timestamp")) {
keyType = KeyType.TS;
} else if (suffix.equals("random")) {
keyType = KeyType.RANDOM;
} else if (suffix.equals("nano")) {
keyType = KeyType.TSNANO;
} else {
keyType = KeyType.UUID;
}
payloadColumn = pCol.getBytes(Charsets.UTF_8);
}
if (iCol != null && !iCol.isEmpty()) {
incrementColumn = iCol.getBytes(Charsets.UTF_8);
}
incrementRow = context.getString("incrementRow", "incRow").getBytes(Charsets.UTF_8);
}
@Override
public void setEvent(Event event) {
this.payload = event.getBody();
}
@Override
public void configure(ComponentConfiguration conf) {
// TODO Auto-generated method stub
}
}
- 重新导出jar包 并修改名字 flume-ng-hbase-sink-1.7.0.jar
- 上传到bigdata01.com flume的lib目录中替换原有的
3.5 模拟用户日志生成代码
package main.java;
import java.io.*;
public class ReadWrite {
static String readFileName;
static String writeFileName;
public static void main(String args[]){
readFileName = args[0];
writeFileName = args[1];
try {
// readInput();
readFileByLines(readFileName);
}catch(Exception e){
}
}
public static void readFileByLines(String fileName) {
FileInputStream fis = null;
InputStreamReader isr = null;
BufferedReader br = null;
String tempString = null;
try {
System.out.println("以行为单位读取文件内容,一次读一整行:");
fis = new FileInputStream(fileName);// FileInputStream
// 从文件系统中的某个文件中获取字节
isr = new InputStreamReader(fis,"GBK");
br = new BufferedReader(isr);
int count=0;
while ((tempString = br.readLine()) != null) {
count++;
// 显示行号
Thread.sleep(300);
String str = new String(tempString.getBytes("UTF8"),"GBK");
// System.out.println("row:"+count+">>>>>>>>"+tempString);
method1(writeFileName,tempString);
//appendMethodA(writeFileName,tempString);
}
isr.close();
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
} finally {
if (isr != null) {
try {
isr.close();
} catch (IOException e1) {
}
}
}
}
public static void method1(String file, String conent) {
BufferedWriter out = null;
try {
out = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream(file, true)));
out.write("\n");
out.write(conent);
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
out.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
打包生成jar 然后上传到bigdata02.com:/opt/datas/,bigdata03.com:/opt/datas/
创建启动脚本
#bin/bash
echo "start ..."
java -jar /opt/jar/weblogs.jar /opt/datas/weblog3.log /opt/datas/flume.log
3.6 启动各个组件
- 启动kafka
启动kafka
bin/kafka-server-start.sh config/server.properties
创建topic 命令
bin/kafka-topics.sh --create --zookeeper bigdata01.com:2181,bigdata02.com:2181,bigdata03.com:2181 --replication-factor 3 --partitions 1 --topic weblogs
创建消费脚本
#bin/bash
echo “kfk-kafka-comsumer.sh start ...”
bin/kafka-console-consumer.sh -zookeeper bigdata01.com:2181,bigdata02.com:2181,bigdata03.com:2181 --from-beginning --topic weblogs
- 启动hbase
- 启动flume(创建3个启动脚本 先启动flume2,flume3 再启动flume1)
#/bin/bash
echo "flume-1 start......"
bin/flume-ng agent --conf conf -f conf/flume-conf.properties -n agent1 -Dflume.root.logger=INFO,console
4 启动weblogs.jar
4 运行
1 查看hbase 中的数据
hbase(main):001:0> count 'weblogs'
Current count: 1000, row: 6062969462004942-00:00:07-1525921408064
1559 row(s) in 1.2250 seconds
=> 1559
2 查看kafka中的数据
00:00:32,269736677015411,[itfm],2,11,chanye.finance.sina.com.cn/fz/2007-09-10/334786.shtml
00:00:32,393693921083884,[奥运圣火河南路线],1,3,yanziha.pengpeng.com/bbs/thread/1148790.html
00:00:31,14386146687311085,[PSP游戏《怪物猎人2G》中文版下载],3,2,bbs.2u.com.cn/archiver/tid-93698.html
00:00:31,6747965581699283,[韩国首都为什么改名],1,1,ks.cn.yahoo.com/question/1406120803100.html
00:00:31,5540122643843461,[感恩的心+试听],4,1,www.yymp3.com/Play/7326/92974.htm
00:00:31,9874717412370105,[小马过河的博客],5,5,gaoshanliuyun200.blog.163.com/blog/static/2448501200692303238515/
00:00:31,3978551963724469,[3.44x33.com/],1,1,3.44x33.com/
00:00:31,6345435406335671,[李成儒+离婚],1,1,ent.sina.com.cn/2004-12-28/0646612073.html
00:00:31,5275533831056154,[华国峰同志逝世],6,1,www.meizu.com/bbs/showerr.asp?BoardID=10&ErrCodes=29&action=%BB%AA%B9%FA%B7%E5%CD%AC%D6%BE%CA%C5%CA%C0
00:00:31,3949828035015059,[old+woman],3,21,www.xxxmovieforum.com/
00:00:31,19186326774082868,[张雅],5,5,tv.mofile.com/tags/???\xa8\xa6??:0,1,20,1,0,0,audittime,0,
00:00:31,6009454949181303,[缅甸第三特区],9,13,www.xzqh.org/bbs/read.php?tid=31074
00:00:31,9472812716405814,[软件],6,12,www.onlinedown.net/
00:00:32,9311412621037496,[哄抢救灾物资],2,1,pic.news.mop.com/gs/2008/0528/12985.shtml
00:00:32,3691729199504175,[哭泣的星空+MP3],2,2,yr0201.blog.sohu.com/22352924.html
00:00:32,40320548674212914,[杨丞琳辱华事件],1,1,you.video.sina.com.cn/b/1084004-1261359184.html
00:00:32,8561366108033201,[哄抢救灾物资],1,3,news.21cn.com/social/daqian/2008/05/29/4777194_1.shtml
00:00:32,141278734311103,[网站建设],1,1,www.qicaispace.com/
00:00:32,056513944508728375,[黎姿],2,1,news.baidu.com/f/17/lizi.html
00:00:32,269736677015411,[itfm],2,11,chanye.finance.sina.com.cn/fz/2007-09-10/334786.shtml
00:00:32,393693921083884,[奥运圣火河南路线],1,3,yanziha.pengpeng.com/bbs/thread/1148790.html
00:00:32,9994672352241558,[高级妓女],6,216,lady.anhuinews.com/system/2003/01/07/000213154.shtml
00:00:32,9994672352241558,[高级妓女],6,216,lady.anhuinews.com/system/2003/01/07/000213154.shtml
00:00:32,7954374672498488,[台湾空军叛逃大陆],6,4,www.hanhuncn.com/Html/Twsj/20060921074835205_2.html
00:00:32,2896977267956338,[荔枝核深加工],4,4,www.ilib.cn/A-spkj200603040.html
00:00:33,41800714861954374,[月见草油],7,13,www.hisuppliers.com/remen/list/yuejiancaoyou/yuejiancaoyou.html
00:00:33,2699849326058153,[见过这样的另类吗],9,1,bbs.vogue.com.cn/archiver/?tid-92752.html
00:00:33,12931893747701723,[美军审讯越南女战俘],15,59,xzd.2000y.net/mb/1/ReadNews.asp?NewsID=547170
00:00:33,4554795388682654,[宁王府],1,4,www.cdol.net/html/29/109929-15687.html
00:00:33,9921372392180088,[尺寸链],12,55,www.ilib.cn/A-kjqbkfyjj200307090.html
00:00:33,14386146687311085,[PSP游戏《怪物猎人2G》中文版下载],1,3,games.qq.com/a/20080401/000413.htm
00:00:33,9700485503618976,[如何让头发快速长长],2,12,zhidao.baidu.com/question/24246694.html
00:00:33,6242029922450475,[扫地车报价],19,40,liqixianjin.b2b.hc360.com/supply/27323118.html
00:00:33,8480586467887667,[科比81分视频],1,1,www.tudou.com/programs/view/cZMRnhWcGtw/
00:00:33,9378259159932798,[隆武帝],120,46,www.openow.net/details/e2007.html
00:00:33,8933412496786006,[沈国放间谍事件],1,9,news.qq.com/a/20060425/
00:00:33,48530765688455246,[胡其美],27,4,bbs1.hxsd.com.cn/user/info?uid=182634
00:00:33,28250791446280643,[命名],10,7,www.namers.cn/
00:00:33,21071231987753036,[莎朗斯通],3,6,ent.qq.com/a/20060214/000136.htm
00:00:33,9586356230570776,[学有所教、劳有所得、老有所养、病有所医、住有所居],1,5,cpc.people.com.cn/GB/67481/94156/105719/105723/106451/6738281.html
00:00:34,2199783436347869,[如何下载56视频],1,1,wenwen.soso.com/z/q8527818.htm