大数据项目流程1

最新推荐文章于 2024-04-25 21:36:18 发布

lijinchao1234

最新推荐文章于 2024-04-25 21:36:18 发布

阅读量282

点赞数

本文链接：https://blog.csdn.net/weixin_45135790/article/details/102694639

版权

1、frame打成jar包上传到linux集群、通过java -jar，测试是否运行成功：hdp-2:8181

2、启动nginx 目的是为了产生日志，还有负载均衡和反向代理以后更新，重点是配置文件

cd /usr/local/nginx/conf/

修改nginx.conf文件

vi nginx.conf

#user  nobody;
worker_processes  1;
 
#error_log  logs/error.log;
#error_log  logs/error.log  notice;
#error_log  logs/error.log  info;
 
#pid        logs/nginx.pid;
 
 
events {
    worker_connections  1024;
}
 
 
http {
    include       mime.types;
    default_type  application/octet-stream;
 
    log_format  main  '$remote_addr';
##输出的格式
 
    #access_log  logs/access.log  main;
 
    sendfile        on;
    #tcp_nopush     on;
 
    #keepalive_timeout  0;
    keepalive_timeout  65;
 
    #gzip  on;
    upstream frame-tomcat {
          server hdp-4:8181; 
##指明nginx转发地址
    }
    server {
        listen       80;
        server_name  hdp-8;
##nginx的服务地址
        #charset koi8-r;
 
        access_log  logs/log.frame.access.log  main;
##输出生成的日志文件的路径和格式
        location / {
            # root   html;
            # index  index.html index.htm;
            proxy_pass http://frame-tomcat;
##代理传递（转发）
        }
 
        error_page   500 502 503 504  /50x.html;
        location = /50x.html {
            root   html;
        }
    }
    server {
        listen       80;
        server_name  localhost;
 
        #charset koi8-r;
 
        #access_log  logs/host.access.log  main;
 
        location / {
            root   html;
            index  index.html index.htm;
        }
 
        #error_page  404              /404.html;
 
        # redirect server error pages to the static page /50x.html
        #
        error_page   500 502 503 504  /50x.html;
        location = /50x.html {
            root   html;
        }
 
        # proxy the PHP scripts to Apache listening on 127.0.0.1:80
        #
        #location ~ \.php$ {
        #    proxy_pass   http://127.0.0.1;
        #}
 
        # pass the PHP scripts to FastCGI server listening on 127.0.0.1:9000
        #
        #location ~ \.php$ {
        #    root           html;
        #    fastcgi_pass   127.0.0.1:9000;
        #    fastcgi_index  index.php;
        #    fastcgi_param  SCRIPT_FILENAME  /scripts$fastcgi_script_name;
        #    include        fastcgi_params;
        #}
 
        # deny access to .htaccess files, if Apache's document root
        # concurs with nginx's one
        #
        #location ~ /\.ht {
        #    deny  all;
        #}
    }
 
 
    # another virtual host using mix of IP-, name-, and port-based configuration
    #
    #server {
    #    listen       8000;
    #    listen       somename:8080;
    #    server_name  somename  alias  another.alias;
 
    #    location / {
    #        root   html;
    #        index  index.html index.htm;
    #    }
    #}
 
 
    # HTTPS server
    #
    #server {
    #    listen       443;
    #    server_name  localhost;
 
    #    ssl                  on;
    #    ssl_certificate      cert.pem;
    #    ssl_certificate_key  cert.key;
 
    #    ssl_session_timeout  5m;
 
    #    ssl_protocols  SSLv2 SSLv3 TLSv1;
    #    ssl_ciphers  HIGH:!aNULL:!MD5;
    #    ssl_prefer_server_ciphers   on;
 
    #    location / {
    #        root   html;
    #        index  index.html index.htm;
    #    }
    #}
 
}

3.flume采集nginx产生的日志文件到kafka，重点是配置文件

在flume路径中：

vi tail-hdfs.conf

a1.sources = source1
a1.sinks = k1
a1.channels = c1
# Describe/configure the source
a1.sources.source1.type = exec
##指明数据源来自于一个可执行指令
a1.sources.source1.command = tail -F /usr/local/nginx/logs/log.frame.access.log
##可执行指令，跟踪一个文件中的内容
# Describe the sink
##下沉到kafka的下沉类型
a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.topic = animal
a1.sinks.k1.brokerList = hdp-2:9092, hdp-3:9092
a1.sinks.k1.requiredAcks = 1
a1.sinks.k1.batchSize = 20
a1.sinks.k1.channel = c1
# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
 
# Bind the source and sink to the channel
a1.sources.source1.channels = c1
a1.sinks.k1.channel = c1

注解：

#spooldir：flume中自带的读取目录的source，只要出现新文件就会被读走
#定义三大组件的名称
ag1.sources = source1
ag1.sinks = sink1
ag1.channels = channel1

# 配置source组件
ag1.sources.source1.type = spooldir    #官网flume.apache.org
ag1.sources.source1.spoolDir = /root/log/ #具体的目录
ag1.sources.source1.fileSuffix=.FINISHED   #文件后缀，读走了就改成这样了
ag1.sources.source1.deserializer.maxLineLength=5129  #每一行的大小

# 配置sink组件 把读到的文件存放到哪个地方
ag1.sinks.sink1.type = hdfs
ag1.sinks.sink1.hdfs.path =hdfs://hdp-1:9000/access_log/%y-%m-%d/%H-%M
ag1.sinks.sink1.hdfs.filePrefix = app_log
ag1.sinks.sink1.hdfs.fileSuffix = .log
ag1.sinks.sink1.hdfs.batchSize= 100  #多少条记录切一次
ag1.sinks.sink1.hdfs.fileType = DataStream   #普通的数据流读
ag1.sinks.sink1.hdfs.writeFormat = Text

## roll：滚动切换：控制写文件的切换规则，sink在hdfs中生成文件的时候
ag1.sinks.sink1.hdfs.rollSize = 512000    ## 按文件体积（字节）来切   500k
							#需要讲的：hdfs.rollInterval  hdfs.rollCount   hdfs.writeFormat  hdfs.fileType
ag1.sinks.sink1.hdfs.rollCount = 1000000  ## 按event条数切
ag1.sinks.sink1.hdfs.rollInterval = 60    ## 按时间间隔切换文件     三个，哪个满足就用哪个

## 控制生成目录的规则    目录多久切一次
ag1.sinks.sink1.hdfs.round = true
ag1.sinks.sink1.hdfs.roundValue = 10    #多久切一次  10分钟
ag1.sinks.sink1.hdfs.roundUnit = minute   #单位

ag1.sinks.sink1.hdfs.useLocalTimeStamp = true  #使用本地机器的时间

# channel组件配置
ag1.channels.channel1.type = memory
ag1.channels.channel1.capacity = 500000   ## event条数  在通道中暂存的最大数量  数量要大于sink的batchSize的100条
ag1.channels.channel1.transactionCapacity = 600  ##flume事务控制所需要的缓存容量600条event   多少条记录归拢到一个事务中

# 绑定source、channel和sink之间的连接
ag1.sources.source1.channels = channel1
ag1.sinks.sink1.channel = channel1




启动一个agent程序  -C 自己的配置文件在哪里  -f  采集配置 -n agent的名字 -Dflume.root.logger=INFO.console JVM的环境参数 INFO.console控制台
启动命令：./flume-ng agent -C ../conf/ -f ../dir-hdfs.conf -n ag1 -Dflume.root.logger=INFO.console

 -Dflume.root.logger=INFO.console：打印到控制台
 
 -C ../conf/  ：   配置信息
 
 -f ../dir-hdfs.conf   ：配置文件的位置
 
 -n ag1   :    agent的名字

测试：

cd log

生成shell脚本文件makelog.sh

vi makelog.sh

while true
do
echo 'date' >> access.log
sleep 0.1
done

增加可执行权限

chmod +x makelog.sh

执行makelog.sh

sh makelog.sh 模拟生成日志信息,可以用命令tail -f access.log 跟踪执行结果

启动flume采集程序：

在flume的bin目录下

./flume-ng agent -C ../conf/ -f ../tail-hdfs.conf -n ag1 -Dflume.root.logger=INFO,console

注意此处引用的配置文件为tail-hdfs.conf

4.启动kafka，注意启动之前要启动zookeeper，在kafka的消费者中收到数据产生临时文件

package com.zpark.onekafka;
 
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.log4j.Logger;
 
 
import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Collections;
import java.util.Properties;
 
 
public class ConsumerDemo {
 
 
    public static void main(String[] args) {
 
        //调用接收消息的方法
        receiveMsg();
    }
 
    /**
     * 获取kafka topic（animal）上的数据
     */
    private static void receiveMsg() {
        Logger logger = Logger.getLogger("logRollingFile");
        Properties properties = new Properties();
        properties.put("bootstrap.servers", "hdp-3:9092");
        properties.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        properties.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        properties.put("group.id","aaaa");
        properties.put("enable.auto.commit", true);
        //一个方法
        KafkaConsumer<String, String> consumer = new KafkaConsumer<String, String>(properties);
        consumer.subscribe(Collections.singleton("animal"));
        URI uri = null;
        Configuration conf = null;
        String user = "root";
        try {
            uri = new URI("hdfs://hdp-1:9000");
            conf = new Configuration();
            conf = new Configuration();
            //dfs.replication：分布式文件系统副本的数量
            conf.set("dfs.replication", "2");
            //dfs.blocksize:分布式文件系统的块的大小   100M   64+36
            conf.set("dfs.blocksize", "64m");
 
        } catch (URISyntaxException e) {
            e.printStackTrace();
        }
        try {
            FileOutputStream fos = new FileOutputStream("D:/shuju.log");
            OutputStreamWriter osw = new OutputStreamWriter(fos);
 
 
//            FileSystem fs = FileSystem.get(uri, conf, user);
//            FSDataOutputStream fdos = fs.create(new Path("/cf.txt"));
            while(true) {
 
                /**
                 * 获取kafka
                 */
                ConsumerRecords<String, String> records = consumer.poll(100);
                for(ConsumerRecord<String, String> record: records) {
                    String msg = "key:" + record.key()+ ",value:" + record.value() + ",offset:" + record.offset()+",topic:" + record.topic()+"\r\n";
                    System.out.printf("key=%s,value=%s,offet=%s,topic=%s",record.key() , record.value() , record.offset(), record.topic());
                    logger.debug(record.value());
//                    BufferedWriter bw = new BufferedWriter(osw);
//                    bw.write(msg);
//                    bw.flush();
 
                }
            }
        }catch (Exception e) {
            e.printStackTrace();
        } finally {
            consumer.close();
        }
    }
}

并上传到hdfs的hive表中。

hive中建表： create external table flumetable2 (ip string ) row format delimited location '/usr/';

package com.zpark.onekafka;
 
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
 
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.net.URI;
import java.net.URISyntaxException;
 
public class HdfsTest {
    public static void main(String[] args) {
        URI uri = null;
        Configuration conf = null;
        String user = "root";
        FileSystem fs = null;
        try {
            uri = new URI("hdfs://hdp-1:9000");
            conf = new Configuration();
            //dfs.replication：分布式文件系统副本的数量
            conf.set("dfs.replication", "2");
            //dfs.blocksize:分布式文件系统的块的大小   100M   64+36
            conf.set("dfs.blocksize", "64m");
            fs = FileSystem.get(uri, conf, user);
            fs.copyFromLocalFile(new Path("d:/testlog/access.log"),new Path("/usr/a.txt"));
            /**
             * 往hdfs中写文件
             */
//            FSDataOutputStream out = fs.create(new Path("/bc.txt"));
//            OutputStreamWriter outWriter = new OutputStreamWriter(out);
//            BufferedWriter bw = new BufferedWriter(outWriter);
//            bw.write("hello");
//            bw.close();
//            out.close();
            fs.close();
        } catch (URISyntaxException e) {
            e.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
 
        }
    }
}

6.hive分析 select count(*) from flumetable2;统计访问总pv。

lijinchao1234

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
大数据项目流程1

1、frame打成jar包上传到linux集群、通过java -jar，测试是否运行成功：hdp-2:81812、启动nginx目的是为了产生日志，还有负载均衡和反向代理以后更新，重点是配置文件cd /usr/local/nginx/conf/修改nginx.conf文件vi nginx.conf#user nobody;worker_processes 1...
复制链接

扫一扫