基于kafka的web日志收集项目

项目拓扑图:

项目具体步骤:

部署反向代理集群

  1. 安装nginx

方式:1、yum安装

      2、手动编译安装

在这里我选择的是yum安装。

1、yum install  epel-release -y

2、yum install  nginx -y

检查是否安装成功:rpm -qa |grep nginx

yum安装后,nginx的路径为/etc/nginx

修改配置文件:

#自定义虚拟主机

先在 etc/nginx/nginx.conf 主配置文件,http作用域块中添加指令

include conf.d/*.conf;

在conf目录下创建目录conf.d,添加xy.conf文件

[root@kafka01 conf.d]# cat xy.conf

server {

    listen 8080;    #监听端口

    server_name www.xy.com www.xy2.com;  #域名

    access_log logs/xy.log main;  #访问日志存放路径

    location / {

        root /opt/html;           #网站家目录

        index a.html;             #设置索引文件

    }

}

使用nginx的7层反向代理(根据curl代理)

在后端服务器集群上,安装python3

yum install python3 -y

pip3 install  flask

编辑flask测试程序:

[root@node1 ~]# cat flask-test.py

from flask import Flask

app = Flask(__name__)

@app.route("/")

def index():

     return "this is flask index"

@app.route("/test")

def test():

    return "this is flask test"

app.run(host="0.0.0.0")

#运行flask web

[root@node1 ~]# nohup python3 flask-test.py &

[1] 2273

[root@node1 ~]# nohup: 忽略输入并把输出追加到"nohup.out"

[root@node1 ~]# ps -ef |grep python

root       1056      1  0 10:21 ?        00:00:01 /usr/bin/python2 -Es /usr/sbin/tuned -l -P

root       2273   1981  5 11:52 pts/1    00:00:00 python3 flask-test.py

root       2278   1981  0 11:52 pts/1    00:00:00 grep --color=auto python

#后端真实服务器,关闭防火墙,保证能和前面的代理机网络互通

#在前面nginx代理集群上添加

    location /api {

        proxy_pass http://192.168.1.117:5000/;  #如果在最后添加/ 就表示把/api后面携带的url,向后端真实服务器发送请求

    } 

########后端服务器日志获取真实ip地址

   location /api {

        proxy_set_header    Host    $http_host;

        proxy_set_header    X-Forwarded-For $proxy_add_x_forwarded_for;

        proxy_pass http://192.168.1.117:5000/;  

    }

#后端真实服务器测试修改

安装gunicorn   --  pip3 install gunicron

将flask-test.py里的最后一行app.run()注释掉。

启动服务使用gunicorn

gunicorn flask-test:app -b "0.0.0.0:5000" --access-logfile='./access_sc.log' --access-logformat='%({x-forwarded-for}i)s %(l)s %(u)s %(t)s "%(r)s" %(s)s %(b)s "%(f)s" "%(a)s"' &

添加负载均衡:

在xy.conf文件中和server作用域同级下添加:

upstream flask_backend{

    server 192.168.1.117:5000;

    server 192.168.1.117:6000;

}

修改proxy_pass:

proxy_pass http://flask_backend/;

######负载均衡-健康检查

nginx --自带检查(被动)

当有请求过来的时候,如果请求刚好被代理到down掉的后端真实机,会重新转发一次,转发到可以正常提供服务的真实服务器。

这种检查被动,不能提前发现后端真实服务器的一个状态,而且可能会产生多一次转发,影响效率。

nginx -- 主动健康检查  -- nginx_upstream_check_module

去/opt目录下,下载模块

wget https://codeload.github.com/yaoweibin/nginx_upstream_check_module/zip/master

安装unzip patch命令:  yum install unzip patch -y

在/opt目录下解压:unzip master

#去nginx下载路径下,打补丁  

patch -p1 < /opt/nginx_upstream_check_module-master/check_1.20.1+.patch

#重新编译,添加健康检查模块

./configure --prefix=/usr/local/sc  --user=sc --with-threads  --with-http_ssl_module --with-http_v2_module  --with-http_stub_status_module   --with-stream --with-http_realip_module  --with-http_auth_request_module  --add-module=/opt/nginx_upstream_check_module-master/

#重新安装

#make && make install

修改配置文件xy.conf:

upstream flask_backend{

    server 192.168.1.117:5000;

    server 192.168.1.117:6000;

    check interval=5000 rise=2 fall=5 timeout=1000 type=http;

    check_http_send "HEAD / HTTP/1.0\r\n\r\n";

    check_http_expect_alive http_2xx http_3xx;

}

部署keepalived,实现高可用。

#安装keepalived

yum  install keepalived -y

#配置文件: /etc/keepalived/keepalived.conf

#先备份配置文件

#安装 :yum install keepalived -y

主服务器:

编辑 /etc/keepalived/keepalived.conf

! Configuration File for keepalived

global_defs {

   notification_email {

     acassen@firewall.loc

     failover@firewall.loc

     sysadmin@firewall.loc

   }

   notification_email_from Alexandre.Cassen@firewall.loc

   smtp_server 192.168.200.1

   smtp_connect_timeout 30

   router_id LVS_DEVEL

   vrrp_skip_check_adv_addr

#   vrrp_strict  #严格遵守vrrp协议

   vrrp_garp_interval 0

   vrrp_gna_interval 0

}

vrrp_instance VI_1 {

    state MASTER

    interface ens33

    virtual_router_id 51

    priority 100

    advert_int 1

    authentication {

        auth_type PASS

        auth_pass 1111

    }

    virtual_ipaddress {

        192.168.100.250

    }

}

备服务器:

  主配置文件:/etc/keepalived/keepalived.conf

! Configuration File for keepalived

global_defs {

   notification_email {

     acassen@firewall.loc

     failover@firewall.loc

     sysadmin@firewall.loc

   }

   notification_email_from Alexandre.Cassen@firewall.loc

   smtp_server 192.168.200.1

   smtp_connect_timeout 30

   router_id LVS_DEVEL

   vrrp_skip_check_adv_addr

#   vrrp_strict  #严格遵守vrrp协议

   vrrp_garp_interval 0

   vrrp_gna_interval 0

}

vrrp_instance VI_1 {

    state BACKUP   #设置为备

    interface ens33  #虚拟ip绑定到哪个网卡

    virtual_router_id 51   #0-255#虚拟路由id  在同一个局域网内 来区分不同的keepalive集群 ,

                                      #如果在同一个keepalive集群中,那每台主机的router id都是一样的  

    priority 50                 #0-255优先级, 优先越高拿到虚拟ip的概率就会越大

    advert_int 1              #隔1s钟发送一次存活检测

    authentication {       #认证方式

        auth_type PASS

        auth_pass 1111

    }

    virtual_ipaddress {   #设置虚拟ip

        192.168.100.250

    }

}

##########启动服务

systemctl start keepalived

#######根据nginx服务检测,防止出现脑裂现象而使服务挂掉

#添加检测脚本:

/opt/check_nginx.sh

[root@master keepalived]# cat /opt/check_nginx.sh

/usr/sbin/pidof nginx &>/dev/null

#添加权限  chmod +x  check_nginx.sh

#修改主服务器配置:

! Configuration File for keepalived

global_defs {

   notification_email {

     acassen@firewall.loc

     failover@firewall.loc

     sysadmin@firewall.loc

   }

   notification_email_from Alexandre.Cassen@firewall.loc

   smtp_server 192.168.200.1

   smtp_connect_timeout 30

   router_id LVS_DEVEL

   vrrp_skip_check_adv_addr

#   vrrp_strict  #严格遵守vrrp协议

   vrrp_garp_interval 0

   vrrp_gna_interval 0

}

vrrp_script chk_http_port {

    script "/opt/check_nginx.sh"  #检测脚本位置

    interval 2                    #检测间隔

    weight -60                    #当检测脚本返回非0,将优先级-60

}

vrrp_instance VI_1 {

    state MASTER

    interface ens33

    virtual_router_id 60

    priority 100

    advert_int 1

    authentication {

        auth_type PASS

        auth_pass 1111

    }

    #添加检测脚本

    track_script {

        chk_http_port

    }

    virtual_ipaddress {

        192.168.1.250

    }

}

###########双vip互为主备,提高资源利用。

#1.250的主配置:

! Configuration File for keepalived

global_defs {

   notification_email {

     acassen@firewall.loc

     failover@firewall.loc

     sysadmin@firewall.loc

   }

   notification_email_from Alexandre.Cassen@firewall.loc

   smtp_server 192.168.200.1

   smtp_connect_timeout 30

   router_id LVS_DEVEL

   vrrp_skip_check_adv_addr

#   vrrp_strict  #严格遵守vrrp协议

   vrrp_garp_interval 0

   vrrp_gna_interval 0

}

vrrp_script chk_http_port {

    script "/opt/check_nginx.sh"

    interval 2

    weight -60

}

vrrp_instance VI_1 {

    state MASTER

    interface ens33

    virtual_router_id 60

    priority 100

    advert_int 1

    authentication {

        auth_type PASS

        auth_pass 1111

    }

    track_script {

        chk_http_port

    }

    virtual_ipaddress {

        192.168.1.250

    }

}

vrrp_instance VI_2 {

    state  BACKUP  #设置为备

    interface ens33  #虚拟ip绑定到哪个网卡

    virtual_router_id 61   #0-255#虚拟路由id  在同一个局域网内 来区分不同的keepalive集群 ,

                                      #如果在同一个keepalive集群中,那每台主机的router id都是一样的

    priority 50                 #0-255优先级, 优先越高拿到虚拟ip的概率就会越大

    advert_int 1              #隔1s钟发送一次存活检测

    authentication {       #认证方式

        auth_type PASS

        auth_pass 1111

    }

    virtual_ipaddress {   #设置虚拟ip

        192.168.1.251

}

#1.251的主配置:

 Configuration File for keepalived

global_defs {

   notification_email {

     acassen@firewall.loc

     failover@firewall.loc

     sysadmin@firewall.loc

   }

   notification_email_from Alexandre.Cassen@firewall.loc

   smtp_server 192.168.200.1

   smtp_connect_timeout 30

   router_id LVS_DEVEL

   vrrp_skip_check_adv_addr

#   vrrp_strict  #严格遵守vrrp协议

   vrrp_garp_interval 0

   vrrp_gna_interval 0

}

vrrp_instance VI_1 {

    state BACKUP   #设置为备

    interface ens33  #虚拟ip绑定到哪个网卡

    virtual_router_id 60   #0-255#虚拟路由id  在同一个局域网内 来区分不同的keepalive集群 ,

                                      #如果在同一个keepalive集群中,那每台主机的router id都是一样的  

    priority 50                 #0-255优先级, 优先越高拿到虚拟ip的概率就会越大

    advert_int 1              #隔1s钟发送一次存活检测

    authentication {       #认证方式

        auth_type PASS

        auth_pass 1111

    }

    virtual_ipaddress {   #设置虚拟ip

        192.168.1.250

    }

}

vrrp_instance VI_2 {

    state  MASTER  #设置为备

    interface ens33  #虚拟ip绑定到哪个网卡

    virtual_router_id 61   #0-255#虚拟路由id  在同一个局域网内 来区分不同的keepalive集群 ,

                                      #如果在同一个keepalive集群中,那每台主机的router id都是一样的

    priority 100                 #0-255优先级, 优先越高拿到虚拟ip的概率就会越大

    advert_int 1              #隔1s钟发送一次存活检测

    authentication {       #认证方式

        auth_type PASS

        auth_pass 1111

    }

    virtual_ipaddress {   #设置虚拟ip

        192.168.1.251

    }

}

部署后端服务器集群

#1、nfs配置   https://blog.csdn.net/sj349781478/article/details/79970739

   安装: yum  install  nfs-utils

   启动: systemctl  start nfs

   在服务端:

   在opt目录下创建html目录,编写index.html

   编辑/etc/exports文件

   /opt/html 192.168.1.121(ro,no_root_squash)

   [root@web-3 html]# exportfs

   /opt/html        192.168.1.121

   systemctl restart nfs

   在客户端:

   mount -t nfs 192.168.1.117:/opt/flaskapp/opt/flaskapp

2、后端真实服务器上启动flask

/opt/flaskapp目录下执行:

gunicorn flask-test:app -b "0.0.0.0:5000" --access-logfile="./flask.log" &

3、反向代理,nginx配置文件修改

upstream test {

  server 192.168.1.117:5000;

  server 192.168.1.121:5000;

}

server {

    listen 80;

    server_name www.sctest.com;

    location / {

        proxy_pass http://test;

    }

}

部署zookeeper和kafka集群

1、安装:

安装java:yum install java wget  -y

安装kafka: wget   https://mirrors.bfsu.edu.cn/apache/kafka/2.8.1/kafka_2.12-2.8.1.tgz

解包:

tar  xf  kafka_2.12-2.8.1.tgz

使用自带的zookeeper集群配置

安装zookeeper:

wget   https://mirrors.bfsu.edu.cn/apache/zookeeper/zookeeper-3.6.3/apache-zookeeper-3.6.3-bin.tar.gz

2、配置kafka

修改config /server.properties:

broker.id=0

listeners=PLAINTEXT://nginx-kafka01:9092

zookeeper.connect=192.168.0.94:2181,192.168.0.95:2181,192.168.0.96:2181

3、配置zk

进入/opt/apache-zookeeper-3.6.3-bin/confs

cp zoo_sample.cfg zoo.cfg

修改zoo.cfg, 添加如下三行:

server.1=192.168.0.94:3888:4888         #添加三台虚拟机各自的主机名和ip

server.2=192.168.0.95:3888:4888

server.3=192.168.0.96:3888:4888

3888和4888都是端口  一个用于数据传输,一个用于检验存活性和选举

创建/tmp/zookeeper目录 ,在目录中添加myid文件,文件内容就是本机指定的zookeeper id内容

如:在192.168.0.94机器上

echo 1 > /tmp/zookeeper/myid

启动zookeeper:

bin/zkServer.sh start

开启zk和kafka的时候,一定是先启动zk,再启动kafka

关闭服务的时候,kafka先关闭,再关闭zk

#查看

[root@nginx-kafka03 apache-zookeeper-3.6.3-bin]# bin/zkServer.sh status

/usr/bin/java

ZooKeeper JMX enabled by default

Using config: /opt/apache-zookeeper-3.6.3-bin/bin/../conf/zoo.cfg

Client port found: 2181. Client address: localhost. Client SSL: false.

Mode: leader

启动kafka:

bin/kafka-server-start.sh -daemon config/server.properties

zookeeper使用:            ---------分布式应用协调服务 -- 可以用来做统一配置管理、统一命名服务、分布式锁、集群管理

运行

bin/zkCli.sh

[zk: localhost:2181(CONNECTED) 1] ls /

[admin, brokers, cluster, config, consumers, controller, controller_epoch, feature, isr_change_notification, latest_producer_id_block, log_dir_event_notification, sc, zookeeper]

[zk: localhost:2181(CONNECTED) 2] ls /brokers/ids

[1, 2, 3]

[zk: localhost:2181(CONNECTED) 3] create /sc/yy

Created /sc/yy

[zk: localhost:2181(CONNECTED) 4] ls /sc

[page, xx, yy]

[zk: localhost:2181(CONNECTED) 5] set /sc/yy 90

[zk: localhost:2181(CONNECTED) 6] get /sc/yy

90

#测试

创建topic

bin/kafka-topics.sh --create --zookeeper 192.168.0.95:2181 --replication-factor 1 --partitions 1 --topic sc

查看topic

 bin/kafka-topics.sh --list --zookeeper 192.168.0.95:2181

创建生产者

[root@localhost kafka_2.12-2.8.0]# bin/kafka-console-producer.sh --broker-list 192.168.0.94:9092 --topic sc

>hello

>sanchuang tongle

>nihao

>world !!!!!!1

>

创建消费者

[root@localhost kafka_2.12-2.8.0]# bin/kafka-console-consumer.sh --bootstrap-server 192.168.0.96:9092 --topic sc --from-beginning

连接zk:

bin/zkCli.sh

[zk: localhost:2181(CONNECTED) 0] ls /

[admin, brokers, cluster, config, consumers, controller, controller_epoch, feature, isr_change_notification, latest_producer_id_block, log_dir_event_notification, zookeeper]

[zk: localhost:2181(CONNECTED) 1] ls /brokers

[ids, seqid, topics]

[zk: localhost:2181(CONNECTED) 2] ls /brokers/ids

[0, 1, 2]

[zk: localhost:2181(CONNECTED) 3] get /brokers/ids

null

[zk: localhost:2181(CONNECTED) 4] get /brokers/ids/0

{"listener_security_protocol_map":{"PLAINTEXT":"PLAINTEXT"},"endpoints":["PLAINTEXT://nginx-kafka02:9092"],"jmx_port":9999,"features":{},"host":"nginx-kafka02","timestamp":"1642300427923","port":9092,"version":5}

[zk: localhost:2181(CONNECTED) 5] ls /brokers/ids/0

[]

[zk: localhost:2181(CONNECTED) 6] get /brokers/ids/0

{"listener_security_protocol_map":{"PLAINTEXT":"PLAINTEXT"},"endpoints":["PLAINTEXT://nginx-kafka02:9092"],"jmx_port":9999,"features":{},"host":"nginx-kafka02","timestamp":"1642300427923","port":9092,"version":5}

zookeeper  分布式,开源的配置管理服务  etcd

在后端服务器部署filebeat

#安装

1、rpm --import https://packages.elastic.co/GPG-KEY-elasticsearch

2、编辑 vim /etc/yum.repos.d/fb.repo

[elastic-7.x]

name=Elastic repository for 7.x packages

baseurl=https://artifacts.elastic.co/packages/7.x/yum

gpgcheck=1

gpgkey=https://artifacts.elastic.co/GPG-KEY-elasticsearch

enabled=1

autorefresh=1

​type=rpm-md

3、yum安装

yum  install  filebeat -y

rpm -qa  |grep filebeat  #可以查看filebeat有没有安装  rpm -qa 是查看机器上安装的所有软件包

rpm -ql  filebeat  查看filebeat安装路径,牵扯的文件有哪些

4、设置开机自启

systemctl enable filebeat

#ymal格式

{

"filebeat.inputs": [

 { "type":"log",

   "enabled":true,

   "paths":["/var/log/nginx/sc_access"]

 },

],

}

#配置

修改配置文件/etc/filebeat/filebeat.yml

filebeat.inputs:

- type: log

  # Change to true to enable this input configuration.

  enabled: true

  # Paths that should be crawled and fetched. Glob based paths.

  paths:

    - /var/log/nginx/sc_access.log

#==========------------------------------kafka-----------------------------------

output.kafka:

  hosts: ["192.168.229.139:9092","192.168.229.140:9092"]

  topic: nginxlog

  keep_alive: 10s

#创建主题nginxlog

bin/kafka-topics.sh --create --zookeeper 192.168.77.132:2181 --replication-factor 3 --partitions 1 --topic nginxlog

#启动服务:

systemctl start  filebeat

[root@nginx-kafka01 opt]# ps -ef |grep filebeatroot        5537       1  0 15:32 ?        00:00:08 /usr/share/filebeat/bin/filebeat --environment systemd -c /etc/filebeat/filebeat.yml --path.home /usr/share/filebeat --path.config /etc/filebeat --path.data /var/lib/filebeat --path.logs /var/log/filebeat

filebeat数据文件

[root@nginx-kafka01 filebeat]# pwd

/var/lib/filebeat/registry/filebeat

[root@nginx-kafka01 filebeat]# less log.json

数据入库

1、需求分析

   需要nginx日志的ip,时间,带宽字段  

   将ip字段解析成相应的省份、运营商

   存入数据库的字段:  id, 时间, 省份, 运营商, 带宽

#步骤

1、创建数据表

2、编写python脚本, 从kafka获取nginx日志

3、获取好的nginx日志,提取出ip,时间,带宽字段

4、提取出的ip字段通过淘宝的一个接口解析出省份和运营商

url = "https://ip.taobao.com/outGetIpInfo?accessKey=alibaba-inc&ip=114.114.114.114"

5、格式化时间字段  "2021-10-12 12:00:00"

6、存入数据库

#创建表

create table nginxlog (

id  int primary key auto_increment,

dt  datetime not null,

prov int ,

isp  int,

bd  float

) CHARSET=utf8;

create table prov_index(

id  int primary key auto_increment,

prov_name  varchar(256)

) charset=utf8;

create table isp_index(

id int primary key auto_increment,

isp_name varchar(256)

) charset=utf8;

##################################################

#安装celery

pip install  celery

#python里连接redis的模块库,

pip install  redis

#编辑celery  参照flask_log/celery_app

#配置celery

################config.py

########celery的配置的文件

from celery.schedules import crontab

#配置消息中间件的地址

BROKER_URL = "redis://192.168.77.132:6379/1"

#配置结果存放地址

CELERY_RESULT_BACKEND = "redis://192.168.77.132:6379/2"

#启动celery时,导入任务, 只有导入任务才能执行

CELERY_IMPORTS = {

      'celery_tasks'   #存放celery要执行的任务

}

#时区

CELERY_TIMEZONE = "Asia/Shanghai"

#设置定时任务

CELERYBEAT_SCHEDULE = {

   'log-every-minute': {

     'task' : 'celery_tasks.scheduled_task',

     'schedule': crontab(minute='*/1')

  }

}

############app.py  存放核心对象的文件

from celery import Celery

#实例化celery对象,传入一个名字即可

celery_app = Celery('celery_app')

celery_app.config_from_object('config')

#############celery_tasks.py 存放任务的文件

from app import celery_app

@celery_app.task

def scheduled_task(*args, **kwargs):

    print("this is schedule task")

#启动worker

[root@nginx-kafka01 flask_log]# celery -A app.celery_app worker --loglevel=INFO -n node1

#启动beat

[root@nginx-kafka01 flask_log]# celery -A app.celery_app beat --loglevel=INFO

#celery异步任务

#指令下发

#启动可视化工具flower

celery -A celery_app flower --address=127.0.0.1 --port=5555

#上线部署

nginx + gunicron

nginx + uwsgi

#相比于uwsgi而言,gunicorn配置简单使用方便

#生成requirems.txt文件

pip freeze > reqirements.txt

#在新主机上,安装依赖包

pip3 install -r requirements.txt -i "http://pypi.douban.com/simple" --trusted-host pypi.douban.com

#####启动flask

gunicorn -w 2 -b :8000 manage:app

#修改前端nginx集群配置

server {

    listen 80 default_server;

    server_name  www.sc.com;

    root         /usr/share/nginx/html;

    access_log  /var/log/nginx/sc/access.log main;

    location  /v1 {

           proxy_pass http://127.0.0.1:8000;

    };

    location / {

       root /usr/share/nginx/html;

    }

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值