还原hadoop离线项目实战

项目架构:
在这里插入图片描述
idea创建sb项目:
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
至此,sb项目构建完毕,找到程序入口,测试成功!

搭建nginx环境:
1,安装c++依赖
cd /etc/yum.repos.d
rm -rf *
上传 CentOS6-Base-163.repo 文件---->/etc/yum.repos.d
yum clean all
yum makecache
yum -y install gcc pcre-devel openssl openssl-devel(必须执行)


2,解压nginx-1.11.11.tar.gz
tar -zxvf nginx-1.11.11.tar.gz -C ~/app/

3,cd 到解压后的nginx目录,执行以下命令 检查安装环境,并指定安装目录
 ./configure --prefix=/home/hadoop/app/nginx


4, cd 到解压后的nginx目录,执行以下命令
make && make install

5,修改nginx.conf配置文件
增加
    #gzip  on;
    #bigdata.hwz.com为你的nginx访问
        upstream bigdata.hwz.com {
                #配置sb服务地址:端口与yml文件中一致,如果有多台可以写多个
                  server hadoop001:16666;
        }
        
server {
        #配置ngnix的端口,默认 80
        listen       6789;
        ....很多.....
}
启动命令为(xxx/nginx/sbin/nginx -c xxx/nginx/conf/nginx.conf )
        

我们模拟client请求:

//该类模拟了一个httpClient,即通过该类可以将数据发送到sb
public class HttpClientDemo {

public static void sendData(String log,String hostname) {
    try{
    	//该路径为yum文件中的路径加Controller中的路径
        URL url = new URL(hostname+"/log/uploadData");

        HttpURLConnection conn = (HttpURLConnection) url.openConnection();
        conn.setRequestMethod("POST");
        conn.setRequestProperty("clientTime",System.currentTimeMillis()+"");
        conn.setDoOutput(true);
        conn.setRequestProperty("Content-Type", "application/json;charset=utf8");

        //连接   可以不写,默认  conn.getOutputStream() 也能默认打开连接
        conn.connect();

        OutputStream os = conn.getOutputStream();
        os.write(log.getBytes());
        os.flush();
        os.close();

        //判断连接是否成功
        if(conn.getResponseCode()==200){
            System.out.println("状态:"+200);
        }else{
            System.out.println("状态!="+200);
        }
        conn.disconnect();

    }catch (Exception e){
        e.printStackTrace();
    }
 } 
//该类模拟的是一个随机生成数据
object GenereateData {
 def randomDate(): Date = {
   val sdf = new SimpleDateFormat("yyyy/MM/dd hh:mm:ss")
   val hour = new Random().nextInt(24)
   val minute = new Random().nextInt(60)
   val second = new Random().nextInt(60)

   val sb = new StringBuffer()
   sb.append("2019").append("/").append("09").append("/").append("04")
     .append(" ").append(""+hour).append(":").append(""+minute).append(":").append(""+second)


   sdf.parse(sb.toString)
 }

 def main(args: Array[String]): Unit = {


   val platforms = Array("Android", "iOS","windows phone","Symbian","Harmony")



   for(i <- 1 to 1000){
     val sb = new StringBuffer()
     //获取随机操作平台
     val platform=platforms(new Random().nextInt(4))
     //获取随机用户
     var user="hwz"+new Random().nextInt(100)
     //获取随机软件版本号
     var version="10.1."+new Random().nextInt(3)
     //获取随机ip
     val ip = new Random().nextInt(100)+"."+new Random().nextInt(100)+"."+new Random().nextInt(100)+"."+new Random().nextInt(100)

     //获取随机访问时间
     val formatDate = new SimpleDateFormat("dd/MM/yyyy HH:mm:ss ZZZ")
     val time = formatDate.format(randomDate())
     //获取随机 traffic
     val traffic = new Random().nextInt(8000)
     //获取随机停留时间
     val duration = new Random().nextInt(5000)
     val appId="app001"

     if(i%3==0 && i%5!=0){


       val map = Map("user"->user,"appId"->appId,"ip"->ip,"time"->time,"traffic"->"error","duration"->duration,"platform"->platform,"version"->version)
       val jo = new JSONObject(map)

       HttpClientDemo.sendData(jo.toString(JSONFormat.defaultFormatter),"http://bigdata.hwz.com:6789")
     }else{

       val map = Map("user"->user,"appId"->appId,"ip"->ip,"time"->time,"traffic"->traffic,"duration"->duration,"platform"->platform,"version"->version)
       val jo = new JSONObject(map)

       HttpClientDemo.sendData(jo.toString(JSONFormat.defaultFormatter),"http://bigdata.hwz.com:6789")
     }

   }


 }
}
public class AccessLog {
    private String user; //用户账号,可能为null
    private String platform; //操作平台
    private String version; //软件版本号
    private String ip; //通过ip可以获取到经纬度,省份/城市/运营商,然后可以构建商圈
    private String traffic; //字符串型流量,
    private String time; //访问时间
    private String duration;// 页面停留时间
    private String appId;//因为一家公司可能有多个app

    public void setUser(String user) {
        this.user = user;
    }

    public void setPlatform(String platform) {
        this.platform = platform;
    }

    public void setVersion(String version) {
        this.version = version;
    }

    public void setIp(String ip) {
        this.ip = ip;
    }

    public void setTraffic(String traffic) {
        this.traffic = traffic;
    }

    public void setTime(String time) {
        this.time = time;
    }

    public void setDuration(String duration) {
        this.duration = duration;
    }

    public void setAppId(String appId) {
        this.appId = appId;
    }

    public String getUser() {
        return user;
    }

    public String getPlatform() {
        return platform;
    }

    public String getVersion() {
        return version;
    }

    public String getIp() {
        return ip;
    }

    public String getTraffic() {
        return traffic;
    }

    public String getTime() {
        return time;
    }

    public String getDuration() {
        return duration;
    }

    public String getAppId() {
        return appId;
    }

    @Override
    public String toString() {
        return "AccessLog{" +
                "user='" + user + '\'' +
                ", platform='" + platform + '\'' +
                ", version='" + version + '\'' +
                ", ip='" + ip + '\'' +
                ", traffic='" + traffic + '\'' +
                ", time='" + time + '\'' +
                ", duration='" + duration + '\'' +
                ", appId='" + appId + '\'' +
                '}';
    }
}
//该类为控制器,即根据不同的请求路径,访问该控制器里面对应的方法
@Controller
public class DataController {

    private final static Logger logger=Logger.getLogger(DataController.class);
    //定义请求路径,即如果请求的是这个路径,并且为post请求,那么就执行下边这个方法
    @RequestMapping(value = "/uploadData",method = RequestMethod.POST)
    @ResponseBody
    //@RequestBody主要用来接收前端传递给后端的json字符串中的数据的(请求体中的数据的),且必须为post请求(因为get请求无请求体)
    public AccessLog doPost(@RequestBody AccessLog accessLog){
        //通过log4j讲数据保存起来
        logger.info(JSON.toJSONString(accessLog));
        System.out.println(JSON.toJSONString(accessLog));
        return accessLog;
    }
}
log4j.properties配置文件:
#定义了生成的数据文件在执行该程序位置的上级目录下的logs目录下的access.log文件
#定义了access.log文件一天生成一个,然后后缀名为 .年.月.日
log4j.appender.hwz.File=org.apache.log4j.DailyRollingFileAppender
log4j.appender.hwz.File.file=../logs/access.log
log4j.appender.hwz.File.DatePattern='.'yyyy-MM-dd
log4j.appender.hwz.File.layout=org.apache.log4j.PatternLayout
log4j.appender.hwz.File.layout.ConversionPattern=%m%n
log4j.rootLogger=info,hwz.File
[hadoop@hadoop001 flumeC]$ cat tailDir_memory_hdfs.conf 
#!/bin/bash

a1.sources = r1
a1.channels = c1
a1.sinks = k1

##定义source类型
a1.sources.r1.type = TAILDIR
a1.sources.r1.positionFile = /home/hadoop/data/flumeData/flume_position/taildir_position.json
a1.sources.r1.filegroups = f1
a1.sources.r1.filegroups.f1 = /home/hadoop/hadoop-project/logs/access.log


##定义channel
a1.channels.c1.type = memory
a1.channels.c1.capacity = 10000
a1.channels.c1.transactionCapacity = 10000



##定义sink类型
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path =hdfs://hadoop001:9000/hadoop/project/access-logs/%Y%m%d
a1.sinks.k1.hdfs.filePrefix = hwz
a1.sinks.k1.hdfs.fileSuffix= .log
a1.sinks.k1.hdfs.fileType=DataStream
a1.sinks.k1.hdfs.rollInterval=60
a1.sinks.k1.hdfs.rollSize=0
a1.sinks.k1.hdfs.rollCount=0
a1.sinks.k1.hdfs.batchSize = 10000
a1.sinks.k1.hdfs.useLocalTimeStamp = true

##将三个组件串起来
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1 

捋一下思路:
1.启动nginx服务:/home/hadoop/app/nginx/sbin/nginx -c /home/hadoop/app/nginx/conf/nginx.conf
2.启动sb服务:nohup java -jar logserver-0.0.1-SNAPSHOT.jar &
执行后会在当前目录的上一层目录创建/logs/access.log
3.启动flume,实时监控生成数据access.log文件
启动命令:
flume-ng agent
-n a1
-c ${FLUME_HOME}/conf
-f /home/hadoop/hadoop-project/flumeC/tailDir_memory_hdfs.conf
-Dflume.root.logger=INFO,console
4.idea运行GenereateData
在这里插入图片描述
数据已到hdfs,使用mapreduce清洗数据:
重点:
1.ip解析,根据ip查找ip规则库,此处使用分布式缓存加载ip规则库,获取到对应的省/市/运营商字段
2.过滤掉错误的日志信息(流量字段需要数值型,过滤掉字符串类型)
3.解析日期,获取对应的年/月/日字段值
4.因为数据是json格式的,所以可以使用JSONObject.parseObject将每一行数据封装为一个对象,然后通过重写对象的toString方法调正输出清洗后的数据规则

//清洗数据使用的主类
public class EtlLog {
    public static class MyMapper extends Mapper<LongWritable,Text,Text,NullWritable>{

        private List<Info> list=new ArrayList<Info>();

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            URI[] cacheFiles = context.getCacheFiles();

            String path = cacheFiles[0].getPath();

//            BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(path)));

            FileSystem fileSystem = FileSystem.get(context.getConfiguration());
            FSDataInputStream fis = fileSystem.open(new Path(path));
            BufferedReader br = new BufferedReader(new InputStreamReader(fis));



            String line;
            while(StringUtils.isNotEmpty(line=br.readLine())){
                String[] split = line.split("[|]");
                Info info = new Info();
                info.setStartIp(Long.parseLong(split[2]));
                info.setEndIp(Long.parseLong(split[3]));
                info.setProvince(split[6]);
                info.setCity(split[7]);
                info.setIsp(split[9]);

                list.add(info);
            }

            IOUtils.closeStream(br);
        }

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            //定义计算总记录数
            context.getCounter("etlLog","totalLog").increment(1);

            String json = value.toString();

            AccessLog accessLog = JSONObject.parseObject(json, AccessLog.class);

            //解析年月日
            SimpleDateFormat sdf=new SimpleDateFormat("dd/MM/yyyy HH:mm:ss ZZZ");
            try {
                Date date = sdf.parse(accessLog.getTime());
                Calendar calendar = Calendar.getInstance();
                calendar.setTime(date);


                int  year = calendar.get(Calendar.YEAR);
                int  month = calendar.get(Calendar.MONDAY)+1; //Calendar的月份从0开始
                int day = calendar.get(Calendar.DATE);

                accessLog.setYear(year+"");
                accessLog.setMonth(month<10?"0"+month:month+"");
                accessLog.setDay(day<10?"0"+day:day+"");



//                过滤掉错误的流量信息
                String traffic = accessLog.getTraffic();
                long size = Long.parseLong(traffic);
                accessLog.setSize(size);

                //获取省/市/运营商
                Long ip = IPUtil.ip2Long(accessLog.getIp());
                int index = IPUtil.search(ip, list);
                Info info = list.get(index);

                accessLog.setProvince(info.getProvince());
                accessLog.setCity(info.getCity());
                accessLog.setIsp(info.getIsp());

                //定义符合数据格式的记录数
                context.getCounter("etlLog","rightLog").increment(1);
                context.write(new Text(accessLog.toString()),NullWritable.get());

            } catch (Exception e) {
                e.printStackTrace();
            }

        }
    }


    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();
        //设置map端压缩
//        conf.set("mapreduce.map.output.compress","true");
//        conf.set("mapreduce.map.output.compress.codec","org.apache.hadoop.io.compress.DefaultCodec");
        //设置reduce端压缩
//        conf.set("mapreduce.output.fileoutputformat.compress","true");
//        conf.set("mapreduce.output.fileoutputformat.compress.codec","org.apache.hadoop.io.compress.BZip2Codec");

        Job job = Job.getInstance(conf, "etlLog");

        //将mvn packet后的包指定一下
//        conf.set("mapred.jar","C:\\tools\\IdealUltimateEdition\\hadoop\\target\\hadoop-1.0-SNAPSHOT.jar");
//        job.setJar("C:\\tools\\IdealUltimateEdition\\hadoop\\target\\hadoop-1.0-SNAPSHOT.jar");

        job.setJarByClass(EtlLog.class);

        job.setMapperClass(MyMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NullWritable.class);




        //hadoop jar 后面设置主类的话  args[1]代表输入  args[2]代表
        FileInputFormat.setInputPaths(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        //将ip规则库加载到缓存
        job.addCacheFile(new URI(args[2]));

        int status = job.waitForCompletion(true) ? 0 : -1;
//        //拿到记录数
//        CounterGroup etlLog = job.getCounters().getGroup("etlLog");
//        Iterator<Counter> ite = etlLog.iterator();
//        while(ite.hasNext()){
//            //获取记录数,然后可以选择存放mysql或者其它数据库
//            Counter next = ite.next();
//            System.out.println("----"+next.getName()+"\t"+ next.getValue());
//        }
        System.exit(status);
    }
}
//解析ip使用的工具类
public class IPUtil {
    public static Long ip2Long(String ipaddr){
        String[] ips=ipaddr.split("\\.");
        long l = Long.parseLong(ips[0]) * 256 * 256 * 256 +
                Long.parseLong(ips[1]) * 256 * 256 +
                Long.parseLong(ips[2]) * 256 +
                Long.parseLong(ips[3]);
        return l;
    }




    public static int search(Long ip, List<Info> infos){
        int index=-1;

        int min=0;
        int max=infos.size()-1;

        while(min<=max){
            int mid=(min+max)/2;
            Info info = infos.get(mid);

            if(ip>=info.getStartIp()&&ip<=info.getEndIp()){
                index=mid;
                break;
            }else if(ip>info.getEndIp()){
                min=mid+1;
            }else if(ip<info.getStartIp()){
                max=mid-1;
            }
        }

        return index;
    }
}

//该javaBean用来封装ip相关字段信息
public class Info {

    private Long startIp;
    private Long endIp;

    private String province;
    private String city;
    private String isp;

    public void setStartIp(Long startIp) {
        this.startIp = startIp;
    }

    public void setEndIp(Long endIp) {
        this.endIp = endIp;
    }

    public Long getStartIp() {
        return startIp;

    }

    public Long getEndIp() {
        return endIp;
    }

    public String getProvince() {

        return province;
    }

    public void setProvince(String province) {
        this.province = province;
    }

    public void setCity(String city) {
        this.city = city;
    }

    public void setIsp(String isp) {
        this.isp = isp;
    }

    public String getCity() {
        return city;

    }

    public String getIsp() {
        return isp;
    }
}

此时数据已经清洗完毕,落盘到hdfs上,然后创建hive大宽表:

创建hive外部表,该表只需要创建一次,因为是分区表,每天的数据都会在不同的分区中,加载数据的路径即为etl清洗后的数据
create database hadoop;
use hadoop;
create external table hadoop.access_wide(
user string,
platform string,
version string,
ip string,
size bigint,
time string,
duration string,
appId string,
province string,
city string,
isp string,
y string,
month string,
d string
)
partitioned by (day string)
row format delimited fields terminated by '\t'
location 'hdfs://hadoop001:9000/hadoop/project/access-wide/';

然后刷新对应分区的数据即可:
ALTER TABLE hadoop.access_wide ADD IF NOT EXISTS PARTITION (day=$time)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值