学习hadoop时老师布置的一个作业

要求:
(1)找出每个IP的访问的次数
(2)找出每一种浏览器访问的次数
(3)把浏览器的访问次数进行图形可视化
要处理的文本(共60多万行,这里就显示前45行):

117.135.212.67 - - [08/Mar/2018:22:35:42 +0800] "GET / HTTP/1.1" 200 4364 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36" -
221.13.7.73 - - [08/Mar/2018:22:35:48 +0800] "GET / HTTP/1.1" 200 4364 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko" -
117.135.212.67 - - [08/Mar/2018:22:35:52 +0800] "GET /favicon.ico HTTP/1.1" 404 564 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36" -
221.13.7.73 - - [08/Mar/2018:22:35:54 +0800] "GET / HTTP/1.1" 200 4364 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko" -
221.13.7.73 - - [08/Mar/2018:22:35:58 +0800] "GET / HTTP/1.1" 200 4364 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko" -
221.13.7.73 - - [08/Mar/2018:22:35:59 +0800] "GET / HTTP/1.1" 200 4364 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko" -
221.13.7.73 - - [08/Mar/2018:22:36:00 +0800] "GET / HTTP/1.1" 200 4364 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko" -
221.13.7.73 - - [08/Mar/2018:22:36:01 +0800] "GET / HTTP/1.1" 200 4364 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko" -
221.13.7.73 - - [08/Mar/2018:22:36:10 +0800] "GET / HTTP/1.1" 200 4364 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko" -
221.13.7.73 - - [08/Mar/2018:22:36:15 +0800] "GET / HTTP/1.1" 200 4364 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko" -
117.135.212.67 - - [08/Mar/2018:22:36:19 +0800] "GET / HTTP/1.1" 200 4364 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36" -
221.13.7.73 - - [08/Mar/2018:22:36:21 +0800] "GET / HTTP/1.1" 200 4364 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko" -
216.126.58.188 - - [08/Mar/2018:22:36:22 +0800] "GET /09/top.php HTTP/1.1" 302 0 "http://google.com" "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 GTB7.1 (.NET CLR 3.5.30729)" -
221.13.7.73 - - [08/Mar/2018:22:36:24 +0800] "GET / HTTP/1.1" 200 4364 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko" -
221.13.7.73 - - [08/Mar/2018:22:36:28 +0800] "GET / HTTP/1.1" 200 4364 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko" -
221.13.7.73 - - [08/Mar/2018:22:36:32 +0800] "GET / HTTP/1.1" 200 4364 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko" -
221.13.7.73 - - [08/Mar/2018:22:36:33 +0800] "GET / HTTP/1.1" 200 4364 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko" -
221.13.7.73 - - [08/Mar/2018:22:36:33 +0800] "GET / HTTP/1.1" 200 4364 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko" -
221.13.7.73 - - [08/Mar/2018:22:36:34 +0800] "GET / HTTP/1.1" 200 4364 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko" -
221.13.7.73 - - [08/Mar/2018:22:36:35 +0800] "GET / HTTP/1.1" 200 4364 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko" -
221.13.7.73 - - [08/Mar/2018:22:36:35 +0800] "GET / HTTP/1.1" 200 4364 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko" -
221.13.7.73 - - [08/Mar/2018:22:36:36 +0800] "GET / HTTP/1.1" 200 4364 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko" -
221.13.7.73 - - [08/Mar/2018:22:36:36 +0800] "GET / HTTP/1.1" 200 4364 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko" -
221.13.7.73 - - [08/Mar/2018:22:36:37 +0800] "GET / HTTP/1.1" 200 4364 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko" -
221.13.7.73 - - [08/Mar/2018:22:36:39 +0800] "GET / HTTP/1.1" 200 4364 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko" -
221.13.7.73 - - [08/Mar/2018:22:36:40 +0800] "GET / HTTP/1.1" 200 4364 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko" -
221.13.7.73 - - [08/Mar/2018:22:36:40 +0800] "GET / HTTP/1.1" 200 4364 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko" -
221.13.7.73 - - [08/Mar/2018:22:36:43 +0800] "GET / HTTP/1.1" 200 4364 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko" -
221.13.7.73 - - [08/Mar/2018:22:36:44 +0800] "GET / HTTP/1.1" 200 4364 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko" -
221.13.7.73 - - [08/Mar/2018:22:36:44 +0800] "GET / HTTP/1.1" 200 4364 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko" -
221.13.7.73 - - [08/Mar/2018:22:36:45 +0800] "GET / HTTP/1.1" 200 4364 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko" -
221.13.7.73 - - [08/Mar/2018:22:36:45 +0800] "GET / HTTP/1.1" 200 4364 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko" -
221.13.7.79 - - [08/Mar/2018:22:37:18 +0800] "GET / HTTP/1.1" 200 4364 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0" -
221.13.7.73 - - [08/Mar/2018:22:37:21 +0800] "GET / HTTP/1.1" 200 4364 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko" -
221.13.7.73 - - [08/Mar/2018:22:37:22 +0800] "GET / HTTP/1.1" 200 4364 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko" -
221.13.7.73 - - [08/Mar/2018:22:37:22 +0800] "GET / HTTP/1.1" 200 4364 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko" -
221.13.7.77 - - [08/Mar/2018:22:37:36 +0800] "GET / HTTP/1.1" 200 4365 "http://www.cnlaoxiang.com/main.html" "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36" -
117.135.212.67 - - [08/Mar/2018:22:37:37 +0800] "GET / HTTP/1.1" 200 4365 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36" -
221.13.7.79 - - [08/Mar/2018:22:37:51 +0800] "GET / HTTP/1.1" 200 4365 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0" -
221.13.7.77 - - [08/Mar/2018:22:37:57 +0800] "GET / HTTP/1.1" 200 4365 "http://www.cnlaoxiang.com/main.html" "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36" -
221.13.7.79 - - [08/Mar/2018:22:37:59 +0800] "GET / HTTP/1.1" 200 4365 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0" -
221.13.7.77 - - [08/Mar/2018:22:38:01 +0800] "GET / HTTP/1.1" 200 4365 "-" "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36" -
221.13.7.79 - - [08/Mar/2018:22:38:02 +0800] "GET / HTTP/1.1" 200 4365 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0" -
221.13.7.77 - - [08/Mar/2018:22:38:04 +0800] "GET / HTTP/1.1" 200 4365 "-" "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36" -
221.13.7.79 - - [08/Mar/2018:22:38:05 +0800] "GET / HTTP/1.1" 200 4365 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0" -
一、找出每个ip的访问次数

类似于wordcount实例,观察发现,ip都在每行的第一个空格之前,可以直接用java的spilt函数进行切割,我们只要截取第一个词就可以了。

//拿到一行文本内容,转换成String 类型  
String line = value.toString();  
//将这行文本切分成单词  
String[] words=line.split(" ");  
//ip地址为每行第一个空格第一个单词
String ipString = words[0];
//输出<ip,1>  
context.write(new Text(ipString), new IntWritable(1));  
二、找出每一种浏览器访问的次数

laoshi给我们的定义是最后出现的那个浏览器进程就是这个浏览器名称,前面的是浏览器内核,先不管这个是什么逻辑吧,不过只要是最后一个给我们的难度就降低了很多。
思路:
(1)最后一次出现的
(2)可以列个匹配字符数组,浏览器总共就那么几个

//存储所有浏览器的字符数组
String[] browser_list = {"Safari","Mozilla","Firefox","QQBrowser","Opera","IEXPLORE","Chrome","Sogou","360SE","Netscape"}; //Netscape网景浏览器

int flage = 0; //标记,如果找到了就不要再找了
for(int i=words.length-1;i>=0;i--){//从后面开始找起,只要靠近行末的那个,那个是什么就是什么浏览器
    for(int j=0;j<browser_list.length;j++){ //遍历浏览器
        if(words[i].contains(browser_list[j])){ //contains()包含函数, 如果切片的字符中包含浏览器那么这个浏览器就是我们要找的
            //输出<浏览器,1>  
            context.write(new Text(browser_list[j]), new IntWritable(1)); 
            flage = 1; //找到了,标记为1
            break;
        }
    }
    if(flage==1){ //找到了就结束了
        break;
    }
}
三、把浏览器的访问次数进行图形可视化

是用命令讲计算结果文件下载下来:我将它存放在/home/hadoop/result/中

[root@master myjar]# hadoop fs -get /user/hadoop/output4/part-r-00000 /home/hadoop/result/

计算的结果共有2674行,这里截取最后50行显示:

66.102.7.29 3
66.102.7.30 4
66.240.205.34   4
66.249.83.222   3
66.249.88.67    11
66.249.88.70    3
66.249.88.72    7
67.21.36.2  1
68.71.57.172    16
69.28.88.23 1
69.30.226.234   3
69.58.178.56    5
69.58.178.58    5
69.58.178.59    5
70.42.131.170   22
71.6.202.198    1
71.6.202.204    8
72.13.36.80 14
74.82.47.2  2
74.82.47.3  3
74.82.47.4  1
74.82.47.5  1
75.149.221.170  2
77.72.85.108    7
80.82.77.139    10
80.82.77.33 9
83.41.145.129   1
84.54.232.182   6
89.111.177.210  2
89.234.68.89    4
91.200.12.151   1
91.240.208.14   5
93.179.69.161   1
94.102.49.193   2
95.181.179.32   1
95.211.226.185  4
95.213.130.90   6
95.24.206.186   1
95.27.143.72    1
96.127.158.234  1
96.127.158.237  1
Chrome  125
Firefox 68774
IEXPLORE    5
Mozilla 54481
Netscape    1
Opera   1
QQBrowser   39626
Safari  473669
Sogou   214

然后我想用python来进行数据可视化,做一个饼状图。
直接上代码,都有注释说明:

import matplotlib.pyplot as plt

#(每一块)饼图外侧显示的说明文字
labels=['Chrome','Firefox','IEXPLORE','Mozilla','Netscape','QQBrowser','OppoBrowser','Opera','Safari','Sogou','VivoBrowser','UCBrowser']    
#每一块的比例
sizes=[125,68774,5,54329,1,39626,2299,1,469773,214,1552,213]
#设置颜色(循环显示)    
colors  = ['blue','red','coral','green','yellow','orange','yellowgreen','gold']  
#explode=[0,0.1,0.2,0,0.1,0.2,0,0.1,0.2,0,0.1,0.2] #(每一块)离开中心的距离
explode=[0,0,0,0,0,0,0,0,0,0,0,0] #(每一块)离开中心的距离
plt.pie(sizes,explode=explode,labels=labels,colors=colors,autopct='%1.2f%%',shadow=True,startangle=90)
#shadow,饼是否有阴影
#startangle,起始角度,0,表示从0开始逆时针转,为第一块。一般选择从90度开始比较好看
plt.axis('equal')
plt.show()

运行结果如下:
这里写图片描述

附录——完整代码

CountJob 类:

package com.gznc_pcc.hadoop;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;  
import org.apache.hadoop.io.*;  
import org.apache.hadoop.mapreduce.*;  
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
public class CountJob {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {  
        Configuration conf = new Configuration();  
        Job wordCountJob = Job.getInstance(conf);  

        //重要:指定本job所在的jar包  
        wordCountJob.setJarByClass(CountJob.class);  

        //设置wordCountJob所用的mapper逻辑类为哪个类  
        wordCountJob.setMapperClass(CountMapper.class);  
        //设置wordCountJob所用的reducer逻辑类为哪个类  
        wordCountJob.setReducerClass(CountReducer.class);  

        //设置map阶段输出的kv数据类型  
        wordCountJob.setMapOutputKeyClass(Text.class);  
        wordCountJob.setMapOutputValueClass(IntWritable.class);  

        //设置最终输出的kv数据类型  
        wordCountJob.setOutputKeyClass(Text.class);  
        wordCountJob.setOutputValueClass(IntWritable.class);  

        //设置要处理的文本数据所存放的路径  
        FileInputFormat.setInputPaths(wordCountJob, args[0]);  
        FileOutputFormat.setOutputPath(wordCountJob, new Path(args[1]));  

        //提交job给hadoop集群  
        wordCountJob.waitForCompletion(true);  
    }  
}

CountMapper 类:

package com.gznc_pcc.hadoop;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
/* 
 * KEYIN:输入kv数据对中key的数据类型 
 * VALUEIN:输入kv数据对中value的数据类型 
 * KEYOUT:输出kv数据对中key的数据类型 
 * VALUEOUT:输出kv数据对中value的数据类型 
 */  
public class CountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{

    /* 
     * map方法是提供给map task进程来调用的,map task进程是每读取一行文本来调用一次我们自定义的map方法 
     * map task在调用map方法时,传递的参数: 
     *      一行的起始偏移量LongWritable作为key 
     *      一行的文本内容Text作为value 
     */  
    @Override  
    protected void map(LongWritable key, Text value,Context context) throws IOException, InterruptedException {  
        //拿到一行文本内容,转换成String 类型  
        String line = value.toString();  
        //将这行文本切分成单词  
        String[] words=line.split(" ");  
        //ip地址为每行第一个空格第一个单词
        String ipString = words[0];
        //输出<ip,1>  
        context.write(new Text(ipString), new IntWritable(1));  

        //存储所有浏览器的字符数组
        String[] browser_list = {"Safari","Mozilla","Firefox","QQBrowser","Opera","IEXPLORE","Chrome","Sogou","360SE","Netscape"}; //Netscape网景浏览器

        int flage = 0; //标记,如果找到了就不要再找了
        for(int i=words.length-1;i>=0;i--){//从后面开始找起,只要靠近行末的那个,那个是什么就是什么浏览器
            for(int j=0;j<browser_list.length;j++){ //遍历浏览器
                if(words[i].contains(browser_list[j])){ //contains()包含函数, 如果切片的字符中包含浏览器那么这个浏览器就是我们要找的
                    //输出<浏览器,1>  
                    context.write(new Text(browser_list[j]), new IntWritable(1));  
                    flage = 1; //找到了,标记为1
                    break;
                }
            }
            if(flage==1){ //找到了就结束了
                break;
            }
        }
    }  
}

CountReducer 类:

package com.gznc_pcc.hadoop;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
/* 
 * KEYIN:对应mapper阶段输出的key类型 
 * VALUEIN:对应mapper阶段输出的value类型 
 * KEYOUT:reduce处理完之后输出的结果kv对中key的类型 
 * VALUEOUT:reduce处理完之后输出的结果kv对中value的类型 
 */  
public class CountReducer  extends Reducer<Text, IntWritable, Text, IntWritable>{
    @Override  
    /* 
     * reduce方法提供给reduce task进程来调用 
     *  
     * reduce task会将shuffle阶段分发过来的大量kv数据对进行聚合,聚合的机制是相同key的kv对聚合为一组 
     * 然后reduce task对每一组聚合kv调用一次我们自定义的reduce方法 
     * 比如:<hello,1><hello,1><hello,1><tom,1><tom,1><tom,1> 
     *  hello组会调用一次reduce方法进行处理,tom组也会调用一次reduce方法进行处理 
     *  调用时传递的参数: 
     *          key:一组kv中的key 
     *          values:一组kv中所有value的迭代器 
     */  
    protected void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {  
        //定义一个计数器  
        int count = 0;  
        //通过value这个迭代器,遍历这一组kv中所有的value,进行累加  
        for(IntWritable value:values){  
            count+=value.get();  
        }  
        //输出这个单词的统计结果  
        context.write(key, new IntWritable(count));  
    }  
}
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值