大数据教程(10.5)运营商流量日志解析增强

    上一篇文章分析了如何使用hadoop来实现sql中group by 并且取每组中最大值的需求--订单中成交金额最大的订单项分析,本篇博客博主将继续分享一个mapreduce实战例子--运营商流量日志解析增强;

    一、需求

           电信运营商服务器中记录了用户流量访问的日志,效果如下图所示:

1374609560.11	1374609560.16	1374609560.16	1374609560.16	110	5	8615038208365	460023383869133	8696420056841778	2	460	0	14615			54941	10.188.77.252	61.145.116.27	35020	80	6	cmnet	1	221.177.218.34	221.177.217.161	221.177.218.34	221.177.217.167	ad.veegao.com	http://ad.veegao.com/veegao/iris.action		Apache-HttpClient/UNAVAILABLE (java 1.4)	POST	200	593	310	4	3	0	0	4	3	0	0	0	0	http://ad.veegao.com/veegao/iris.action	5903903079251243019	5903903103500771339	5980728
1374609558.91	1374609558.97	1374609558.97	1374609559.31	112	461	8615038208365	460023383869133	8696420056841778	2	460	0	14615			54941	10.188.77.252	101.226.76.175	37293	80	6	cmnet	1	221.177.218.34	221.177.217.161	221.177.218.34	221.177.217.167	short.weixin.qq.com	http://short.weixin.qq.com/cgi-bin/micromsg-bin/getcdndns		Android QQMail HTTP Client	POST	200	543	563	2	3	0	0	2	3	0	0	0	0	http://short.weixin.qq.com/cgi-bin/micromsg-bin/getcdndns	5903903079251243019	5903903097240039435	5980728
1374609514.70	1374609514.75	1374609514.75	1374609515.58	110	5	8613674976196	460004901700207	8623350100353878	2	460	0	14694			58793	10.184.80.32	111.13.13.222	36181	80	6	cmnet	1	221.177.156.4	221.177.217.145	221.177.156.4	221.177.217.156	retype.wenku.bdimg.com	http://retype.wenku.bdimg.com/img/97308d2b7375a417866f8f09		AMB_400	GET	200	345	4183	5	5	0	0	5	5	0	0	0	0	http://retype.wenku.bdimg.com/img/97308d2b7375a417866f8f09	5903900710696611851	5903902908140003339	5937307

            我们需要将其中的url如果已经在数据库中有其对应的标签内容,则增强日志,在后面输出其内容;否则,表示该url在数据字典中不存在,需要使用爬虫去爬取;

    二、代码实现

           DBLoader(db数据加载类)

package com.empire.hadoop.mr.logenhance;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.Map;

public class DBLoader {

    public static void dbLoader(Map<String, String> ruleMap) throws Exception {

        Connection conn = null;
        Statement st = null;
        ResultSet res = null;

        try {
            Class.forName("com.mysql.jdbc.Driver");
            conn = DriverManager.getConnection("jdbc:mysql://192.168.29.131:3306/urldb?characterEncoding=utf-8", "root",
                    "123456");
            st = conn.createStatement();
            res = st.executeQuery("select url,content from url_rule");
            while (res.next()) {
                ruleMap.put(res.getString(1), res.getString(2));
            }

        } finally {
            try {
                if (res != null) {
                    res.close();
                }
                if (st != null) {
                    st.close();
                }
                if (conn != null) {
                    conn.close();
                }

            } catch (Exception e) {
                e.printStackTrace();
            }
        }

    }
}

           LogEnhanceOutputFormat(自定义OutputFormat--用于将增强日志和需要爬虫爬取的url分文件输出)

package com.empire.hadoop.mr.logenhance;

import java.io.IOException;

import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * maptask或者reducetask在最终输出时,先调用OutputFormat的getRecordWriter方法拿到一个RecordWriter
 * 然后再调用RecordWriter的write(k,v)方法将数据写出
 */
public class LogEnhanceOutputFormat extends FileOutputFormat<Text, NullWritable> {

    @Override
    public RecordWriter<Text, NullWritable> getRecordWriter(TaskAttemptContext context)
            throws IOException, InterruptedException {

        FileSystem fs = FileSystem.get(context.getConfiguration());

        Path enhancePath = new Path("/en/log.dat");
        Path tocrawlPath = new Path("/crw/url.dat");

        FSDataOutputStream enhancedOs = fs.create(enhancePath);
        FSDataOutputStream tocrawlOs = fs.create(tocrawlPath);

        return new EnhanceRecordWriter(enhancedOs, tocrawlOs);
    }

    /**
     * 构造一个自己的recordwriter
     * 
     * @author
     */
    static class EnhanceRecordWriter extends RecordWriter<Text, NullWritable> {
        FSDataOutputStream enhancedOs = null;
        FSDataOutputStream tocrawlOs  = null;

        public EnhanceRecordWriter(FSDataOutputStream enhancedOs, FSDataOutputStream tocrawlOs) {
            super();
            this.enhancedOs = enhancedOs;
            this.tocrawlOs = tocrawlOs;
        }

        @Override
        public void write(Text key, NullWritable value) throws IOException, InterruptedException {
            String result = key.toString();
            // 如果要写出的数据是待爬的url,则写入待爬清单文件 /logenhance/tocrawl/url.dat
            if (result.contains("tocrawl")) {
                tocrawlOs.write(result.getBytes());
            } else {
                // 如果要写出的数据是增强日志,则写入增强日志文件 /logenhance/enhancedlog/log.dat
                enhancedOs.write(result.getBytes());
            }

        }

        @Override
        public void close(TaskAttemptContext context) throws IOException, InterruptedException {
            if (tocrawlOs != null) {
                tocrawlOs.close();
            }
            if (enhancedOs != null) {
                enhancedOs.close();
            }

        }

    }

}

           LogEnhance(日志分析增强主程序类)

package com.empire.hadoop.mr.logenhance;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class LogEnhance {

    static class LogEnhanceMapper extends Mapper<LongWritable, Text, Text, NullWritable> {

        Map<String, String> ruleMap = new HashMap<String, String>();

        Text                k       = new Text();
        NullWritable        v       = NullWritable.get();

        // 从数据库中加载规则信息倒ruleMap中
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {

            try {
                DBLoader.dbLoader(ruleMap);
            } catch (Exception e) {
                e.printStackTrace();
            }

        }

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            // 获取一个计数器用来记录不合法的日志行数, 组名, 计数器名称
            Counter counter = context.getCounter("malformed", "malformedline");
            String line = value.toString();
            String[] fields = StringUtils.split(line, "\t");
            try {
                String url = fields[26];
                if (isHttpUrl(url)) {
                    String content_tag = ruleMap.get(url);
                    // 判断内容标签是否为空,如果为空,则只输出url到待爬清单;如果有值,则输出到增强日志
                    if (content_tag == null) {
                        k.set(url + "\t" + "tocrawl" + "\n");
                        context.write(k, v);
                    } else {
                        k.set(line + "\t" + content_tag + "\n");
                        context.write(k, v);
                    }
                }
            } catch (Exception exception) {
                counter.increment(1);
            }
        }

        /**
         * 判断字符串是否为URL
         * 
         * @param urls
         * @return true:是URL、false:不是URL
         */
        public boolean isHttpUrl(String urls) {
            boolean isurl = false;
            String regex = "(((https|http)?://)?([a-z0-9]+[.])|(www.))"
                    + "\\w+[.|\\/]([a-z0-9]{0,})?[[.]([a-z0-9]{0,})]+((/[\\S&&[^,;\u4E00-\u9FA5]]+)+)?([.][a-z0-9]{0,}+|/?)";//设置正则表达式

            Pattern pat = Pattern.compile(regex.trim());//比对
            Matcher mat = pat.matcher(urls.trim());
            isurl = mat.matches();//判断是否匹配
            if (isurl) {
                isurl = true;
            }
            return isurl;
        }

    }

    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);

        job.setJarByClass(LogEnhance.class);

        job.setMapperClass(LogEnhanceMapper.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        // 要控制不同的内容写往不同的目标路径,可以采用自定义outputformat的方法
        job.setOutputFormatClass(LogEnhanceOutputFormat.class);

        FileInputFormat.setInputPaths(job, new Path(args[0]));

        // 尽管我们用的是自定义outputformat,但是它是继承制fileoutputformat
        // 在fileoutputformat中,必须输出一个_success文件,所以在此还需要设置输出path
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        // 不需要reducer
        job.setNumReduceTasks(0);

        job.waitForCompletion(true);
        System.exit(0);

    }

}

    三、数据库字典数据准备

           创建表:      

DROP TABLE IF EXISTS `url_rule`;
CREATE TABLE `url_rule` (
  `url` varchar(2000) DEFAULT NULL,
  `content` varchar(255) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

           导入数据:

INSERT INTO `url_rule` VALUES ('http://timg01.baidu-1img.cn/timg?imagewise_list&size=b140_140&bd_page_type=1&ssid=0&from=2001m&uid=1ADA21F27A014F2180A7E22E8BEE35B9&pu=usm%402%2Csz%401321_1003%2Cta%40utouch_2_2.3_1_9.0&quality=80&sec=1374609614&di=7cec4b45b8d4db319556ad87166932d5&src=http://i1.baidu.com/it/u=975390796,1697384219&fm=21&gp=0.jpg', 'somecontent');
INSERT INTO `url_rule` VALUES ('http://timg01.baidu-1img.cn/timg?imagewise_list&size=b140_140&bd_page_type=1&ssid=0&from=2001m&uid=1ADA21F27A014F2180A7E22E8BEE35B9&pu=usm%402%2Csz%401321_1003%2Cta%40utouch_2_2.3_1_9.0&quality=80&sec=1374609712&di=69f7930c9cc2938e9fb9a09f815b78b8&src=http://i1.baidu.com/it/u=2694289975,690736961&fm=21&gp=0.jpg', 'somecontent');
INSERT INTO `url_rule` VALUES ('http://timg01.baidu-1img.cn/timg?imagewise_list&size=b140_140&bd_page_type=1&ssid=0&from=2001m&uid=1ADA21F27A014F2180A7E22E8BEE35B9&pu=usm%402%2Csz%401321_1003%2Cta%40utouch_2_2.3_1_9.0&quality=80&sec=1374609712&di=9998b896a653eae8a998dbeed5bce6e7&src=http://i1.baidu.com/it/u=433837882,4029071921&fm=21&gp=0.jpg', 'somecontent');
......

    四、运行程序

#上传jar

Alt+p
lcd d:/
 put loge.jar 2013072404-http-combinedBy-1373892200521-log-1.log

#准备hadoop处理的数据文件

cd /home/hadoop/apps/hadoop-2.9.1
hadoop fs  -mkdir -p /loge/input
hdfs dfs -put  2013072404-http-combinedBy-1373892200521-log-1.log /loge/input

#运行rjoin程序

hadoop jar loge.jar  com.empire.hadoop.mr.logenhance.LogEnhance /loge/input /loge/output

    五、运行效果

[hadoop@centos-aaron-h1 ~]$  hadoop jar loge.jar  com.empire.hadoop.mr.logenhance.LogEnhance /loge/input /loge/output
18/12/23 23:48:43 INFO client.RMProxy: Connecting to ResourceManager at centos-aaron-h1/192.168.29.144:8032
18/12/23 23:48:44 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
18/12/23 23:48:44 INFO input.FileInputFormat: Total input files to process : 1
18/12/23 23:48:44 INFO mapreduce.JobSubmitter: number of splits:1
18/12/23 23:48:45 INFO Configuration.deprecation: yarn.resourcemanager.system-metrics-publisher.enabled is deprecated. Instead, use yarn.system-metrics-publisher.enabled
18/12/23 23:48:45 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1545579497045_0001
18/12/23 23:48:46 INFO impl.YarnClientImpl: Submitted application application_1545579497045_0001
18/12/23 23:48:46 INFO mapreduce.Job: The url to track the job: http://centos-aaron-h1:8088/proxy/application_1545579497045_0001/
18/12/23 23:48:46 INFO mapreduce.Job: Running job: job_1545579497045_0001
18/12/23 23:48:59 INFO mapreduce.Job: Job job_1545579497045_0001 running in uber mode : false
18/12/23 23:48:59 INFO mapreduce.Job:  map 0% reduce 0%
18/12/23 23:49:14 INFO mapreduce.Job:  map 100% reduce 0%
18/12/23 23:49:14 INFO mapreduce.Job: Job job_1545579497045_0001 completed successfully
18/12/23 23:49:15 INFO mapreduce.Job: Counters: 31
        File System Counters
                FILE: Number of bytes read=0
                FILE: Number of bytes written=196971
                FILE: Number of read operations=0
                FILE: Number of large read operations=0
                FILE: Number of write operations=0
                HDFS: Number of bytes read=61826403
                HDFS: Number of bytes written=9735615
                HDFS: Number of read operations=3
                HDFS: Number of large read operations=0
                HDFS: Number of write operations=2
        Job Counters 
                Launched map tasks=1
                Data-local map tasks=1
                Total time spent by all maps in occupied slots (ms)=11274
                Total time spent by all reduces in occupied slots (ms)=0
                Total time spent by all map tasks (ms)=11274
                Total vcore-milliseconds taken by all map tasks=11274
                Total megabyte-milliseconds taken by all map tasks=11544576
        Map-Reduce Framework
                Map input records=100064
                Map output records=85730
                Input split bytes=154
                Spilled Records=0
                Failed Shuffles=0
                Merged Map outputs=0
                GC time elapsed (ms)=356
                CPU time spent (ms)=3680
                Physical memory (bytes) snapshot=114851840
                Virtual memory (bytes) snapshot=846995456
                Total committed heap usage (bytes)=16556032
        malformed
                malformedline=1
        File Input Format Counters 
                Bytes Read=61826249
        File Output Format Counters 
                Bytes Written=9735615
[hadoop@centos-aaron-h1 ~]$ 

    六、运行结果

[hadoop@centos-aaron-h1 ~]$hdfs dfs -cat /en/log.dat
1374609375.94   1374609375.95   1374609375.99   1374609378.08   110     362     8618841213864   460078412124864 3562060505359300        2       460     0       14163                   34173   10.13.13.240    211.142.196.65  35681   80cmnet    1       221.177.157.97  221.177.152.242 221.177.157.97  221.177.152.242 m.qpic.cn       http://m.qpic.cn/psb?/V11R7ZV52NVvz8/Id2ZLgBh*B.Lp9DoQNZSRtvekhZJegpqEqmpUZxKNdQ!/m/dCvwjYhxGAAA&bo=WAIgAwAAAAABAF4!    m.qpic.cn       android-qzone      GET     200     705     11884   10      9       0       0       10      9       0       0       0       0       http://m.qpic.cn/psb?/V11R7ZV52NVvz8/Id2ZLgBh*B.Lp9DoQNZSRtvekhZJegpqEqmpUZxKNdQ!/m/dCvwjYhxGAAA&bo=WAIgAwAAAAABAF4!       5903901810496765953     5903902282558234625     1495263367      somecontent
1374609386.06   1374609386.07   1374609406.25   1374609406.88   110     362     8618841213864   460078412124864 3562060505359300        2       460     0       14163                   34173   10.13.13.240    211.142.196.65  35681   80cmnet    1       221.177.157.97  221.177.152.242 221.177.157.97  221.177.152.242 m.qpic.cn       http://m.qpic.cn/psb?/718552d9-4af9-4d55-a311-821644911cf9/EwL66FxlQW2lla.SsNEdThCNFPRNB3dvPLlY6KjwoOw!/m/dLw974fOIgAA&bo=rAK7AawCuwEKACw!m.qpic.cn        android-qzone   GET     200     2008    20432   16      18      0       0       16      18      3       8       0       0       http://m.qpic.cn/psb?/718552d9-4af9-4d55-a311-821644911cf9/EwL66FxlQW2lla.SsNEdThCNFPRNB3dvPLlY6KjwoOw!/m/dLw974fOIgAA&bo=rAK7AawCuwEKACw! 5903901810496765953     5903902282558234625     1495263367      somecontent
1374609407.48   1374609407.49   1374609410.03   1374609411.86   110     362     8618841213864   460078412124864 3562060505359300        2       460     0       14163                   34173   10.13.13.240    211.142.196.65  35681   80cmnet    1       221.177.157.97  221.177.152.242 221.177.157.97  221.177.152.242 m.qpic.cn       http://m.qpic.cn/psb?/V116gGbo2r9QdU/gJda1CbWinnZjDb0ULQmuotKjtTGbEANVjOEWTVA4lk!/m/dCijJ6JkHwAA&bo=kAFYApABWAIBACc!    m.qpic.cn       android-qzone      GET     200     1061    19785   18      16      0       0       18      16      0       0       0       0       http://m.qpic.cn/psb?/V116gGbo2r9QdU/gJda1CbWinnZjDb0ULQmuotKjtTGbEANVjOEWTVA4lk!/m/dCijJ6JkHwAA&bo=kAFYApABWAIBACc!       5903901810496765953     5903902282558234625     1495263367      somecontent
1374609411.98   1374609411.99   1374609412.01   1374609413.90   110     362     8618841213864   460078412124864 3562060505359300        2       460     0       14163                   34173   10.13.13.240    211.142.196.65  35681   80cmnet    1       221.177.157.97  221.177.152.242 221.177.157.97  221.177.152.242 m.qpic.cn       http://m.qpic.cn/psu?/09b59a9b-e7ba-41da-95aa-1450231724ff/3qGX0RZSUjqreuCQAXieiNMmUGuIgxw2H*qv3IMxxSo!/m/YWGsShr0nAAAYql.Phq0KAAA      m.qpic.cn  android-qzone   GET     200     559     7738    6       6       0       0       6       6       0       0       0       0       http://m.qpic.cn/psu?/09b59a9b-e7ba-41da-95aa-1450231724ff/3qGX0RZSUjqreuCQAXieiNMmUGuIgxw2H*qv3IMxxSo!/m/YWGsShr0nAAAYql.Phq0KAAA 5903901810496765953     5903902282558234625     1495263367      somecontent
1374609414.02   1374609414.03   1374609414.05   1374609416.90   110     362     8618841213864   460078412124864 3562060505359300        2       460     0       14163                   34173   10.13.13.240    211.142.196.65  35681   80cmnet    1       221.177.157.97  221.177.152.242 221.177.157.97  221.177.152.242 m.qpic.cn       http://m.qpic.cn/psu?/09b59a9b-e7ba-41da-95aa-1450231724ff/2Kd9C9EeOGYlnXW9rsTQ6gu4l4q**vsZZGkV8hahmas!/m/YXx*QRq*fQAAYpxtyyDIAQAA&     m.qpic.cn  android-qzone   GET     200     600     8304    7       6       0       0       7       6       0       0       0       0       http://m.qpic.cn/psu?/09b59a9b-e7ba-41da-95aa-1450231724ff/2Kd9C9EeOGYlnXW9rsTQ6gu4l4q**vsZZGkV8hahmas!/m/YXx*QRq*fQAAYpxtyyDIAQAA&        5903901810496765953     5903902282558234625     1495263367      somecontent
1374609417.14   1374609417.15   1374609417.18   1374609419.12   110     362     8618841213864   460078412124864 3562060505359300        2       460     0       14163                   34173   10.13.13.240    211.142.196.65  35681   80cmnet    1       221.177.157.97  221.177.152.242 221.177.157.97  221.177.152.242 m.qpic.cn       http://m.qpic.cn/psu?/09b59a9b-e7ba-41da-95aa-1450231724ff/s54jR1c6WFenrs9ioiAvm*b.G28MlRL6XNsy0oG.qwg!/m/Yd0aQiB2EQAAYnt7PSCQIAAA      m.qpic.cn  android-qzone   GET     200     559     8072    6       6       0       0       6       6       0       0       0       0       http://m.qpic.cn/psu?/09b59a9b-e7ba-41da-95aa-1450231724ff/s54jR1c6WFenrs9ioiAvm*b.G28MlRL6XNsy0oG.qwg!/m/Yd0aQiB2EQAAYnt7PSCQIAAA 5903901810496765953     5903902282558234625     1495263367      somecontent
1374609470.46   1374609470.47   1374609470.51   1374609473.60   110     362     8618841213864   460078412124864 3562060505359300        2       460     0       14163                   34173   10.13.13.240    211.142.196.65  35681   80cmnet    1       221.177.157.97  221.177.152.242 221.177.157.97  221.177.152.242 m.qpic.cn       http://m.qpic.cn/psb?/V14eT0x12SEIJe/x2xkBFLgD6ye9Kqe2vQwmk*29qLlZTq.ldCu6ZvcoW4!/m/dEmfu6KFIAAA&bo=uAFKArgBSgIBACc!    m.qpic.cn       android-qzone      GET     200     705     11551   10      9       0       0       10      9       0       0       0       0       http://m.qpic.cn/psb?/V14eT0x12SEIJe/x2xkBFLgD6ye9Kqe2vQwmk*29qLlZTq.ldCu6ZvcoW4!/m/dEmfu6KFIAAA&bo=uAFKArgBSgIBACc!       5903901810496765953     5903902282558234625     1495263367      somecontent
1374609473.80   1374609473.81   1374609473.82   1374609475.40   110     362     8618841213864   460078412124864 3562060505359300        2       460     0       14163                   34173   10.13.13.240    211.142.196.65  35681   80cmnet    1       221.177.157.97  221.177.152.242 221.177.157.97  221.177.152.242 m.qpic.cn       http://m.qpic.cn/psb?/V14eT0x12SEIJe/SPTUGBbUW2b8TYVoPwFdaM*m0jnisYKBjhs2*qMqWb0!/m/dH8ekqHhHgAA&bo=uAFKAbgBSgEBACc!    m.qpic.cn       android-qzone      GET     200     465     5664    4       5       0       0       4       5       0       0       0       0       http://m.qpic.cn/psb?/V14eT0x12SEIJe/SPTUGBbUW2b8TYVoPwFdaM*m0jnisYKBjhs2*qMqWb0!/m/dH8ekqHhHgAA&bo=uAFKAbgBSgEBACc!       5903901810496765953     5903902282558234625     1495263367      somecontent
1374609475.58   1374609475.59   1374609475.61   1374609477.02   110     362     8618841213864   460078412124864 3562060505359300        2       460     0       14163                   34173   10.13.13.240    211.142.196.65  35681   80cmnet    1       221.177.157.97  221.177.152.242 221.177.157.97  221.177.152.242 m.qpic.cn       http://m.qpic.cn/psb?/V14eT0x12SEIJe/DTXJMtasqR8G3ixySNM7z9h9JqR9JI3n8SLjpqKj6T4!/m/dHpq86AjIgAA&bo=uAFKAbgBSgEBACc!    m.qpic.cn       android-qzone      GET     200     505     6009    5       5       0       0       5       5       0       0       0       0       http://m.qpic.cn/psb?/V14eT0x12SEIJe/DTXJMtasqR8G3ixySNM7z9h9JqR9JI3n8SLjpqKj6T4!/m/dHpq86AjIgAA&bo=uAFKAbgBSgEBACc!       5903901810496765953     5903902282558234625     1495263367      somecontent
1374609477.18   1374609477.19   1374609477.21   1374609480.38   110     362     8618841213864   460078412124864 3562060505359300        2       460     0       14163                   34173   10.13.13.240    211.142.196.65  35681   80cmnet    1       221.177.157.97  221.177.152.242 221.177.157.97  221.177.152.242 m.qpic.cn       http://m.qpic.cn/psb?/V11ecKOE0VsKsD/z.XrXqazeCWC.04HU*U*LTSKf5CPyorZOFdQm9euv3Y!/m/dOsjc5eiLgAA&bo=gALgAQAAAAABAEQ!    m.qpic.cn       android-qzone      GET     200     545     7400    6       6       0       0       6       6       0       0       0       0       http://m.qpic.cn/psb?/V11ecKOE0VsKsD/z.XrXqazeCWC.04HU*U*LTSKf5CPyorZOFdQm9euv3Y!/m/dOsjc5eiLgAA&bo=gALgAQAAAAABAEQ!       5903901810496765953     5903902282558234625     1495263367      somecontent
1374609480.56   1374609480.57   1374609480.60   1374609482.26   110     362     8618841213864   460078412124864 3562060505359300        2       460     0       14163                   34173   10.13.13.240    211.142.196.65  35681   80cmnet    1       221.177.157.97  221.177.152.242 221.177.157.97  221.177.152.242 m.qpic.cn       http://m.qpic.cn/psb?/V11ecKOE0VsKsD/aN827RzQs8kJlfB29EiToksb9qDO.xTfqrOE.Wet*Hw!/m/dO.vdJfnLwAA&bo=gALgAQAAAAABAEQ!    m.qpic.cn       android-qzone      GET     200     545     9028    6       7       0       0       6       7       0       0       0       0       http://m.qpic.cn/psb?/V11ecKOE0VsKsD/aN827RzQs8kJlfB29EiToksb9qDO.xTfqrOE.Wet*Hw!/m/dO.vdJfnLwAA&bo=gALgAQAAAAABAEQ!       5903901810496765953     5903902282558234625     1495263367      somecontent
1374609483.20   1374609483.21   1374609483.24   1374609485.44   110     362     8618841213864   460078412124864 3562060505359300        2       460     0       14163                   34173   10.13.13.240    211.142.196.65  35681   80cmnet    1       221.177.157.97  221.177.152.242 221.177.157.97  221.177.152.242 m.qpic.cn       http://m.qpic.cn/psb?/V11ecKOE0VsKsD/UfNHygCUHpfb.ZsBmHqAyP3nsMMY9xR55ojma50jxec!/m/dEid2pbqKAAA&bo=gALgAQAAAAABAEQ!    m.qpic.cn       android-qzone      GET     200     585     8554    7       7       0       0       7       7       0       0       0       0       http://m.qpic.cn/psb?/V11ecKOE0VsKsD/UfNHygCUHpfb.ZsBmHqAyP3nsMMY9xR55ojma50jxec!/m/dEid2pbqKAAA&bo=gALgAQAAAAABAEQ!       5903901810496765953     5903902282558234625     1495263367      somecontent
1374609485.62   1374609485.63   1374609485.66   1374609487.84   110     362     8618841213864   460078412124864 3562060505359300        2       460     0       14163                   34173   10.13.13.240    211.142.196.65  35681   80cmnet    1       221.177.157.97  221.177.152.242 221.177.157.97  221.177.152.242 m.qpic.cn       http://m.qpic.cn/psb?/V11ecKOE0VsKsD/soj.ZYORrztQVszHsGPn0ZjXaP8L9hbwsSWGKUUPt9Y!/m/dM.X2pb.KQAA&bo=gALgAQAAAAABAEQ!    m.qpic.cn       android-qzone      GET     200     545     9332    6       7       0       0       6       7       0       0       0       0       http://m.qpic.cn/psb?/V11ecKOE0VsKsD/soj.ZYORrztQVszHsGPn0ZjXaP8L9hbwsSWGKUUPt9Y!/m/dM.X2pb.KQAA&bo=gALgAQAAAAABAEQ!       5903901810496765953     5903902282558234625     1495263367      somecontent
1374609488.02   1374609488.03   1374609488.06   1374609489.98   110     362     8618841213864   460078412124864 3562060505359300        2       460     0       14163                   34173   10.13.13.240    211.142.196.65  35681   80cmnet    1       221.177.157.97  221.177.152.242 221.177.157.97  221.177.152.242 m.qpic.cn       http://m.qpic.cn/psb?/V11ecKOE0VsKsD/g5DqmV0u92.x*CO5pg2QTr55tqN6TXM8hGe6*2hSGtY!/m/dMTLEJWENAAA&bo=gALgAQAAAAABAEQ!    m.qpic.cn       android-qzone      GET     200     545     9151    6       7       0       0       6       7       0       0       0       0       http://m.qpic.cn/psb?/V11ecKOE0VsKsD/g5DqmV0u92.x*CO5pg2QTr55tqN6TXM8hGe6*2hSGtY!/m/dMTLEJWENAAA&bo=gALgAQAAAAABAEQ!       5903901810496765953     5903902282558234625     1495263367      somecontent
1374609501.12   1374609501.13   1374609502.91   1374609505.50   110     362     8618841213864   460078412124864 3562060505359300        2       460     0       14163                   34173   10.13.13.240    211.142.196.65  35681   80cmnet    1       221.177.157.97  221.177.152.242 221.177.157.97  221.177.152.242 m.qpic.cn       http://m.qpic.cn/psb?/V13gplZ20cRvyn/FGcdGR7FwhuOabarVYHnGK5qhJ42GEKjWrFEd70zQZw!/m/dGyIb5q*BAAA&bo=EAEnARABJwEBACc!    m.qpic.cn       android-qzone      GET     200     785     14556   12      11      0       0       12      11      0       0       0       0       http://m.qpic.cn/psb?/V13gplZ20cRvyn/FGcdGR7FwhuOabarVYHnGK5qhJ42GEKjWrFEd70zQZw!/m/dGyIb5q*BAAA&bo=EAEnARABJwEBACc!       5903901810496765953     5903902282558234625     1495263367      somecontent
1374609547.42   1374609547.43   1374609547.44   1374609549.00   110     362     8618841213864   460078412124864 3562060505359300        2       460     0       14163                   34173   10.13.13.240    211.142.196.65  35681   80cmnet    1       221.177.157.97  221.177.152.242 221.177.157.97  221.177.152.242 m.qpic.cn       http://m.qpic.cn/psb?/V11wxP2v0hGwaf/m7u6bFuzkfJ6GKXhBV8I390AS127QVsOd8o.Jr1Yagw!/m/dNhgOpalLAAA&bo=gAJVAwAAAAABAPM!    m.qpic.cn       android-qzone      GET     200     1540    5450    8       7       0       0       8       7       3       0       0       0       http://m.qpic.cn/psb?/V11wxP2v0hGwaf/m7u6bFuzkfJ6GKXhBV8I390AS127QVsOd8o.Jr1Yagw!/m/dNhgOpalLAAA&bo=gAJVAwAAAAABAPM!       5903901810496765953     5903902282558234625     1495263367      somecontent
1374609549.14   1374609549.15   1374609549.17   1374609551.20   110     362     8618841213864   460078412124864 3562060505359300        2       460     0       14163                   34173   10.13.13.240    211.142.196.65  35681   80cmnet    1       221.177.157.97  221.177.152.242 221.177.157.97  221.177.152.242 m.qpic.cn       http://m.qpic.cn/psb?/V12TDrbk3A3vMJ/aKcud*PHBeeGA6.xo4OMEipBTW3SyWZHFLb1N0ABA2s!/m/dHA8UcJ8JQAA&bo=5gGIAgAAAAABAcat: Filesystem closed
[hadoop@centos-aaron-h1 ~]$ 
[hadoop@centos-aaron-h1 ~]$  hdfs dfs -cat /crw/url.dat |more 
http://m.baidu.com/static/tf/nopic.gif?r=1374609403508&tj=alaxs&ftj=xschp_normal_5_0_10&hasRp=1&ac=nextgp       tocrawl
http://m.baidu.com/static/tf/nopic.gif?r=1374609444479&tj=alaxs&ftj=xschp_normal_5_0_10&hasRp=1&ac=nextgp       tocrawl
http://m.nuomi.com/client/push/list?cid=2000010000&devid=358059043449333&manufacturer=samsung&version=3.0.0&client=android&loc=MTEzLjY1NDQ0OTQ2Mjg5MDYzLDM0LjgwOTE4MTIxMzM3ODkwNg%3D%3D&uuid=ffffffff-db69-afec-4b6f-b9ab3c02bbbc&cityid=20
00010000&model=GT-I9228&userid=1345361458243220&channel=wooboo06.d8&release=2.3.6&mac=NTA6Q0M6Rjg6QTQ6MzQ6RjI%3D        tocrawl
http://m.baidu.com/ssid=0/from=2001m/bd_page_type=1/uid=1ADA21F27A014F2180A7E22E8BEE35B9/pu=usm%402%2Csz%401321_1003%2Cta%40utouch_2_2.3_1_9.0/img?tn=bdwis&word=%E6%A8%B1%E4%BA%95%E8%8E%89%E4%BA%9A%E7%94%B5%E5%BD%B1%E6%88%AA%E5%9B%BE&p
n=0&dw=w320&bs=176_208&pos=0&pinf=12_6_0_@bdwis_@av%E7%94%B5%E5%BD%B1%E9%AB%98%E6%BD%AE%E6%88%AA%E5%9B%BE_@176_208_@w320&fm=rs2&sp=&mid=w320    tocrawl
http://api.app.yiche.com/webapi/reviewtopic.ashx?op=get&topicid=166025  tocrawl
http://hm.baidu.com/hm.gif?si=b7723ac5ec07c308ac1ddf314523c2b0&et=0&nv=0&st=4&lt=1374609378&su=http%3A%2F%2Ffang.xinzheng.cc%2F&u=http%3A%2F%2Fwww.xinzheng.cc%2Ffangwu%2Fxiezilou&v=tc-1.0&rnd=2142219798      tocrawl
http://az.tpwap.cn/config.jsp?pos=448&clientId=435&sid=c2_1.1.1 tocrawl
better01.sinaapp.com    tocrawl
better01.sinaapp.com    tocrawl
http://af.upsdk.com/af/appActive?user_agent=Lenovo+A590&project=Lianxiang_Lenovo_Lenovo+A590&af_channel=Lianxiang&af_project=LXF_AG790_A01&af_version_code=198401&imei=D9B97DEA530324F235E73ABFA2CE003A&sid=1374609488731&brand=Lenovo&veri
fy_code=0ac0ba8ecb2a5455a052fa445c1937d5&encrpytion=DES&wifi_mac=0000000000000000&bt_mac=0000000000000000&cpu_serial=0000000000000000   tocrawl
http://af.upsdk.com/af/afPush?user_agent=Lenovo+A590&project=Lianxiang_Lenovo_Lenovo+A590&af_project=LXF_AG790_A01&af_channel=Lianxiang&af_version_code=198401&imei=D9B97DEA530324F235E73ABFA2CE003A&imsi=FD8CD80070CD6E0F71D8FAC3B8BF4F73&
sdk_version=16&sdk_name=4.1.1&af_version=3.7.1&sid=1374609488711&brand=Lenovo&verify_code=838773006a2a84415a64dea4aa117113&encrpytion=DES&lac=14196&cid=57917&wifi_mac=0000000000000000&bt_mac=0000000000000000&cpu_serial=0000000000000000
        tocrawl
http://app.wapx.cn/action/push/api_ad?app_id=24eeb27a1f5032e40b4561317f5f460c&udid=354096050843721&imsi=460003891750725&net=&app_version=1.0.7&sdk_version=1.5.2&device_name=GT-S7562i&y=6a5434e15a787d6791343c83523df81b&device_type=andro
id&os_version=4.0.4&country_code=CN&language=zh&act=dangerb.game.llk.ReceiverRestrictedContext&channel=WAPS&device_width=480&device_height=800&at=1374600352535 tocrawl
http://q.qlogo.cn/headimg_dl?bs=qq&dst_uin=1492849784&src_uin=1665118942&fid=1492849784&spec=100&referer=mqq&term_type=pc&host=1&sign=0C82F9B2A7109F7C2C058C41FE6CB7810AC63C710B4A3474&rsp_type=img     tocrawl
better01.sinaapp.com    tocrawl
http://switching.atm.punchbox.org/v1/?appid=16252171-7DE8-159E-F72B-9A58CFD2D08D&ver=7.1.2      tocrawl
http://q1.qlogo.cn/g?b=mqq&k=S8TokNAI2eSw6kynuB2gjA&t=1374608380&refer=mqq&s=40 tocrawl
http://launchermsg.3g.cn/golaunchermsg/msgservice.do?funid=1&rd=-3535723825567048133    tocrawl
http://q.qlogo.cn/headimg_dl?bs=qq&dst_uin=2569887365&src_uin=1665118942&fid=2569887365&spec=100&referer=mqq&term_type=pc&host=1&sign=5D6B95E75ACFC11355D4542D151A876C72D62301FCAEBFD3&rsp_type=img     tocrawl
better01.sinaapp.com    tocrawl
http://app.adsofts.cn/action/connect/active?app_id=3fe23c1a9f592ec42abf6a7d012841ba&udid=865662012462439&imsi=460003778631834&net=cmnet&base=adsofts.cn&app_version=3.7&sdk_version=1.6.10&device_name=TE600+&device_brand=Ctyon&y=2d81b1d1
1e11ae7de129d3cce29a958c&device_type=android&os_version=4.1.2&country_code=CN&language=zh&cid=83mdas8k2mj0g70r4b0b68ko9kgr2dk6&act=com.androidemu.harvechise.ReceiverRestrictedContext&channel=gfan&device_width=320&device_height=480&at=1
374609806979    tocrawl
better01.sinaapp.com    tocrawl
http://mb.hd.sohu.com.cn/mc.gif?uid=ff05360fb8fc15f1eb07c84904a9f863&url=1002&passport=&mtype=6&ltype=&cv=2.8.1&mos=2&mosv=4.2.2&pro=1&mfo=BBK&mfov=vivo%20Xplay&webtype=2G&vid=&time=1374609536953&memo=0&type=1&channelid=91&value=&sim=1
&playlistid=&catecode=&preid=&newuser=0&enterid=0&startid=1374609536961&loc=    tocrawl
http://launchermsg.3g.cn/golaunchermsg/msgservice.do?funid=1&rd=-3535723825567048133    tocrawl
http://wap.baidu.com/bd_page_type=1/pu=usm%400%2Csz%401330%5F640%2Cta%40big%5F%5F5%2E0%5F3%5F525/uid=FC69E3DEB768362786105AE0F78D77A7/t=wap/w=0_10_%E6%9E%81%E5%93%81%E5%A5%B3%E4%BB%99%E4%B9%A6%E5%8C%85%E7%BD%91%E9%98%85%E8%AF%BB/ssid=0
/from=128g/l=0/tc?func=nextp&pi=3&m=0&pn=15&src=http%3A%2F%2Fwww%2Ebookbao%2Ecom%2Fviews%2F201306%2F05%2Fid%5FXMzI4NDU2%5F14%2Ehtml     tocrawl
http://q.qlogo.cn/headimg_dl?bs=qq&dst_uin=2251204892&src_uin=1665118942&fid=2251204892&spec=100&referer=mqq&term_type=pc&host=1&sign=CFAFF94D0282A4B4EBC81DBDB307CAD166B756E7433FC920&rsp_type=img     tocrawl
http://r3.11222.cn/resource/bookclub/cover/95/84/2011120917151262.jpg   tocrawl
http://m.i.ppsrc.com/d/82C09949DDA26192C2170EF4CB0A0084/640/640 tocrawl
http://app.wapx.cn/action/push/api_ad?app_id=24eeb27a1f5032e40b4561317f5f460c&udid=358864040778026&imsi=460003827213968&net=cmnet&app_version=1.0.2&sdk_version=1.5.2&device_name=GT-P6200&y=f34b836872c220f87c39d6bf6afeb123&device_type=a
ndroid&os_version=3.2&country_code=CN&language=zh&act=dangerb.game.llk.ReceiverRestrictedContext&channel=Samsung&device_width=600&device_height=976&at=1374609799140    tocrawl
better01.sinaapp.com    tocrawl
http://psb.lenovomm.com/pushservice/2.1/poll?lpsst=B5AAAAAAXEPJgCAAAADAAAAUANHH1VAA0vAFhwaWQ9JmRpZD1PRFkyTlRFMk1ERTJOVE0yT1RRMiZzaWQ9TVVKRVJrTTNORFl6UlVFeVFUUXpOell5TVRFek56VkJSa1ExUTBWR056UXgmZHQ9YVcxbGFRK4GzrB5hOEri5Cx08JrOrw&ack=R1:
2106551173&min=180&max=320      tocrawl
http://api.app.yiche.com/webapi/reviewtopic.ashx?op=get&serialid=2573&level=1&pageindex=2&pagesize=20   tocrawl
http://q.qlogo.cn/headimg_dl?bs=qq&dst_uin=2215216651&src_uin=1157251974&fid=2215216651&spec=100&referer=mqq&term_type=pc&host=1&sign=08F0E013042DA3DB50682F97BFBD852A13558806FDF2A2E3&rsp_type=img     tocrawl
http://q.qlogo.cn/headimg_dl?bs=qq&dst_uin=306486557&src_uin=1157251974&fid=306486557&spec=100&referer=mqq&term_type=pc&host=1&sign=08F0E013042DA3DB50682F97BFBD852A13558806FDF2A2E3&rsp_type=img       tocrawl
http://psb.lenovomm.com/pushservice/2.1/poll?lpsst=B5AAAAAAIyZAkCAAAADAAAAT_pzFv2AA0vAFhwaWQ9JmRpZD1PRFk1TkRVNU1ERXlNVE16TWpFNSZzaWQ9UmpZNVJEa3hRMFkxTWpKQk56WXpNelkwUWpOQ1FUQkNSamhGTkRNMFJEY3gmZHQ9YVcxbGFRGaRu5QbEjFyJDcKU6sGCSg&ack=R1:
2005075059&min=300&max=300      tocrawl
http://api.changba.com/ktvbox.php?ac=pullnotice&macaddress=1C%3A66%3AAA%3A1A%3AF9%3A56&channelsrc=changba_A&deviceid=1C%3A66%3AAA%3A1A%3AF9%3A56&version=2.3.0&seret=f3f1470646&_userinfo=796   tocrawl
http://q3.qlogo.cn/g?b=mqq&k=9FqISKXvCKCf6sPAxeu8yA&t=1374446924&refer=mqq&s=100        tocrawl
http://q4.qlogo.cn/g?b=mqq&k=puGXojn2c7cOCeWibOce6yw&t=1374373655&refer=mqq&s=100       tocrawl
http://q1.qlogo.cn/g?b=mqq&k=ZRw4QCD4IoG0SsepbqNUfQ&t=1370823532&refer=mqq&s=100        tocrawl
http://a126.photo.store.qq.com/psb?/V10Z7w8y4FYPqA/zzUERobf01h9PPi7s*.q7SkeQsyEm0KTu1EfAYWWZ94!/a/dHi5G0t8JgAA&bo=yAALAQAAAAABAOU!      tocrawl
[hadoop@centos-aaron-h1 ~]$ 

    七、注意事项

           mapreduce代码打包的时候注意需要将mysql的驱动Jar包打上,可以使用maven插件,或者开发工具选取lib文件打包;

          最后寄语,以上是博主本次文章的全部内容,如果大家觉得博主的文章还不错,请点赞;如果您对博主其它服务器大数据技术或者博主本人感兴趣,请关注博主博客,并且欢迎随时跟博主沟通交流。

转载于:https://my.oschina.net/u/2371923/blog/2992123

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值