上一篇文章分析了如何使用hadoop来实现sql中group by 并且取每组中最大值的需求--订单中成交金额最大的订单项分析,本篇博客博主将继续分享一个mapreduce实战例子--运营商流量日志解析增强;
一、需求
电信运营商服务器中记录了用户流量访问的日志,效果如下图所示:
1374609560.11 1374609560.16 1374609560.16 1374609560.16 110 5 8615038208365 460023383869133 8696420056841778 2 460 0 14615 54941 10.188.77.252 61.145.116.27 35020 80 6 cmnet 1 221.177.218.34 221.177.217.161 221.177.218.34 221.177.217.167 ad.veegao.com http://ad.veegao.com/veegao/iris.action Apache-HttpClient/UNAVAILABLE (java 1.4) POST 200 593 310 4 3 0 0 4 3 0 0 0 0 http://ad.veegao.com/veegao/iris.action 5903903079251243019 5903903103500771339 5980728
1374609558.91 1374609558.97 1374609558.97 1374609559.31 112 461 8615038208365 460023383869133 8696420056841778 2 460 0 14615 54941 10.188.77.252 101.226.76.175 37293 80 6 cmnet 1 221.177.218.34 221.177.217.161 221.177.218.34 221.177.217.167 short.weixin.qq.com http://short.weixin.qq.com/cgi-bin/micromsg-bin/getcdndns Android QQMail HTTP Client POST 200 543 563 2 3 0 0 2 3 0 0 0 0 http://short.weixin.qq.com/cgi-bin/micromsg-bin/getcdndns 5903903079251243019 5903903097240039435 5980728
1374609514.70 1374609514.75 1374609514.75 1374609515.58 110 5 8613674976196 460004901700207 8623350100353878 2 460 0 14694 58793 10.184.80.32 111.13.13.222 36181 80 6 cmnet 1 221.177.156.4 221.177.217.145 221.177.156.4 221.177.217.156 retype.wenku.bdimg.com http://retype.wenku.bdimg.com/img/97308d2b7375a417866f8f09 AMB_400 GET 200 345 4183 5 5 0 0 5 5 0 0 0 0 http://retype.wenku.bdimg.com/img/97308d2b7375a417866f8f09 5903900710696611851 5903902908140003339 5937307
我们需要将其中的url如果已经在数据库中有其对应的标签内容,则增强日志,在后面输出其内容;否则,表示该url在数据字典中不存在,需要使用爬虫去爬取;
二、代码实现
DBLoader(db数据加载类)
package com.empire.hadoop.mr.logenhance;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.Map;
public class DBLoader {
public static void dbLoader(Map<String, String> ruleMap) throws Exception {
Connection conn = null;
Statement st = null;
ResultSet res = null;
try {
Class.forName("com.mysql.jdbc.Driver");
conn = DriverManager.getConnection("jdbc:mysql://192.168.29.131:3306/urldb?characterEncoding=utf-8", "root",
"123456");
st = conn.createStatement();
res = st.executeQuery("select url,content from url_rule");
while (res.next()) {
ruleMap.put(res.getString(1), res.getString(2));
}
} finally {
try {
if (res != null) {
res.close();
}
if (st != null) {
st.close();
}
if (conn != null) {
conn.close();
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
LogEnhanceOutputFormat(自定义OutputFormat--用于将增强日志和需要爬虫爬取的url分文件输出)
package com.empire.hadoop.mr.logenhance;
import java.io.IOException;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* maptask或者reducetask在最终输出时,先调用OutputFormat的getRecordWriter方法拿到一个RecordWriter
* 然后再调用RecordWriter的write(k,v)方法将数据写出
*/
public class LogEnhanceOutputFormat extends FileOutputFormat<Text, NullWritable> {
@Override
public RecordWriter<Text, NullWritable> getRecordWriter(TaskAttemptContext context)
throws IOException, InterruptedException {
FileSystem fs = FileSystem.get(context.getConfiguration());
Path enhancePath = new Path("/en/log.dat");
Path tocrawlPath = new Path("/crw/url.dat");
FSDataOutputStream enhancedOs = fs.create(enhancePath);
FSDataOutputStream tocrawlOs = fs.create(tocrawlPath);
return new EnhanceRecordWriter(enhancedOs, tocrawlOs);
}
/**
* 构造一个自己的recordwriter
*
* @author
*/
static class EnhanceRecordWriter extends RecordWriter<Text, NullWritable> {
FSDataOutputStream enhancedOs = null;
FSDataOutputStream tocrawlOs = null;
public EnhanceRecordWriter(FSDataOutputStream enhancedOs, FSDataOutputStream tocrawlOs) {
super();
this.enhancedOs = enhancedOs;
this.tocrawlOs = tocrawlOs;
}
@Override
public void write(Text key, NullWritable value) throws IOException, InterruptedException {
String result = key.toString();
// 如果要写出的数据是待爬的url,则写入待爬清单文件 /logenhance/tocrawl/url.dat
if (result.contains("tocrawl")) {
tocrawlOs.write(result.getBytes());
} else {
// 如果要写出的数据是增强日志,则写入增强日志文件 /logenhance/enhancedlog/log.dat
enhancedOs.write(result.getBytes());
}
}
@Override
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
if (tocrawlOs != null) {
tocrawlOs.close();
}
if (enhancedOs != null) {
enhancedOs.close();
}
}
}
}
LogEnhance(日志分析增强主程序类)
package com.empire.hadoop.mr.logenhance;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class LogEnhance {
static class LogEnhanceMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
Map<String, String> ruleMap = new HashMap<String, String>();
Text k = new Text();
NullWritable v = NullWritable.get();
// 从数据库中加载规则信息倒ruleMap中
@Override
protected void setup(Context context) throws IOException, InterruptedException {
try {
DBLoader.dbLoader(ruleMap);
} catch (Exception e) {
e.printStackTrace();
}
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 获取一个计数器用来记录不合法的日志行数, 组名, 计数器名称
Counter counter = context.getCounter("malformed", "malformedline");
String line = value.toString();
String[] fields = StringUtils.split(line, "\t");
try {
String url = fields[26];
if (isHttpUrl(url)) {
String content_tag = ruleMap.get(url);
// 判断内容标签是否为空,如果为空,则只输出url到待爬清单;如果有值,则输出到增强日志
if (content_tag == null) {
k.set(url + "\t" + "tocrawl" + "\n");
context.write(k, v);
} else {
k.set(line + "\t" + content_tag + "\n");
context.write(k, v);
}
}
} catch (Exception exception) {
counter.increment(1);
}
}
/**
* 判断字符串是否为URL
*
* @param urls
* @return true:是URL、false:不是URL
*/
public boolean isHttpUrl(String urls) {
boolean isurl = false;
String regex = "(((https|http)?://)?([a-z0-9]+[.])|(www.))"
+ "\\w+[.|\\/]([a-z0-9]{0,})?[[.]([a-z0-9]{0,})]+((/[\\S&&[^,;\u4E00-\u9FA5]]+)+)?([.][a-z0-9]{0,}+|/?)";//设置正则表达式
Pattern pat = Pattern.compile(regex.trim());//比对
Matcher mat = pat.matcher(urls.trim());
isurl = mat.matches();//判断是否匹配
if (isurl) {
isurl = true;
}
return isurl;
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(LogEnhance.class);
job.setMapperClass(LogEnhanceMapper.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
// 要控制不同的内容写往不同的目标路径,可以采用自定义outputformat的方法
job.setOutputFormatClass(LogEnhanceOutputFormat.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
// 尽管我们用的是自定义outputformat,但是它是继承制fileoutputformat
// 在fileoutputformat中,必须输出一个_success文件,所以在此还需要设置输出path
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// 不需要reducer
job.setNumReduceTasks(0);
job.waitForCompletion(true);
System.exit(0);
}
}
三、数据库字典数据准备
创建表:
DROP TABLE IF EXISTS `url_rule`;
CREATE TABLE `url_rule` (
`url` varchar(2000) DEFAULT NULL,
`content` varchar(255) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
导入数据:
INSERT INTO `url_rule` VALUES ('http://timg01.baidu-1img.cn/timg?imagewise_list&size=b140_140&bd_page_type=1&ssid=0&from=2001m&uid=1ADA21F27A014F2180A7E22E8BEE35B9&pu=usm%402%2Csz%401321_1003%2Cta%40utouch_2_2.3_1_9.0&quality=80&sec=1374609614&di=7cec4b45b8d4db319556ad87166932d5&src=http://i1.baidu.com/it/u=975390796,1697384219&fm=21&gp=0.jpg', 'somecontent');
INSERT INTO `url_rule` VALUES ('http://timg01.baidu-1img.cn/timg?imagewise_list&size=b140_140&bd_page_type=1&ssid=0&from=2001m&uid=1ADA21F27A014F2180A7E22E8BEE35B9&pu=usm%402%2Csz%401321_1003%2Cta%40utouch_2_2.3_1_9.0&quality=80&sec=1374609712&di=69f7930c9cc2938e9fb9a09f815b78b8&src=http://i1.baidu.com/it/u=2694289975,690736961&fm=21&gp=0.jpg', 'somecontent');
INSERT INTO `url_rule` VALUES ('http://timg01.baidu-1img.cn/timg?imagewise_list&size=b140_140&bd_page_type=1&ssid=0&from=2001m&uid=1ADA21F27A014F2180A7E22E8BEE35B9&pu=usm%402%2Csz%401321_1003%2Cta%40utouch_2_2.3_1_9.0&quality=80&sec=1374609712&di=9998b896a653eae8a998dbeed5bce6e7&src=http://i1.baidu.com/it/u=433837882,4029071921&fm=21&gp=0.jpg', 'somecontent');
......
四、运行程序
#上传jar
Alt+p
lcd d:/
put loge.jar 2013072404-http-combinedBy-1373892200521-log-1.log
#准备hadoop处理的数据文件
cd /home/hadoop/apps/hadoop-2.9.1
hadoop fs -mkdir -p /loge/input
hdfs dfs -put 2013072404-http-combinedBy-1373892200521-log-1.log /loge/input
#运行rjoin程序
hadoop jar loge.jar com.empire.hadoop.mr.logenhance.LogEnhance /loge/input /loge/output
五、运行效果
[hadoop@centos-aaron-h1 ~]$ hadoop jar loge.jar com.empire.hadoop.mr.logenhance.LogEnhance /loge/input /loge/output
18/12/23 23:48:43 INFO client.RMProxy: Connecting to ResourceManager at centos-aaron-h1/192.168.29.144:8032
18/12/23 23:48:44 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
18/12/23 23:48:44 INFO input.FileInputFormat: Total input files to process : 1
18/12/23 23:48:44 INFO mapreduce.JobSubmitter: number of splits:1
18/12/23 23:48:45 INFO Configuration.deprecation: yarn.resourcemanager.system-metrics-publisher.enabled is deprecated. Instead, use yarn.system-metrics-publisher.enabled
18/12/23 23:48:45 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1545579497045_0001
18/12/23 23:48:46 INFO impl.YarnClientImpl: Submitted application application_1545579497045_0001
18/12/23 23:48:46 INFO mapreduce.Job: The url to track the job: http://centos-aaron-h1:8088/proxy/application_1545579497045_0001/
18/12/23 23:48:46 INFO mapreduce.Job: Running job: job_1545579497045_0001
18/12/23 23:48:59 INFO mapreduce.Job: Job job_1545579497045_0001 running in uber mode : false
18/12/23 23:48:59 INFO mapreduce.Job: map 0% reduce 0%
18/12/23 23:49:14 INFO mapreduce.Job: map 100% reduce 0%
18/12/23 23:49:14 INFO mapreduce.Job: Job job_1545579497045_0001 completed successfully
18/12/23 23:49:15 INFO mapreduce.Job: Counters: 31
File System Counters
FILE: Number of bytes read=0
FILE: Number of bytes written=196971
FILE: Number of read operations=0
FILE: Number of large read operations=0
FILE: Number of write operations=0
HDFS: Number of bytes read=61826403
HDFS: Number of bytes written=9735615
HDFS: Number of read operations=3
HDFS: Number of large read operations=0
HDFS: Number of write operations=2
Job Counters
Launched map tasks=1
Data-local map tasks=1
Total time spent by all maps in occupied slots (ms)=11274
Total time spent by all reduces in occupied slots (ms)=0
Total time spent by all map tasks (ms)=11274
Total vcore-milliseconds taken by all map tasks=11274
Total megabyte-milliseconds taken by all map tasks=11544576
Map-Reduce Framework
Map input records=100064
Map output records=85730
Input split bytes=154
Spilled Records=0
Failed Shuffles=0
Merged Map outputs=0
GC time elapsed (ms)=356
CPU time spent (ms)=3680
Physical memory (bytes) snapshot=114851840
Virtual memory (bytes) snapshot=846995456
Total committed heap usage (bytes)=16556032
malformed
malformedline=1
File Input Format Counters
Bytes Read=61826249
File Output Format Counters
Bytes Written=9735615
[hadoop@centos-aaron-h1 ~]$
六、运行结果
[hadoop@centos-aaron-h1 ~]$hdfs dfs -cat /en/log.dat
1374609375.94 1374609375.95 1374609375.99 1374609378.08 110 362 8618841213864 460078412124864 3562060505359300 2 460 0 14163 34173 10.13.13.240 211.142.196.65 35681 80cmnet 1 221.177.157.97 221.177.152.242 221.177.157.97 221.177.152.242 m.qpic.cn http://m.qpic.cn/psb?/V11R7ZV52NVvz8/Id2ZLgBh*B.Lp9DoQNZSRtvekhZJegpqEqmpUZxKNdQ!/m/dCvwjYhxGAAA&bo=WAIgAwAAAAABAF4! m.qpic.cn android-qzone GET 200 705 11884 10 9 0 0 10 9 0 0 0 0 http://m.qpic.cn/psb?/V11R7ZV52NVvz8/Id2ZLgBh*B.Lp9DoQNZSRtvekhZJegpqEqmpUZxKNdQ!/m/dCvwjYhxGAAA&bo=WAIgAwAAAAABAF4! 5903901810496765953 5903902282558234625 1495263367 somecontent
1374609386.06 1374609386.07 1374609406.25 1374609406.88 110 362 8618841213864 460078412124864 3562060505359300 2 460 0 14163 34173 10.13.13.240 211.142.196.65 35681 80cmnet 1 221.177.157.97 221.177.152.242 221.177.157.97 221.177.152.242 m.qpic.cn http://m.qpic.cn/psb?/718552d9-4af9-4d55-a311-821644911cf9/EwL66FxlQW2lla.SsNEdThCNFPRNB3dvPLlY6KjwoOw!/m/dLw974fOIgAA&bo=rAK7AawCuwEKACw!m.qpic.cn android-qzone GET 200 2008 20432 16 18 0 0 16 18 3 8 0 0 http://m.qpic.cn/psb?/718552d9-4af9-4d55-a311-821644911cf9/EwL66FxlQW2lla.SsNEdThCNFPRNB3dvPLlY6KjwoOw!/m/dLw974fOIgAA&bo=rAK7AawCuwEKACw! 5903901810496765953 5903902282558234625 1495263367 somecontent
1374609407.48 1374609407.49 1374609410.03 1374609411.86 110 362 8618841213864 460078412124864 3562060505359300 2 460 0 14163 34173 10.13.13.240 211.142.196.65 35681 80cmnet 1 221.177.157.97 221.177.152.242 221.177.157.97 221.177.152.242 m.qpic.cn http://m.qpic.cn/psb?/V116gGbo2r9QdU/gJda1CbWinnZjDb0ULQmuotKjtTGbEANVjOEWTVA4lk!/m/dCijJ6JkHwAA&bo=kAFYApABWAIBACc! m.qpic.cn android-qzone GET 200 1061 19785 18 16 0 0 18 16 0 0 0 0 http://m.qpic.cn/psb?/V116gGbo2r9QdU/gJda1CbWinnZjDb0ULQmuotKjtTGbEANVjOEWTVA4lk!/m/dCijJ6JkHwAA&bo=kAFYApABWAIBACc! 5903901810496765953 5903902282558234625 1495263367 somecontent
1374609411.98 1374609411.99 1374609412.01 1374609413.90 110 362 8618841213864 460078412124864 3562060505359300 2 460 0 14163 34173 10.13.13.240 211.142.196.65 35681 80cmnet 1 221.177.157.97 221.177.152.242 221.177.157.97 221.177.152.242 m.qpic.cn http://m.qpic.cn/psu?/09b59a9b-e7ba-41da-95aa-1450231724ff/3qGX0RZSUjqreuCQAXieiNMmUGuIgxw2H*qv3IMxxSo!/m/YWGsShr0nAAAYql.Phq0KAAA m.qpic.cn android-qzone GET 200 559 7738 6 6 0 0 6 6 0 0 0 0 http://m.qpic.cn/psu?/09b59a9b-e7ba-41da-95aa-1450231724ff/3qGX0RZSUjqreuCQAXieiNMmUGuIgxw2H*qv3IMxxSo!/m/YWGsShr0nAAAYql.Phq0KAAA 5903901810496765953 5903902282558234625 1495263367 somecontent
1374609414.02 1374609414.03 1374609414.05 1374609416.90 110 362 8618841213864 460078412124864 3562060505359300 2 460 0 14163 34173 10.13.13.240 211.142.196.65 35681 80cmnet 1 221.177.157.97 221.177.152.242 221.177.157.97 221.177.152.242 m.qpic.cn http://m.qpic.cn/psu?/09b59a9b-e7ba-41da-95aa-1450231724ff/2Kd9C9EeOGYlnXW9rsTQ6gu4l4q**vsZZGkV8hahmas!/m/YXx*QRq*fQAAYpxtyyDIAQAA& m.qpic.cn android-qzone GET 200 600 8304 7 6 0 0 7 6 0 0 0 0 http://m.qpic.cn/psu?/09b59a9b-e7ba-41da-95aa-1450231724ff/2Kd9C9EeOGYlnXW9rsTQ6gu4l4q**vsZZGkV8hahmas!/m/YXx*QRq*fQAAYpxtyyDIAQAA& 5903901810496765953 5903902282558234625 1495263367 somecontent
1374609417.14 1374609417.15 1374609417.18 1374609419.12 110 362 8618841213864 460078412124864 3562060505359300 2 460 0 14163 34173 10.13.13.240 211.142.196.65 35681 80cmnet 1 221.177.157.97 221.177.152.242 221.177.157.97 221.177.152.242 m.qpic.cn http://m.qpic.cn/psu?/09b59a9b-e7ba-41da-95aa-1450231724ff/s54jR1c6WFenrs9ioiAvm*b.G28MlRL6XNsy0oG.qwg!/m/Yd0aQiB2EQAAYnt7PSCQIAAA m.qpic.cn android-qzone GET 200 559 8072 6 6 0 0 6 6 0 0 0 0 http://m.qpic.cn/psu?/09b59a9b-e7ba-41da-95aa-1450231724ff/s54jR1c6WFenrs9ioiAvm*b.G28MlRL6XNsy0oG.qwg!/m/Yd0aQiB2EQAAYnt7PSCQIAAA 5903901810496765953 5903902282558234625 1495263367 somecontent
1374609470.46 1374609470.47 1374609470.51 1374609473.60 110 362 8618841213864 460078412124864 3562060505359300 2 460 0 14163 34173 10.13.13.240 211.142.196.65 35681 80cmnet 1 221.177.157.97 221.177.152.242 221.177.157.97 221.177.152.242 m.qpic.cn http://m.qpic.cn/psb?/V14eT0x12SEIJe/x2xkBFLgD6ye9Kqe2vQwmk*29qLlZTq.ldCu6ZvcoW4!/m/dEmfu6KFIAAA&bo=uAFKArgBSgIBACc! m.qpic.cn android-qzone GET 200 705 11551 10 9 0 0 10 9 0 0 0 0 http://m.qpic.cn/psb?/V14eT0x12SEIJe/x2xkBFLgD6ye9Kqe2vQwmk*29qLlZTq.ldCu6ZvcoW4!/m/dEmfu6KFIAAA&bo=uAFKArgBSgIBACc! 5903901810496765953 5903902282558234625 1495263367 somecontent
1374609473.80 1374609473.81 1374609473.82 1374609475.40 110 362 8618841213864 460078412124864 3562060505359300 2 460 0 14163 34173 10.13.13.240 211.142.196.65 35681 80cmnet 1 221.177.157.97 221.177.152.242 221.177.157.97 221.177.152.242 m.qpic.cn http://m.qpic.cn/psb?/V14eT0x12SEIJe/SPTUGBbUW2b8TYVoPwFdaM*m0jnisYKBjhs2*qMqWb0!/m/dH8ekqHhHgAA&bo=uAFKAbgBSgEBACc! m.qpic.cn android-qzone GET 200 465 5664 4 5 0 0 4 5 0 0 0 0 http://m.qpic.cn/psb?/V14eT0x12SEIJe/SPTUGBbUW2b8TYVoPwFdaM*m0jnisYKBjhs2*qMqWb0!/m/dH8ekqHhHgAA&bo=uAFKAbgBSgEBACc! 5903901810496765953 5903902282558234625 1495263367 somecontent
1374609475.58 1374609475.59 1374609475.61 1374609477.02 110 362 8618841213864 460078412124864 3562060505359300 2 460 0 14163 34173 10.13.13.240 211.142.196.65 35681 80cmnet 1 221.177.157.97 221.177.152.242 221.177.157.97 221.177.152.242 m.qpic.cn http://m.qpic.cn/psb?/V14eT0x12SEIJe/DTXJMtasqR8G3ixySNM7z9h9JqR9JI3n8SLjpqKj6T4!/m/dHpq86AjIgAA&bo=uAFKAbgBSgEBACc! m.qpic.cn android-qzone GET 200 505 6009 5 5 0 0 5 5 0 0 0 0 http://m.qpic.cn/psb?/V14eT0x12SEIJe/DTXJMtasqR8G3ixySNM7z9h9JqR9JI3n8SLjpqKj6T4!/m/dHpq86AjIgAA&bo=uAFKAbgBSgEBACc! 5903901810496765953 5903902282558234625 1495263367 somecontent
1374609477.18 1374609477.19 1374609477.21 1374609480.38 110 362 8618841213864 460078412124864 3562060505359300 2 460 0 14163 34173 10.13.13.240 211.142.196.65 35681 80cmnet 1 221.177.157.97 221.177.152.242 221.177.157.97 221.177.152.242 m.qpic.cn http://m.qpic.cn/psb?/V11ecKOE0VsKsD/z.XrXqazeCWC.04HU*U*LTSKf5CPyorZOFdQm9euv3Y!/m/dOsjc5eiLgAA&bo=gALgAQAAAAABAEQ! m.qpic.cn android-qzone GET 200 545 7400 6 6 0 0 6 6 0 0 0 0 http://m.qpic.cn/psb?/V11ecKOE0VsKsD/z.XrXqazeCWC.04HU*U*LTSKf5CPyorZOFdQm9euv3Y!/m/dOsjc5eiLgAA&bo=gALgAQAAAAABAEQ! 5903901810496765953 5903902282558234625 1495263367 somecontent
1374609480.56 1374609480.57 1374609480.60 1374609482.26 110 362 8618841213864 460078412124864 3562060505359300 2 460 0 14163 34173 10.13.13.240 211.142.196.65 35681 80cmnet 1 221.177.157.97 221.177.152.242 221.177.157.97 221.177.152.242 m.qpic.cn http://m.qpic.cn/psb?/V11ecKOE0VsKsD/aN827RzQs8kJlfB29EiToksb9qDO.xTfqrOE.Wet*Hw!/m/dO.vdJfnLwAA&bo=gALgAQAAAAABAEQ! m.qpic.cn android-qzone GET 200 545 9028 6 7 0 0 6 7 0 0 0 0 http://m.qpic.cn/psb?/V11ecKOE0VsKsD/aN827RzQs8kJlfB29EiToksb9qDO.xTfqrOE.Wet*Hw!/m/dO.vdJfnLwAA&bo=gALgAQAAAAABAEQ! 5903901810496765953 5903902282558234625 1495263367 somecontent
1374609483.20 1374609483.21 1374609483.24 1374609485.44 110 362 8618841213864 460078412124864 3562060505359300 2 460 0 14163 34173 10.13.13.240 211.142.196.65 35681 80cmnet 1 221.177.157.97 221.177.152.242 221.177.157.97 221.177.152.242 m.qpic.cn http://m.qpic.cn/psb?/V11ecKOE0VsKsD/UfNHygCUHpfb.ZsBmHqAyP3nsMMY9xR55ojma50jxec!/m/dEid2pbqKAAA&bo=gALgAQAAAAABAEQ! m.qpic.cn android-qzone GET 200 585 8554 7 7 0 0 7 7 0 0 0 0 http://m.qpic.cn/psb?/V11ecKOE0VsKsD/UfNHygCUHpfb.ZsBmHqAyP3nsMMY9xR55ojma50jxec!/m/dEid2pbqKAAA&bo=gALgAQAAAAABAEQ! 5903901810496765953 5903902282558234625 1495263367 somecontent
1374609485.62 1374609485.63 1374609485.66 1374609487.84 110 362 8618841213864 460078412124864 3562060505359300 2 460 0 14163 34173 10.13.13.240 211.142.196.65 35681 80cmnet 1 221.177.157.97 221.177.152.242 221.177.157.97 221.177.152.242 m.qpic.cn http://m.qpic.cn/psb?/V11ecKOE0VsKsD/soj.ZYORrztQVszHsGPn0ZjXaP8L9hbwsSWGKUUPt9Y!/m/dM.X2pb.KQAA&bo=gALgAQAAAAABAEQ! m.qpic.cn android-qzone GET 200 545 9332 6 7 0 0 6 7 0 0 0 0 http://m.qpic.cn/psb?/V11ecKOE0VsKsD/soj.ZYORrztQVszHsGPn0ZjXaP8L9hbwsSWGKUUPt9Y!/m/dM.X2pb.KQAA&bo=gALgAQAAAAABAEQ! 5903901810496765953 5903902282558234625 1495263367 somecontent
1374609488.02 1374609488.03 1374609488.06 1374609489.98 110 362 8618841213864 460078412124864 3562060505359300 2 460 0 14163 34173 10.13.13.240 211.142.196.65 35681 80cmnet 1 221.177.157.97 221.177.152.242 221.177.157.97 221.177.152.242 m.qpic.cn http://m.qpic.cn/psb?/V11ecKOE0VsKsD/g5DqmV0u92.x*CO5pg2QTr55tqN6TXM8hGe6*2hSGtY!/m/dMTLEJWENAAA&bo=gALgAQAAAAABAEQ! m.qpic.cn android-qzone GET 200 545 9151 6 7 0 0 6 7 0 0 0 0 http://m.qpic.cn/psb?/V11ecKOE0VsKsD/g5DqmV0u92.x*CO5pg2QTr55tqN6TXM8hGe6*2hSGtY!/m/dMTLEJWENAAA&bo=gALgAQAAAAABAEQ! 5903901810496765953 5903902282558234625 1495263367 somecontent
1374609501.12 1374609501.13 1374609502.91 1374609505.50 110 362 8618841213864 460078412124864 3562060505359300 2 460 0 14163 34173 10.13.13.240 211.142.196.65 35681 80cmnet 1 221.177.157.97 221.177.152.242 221.177.157.97 221.177.152.242 m.qpic.cn http://m.qpic.cn/psb?/V13gplZ20cRvyn/FGcdGR7FwhuOabarVYHnGK5qhJ42GEKjWrFEd70zQZw!/m/dGyIb5q*BAAA&bo=EAEnARABJwEBACc! m.qpic.cn android-qzone GET 200 785 14556 12 11 0 0 12 11 0 0 0 0 http://m.qpic.cn/psb?/V13gplZ20cRvyn/FGcdGR7FwhuOabarVYHnGK5qhJ42GEKjWrFEd70zQZw!/m/dGyIb5q*BAAA&bo=EAEnARABJwEBACc! 5903901810496765953 5903902282558234625 1495263367 somecontent
1374609547.42 1374609547.43 1374609547.44 1374609549.00 110 362 8618841213864 460078412124864 3562060505359300 2 460 0 14163 34173 10.13.13.240 211.142.196.65 35681 80cmnet 1 221.177.157.97 221.177.152.242 221.177.157.97 221.177.152.242 m.qpic.cn http://m.qpic.cn/psb?/V11wxP2v0hGwaf/m7u6bFuzkfJ6GKXhBV8I390AS127QVsOd8o.Jr1Yagw!/m/dNhgOpalLAAA&bo=gAJVAwAAAAABAPM! m.qpic.cn android-qzone GET 200 1540 5450 8 7 0 0 8 7 3 0 0 0 http://m.qpic.cn/psb?/V11wxP2v0hGwaf/m7u6bFuzkfJ6GKXhBV8I390AS127QVsOd8o.Jr1Yagw!/m/dNhgOpalLAAA&bo=gAJVAwAAAAABAPM! 5903901810496765953 5903902282558234625 1495263367 somecontent
1374609549.14 1374609549.15 1374609549.17 1374609551.20 110 362 8618841213864 460078412124864 3562060505359300 2 460 0 14163 34173 10.13.13.240 211.142.196.65 35681 80cmnet 1 221.177.157.97 221.177.152.242 221.177.157.97 221.177.152.242 m.qpic.cn http://m.qpic.cn/psb?/V12TDrbk3A3vMJ/aKcud*PHBeeGA6.xo4OMEipBTW3SyWZHFLb1N0ABA2s!/m/dHA8UcJ8JQAA&bo=5gGIAgAAAAABAcat: Filesystem closed
[hadoop@centos-aaron-h1 ~]$
[hadoop@centos-aaron-h1 ~]$ hdfs dfs -cat /crw/url.dat |more
http://m.baidu.com/static/tf/nopic.gif?r=1374609403508&tj=alaxs&ftj=xschp_normal_5_0_10&hasRp=1&ac=nextgp tocrawl
http://m.baidu.com/static/tf/nopic.gif?r=1374609444479&tj=alaxs&ftj=xschp_normal_5_0_10&hasRp=1&ac=nextgp tocrawl
http://m.nuomi.com/client/push/list?cid=2000010000&devid=358059043449333&manufacturer=samsung&version=3.0.0&client=android&loc=MTEzLjY1NDQ0OTQ2Mjg5MDYzLDM0LjgwOTE4MTIxMzM3ODkwNg%3D%3D&uuid=ffffffff-db69-afec-4b6f-b9ab3c02bbbc&cityid=20
00010000&model=GT-I9228&userid=1345361458243220&channel=wooboo06.d8&release=2.3.6&mac=NTA6Q0M6Rjg6QTQ6MzQ6RjI%3D tocrawl
http://m.baidu.com/ssid=0/from=2001m/bd_page_type=1/uid=1ADA21F27A014F2180A7E22E8BEE35B9/pu=usm%402%2Csz%401321_1003%2Cta%40utouch_2_2.3_1_9.0/img?tn=bdwis&word=%E6%A8%B1%E4%BA%95%E8%8E%89%E4%BA%9A%E7%94%B5%E5%BD%B1%E6%88%AA%E5%9B%BE&p
n=0&dw=w320&bs=176_208&pos=0&pinf=12_6_0_@bdwis_@av%E7%94%B5%E5%BD%B1%E9%AB%98%E6%BD%AE%E6%88%AA%E5%9B%BE_@176_208_@w320&fm=rs2&sp=&mid=w320 tocrawl
http://api.app.yiche.com/webapi/reviewtopic.ashx?op=get&topicid=166025 tocrawl
http://hm.baidu.com/hm.gif?si=b7723ac5ec07c308ac1ddf314523c2b0&et=0&nv=0&st=4<=1374609378&su=http%3A%2F%2Ffang.xinzheng.cc%2F&u=http%3A%2F%2Fwww.xinzheng.cc%2Ffangwu%2Fxiezilou&v=tc-1.0&rnd=2142219798 tocrawl
http://az.tpwap.cn/config.jsp?pos=448&clientId=435&sid=c2_1.1.1 tocrawl
better01.sinaapp.com tocrawl
better01.sinaapp.com tocrawl
http://af.upsdk.com/af/appActive?user_agent=Lenovo+A590&project=Lianxiang_Lenovo_Lenovo+A590&af_channel=Lianxiang&af_project=LXF_AG790_A01&af_version_code=198401&imei=D9B97DEA530324F235E73ABFA2CE003A&sid=1374609488731&brand=Lenovo&veri
fy_code=0ac0ba8ecb2a5455a052fa445c1937d5&encrpytion=DES&wifi_mac=0000000000000000&bt_mac=0000000000000000&cpu_serial=0000000000000000 tocrawl
http://af.upsdk.com/af/afPush?user_agent=Lenovo+A590&project=Lianxiang_Lenovo_Lenovo+A590&af_project=LXF_AG790_A01&af_channel=Lianxiang&af_version_code=198401&imei=D9B97DEA530324F235E73ABFA2CE003A&imsi=FD8CD80070CD6E0F71D8FAC3B8BF4F73&
sdk_version=16&sdk_name=4.1.1&af_version=3.7.1&sid=1374609488711&brand=Lenovo&verify_code=838773006a2a84415a64dea4aa117113&encrpytion=DES&lac=14196&cid=57917&wifi_mac=0000000000000000&bt_mac=0000000000000000&cpu_serial=0000000000000000
tocrawl
http://app.wapx.cn/action/push/api_ad?app_id=24eeb27a1f5032e40b4561317f5f460c&udid=354096050843721&imsi=460003891750725&net=&app_version=1.0.7&sdk_version=1.5.2&device_name=GT-S7562i&y=6a5434e15a787d6791343c83523df81b&device_type=andro
id&os_version=4.0.4&country_code=CN&language=zh&act=dangerb.game.llk.ReceiverRestrictedContext&channel=WAPS&device_width=480&device_height=800&at=1374600352535 tocrawl
http://q.qlogo.cn/headimg_dl?bs=qq&dst_uin=1492849784&src_uin=1665118942&fid=1492849784&spec=100&referer=mqq&term_type=pc&host=1&sign=0C82F9B2A7109F7C2C058C41FE6CB7810AC63C710B4A3474&rsp_type=img tocrawl
better01.sinaapp.com tocrawl
http://switching.atm.punchbox.org/v1/?appid=16252171-7DE8-159E-F72B-9A58CFD2D08D&ver=7.1.2 tocrawl
http://q1.qlogo.cn/g?b=mqq&k=S8TokNAI2eSw6kynuB2gjA&t=1374608380&refer=mqq&s=40 tocrawl
http://launchermsg.3g.cn/golaunchermsg/msgservice.do?funid=1&rd=-3535723825567048133 tocrawl
http://q.qlogo.cn/headimg_dl?bs=qq&dst_uin=2569887365&src_uin=1665118942&fid=2569887365&spec=100&referer=mqq&term_type=pc&host=1&sign=5D6B95E75ACFC11355D4542D151A876C72D62301FCAEBFD3&rsp_type=img tocrawl
better01.sinaapp.com tocrawl
http://app.adsofts.cn/action/connect/active?app_id=3fe23c1a9f592ec42abf6a7d012841ba&udid=865662012462439&imsi=460003778631834&net=cmnet&base=adsofts.cn&app_version=3.7&sdk_version=1.6.10&device_name=TE600+&device_brand=Ctyon&y=2d81b1d1
1e11ae7de129d3cce29a958c&device_type=android&os_version=4.1.2&country_code=CN&language=zh&cid=83mdas8k2mj0g70r4b0b68ko9kgr2dk6&act=com.androidemu.harvechise.ReceiverRestrictedContext&channel=gfan&device_width=320&device_height=480&at=1
374609806979 tocrawl
better01.sinaapp.com tocrawl
http://mb.hd.sohu.com.cn/mc.gif?uid=ff05360fb8fc15f1eb07c84904a9f863&url=1002&passport=&mtype=6<ype=&cv=2.8.1&mos=2&mosv=4.2.2&pro=1&mfo=BBK&mfov=vivo%20Xplay&webtype=2G&vid=&time=1374609536953&memo=0&type=1&channelid=91&value=&sim=1
&playlistid=&catecode=&preid=&newuser=0&enterid=0&startid=1374609536961&loc= tocrawl
http://launchermsg.3g.cn/golaunchermsg/msgservice.do?funid=1&rd=-3535723825567048133 tocrawl
http://wap.baidu.com/bd_page_type=1/pu=usm%400%2Csz%401330%5F640%2Cta%40big%5F%5F5%2E0%5F3%5F525/uid=FC69E3DEB768362786105AE0F78D77A7/t=wap/w=0_10_%E6%9E%81%E5%93%81%E5%A5%B3%E4%BB%99%E4%B9%A6%E5%8C%85%E7%BD%91%E9%98%85%E8%AF%BB/ssid=0
/from=128g/l=0/tc?func=nextp&pi=3&m=0&pn=15&src=http%3A%2F%2Fwww%2Ebookbao%2Ecom%2Fviews%2F201306%2F05%2Fid%5FXMzI4NDU2%5F14%2Ehtml tocrawl
http://q.qlogo.cn/headimg_dl?bs=qq&dst_uin=2251204892&src_uin=1665118942&fid=2251204892&spec=100&referer=mqq&term_type=pc&host=1&sign=CFAFF94D0282A4B4EBC81DBDB307CAD166B756E7433FC920&rsp_type=img tocrawl
http://r3.11222.cn/resource/bookclub/cover/95/84/2011120917151262.jpg tocrawl
http://m.i.ppsrc.com/d/82C09949DDA26192C2170EF4CB0A0084/640/640 tocrawl
http://app.wapx.cn/action/push/api_ad?app_id=24eeb27a1f5032e40b4561317f5f460c&udid=358864040778026&imsi=460003827213968&net=cmnet&app_version=1.0.2&sdk_version=1.5.2&device_name=GT-P6200&y=f34b836872c220f87c39d6bf6afeb123&device_type=a
ndroid&os_version=3.2&country_code=CN&language=zh&act=dangerb.game.llk.ReceiverRestrictedContext&channel=Samsung&device_width=600&device_height=976&at=1374609799140 tocrawl
better01.sinaapp.com tocrawl
http://psb.lenovomm.com/pushservice/2.1/poll?lpsst=B5AAAAAAXEPJgCAAAADAAAAUANHH1VAA0vAFhwaWQ9JmRpZD1PRFkyTlRFMk1ERTJOVE0yT1RRMiZzaWQ9TVVKRVJrTTNORFl6UlVFeVFUUXpOell5TVRFek56VkJSa1ExUTBWR056UXgmZHQ9YVcxbGFRK4GzrB5hOEri5Cx08JrOrw&ack=R1:
2106551173&min=180&max=320 tocrawl
http://api.app.yiche.com/webapi/reviewtopic.ashx?op=get&serialid=2573&level=1&pageindex=2&pagesize=20 tocrawl
http://q.qlogo.cn/headimg_dl?bs=qq&dst_uin=2215216651&src_uin=1157251974&fid=2215216651&spec=100&referer=mqq&term_type=pc&host=1&sign=08F0E013042DA3DB50682F97BFBD852A13558806FDF2A2E3&rsp_type=img tocrawl
http://q.qlogo.cn/headimg_dl?bs=qq&dst_uin=306486557&src_uin=1157251974&fid=306486557&spec=100&referer=mqq&term_type=pc&host=1&sign=08F0E013042DA3DB50682F97BFBD852A13558806FDF2A2E3&rsp_type=img tocrawl
http://psb.lenovomm.com/pushservice/2.1/poll?lpsst=B5AAAAAAIyZAkCAAAADAAAAT_pzFv2AA0vAFhwaWQ9JmRpZD1PRFk1TkRVNU1ERXlNVE16TWpFNSZzaWQ9UmpZNVJEa3hRMFkxTWpKQk56WXpNelkwUWpOQ1FUQkNSamhGTkRNMFJEY3gmZHQ9YVcxbGFRGaRu5QbEjFyJDcKU6sGCSg&ack=R1:
2005075059&min=300&max=300 tocrawl
http://api.changba.com/ktvbox.php?ac=pullnotice&macaddress=1C%3A66%3AAA%3A1A%3AF9%3A56&channelsrc=changba_A&deviceid=1C%3A66%3AAA%3A1A%3AF9%3A56&version=2.3.0&seret=f3f1470646&_userinfo=796 tocrawl
http://q3.qlogo.cn/g?b=mqq&k=9FqISKXvCKCf6sPAxeu8yA&t=1374446924&refer=mqq&s=100 tocrawl
http://q4.qlogo.cn/g?b=mqq&k=puGXojn2c7cOCeWibOce6yw&t=1374373655&refer=mqq&s=100 tocrawl
http://q1.qlogo.cn/g?b=mqq&k=ZRw4QCD4IoG0SsepbqNUfQ&t=1370823532&refer=mqq&s=100 tocrawl
http://a126.photo.store.qq.com/psb?/V10Z7w8y4FYPqA/zzUERobf01h9PPi7s*.q7SkeQsyEm0KTu1EfAYWWZ94!/a/dHi5G0t8JgAA&bo=yAALAQAAAAABAOU! tocrawl
[hadoop@centos-aaron-h1 ~]$
七、注意事项
mapreduce代码打包的时候注意需要将mysql的驱动Jar包打上,可以使用maven插件,或者开发工具选取lib文件打包;
最后寄语,以上是博主本次文章的全部内容,如果大家觉得博主的文章还不错,请点赞;如果您对博主其它服务器大数据技术或者博主本人感兴趣,请关注博主博客,并且欢迎随时跟博主沟通交流。