生产实习5

电商实战

使用mapreduce编程

运行环境搭建

1.maven配置
2.hadoop下载,并配置环境变量
3.导入依赖,pom.xml

    <!-- 添加hadoop的依赖  -->
    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-common</artifactId>
        <version>3.1.1</version>
    </dependency>

4.idea创建maven项目

编写代码

ProvinceStatApp.java

省流量统计

package com.task.ds.pro;

import com.task.ds.utils.IPParser;
import com.task.ds.utils.LogParser;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.util.Map;


/**
 *  省份浏览量统计
 */
public class ProvinceStatApp {

    public static void main(String[] args) throws Exception {

        Configuration configuration = new Configuration();

        FileSystem fileSystem = FileSystem.get(configuration);
        Path outputPath = new Path("E:\\IdeaProject\\hadoop\\project\\src\\main\\java\\com\\data\\trackInfo_out");
        if (fileSystem.exists(outputPath)) {
            fileSystem.delete(outputPath, true);
        }


        Job job = Job.getInstance(configuration);
        job.setJarByClass(ProvinceStatApp.class);

        job.setMapperClass(MyMapper.class);
        job.setReducerClass(MyReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);

        FileInputFormat.setInputPaths(job, new Path("E:\\IdeaProject\\hadoop\\project\\src\\main\\java\\com\\data\\trackinfo_20130721.txt"));
        FileOutputFormat.setOutputPath(job, new Path("E:\\IdeaProject\\hadoop\\project\\src\\main\\java\\com\\data\\trackInfo_out"));

        job.waitForCompletion(true);


    }

    static class MyMapper extends Mapper<LongWritable, Text, Text, LongWritable> {

        private LongWritable ONE = new LongWritable(1);

        private LogParser logParser;

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            logParser = new LogParser();
        }

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            String log = value.toString();

            Map<String, String> info = logParser.parse(log);
            String ip = info.get("ip");

            if (StringUtils.isNotBlank(ip)) {
                IPParser.RegionInfo regionInfo = IPParser.getInstance().analyseIp(ip);
                if (regionInfo != null) {
                    String provine = regionInfo.getProvince();
                    if (StringUtils.isNotBlank(provine)) {
                        context.write(new Text(provine), ONE);
                    } else {
                        context.write(new Text("-"), ONE);
                    }

                } else {
                    context.write(new Text("-"), ONE);
                }
            } else {
                context.write(new Text("-"), ONE);
            }

        }


    }

    static class MyReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
        @Override
        protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
            long count = 0;
            System.out.println(context);
            for (LongWritable value : values) {
                count++;
            }
            context.write(key, new LongWritable(count));
        }
    }
}


结果展示
-	923
上海市	72898
云南省	1480
内蒙古自治区	1298
北京市	42501
台湾省	254
吉林省	1435
四川省	4442
天津市	11042
宁夏	352
安徽省	5429
山东省	10145
山西省	2301
广东省	51508
广西	1681
新疆	840
江苏省	25042
江西省	2238
河北省	7294
河南省	5279
浙江省	20627
海南省	814
湖北省	7187
湖南省	2858
澳门特别行政区	6
甘肃省	1039
福建省	8918
西藏	110
贵州省	1084
辽宁省	2341
重庆市	1798
陕西省	2487
青海省	336
香港特别行政区	45
黑龙江省	1968


etl.java
提取关键信息:ip、url、pageId(topicId对应的页面Id)、country、province、city

package com.task.ds.pro;

import com.task.ds.utils.IPParser;
import com.task.ds.utils.LogParser;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.util.Map;


/**
 * etl提取关键信息
 */
public class etl {

    public static void main(String[] args) throws Exception {

        Configuration configuration = new Configuration();

        FileSystem fileSystem = FileSystem.get(configuration);
        Path outputPath = new Path("E:\\IdeaProject\\hadoop\\project\\src\\main\\java\\com\\data\\trackInfo_out");
        if (fileSystem.exists(outputPath)) {
            fileSystem.delete(outputPath, true);
        }


        Job job = Job.getInstance(configuration);
        job.setJarByClass(ProvinceStatApp.class);

        job.setMapperClass(MyMapper.class);
        job.setReducerClass(MyReduce.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        FileInputFormat.setInputPaths(job, new Path("E:\\IdeaProject\\hadoop\\project\\src\\main\\java\\com\\data\\trackinfo_20130721.txt"));
        FileOutputFormat.setOutputPath(job, new Path("E:\\IdeaProject\\hadoop\\project\\src\\main\\java\\com\\data\\outInfo_out"));

        job.waitForCompletion(true);


    }

    static class MyMapper extends Mapper<LongWritable, Text, Text, Text> {

        private LongWritable ONE = new LongWritable(1);

        private LogParser logParser;

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            logParser = new LogParser();
        }

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            String log = value.toString();
            ContentUtils contentUtils = new ContentUtils();

            Map<String, String> info = logParser.parse(log);
            String ip = info.get("ip");
            String url = info.get("url");
            String pageId = contentUtils.getPageId(url);
            String country = info.get("country");
            String province = info.get("province");
            String city = info.get("city");
            String time = info.get("time");

            String out = "," + pageId + "," + ip + "," + country + "," + province + "," + city + "," + time;
            context.write(new Text(url) , new Text(out));
        }

    }

    static class MyReduce extends Reducer<Text, Text, Text, Text> {
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            for (Text value : values) {
                context.write(key,value);
            }

        }
    }
}




结果展示

http://b2b.yihaodian.com/cms/view.do?topicId=21407	,21407,115.85.250.26,中国,甘肃省,兰州市,2013-07-21 19:25:27
http://b2b.yihaodian.com/cms/view.do?topicId=23542&tracker_u=1010825072	,23542,101.227.253.115,中国,上海市,null,2013-07-21 05:30:27
http://b2b.yihaodian.com/cms/view.do?topicId=23704&ref=ad.12084_2108676_1	,23704,180.155.238.184,中国,上海市,null,2013-07-21 09:12:56
http://b2b.yihaodian.com/cms/view.do?topicId=23705&ref=ad.7901_2115482_1	,23705,119.85.21.23,中国,重庆市,null,2013-07-21 17:25:49
http://b2b.yihaodian.com/customerMessage.do	,,222.69.22.126,中国,上海市,南汇区,2013-07-21 18:59:50
http://b2b.yihaodian.com/index.do	,,222.177.67.13,中国,重庆市,null,2013-07-21 17:50:08


  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值