生产实习5

最新推荐文章于 2024-07-13 13:55:40 发布

weixin_69668557

最新推荐文章于 2024-07-13 13:55:40 发布

阅读量192

点赞数 1

文章标签： java

本文链接：https://blog.csdn.net/weixin_69668557/article/details/139614058

版权

电商实战

使用mapreduce编程

运行环境搭建

1.maven配置
2.hadoop下载，并配置环境变量
3.导入依赖，pom.xml

    <!-- 添加hadoop的依赖  -->
    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-common</artifactId>
        <version>3.1.1</version>
    </dependency>

4.idea创建maven项目

编写代码

ProvinceStatApp.java

省流量统计

package com.task.ds.pro;

import com.task.ds.utils.IPParser;
import com.task.ds.utils.LogParser;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.util.Map;


/**
 *  省份浏览量统计
 */
public class ProvinceStatApp {

    public static void main(String[] args) throws Exception {

        Configuration configuration = new Configuration();

        FileSystem fileSystem = FileSystem.get(configuration);
        Path outputPath = new Path("E:\\IdeaProject\\hadoop\\project\\src\\main\\java\\com\\data\\trackInfo_out");
        if (fileSystem.exists(outputPath)) {
            fileSystem.delete(outputPath, true);
        }


        Job job = Job.getInstance(configuration);
        job.setJarByClass(ProvinceStatApp.class);

        job.setMapperClass(MyMapper.class);
        job.setReducerClass(MyReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);

        FileInputFormat.setInputPaths(job, new Path("E:\\IdeaProject\\hadoop\\project\\src\\main\\java\\com\\data\\trackinfo_20130721.txt"));
        FileOutputFormat.setOutputPath(job, new Path("E:\\IdeaProject\\hadoop\\project\\src\\main\\java\\com\\data\\trackInfo_out"));

        job.waitForCompletion(true);


    }

    static class MyMapper extends Mapper<LongWritable, Text, Text, LongWritable> {

        private LongWritable ONE = new LongWritable(1);

        private LogParser logParser;

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            logParser = new LogParser();
        }

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            String log = value.toString();

            Map<String, String> info = logParser.parse(log);
            String ip = info.get("ip");

            if (StringUtils.isNotBlank(ip)) {
                IPParser.RegionInfo regionInfo = IPParser.getInstance().analyseIp(ip);
                if (regionInfo != null) {
                    String provine = regionInfo.getProvince();
                    if (StringUtils.isNotBlank(provine)) {
                        context.write(new Text(provine), ONE);
                    } else {
                        context.write(new Text("-"), ONE);
                    }

                } else {
                    context.write(new Text("-"), ONE);
                }
            } else {
                context.write(new Text("-"), ONE);
            }

        }


    }

    static class MyReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
        @Override
        protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
            long count = 0;
            System.out.println(context);
            for (LongWritable value : values) {
                count++;
            }
            context.write(key, new LongWritable(count));
        }
    }
}


结果展示
-	923
上海市	72898
云南省	1480
内蒙古自治区	1298
北京市	42501
台湾省	254
吉林省	1435
四川省	4442
天津市	11042
宁夏	352
安徽省	5429
山东省	10145
山西省	2301
广东省	51508
广西	1681
新疆	840
江苏省	25042
江西省	2238
河北省	7294
河南省	5279
浙江省	20627
海南省	814
湖北省	7187
湖南省	2858
澳门特别行政区	6
甘肃省	1039
福建省	8918
西藏	110
贵州省	1084
辽宁省	2341
重庆市	1798
陕西省	2487
青海省	336
香港特别行政区	45
黑龙江省	1968

etl.java
提取关键信息:ip、url、pageId（topicId对应的页面Id）、country、province、city

package com.task.ds.pro;

import com.task.ds.utils.IPParser;
import com.task.ds.utils.LogParser;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.util.Map;


/**
 * etl提取关键信息
 */
public class etl {

    public static void main(String[] args) throws Exception {

        Configuration configuration = new Configuration();

        FileSystem fileSystem = FileSystem.get(configuration);
        Path outputPath = new Path("E:\\IdeaProject\\hadoop\\project\\src\\main\\java\\com\\data\\trackInfo_out");
        if (fileSystem.exists(outputPath)) {
            fileSystem.delete(outputPath, true);
        }


        Job job = Job.getInstance(configuration);
        job.setJarByClass(ProvinceStatApp.class);

        job.setMapperClass(MyMapper.class);
        job.setReducerClass(MyReduce.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        FileInputFormat.setInputPaths(job, new Path("E:\\IdeaProject\\hadoop\\project\\src\\main\\java\\com\\data\\trackinfo_20130721.txt"));
        FileOutputFormat.setOutputPath(job, new Path("E:\\IdeaProject\\hadoop\\project\\src\\main\\java\\com\\data\\outInfo_out"));

        job.waitForCompletion(true);


    }

    static class MyMapper extends Mapper<LongWritable, Text, Text, Text> {

        private LongWritable ONE = new LongWritable(1);

        private LogParser logParser;

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            logParser = new LogParser();
        }

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            String log = value.toString();
            ContentUtils contentUtils = new ContentUtils();

            Map<String, String> info = logParser.parse(log);
            String ip = info.get("ip");
            String url = info.get("url");
            String pageId = contentUtils.getPageId(url);
            String country = info.get("country");
            String province = info.get("province");
            String city = info.get("city");
            String time = info.get("time");

            String out = "," + pageId + "," + ip + "," + country + "," + province + "," + city + "," + time;
            context.write(new Text(url) , new Text(out));
        }

    }

    static class MyReduce extends Reducer<Text, Text, Text, Text> {
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            for (Text value : values) {
                context.write(key,value);
            }

        }
    }
}

结果展示

http://b2b.yihaodian.com/cms/view.do?topicId=21407	,21407,115.85.250.26,中国,甘肃省,兰州市,2013-07-21 19:25:27
http://b2b.yihaodian.com/cms/view.do?topicId=23542&tracker_u=1010825072	,23542,101.227.253.115,中国,上海市,null,2013-07-21 05:30:27
http://b2b.yihaodian.com/cms/view.do?topicId=23704&ref=ad.12084_2108676_1	,23704,180.155.238.184,中国,上海市,null,2013-07-21 09:12:56
http://b2b.yihaodian.com/cms/view.do?topicId=23705&ref=ad.7901_2115482_1	,23705,119.85.21.23,中国,重庆市,null,2013-07-21 17:25:49
http://b2b.yihaodian.com/customerMessage.do	,,222.69.22.126,中国,上海市,南汇区,2013-07-21 18:59:50
http://b2b.yihaodian.com/index.do	,,222.177.67.13,中国,重庆市,null,2013-07-21 17:50:08

weixin_69668557

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
生产实习5

提取关键信息:ip、url、pageId（topicId对应的页面Id）、country、province、city。2.hadoop下载，并配置环境变量。4.idea创建maven项目。3.导入依赖，pom.xml。
复制链接

扫一扫