【待写】ubuntu 16.04 大数据开发之:第一个MR实战开发&SPARK程序开发&STORM程序开发

我认为,实战开发肯定是包括三个部分:

  1. 数据抓取,用python
  2. 数据存储,用hdfs
  3. 数据处理,用MR, Spark, Storm

python数据抓取

HDFS数据存储

数据处理

MR程序开发
入门小程序

环境准备
IDE:IntelliJ Idea
参见教程:
远程联调:
https://blog.csdn.net/a_bang/article/details/83820022
自己打JAR包上去运行:
https://www.cnblogs.com/liaojie970/p/7920860.html
这里有一个地方和不一样,他说是build on make,其实应该是在这里插入图片描述
之后就可以看到想要的jar包了;
在这里插入图片描述

接下来跑个wordcount程序练练手:
https://blog.csdn.net/l1394049664/article/details/82563637
pom.xml:


<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>wordcount-test</groupId>
  <artifactId>wordcount</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  <packaging>jar</packaging>
      <dependencies>
        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-log4j12</artifactId>
            <version>1.7.21</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flume</groupId>
            <artifactId>flume-ng-core</artifactId>
            <version>1.9.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flume.flume-ng-clients</groupId>
            <artifactId>flume-ng-log4jappender</artifactId>
            <version>1.6.0</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>2.7.7</version>
        </dependency>

        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.11</version>
        </dependency>




    </dependencies>
  <build>
    <plugins>
      <plugin>
        <artifactId>maven-war-plugin</artifactId>
        <version>2.6</version>
        <configuration>
          <warSourceDirectory>WebContent</warSourceDirectory>
          <failOnMissingWebXml>false</failOnMissingWebXml>
        </configuration>
      </plugin>
      <plugin>
        <artifactId>maven-compiler-plugin</artifactId>
        <version>3.5</version>
        <configuration>
          <source>1.8</source>
          <target>1.8</target>
        </configuration>
      </plugin>

	    <plugin>
		<groupId>org.apache.maven.plugins</groupId>
		<artifactId>maven-jar-plugin</artifactId>
		<configuration>

		    <archive>
		        <manifest>
		            <mainClass>ls.wordcount.WordCount</mainClass>
		            <addClasspath>true</addClasspath>
		            <classpathPrefix>lib/</classpathPrefix>
		        </manifest>

		    </archive>
		    <classesDirectory>
		    </classesDirectory>
		</configuration>
	    </plugin>


    </plugins>
  </build>
</project>

tree:
在这里插入图片描述
WordCount.java:(注意大写)

package ls.wordcount;
 
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.NLineInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 
import java.io.IOException;
import java.util.StringTokenizer;
 
public class WordCount {
	
	public static class TokenizerMapper extends  Mapper<Object, Text, Text, IntWritable> {
 
		private final static IntWritable one = new IntWritable(1);
		private Text word = new Text();
		
		public void map(Object key, Text value, Context context)
		        throws IOException, InterruptedException {
		    StringTokenizer itr = new StringTokenizer(value.toString());
		    while (itr.hasMoreTokens()) {
		        word.set(itr.nextToken());
		        context.write(word, one);
		    }
		}
	}
	 public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
		 private IntWritable result = new IntWritable();
		
		 public void reduce(Text key, Iterable<IntWritable> values,
		         Context context) throws IOException, InterruptedException {
		     int sum = 0;
		     for (IntWritable val : values) {
		         sum += val.get();
		     }
		     result.set(sum);
		     context.write(key, result);
		 }
	 }
	
	 public static void main(String[] args) throws Exception {
		if (args == null || args.length < 3) {
			args[0] = "wordcount";
			args[1] = "/hadoop/input/wordcount.txt";
			args[2] = "/hadoop/output/wordcnd2";
		}


	        Configuration conf = new Configuration();
	        Job job = Job.getInstance(conf, args[0]);
	        job.setJarByClass(WordCount.class);
	        job.setMapperClass(TokenizerMapper.class);
	        job.setCombinerClass(IntSumReducer.class);
	        job.setReducerClass(IntSumReducer.class);
	        job.setOutputKeyClass(Text.class);
	        job.setOutputValueClass(IntWritable.class);
	        job.setInputFormatClass(NLineInputFormat.class);
	        // 输入文件路径
	        FileInputFormat.addInputPath(job, new Path(args[1]));
	        // 输出文件路径
	        FileOutputFormat.setOutputPath(job, new Path(args[2]));
	        System.exit(job.waitForCompletion(true) ? 0 : 1);
	    }

}

参考https://www.cnblogs.com/hd-zg/p/5911447.html 这里有不少样例教程,我们就先拿第一个练练手

2.2.2问题1:求各个部门的总工资
  1. 准备好dept数据和emp数据

emp:

19/09/23 04:21:46 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
7369,SMITH,CLERK,7902,17-12月-80,800,,20
7499,ALLEN,SALESMAN,7698,20-2月-81,1600,300,30
7521,WARD,SALESMAN,7698,22-2月-81,1250,500,30
7566,JONES,MANAGER,7839,02-4月-81,2975,,20
7654,MARTIN,SALESMAN,7698,28-9月-81,1250,1400,30
7698,BLAKE,MANAGER,7839,01-5月-81,2850,,30
7782,CLARK,MANAGER,7839,09-6月-81,2450,,10
7839,KING,PRESIDENT,,17-11月-81,5000,,10
7844,TURNER,SALESMAN,7698,08-9月-81,1500,0,30
7900,JAMES,CLERK,7698,03-12月-81,950,,30
7902,FORD,ANALYST,7566,03-12月-81,3000,,20
7934,MILLER,CLERK,7782,23-1月-82,1300,,10

dept

10,ACCOUNTING,NEW YORK
20,RESEARCH,DALLAS
30,SALES,CHICAGO
40,OPERATIONS,BOSTON
  1. 准备程序
    pom.xml

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>wordcount-test</groupId>
  <artifactId>wordcount</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  <packaging>jar</packaging>
      <dependencies>
        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-log4j12</artifactId>
            <version>1.7.21</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flume</groupId>
            <artifactId>flume-ng-core</artifactId>
            <version>1.9.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flume.flume-ng-clients</groupId>
            <artifactId>flume-ng-log4jappender</artifactId>
            <version>1.6.0</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>2.7.7</version>
        </dependency>

        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.11</version>
        </dependency>




    </dependencies>
  <build>
    <plugins>
      <plugin>
        <artifactId>maven-war-plugin</artifactId>
        <version>2.6</version>
        <configuration>
          <warSourceDirectory>WebContent</warSourceDirectory>
          <failOnMissingWebXml>false</failOnMissingWebXml>
        </configuration>
      </plugin>
      <plugin>
        <artifactId>maven-compiler-plugin</artifactId>
        <version>3.5</version>
        <configuration>
          <source>1.8</source>
          <target>1.8</target>
        </configuration>
      </plugin>

	    <plugin>
		<groupId>org.apache.maven.plugins</groupId>
		<artifactId>maven-jar-plugin</artifactId>
		<configuration>

		    <archive>
		        <manifest>
		            <mainClass>ls.wordcount.WordCount</mainClass>
		            <addClasspath>true</addClasspath>
		            <classpathPrefix>lib/</classpathPrefix>
		        </manifest>

		    </archive>
		    <classesDirectory>
		    </classesDirectory>
		</configuration>
	    </plugin>


    </plugins>
  </build>
</project>

WordCount.java:(名字懒得改了,从wordcount那边目录复制过来的,class名字也改了)

package ls.wordcount;

import java.io.BufferedReader;

import java.io.FileReader;

import java.io.IOException;

import java.util.HashMap;

import java.util.Map;

 

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.conf.Configured;

import org.apache.hadoop.filecache.DistributedCache;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import org.apache.hadoop.util.GenericOptionsParser;

import org.apache.hadoop.util.Tool;

import org.apache.hadoop.util.ToolRunner;

 

public class WordCount extends Configured implements Tool {

 

    public static class MapClass extends Mapper<LongWritable, Text, Text, Text> {

 

        // 用于缓存 dept文件中的数据

        private Map<String, String> deptMap = new HashMap<String, String>();

        private String[] kv;

 

        // 此方法会在Map方法执行之前执行且执行一次

        @Override

        protected void setup(Context context) throws IOException, InterruptedException {

            BufferedReader in = null;

            try {

 

                // 从当前作业中获取要缓存的文件

                Path[] paths = DistributedCache.getLocalCacheFiles(context.getConfiguration());

                String deptIdName = null;

                for (Path path : paths) {

 

                    // 对部门文件字段进行拆分并缓存到deptMap中

                    if (path.toString().contains("dept")) {

                        in = new BufferedReader(new FileReader(path.toString()));

                        while (null != (deptIdName = in.readLine())) {

                           

                            // 对部门文件字段进行拆分并缓存到deptMap中

                            // 其中Map中key为部门编号,value为所在部门名称

                            deptMap.put(deptIdName.split(",")[0], deptIdName.split(",")[1]);

                        }

                    }

                }

            } catch (IOException e) {

                e.printStackTrace();

            } finally {

                try {

                    if (in != null) {

                        in.close();

                    }

                } catch (IOException e) {

                    e.printStackTrace();

                }

            }

        }

 

public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

 

            // 对员工文件字段进行拆分

            kv = value.toString().split(",");

 
		if (kv.length < 3) {
			return;
		}
            // map join: 在map阶段过滤掉不需要的数据,输出key为部门名称和value为员工工资

            if (deptMap.containsKey(kv[7])) {

                if (null != kv[5] && !"".equals(kv[5].toString())) {

                    context.write(new Text(deptMap.get(kv[7].trim())), new Text(kv[5].trim()));

                }

            }

        }

    }

 

    public static class Reduce extends Reducer<Text, Text, Text, LongWritable> {

 

public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {

 

            // 对同一部门的员工工资进行求和

            long sumSalary = 0;

            for (Text val : values) {

                sumSalary += Long.parseLong(val.toString());

            }

 

            // 输出key为部门名称和value为该部门员工工资总和

            context.write(key, new LongWritable(sumSalary));

        }

    }

 

    @Override

    public int run(String[] args) throws Exception {

 

        // 实例化作业对象,设置作业名称、Mapper和Reduce类

        Job job = new Job(getConf(), "WordCount");

        job.setJobName("WordCount");

        job.setJarByClass(WordCount.class);

        job.setMapperClass(MapClass.class);

        job.setReducerClass(Reduce.class);

 

        // 设置输入格式类

        job.setInputFormatClass(TextInputFormat.class);

 

        // 设置输出格式

        job.setOutputFormatClass(TextOutputFormat.class);

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(Text.class);

 

        // 第1个参数为缓存的部门数据路径、第2个参数为员工数据路径和第3个参数为输出路径

       String[] otherArgs = new GenericOptionsParser(job.getConfiguration(), args).getRemainingArgs();

       DistributedCache.addCacheFile(new Path(otherArgs[0]).toUri(), job.getConfiguration());

        FileInputFormat.addInputPath(job, new Path(otherArgs[1]));

        FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));

 

        job.waitForCompletion(true);

        return job.isSuccessful() ? 0 : 1;

    }

 

    /**

     * 主方法,执行入口

     * @param args 输入参数

     */

    public static void main(String[] args) throws Exception {

        int res = ToolRunner.run(new Configuration(), new WordCount(), args);

        System.exit(res);

    }

}
  1. 调试,运行
    在这里插入图片描述
  2. 遗留问题
    为什么输出目录每次都必须修改,能否改成修改的时候连续跟着上一次的输出结果继续输出?还是说每次都必须得换新的目录名?
    更多的题目:
    https://www.cnblogs.com/ManchesterCityCoder/p/10799511.html
Spark
Storm
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值