hadoop各组件Java API

DRAmonster

已于 2022-11-02 14:41:56 修改

阅读量884

点赞数

文章标签： hadoop java

于 2020-08-12 00:07:46 首次发布

本文链接：https://blog.csdn.net/DRAmonster/article/details/107406997

版权

使用Java API

HDFS

Hdfs Java API

Java API对hdfs文件读写操作：
1.创建获取配置对象
Configuration conf=new Configuration();
//configuration对象 对文件中的属性进行封装 默认的属性信息来自于hadoop的配置文件
//core.xml hdfs-site.xml
conf.addResource(/core.xml_Path);
conf.addResource(hdfs-site.xml_Path);
//利用fileSystem（抽象类）的静态方法get 获取fileSystem的实例
//主要使用FileSystem接口的方法对hdfs文件进行各种操作
FileSystem fSystem=FileSystem.get(conf);
2.上传文件
fSystem.copyFromLocalFile(localPath,hdfsPath);
3.使用流的方式上传文件
//向hdfs上传文件时，先在hdfs中创建一个文件，但先不使用，等所有的资源和任务分配好之后再真正执行上传文件
 FSDataOutputStream out=fSystem.create(hdfsPath);
//指定要上传的文件路径
FileInputStream fileInputStream=new FileInputStream(localPath);
//使用IOUtils接口上传文件
IOUtils.copy(fileInputStream,out);
4.下载文件
fSystem.copyToLocalFile(hdfsPath,localPath);
5.以流的方式下载文件
//用FSDataInputStream对象读取hdfs文件
FSDataInputStream in =fSystem.open(hdfsPath);
//使用FileOutputStream对象接收流文件
FileOutputStream outputStream=new FileOutputStream(localPath);
IOUtils.copy(in,outputStream);
6.创建目录
 //返回值为布尔类型，返回成功或失败信息
boolean b=fSystem.mkdirs(hdfsPath);
7.删除目录或文件
 boolean b=fSystem.delete(hdfsPath);
8.打印指定目录下的文件或目录信息
 FileStatus[] listStatus=fSystem.listStatus(hdfsPath);
for(FileStatus files:listStatus){
     System.out.println((files.isFile()?"file:":"directory")+files.getPath().getName());
        }
9.读取文件内容
 FSDataInputStream in=fSystem.open(new Path(filePath));
//BufferdReader是为了提高读的效率设计的包装类 包装字符流 可以从字符输入流中读取文本，缓冲8192个字符，从而实现字符，数组和行的高效读取。
 BufferedReader bufferedReader=new BufferedReader(new InputStreamReader(in));
10.向文件中写内容
//首先在hdfs中选择要写入的文件
FSDataOutputStream out=fSystem.create(hdfsPath);
//BufferedWriter写文件默认覆盖原文件，若要以追加形式写文件，后面加上true
BufferedWriter bufferedWriter=new BufferedWriter(new OutputStreamWriter(out));
bufferedWriter.write("hello tonghan");

具体代码

package com.hcip;
import org.apache.commons.compress.utils.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import java.io.*;
public class hcip {
    Configuration conf=null;
    FileSystem fSystem=null;

    public void init() throws Exception{
        //获取当前目录所在路径
        String proDir= System.getProperty("user.dir");
        String baseDir=proDir+ File.separator+"conf";
        String corePath=proDir+File.separator+"conf"+File.separator+"core-site.xml";
        String hdfsPath=proDir+File.separator+"conf"+File.separator+"hdfs-site.xml";

        conf =new Configuration();//如果不通过代码配置，会默认获取hadoop安装环境下两个配置
        conf.addResource(new Path(corePath));
        conf.addResource(new Path(hdfsPath));
        fSystem=FileSystem.get(conf);
    }

    //上传文件
    public void upload() throws Exception{
        fSystem.copyFromLocalFile(new Path("C:/Users/Administrator/Desktop/test.txt"),new Path("/"));
        fSystem.close();
    }
    //使用流的方式上传文件
    public void ioupload() throws Exception{
        FSDataOutputStream out=fSystem.create(new Path("/hcip.txt"),true); //overwrite:true
        FileInputStream fileInputStream=new FileInputStream("C:/Users/Administrator/Desktop/test.txt");
        IOUtils.copy(fileInputStream,out);
    }

    //下载文件
    public void download() throws Exception{
        fSystem.copyToLocalFile(false,new Path("/test.txt"),new Path("C:/Users/Administrator/Desktop/hcip/hcip1.txt"),true);
        fSystem.close();
    }

    //以流的方式下载文件
    public void Iodownload() throws Exception{
        FSDataInputStream in =fSystem.open(new Path("/test.txt"));
        FileOutputStream outputStream=new FileOutputStream("C:/Users/Administrator/Desktop/hcip/hcip.txt");
        IOUtils.copy(in,outputStream);
    }
    //创建目录
    public void mkdir() throws Exception{
        //返回值为布尔类型，返回成功或失败信息
        boolean b=fSystem.mkdirs(new Path("/tonghan"));
        System.out.println(b);
    }

    //删除目录或文件
    public void delete() throws Exception{
        boolean b=fSystem.delete(new Path("/tonghan"));
        System.out.println(b);
    }

    //打印指定目录下的文件或目录信息
    public void list() throws Exception{
        FileStatus[] listStatus=fSystem.listStatus(new Path("/"));
        for(FileStatus files:listStatus){
            System.out.println((files.isFile()?"file:":"directory")+files.getPath().getName());
        }
    }

    //读取文件内容
    public void read(String filePath) throws Exception{
        String line;
        FSDataInputStream in=fSystem.open(new Path(filePath));
        //BufferdReader8192个字符的缓冲区,把返回的FSDataInputReader对象放入inputstreamReader容器中
        //BufferdReader是一个包装类，把inputStream对象用BufferdReader包装
        BufferedReader bufferedReader=new BufferedReader(new InputStreamReader(in));
        while ((line=bufferedReader.readLine())!=null){
            System.out.println(line);
        }
        bufferedReader.close();
    }

    //向文件中写内容
    public void Writer(String filePath) throws Exception{
        FSDataOutputStream out=fSystem.create(new Path(filePath));
        //BufferedWriter写文件默认覆盖原文件，若要以追加形式写文件，后面加上true
        BufferedWriter bufferedWriter=new BufferedWriter(new OutputStreamWriter(out));
        bufferedWriter.write("hello tonghan");
        bufferedWriter.close();
    }
    //创建configuration对象
    //利用fileSystem（抽象类）的静态方法get 获取fileSystem的实例
    //调用fileSystem的方法进行实际的文件操作
    //configuration对象 对文件中的属性进行封装 默认的属性信息来自于hadoop的配置文件
    //core.xml hdfs-site.xml
    public static void main(String[] args) throws Exception{
        hcip hcip = new hcip();
        hcip.init();
        hcip.read("/hcip.txt");
    }
}

BufferdReader
是为了提高读的效率设计的包装类包装字符流可以从字符输入流中读取文本，缓冲8192个字符，从而实现字符，数组和行的高效读取。
在这里插入图片描述

MapReduce

//首先MapReduce使用的是hdfs上的文件，所以也要创建获取配置对像
static Configuration conf = new Configuration();
//创建job，要将job提交给RM
 Job job=Job.getInstance(conf,"classname");
//设置job运行的主类
job.setJarByClass(MapReduce.class);
//设置job从哪里读数据，怎么处理怎么输出（主体）
Path inputPath=new Path(hdfsPath);
FileInputFormat.setInputPaths(job,inputPath);
//map阶段，wordCountMapper.class是需要自定义的类
job.setMapperClass(wordCountMapper.class);
//map阶段<key,value>输出的数据类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//shuffle
//reduce阶段，wordCountReducer.class是需要自定义的类
job.setReducerClass(wordCountReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//设置MapReduce（job）任务完后的文件输出路径
Path outputPath=new Path(hdfsPath);
FileOutputFormat.setOutputPath(job,outputPath);
//提交任务
boolean isSuccess=job.waitForCompletion(true);
//成功返回0，失败返回1
System.exit(isSuccess?0:1);

在这里插入图片描述
数据要在网络中传输，必需要经过序列化（有以下几个包装类）。
int——>intwritable
long——>longwritable
string——>text
text——>String toString

向hdfs提交MapReduce的Wordcount任务：

package com.mapreduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import javax.xml.crypto.dsig.keyinfo.KeyInfo;
import java.io.File;
import java.io.IOException;

public class MapReduce {

    static Configuration conf=null;

    public  void init(){
//		获取当前所在目录路径
        String proDir= System.getProperty("user.dir");
//		获取conf路径
        String  baseDir = proDir+File.separator+"conf";
//		获取两个配置文件路径
        String corePath = baseDir+File.separator+"core-site.xml";
        String hdfsPath = baseDir+File.separator+"hdfs-site.xml";
        conf = new Configuration();//如果不通过代码配置 会默认获取hadoop安装环境下  两个配置文件
//		conf.set("fs.defaultFS", "hdfs://hadoop:9000");
        conf.addResource(new Path(corePath));
        conf.addResource(new Path(hdfsPath));
    }
    private static class wordCountMapper extends Mapper<LongWritable, Text,Text, IntWritable>{
        private Text mapOutputKey=new Text();
        private final static IntWritable mapOutputValue=new IntWritable(1);
        @Override
        protected  void map(LongWritable key,Text value,Context context)
                throws IOException,InterruptedException{
            //将行内容转化为字符串
            String valString=value.toString();
            // 使用空格各分隔单词
            String[] items=valString.split(" ");
            //将单词和一组合
            for (String item:items){
                mapOutputKey.set(item);
                context.write(mapOutputKey,mapOutputValue);
            }
        }
    }
    private static class wordCountReducer
            extends Reducer<Text,IntWritable,Text,IntWritable>{
        private IntWritable Sum=new IntWritable();
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context)
                throws IOException, InterruptedException {
           // super.reduce(key, values, context);
            //对value进行累加
            int sum=0;
            for(IntWritable value:values){
                sum+=value.get();//intWriterable-->int
                Sum.set(sum);//封装成hadoop类型
                context.write(key,Sum);
            }
        }
    }
    public static void main(String[] args) throws Exception{
        //设置conf
        MapReduce mapReduce = new MapReduce();
        mapReduce.init();
        //创建job
        Job job=Job.getInstance(conf,"MapReduce");
        //设置job运行的主类
        job.setJarByClass(MapReduce.class);
        //设置job从哪里读取数据，怎么处理，怎么输出
        //input
        Path inputPath=new Path(args[0]);
        FileInputFormat.setInputPaths(job,inputPath);
        //map
        job.setMapperClass(wordCountMapper.class);
        //map输出的数据类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        //shuffle
        //reduce
        job.setReducerClass(wordCountReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        //output
        Path outputPath=new Path(args[1]);
        FileOutputFormat.setOutputPath(job,outputPath);
        //任务提交
        boolean isSuccess=job.waitForCompletion(true);
        //成功返回0，失败返回1
        System.exit(isSuccess?0:1);
    }
}

向yarn提交join任务：
在这里插入图片描述

package com.join;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.util.Vector;
import java.io.File;
import java.io.IOException;

public class join {
    static Configuration conf=null;

    public void init(){
        //获取当前所在目录路径
        String proDir=System.getProperty("user.dir");
        //获取conf路径
        String  baseDir = proDir+ File.separator+"conf";
        //获取两个配置文件路径
        String corePath = baseDir+File.separator+"core-site.xml";
        String hdfsPath = baseDir+File.separator+"hdfs-site.xml";
        conf = new Configuration();//如果不通过代码配置 会默认获取hadoop安装环境下  两个配置文件
        //conf.set("fs.defaultFS", "hdfs://hadoop:9000");
        conf.addResource(new Path(corePath));
        conf.addResource(new Path(hdfsPath));

    }
    private static class mapper extends Mapper<LongWritable,Text,Text,Text>{
        private final Text mapOutputKey=new Text();
        private final Text mapOutputVlaue=new Text();
        @Override
        protected void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
            //super.map(key, value, context);
            //通过对文件进行分割获取文件名来区分两张表
            String keyOut="";
            String valueOut="";
            FileSplit fileSplit=(FileSplit) context.getInputSplit();
            //转化成字符串类型
            String path=fileSplit.getPath().toString();
            //要在value前添加a#或b#，所以要将value转化成字符串类型
            String line=value.toString();
            //判断path的来源决定要在line前添加a#还是b#
            if(path.contains("province")){
                String[] values=line.split(" ");
                //如果line的长度不为2，则为无效数据
                if(values.length!=2){
                    return;
                }
                keyOut=values[0];
                valueOut="a# "+values[1];
            }
            if(path.contains("data")){
                String[] values=line.split(" ");
                //如果line的长度不为2，则为无效数据
                if(values.length!=3){
                    return;
                }
                keyOut=values[0];
                valueOut="b# "+values[1]+" "+values[2];
            }
            mapOutputKey.set(keyOut);
            mapOutputVlaue.set(valueOut);
            context.write(mapOutputKey,mapOutputVlaue);
        }
    }

    private static class reducer extends Reducer<Text,Text,Text,Text>{
        //private final Text key=new Text();
        private final Text VALUE=new Text();
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context)
                throws IOException, InterruptedException {
            String valueOut="";
            String valueOut_adi="";
            //定义两个容器
            Vector<String> A=new Vector<String>();
            Vector<String> B = new Vector<String>();
            //将A容器和B容器中的东西加起来,确定容器遍历的次数
            for (Text val :values) {
                if (val.toString().startsWith("a#")) {
                    A.add(val.toString().substring(2));//截取掉前面两个字符a#  只剩下beijing
                }
                if (val.toString().startsWith("b#")) {
                    B.add(val.toString().substring(2));
                }
            }
//			确定容器要遍历的此相互
            int sizeA = A.size();
            int sizeB = B.size();
            for (int i=0;i<sizeA;i++) {
                for (int j=0;j<sizeB;j++) {
                    String str = A.get(i)+" "+B.get(j);
                    context.write(key, new Text(str));
                }
            }
        }
    }
    
    public static void main(String[] args) throws Exception{
//      1.map过程，数据分片来自哪个文件，分别打上标记
//      2.reduce，将a#和b#和数据进行合并
        join jn=new join();
        //配置conf
        jn.init();
        //创建job
        Job job=Job.getInstance(conf,"join");
        //设置job运行的主类
        job.setJarByClass(join.class);
        //input
        Path inputPath=new Path(args[0]);
        FileInputFormat.setInputPaths(job,inputPath);//读取文件规格 格式化
        //map
        job.setMapperClass(mapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        //map的key的输出类型，value的输出类型
        //shuffle
        //reduce
        job.setReducerClass(reducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        //output
        Path outputPath=new Path(args[1]);
        FileOutputFormat.setOutputPath(job,outputPath);
        //任务提交
        boolean isSuccess=job.waitForCompletion(true);
        //成功返回0，失败返回1
        System.exit(isSuccess?0:1);
    }
}

HIVE

Hive以hdfs存储数据，MR处理框架，将HQL转化为MR任务进行计算。Hive具有数据分析的能力。
Hive分层架构：
ODS层（源数据：流日志、数据库数据、文档数据）——>DWD层（清洗数据）——>DWS层（以DWD为基础进行轻度汇总，预处理、空值）——>ADS层（数据分析、数据挖掘，web数据分析、数据挖掘）

首先Java想要访问Hive，需要通过beeline的方式连接hive，启动beeline之前要先启动hiveserver2，通过beeline可以方便的对hive进行命令行的操作
//JDBC操作Hive
//Hive数据储存在hdfs上，所以要创建获取配置信息对象
Configuration conf = new Configuration();
//加载hive驱动，创建连接
String driverName="org.apache.hive.jdbc.HiveDriver";
Class.forName(driverName);//用class.forname 加载驱动
//创建jdbc连接
Connection connection=null;
connection= DriverManager.getConnection("jdbc:hive2://nna:10000","beeline_username","passwd");
//使用preparestatment，JDBC储存过程，包含预编译的SQL语句
PreparedStatement statement=null;
statement=connection.prepareStatement(sql);
//执行SQL语句
statement.execute();
//有返回结果，resultSet
ResultSet resultSet=statement.executeQuery();

Hive具体实现代码

package com.hive;

import java.sql.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import java.io.File;

public class hive {
    Configuration conf=null;

    public  void init(){
//		获取当前所在目录路径
        String proDir= System.getProperty("user.dir");
//		获取conf路径
        String  baseDir = proDir+ File.separator+"conf";
//		获取两个配置文件路径
        String corePath = baseDir+File.separator+"core-site.xml";
        String hdfsPath = baseDir+File.separator+"hdfs-site.xml";
        conf = new Configuration();//如果不通过代码配置 会默认获取hadoop安装环境下  两个配置文件
//		conf.set("fs.defaultFS", "hdfs://hadoop:9000");
        conf.addResource(new Path(corePath));
        conf.addResource(new Path(hdfsPath));
    }
    public static void main(String[] args) throws SQLException,Exception{
        hive hive_new=new hive();
        hive_new.init();
        //通过jdbc进行调用，定义HQL语句
        String[] sqls={"CREATE TABLE IF NOT EXISTS th(id INT,name STRING)",
        "SELECT COUNT(*) FROM th","DROP TABLE th"};
        //创建jdbc URL
        String url="jdbc:hive2://nna:10000";
        //加载hive驱动
        String driverName="org.apache.hive.jdbc.HiveDriver";
        Class.forName(driverName);//用class.forname 加载驱动
        //创建连接
        Connection connection=null;
        try {
            //创建JDBC连接
            connection= DriverManager.getConnection(url,"tonghan","");
            //建表
            execDDL(connection,sqls[0]);
            //查询
            execDML(connection,sqls[1]);
            //删除
            execDDL(connection,sqls[2]);
        }
        catch(Exception e){
        }
    }
    //建表
    private static void execDDL(Connection connection,String sql) throws SQLException {
        //preparestatment  JDBC储存过程
        PreparedStatement statement=null;
        try{
            //prepareStatement对sql进行存储和编译
            statement=connection.prepareStatement(sql);
            //执行
            statement.execute();
        }
        catch (Exception e){ }
        finally {
            if(statement!=null){
                statement.close();
            }
        }
    }
    //查询
    private static void execDML(Connection connection,String sql) throws SQLException{
        //查询完之后会返回一个对象 ResultSet
        PreparedStatement statement=null;
        ResultSet resultSet=null;
        ResultSetMetaData resultSetMetaData=null;
        try {
            //执行HQL
            statement=connection.prepareStatement(sql);
            resultSet=statement.executeQuery();
            //输出结果到控制台
            resultSetMetaData=resultSet.getMetaData();
            int columns=resultSetMetaData.getColumnCount();
            for(int i=1;i<columns;i++){
                System.out.println(resultSetMetaData.getColumnLabel(i)+"\t");
            }
        }
        catch (Exception e){

        }
        finally {
            if(resultSet!=null){
                resultSet.close();
            }
            if(statement!=null){
                statement.close();
            }
        }
    }
}
//通过jdbc调用sql语句

HBase

//HBASE基于hdfs进行存储，所以创建获取配置信息对象
Configuration conf=null;
conf = new Configuration();
//创建HBase的连接
Connection conn=ConnectionFactory.createConnection(conf);
//调用java接口Admin的getAdmin()方法来返回子对象的成员方法
Admin hbAdmin=conn.getAdmin();

HBase 主要包括 5 大类操作：HBase 的配置、HBase 表的管理、列族的管理、列的管理、数据操作等。
HBase常用的java API：

java类	HBase数据模型
HBaseAdmin	数据库（DB）
HTable	表
HTableDescriptor	列族（CF）
Put	列（Column Qualifier）
Get
Scanner

org.apache.hadoop.hbase.Admin：Admin 是 Java 接口类型，不能直接用该接口来实例化一个对象，而是必须通过调用 Connection.getAdmin() 方法，来调用返回子对象的成员方法。该接口用来管理 HBase 数据库的表信息。它提供的方法包括创建表，删除表，使表有效或无效，以及添加或删除表列等。
org.apache.hadoop.hbase.HTableDescriptor：接口包含了表的详细信息，可以在创建表时添加列族。
org.apache.hadoop.hbase.HColumnDescriptor：提供接口对列族进行操作。

HTableDescriptor hTableDescriptor = new HTableDescriptor(tableName);
//定义列族对象
HColumnDescriptor hColumnDescriptor1 = new HColumnDescriptor(Bytes.toBytes("info1"));
//hbase数据类型只有Bytes类型，通过bytes.tobytes方法进行封装 返回result
//需要通过bytes.tostrong()转化为String对象
//HColumnDescriptor列簇的描述信息类，比如版本，压缩方式，添加一个列的时候会使用
HColumnDescriptor hColumnDescriptor2 = new HColumnDescriptor(Bytes.toBytes("info2"));
hTableDescriptor.addFamily(hColumnDescriptor1);
hTableDescriptor.addFamily(hColumnDescriptor2);
hbAdmin.createTable(hTableDescriptor);

org.apache.hadoop.hbase.client.Table：Table 是 Java 接口类型，不可以用 Table 直接实例化一个对象，而是必须通过调用 connection.getTable() 的一个子对象，来调用返回子对象的成员方法。这个接口可以用来和 HBase 表直接通信，对表中数据进行操作。

TableName tableName=TableName.valueOf("tablename");
Table table=conn.getTable(tableName);//table接口可以用来对表中增加一个单元格

过滤器、比较器

过滤器
equal	相等
greater	大于
greater_or_equal	大于等于
less	小于
less_or_equal	小于等于
not_equal	不等于
比较器
binaryComparator	匹配完整字节组
binaryPrefixComparator	匹配字节数组前缀
regexStringComparator	正则表达式匹配
substringComparator	子串匹配

//过滤器，比较器
RowFilter rowFilter = new RowFilter(CompareFilter.CompareOp.EQUAL, new RegexStringComparator("\\[a-z]+"));

具体实现代码：

package com.hbase;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.*;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.filter.*;
import org.apache.hadoop.hbase.util.Bytes;
import java.io.File;

public class Hbase {
    private Configuration conf=null;
    private Connection conn;
    private Admin hbAdmin;
    public void init() throws Exception{
        //获取当前所在目录路径
        String proDir=System.getProperty("user.dir");
        //获取conf路径
        String  baseDir = proDir+ File.separator+"conf";
        //获取两个配置文件路径
        String corePath = baseDir+File.separator+"core-site.xml";
        String hdfsPath = baseDir+File.separator+"hdfs-site.xml";
        String hbasePath=baseDir+File.separator+"hbase-site.xml";
        conf = new Configuration();//如果不通过代码配置 会默认获取hadoop安装环境下  两个配置文件
        //conf.set("fs.defaultFS", "hdfs://hadoop:9000");
        conf.addResource(new Path(corePath));
        conf.addResource(new Path(hdfsPath));
        conf.addResource(new Path(hbasePath));
        conf.set("hbase.zookeeper.quorum","nna:2181");
        //创建连接
        conn= ConnectionFactory.createConnection(conf);
        hbAdmin=conn.getAdmin();
    }
    //表的创建
    public void create() throws Exception{
        TableName tableName=TableName.valueOf("tonghan");//定义表名
        //判断表是否已经存在
        if(!hbAdmin.tableExists(tableName)){
            //定义表的对象
            HTableDescriptor hTableDescriptor = new HTableDescriptor(tableName);
            //定义列族对象
            HColumnDescriptor hColumnDescriptor1 = new HColumnDescriptor(Bytes.toBytes("info1"));
            //hbase数据类型只有Bytes类型，通过bytes.tobytes方法进行封装 返回result
            //需要通过bytes.tostrong()转化为String对象
            //HColumnDescriptor列簇的描述信息类，比如版本，压缩方式，添加一个列的时候会使用
            HColumnDescriptor hColumnDescriptor2 = new HColumnDescriptor(Bytes.toBytes("info2"));
            hTableDescriptor.addFamily(hColumnDescriptor1);
            hTableDescriptor.addFamily(hColumnDescriptor2);
            hbAdmin.createTable(hTableDescriptor);
        }
    }
    //增加单元格
    public void put() throws Exception{
        TableName tableName=TableName.valueOf("tonghan");
        Table table=conn.getTable(tableName);//table接口可以用来对表中增加一个单元格
        String[] str={"t","h","o","n"};
        for(int i=0;i<4;i++){
            Put put_in=new Put(Bytes.toBytes(i));//定义行名
            put_in.addColumn(Bytes.toBytes("info1"),Bytes.toBytes("name"),Bytes.toBytes(str[i]));
            table.put(put_in);
        }

    }
    //删除单元格
    public void delete() throws Exception{
        TableName tableName=TableName.valueOf("tonghan");
        Table table=conn.getTable(tableName);
        Delete delete=new Delete(Bytes.toBytes("a"));
        delete.addColumn(Bytes.toBytes("info1"),Bytes.toBytes("name"));
        table.delete(delete);
    }
    //查询一行数据
    public void get() throws Exception{
        TableName tableName=TableName.valueOf("tonghan");
        Table table=conn.getTable(tableName);
        //指定rowkey
        Get get=new Get(Bytes.toBytes("a"));
        //指定列族
        get.addFamily(Bytes.toBytes("info1"));
        //接收返回值,返回类型是cell
        Result result=table.get(get);
        for(Cell cells:result.rawCells()){
            System.out.println("行键"+Bytes.toString(CellUtil.cloneRow(cells))+"\t");
            System.out.println("列族"+Bytes.toString(CellUtil.cloneFamily(cells))+"\t");
            System.out.println("列名qual"+Bytes.toString(CellUtil.cloneQualifier(cells))+"\t");
            System.out.println("值"+Bytes.toString(CellUtil.cloneValue(cells))+"\t");
        }
    }

    //scan方法,,扫描指定表
    public void scan() throws Exception{
        TableName tableName=TableName.valueOf("tonghan");
        Table table=conn.getTable(tableName);
        Scan scan=new Scan();//指定扫描region中的对象
        //返回ResultScanner对象
        ResultScanner resultScanner=table.getScanner(scan);
        showResult(resultScanner);

    }
    public void showResult(ResultScanner resultScanner) {
        for(Result result:resultScanner){
            Cell[] cells=result.rawCells();
            for(Cell cell:cells){
                System.out.println("行键"+Bytes.toString(CellUtil.cloneRow(cell))+"\t");
                System.out.println("列族"+Bytes.toString(CellUtil.cloneFamily(cell))+"\t");
                System.out.println("列名qual"+Bytes.toString(CellUtil.cloneQualifier(cell))+"\t");
                System.out.println("值"+Bytes.toString(CellUtil.cloneValue(cell))+"\t");
            }
        }
    }
    //过滤filter
    //rowkey过滤器扫描rowFilter
    public void filter_rowkey() throws Exception{
        TableName tableName=TableName.valueOf("tonghan");
        Table table=conn.getTable(tableName);
        Scan scan=new Scan();
        //过滤器，比较器
        RowFilter rowFilter = new RowFilter(CompareFilter.CompareOp.EQUAL, new RegexStringComparator("\\[a-z]+"));
        scan.setFilter(rowFilter);
        showResult(table.getScanner(scan));
    }


    //CF过滤器
    public void familyFilterScan() throws Exception{
        TableName tableName=TableName.valueOf("tonghan");
        Table table=conn.getTable(tableName);
        Scan scan=new Scan();
        scan.addFamily(Bytes.toBytes("info1"));
        FamilyFilter familyFilter=new FamilyFilter(CompareFilter.CompareOp.EQUAL,new BinaryComparator(Bytes.toBytes("info2")));
        scan.setFilter(familyFilter);
        showResult(table.getScanner(scan));
    }

    //列标识过滤
    public void qulifiterScan() throws Exception{
        TableName tableName=TableName.valueOf("tonghan");
        Table table=conn.getTable(tableName);
        Scan scan=new Scan();
        QualifierFilter qualifierFilter = new QualifierFilter(CompareFilter.CompareOp.EQUAL, new BinaryComparator(Bytes.toBytes("name")));
        scan.setFilter(qualifierFilter);
        showResult(table.getScanner(scan));
    }

    //值过滤
    public void valuefiterScan() throws Exception{
        TableName tableName=TableName.valueOf("tonghan");
        Table table=conn.getTable(tableName);
        Scan scan=new Scan();
        ValueFilter valueFilter = new ValueFilter(CompareFilter.CompareOp.EQUAL, new BinaryComparator(Bytes.toBytes("tonghan")));
        scan.setFilter(valueFilter);
        showResult(table.getScanner(scan));
    }
     public static void main(String[] args) throws Exception{
         Hbase hbase = new Hbase();
         hbase.init();
//         hbase.create();
//         hbase.put();
//         hbase.scan();
         hbase.filter_rowkey();
//         hbase.familyFilterScan();
//         hbase.qulifiterScan();
//         hbase.valuefiterScan();
     }
}

kafka

基于发布订阅的消息队列

producer和comsumer的重要参数

producer重要参数
Bootstrap.servers	broker地址列表	生产者通过这个参数值，创建与broker之间的连接
Security.protocol	安全协议类型	SASL协议
Key.serializer	消息Key值序列化类
Value.serializer	value值的序列化类
Client id	客户端id号	任意指定
producer重要接口函数
send(ProducerRecord<K,V>record)	发送接口，通常使用Future的get()函数发送，同步发送（不带回调函数的发送接口）	返回值：Java.util.concurrent.Future
send(ProducerRecord<K,V>record Callback callback)	发送接口，带回调函数的发送接口通常用于异步发送，通过回调函数对发送结果进行处理	返回值：Java.util.concrrent.Future
onCompletion(RecordMetadata metadata,Exception e)	回调函数的接口方法通过Callback中的此方法来进行异步发送结果的处理	返回值：void
comsumer重要参数
Bootstrap.server	broker地址列表	消费者通过此参数值，创建于broker的连接
Key.deserializer	消息key值反序列化类
value.deserializer	消息value值反序列化类
group.id	任意指定
enableAutoCommit	是否自动提交offset
autoCommitIntervalMs	提交offset时间间隔
sessionTimeoutMs	会话超时时间
重要接口函数
subscribe topic	订阅接口方法	返回值：void
poll（long timeout）	请求过去信息接口的方法	返回值：comsumerRecords<K,V>

producer端
//定义topic
private String topic;
//指定是否异步发送
private Boolean isAsync;
//设置配置文件信息
private Properties props=new Properties();
//创建producer对象，
private KafkaProducer<Integer,String> producer=new KafkaProducer<Integer,String>(props);
//kafka的地址和端口号
props.put("bootstrap.servers","nna:9092");
//序列化，网络中传输需要将数据序列化
props.put("key.serializer","org.apache.kafka.common.serialization.IntegerSerializer");
props.put("value.serializer","org.apache.kafka.common.serialization.StringSerializer");
//产生消息
//发送消息ProducerRecord
ProducerRecord ProducerRecord = new ProducerRecord<>(topic, number, value);
producer.send(ProducerRecord).get();       
comsumer端
//配置参数信息
 Properties props=new Properties();
props.put("bootstrap.servers","nna:9092");
props.put("key.deserializer","org.apache.kafka.common.serialization.IntegerDeserializer");
props.put("value.deserializer","org.apache.kafka.common.serialization.StringDeserializer");
props.put("enable.auto.commit",true);
props.put("session.timeout.ms",60000);
props.put("group.id","kafka");
//创建consumer对象
KafkaConsumer<Integer, String> KafkaConsumer = new KafkaConsumer<>(props);
//订阅数据
KafkaConsumer.subscribe(Arrays.asList("topicname"));
ConsumerRecords<Integer,String> consumers=KafkaConsumer.poll(100);

具体实现代码：

//producer端
package com.kafka;

import org.apache.kafka.clients.producer.*;
import org.slf4j.LoggerFactory;
import org.slf4j.Logger;
import java.util.Properties;

public class kafka {
    //配置日志获取方式
    private static final Logger LOG= LoggerFactory.getLogger(kafka.class);
    //创建producer对象
    private KafkaProducer<Integer,String> producer;
    //定义topic
    private String topic;
    //定义是否异步发送
    private Boolean isAsync;
    //配置参数broker的地址列表
    private Properties props=new Properties();
    //指定生产多少消息
    private int messageNumToSend;
    //定义构造方法
    public kafka(Boolean isAsync,int messageNumToSend,String topic){
        //kafka的地址和端口号
        props.put("bootstrap.servers","nna:9092");
        //反序列化
        props.put("key.serializer","org.apache.kafka.common.serialization.IntegerSerializer");
        props.put("value.serializer","org.apache.kafka.common.serialization.StringSerializer");
        this.isAsync=isAsync;
        this.messageNumToSend=messageNumToSend;
        this.topic=topic;
        this.producer=new KafkaProducer<Integer,String>(props);
    }
    //同步（sync）异步（async）通过producer.type参数进行指定的
    //产生消息
    public void sendMessage() throws Exception{
        LOG.info("producer start");
        int number =0;
        String value="";
        //定义循环发送数据
        while(number<messageNumToSend){
            value="message"+number;
            ProducerRecord ProducerRecord = new ProducerRecord<>(topic, number, value);
            if(!isAsync){
                //同步发送
                producer.send(ProducerRecord).get();
                LOG.info("send_sync"+"\t"+number+"\t"+value);
            }
            else{
                //获取系统时间
                //异步发送
                long startTime=System.currentTimeMillis();
                producer.send(ProducerRecord,new AsyncCallback(startTime,number,value));
                LOG.info("send_Async"+"\t"+number+"\t"+value);
            }
            number+=1;
        }
        producer.close();
    }
    class AsyncCallback implements Callback{
        private long startTime;
        private int key;
        private String message;
        AsyncCallback(long startTime,int key,String message){
            this.startTime=startTime;
            this.key=key;
            this.message=message;
        }

        @Override
        public void onCompletion(RecordMetadata recordMetadata, Exception e) {
            //经过多少时间(间隔时间)
            long time=System.currentTimeMillis()-startTime;
            if(recordMetadata!=null){
                LOG.info("message:("+key+","+message+")," +
                        "offset:"+recordMetadata.offset()+"in"+time);
            }
            else if(e!=null){
                LOG.error("the exception occured"+e);
            }
        }
    }
    public static void main(String[] args) throws Exception{
        //
        kafka kafka = new kafka(Boolean.TRUE,10,"tonghan");
        kafka.sendMessage();

    }
}

//comsumer端
package com.kafka;

import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import java.lang.reflect.Array;
import java.util.Arrays;
import java.util.Properties;

public class comsumer {

    public static void main(String[] args) {
        //可以向kafka接口传递参数
        Properties props=new Properties();
        props.put("bootstrap.servers","nna:9092");
        props.put("key.deserializer","org.apache.kafka.common.serialization.IntegerDeserializer");
        props.put("value.deserializer","org.apache.kafka.common.serialization.StringDeserializer");
        props.put("enable.auto.commit",true);
        props.put("session.timeout.ms",60000);
        props.put("group.id","kafka");
        //创建consumer对象
        KafkaConsumer<Integer, String> KafkaConsumer = new KafkaConsumer<>(props);
        //订阅数据
        KafkaConsumer.subscribe(Arrays.asList("tonghan"));
        while(true){
            ConsumerRecords<Integer,String> consumers=KafkaConsumer.poll(100);
            for(ConsumerRecord<Integer,String> consumer:consumers){
                System.out.println("消费数据为："+consumer.value());
            }
        }
    }
}

Spark

用户提交任务：
用户提交应用程序，先使用blockmanager将任务的hadoop配置广播——>slave——>driver构建DAG——>DAG scheduler划分stage——>分发给相应的executo进行执行
job：一个application中，每遇到一个action操作，就生成一个job（多个stage构成的，一个job包含多个rdd及其作用于其上的各种操作）
stage ：基本调度单元，分成多组task——>stage/taskset
sparkcontext是整个application程序操作的入口。
clustermanager：集群资源管理器，资源的分配调度管理，每个worker 内存 CPU 分配给应用程序不负责对executor资源分配。
driver向集群管理器申请资源，driver会创建一个sparkcontext ——》 资源管理器为executor分配资源，启动executor进程 ——》 sparkcontext根据RDD的依赖关系划分构建DAG，DAGscheduler据此划分stage ——》 taskset进行处理，由task scheduler进行处理 ——》 executor向sparkcontext去申请task，提供应用程序代码 ——》 task在executor上运行，把结果反馈给task scheduler，之后向driver注销

spark编程模型

用户使用sparkcontext提供的API（textFile、runjob）编写driver application程序
RDD（map、flatmap、collect）

	spark core常用接口
javasparkcontext	spark对外接口	连接spark集群，创建RDD
spark conf	配置类	应用的名称，执行的模式（local、client、cluster）
javaRDD<>	定义RDD的类
javaPairRDD<>	以key-value的形式定义RDD类
配置过程

//1. 生成sparkconf对象
sparkconf() conf=new sparkconf().setAppName  /steMaster/
//2.实例化一个sparkcontext对象，整个APP程序操作入口 （driver app），同一时刻只允许一个sparkcontext处于活跃状态
sparkcontext sc=new sparkcontext（conf）
//3. 创建RDD
//本地 hdfs 创建列表，生成RDD
javaRDD<String> filePath=sc.textFile("file:///",2）//（partition数量）)
javaPairRDD<String,String> textFile=sc.wholeTextFile("file:///")
//map()将函数用于RDD中的每个元素，将返回值构成新的RDD。
//flatmap()是将函数应用于RDD中的每个元素，将返回的迭代器的所有内容构成新的RDD例子
//mapvalue：针对（key，value）型的value数据进行map操作，而不对key进行操作
//mappartition算子：获取每个分区的迭代器。
//union算子：两个RDD元素的数据类型相同，不进行去重操作，保存所有元素
//groupbykey算子：根据相同key值进行聚合
//combineByKey：局部聚合，根据value值生成一个迭代器
//reduceBykey：每个分区的value值，聚合出结果
//sortByKey：按照key值排序
//join算子：cogroup，笛卡尔积，对应key下面的所有元素形成一个集合
//filter算子：对元素进行过滤。返回值为boolean类型，为true则选择出来，为false则过滤
//boolean类型，为true则选择出来，为false则过滤

具体例子：

package com.company;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.*;
import scala.Tuple2;

import java.util.ArrayList;
import java.util.Arrays;
import java.lang.Iterable;
import java.util.Iterator;
import java.util.List;

public class spark {
    public static void main(String[] args) {
        //创建conf，配置
        SparkConf conf=new SparkConf().setAppName("spark").setMaster("local");
        //创建sparkcontext
        JavaSparkContext sc=new JavaSparkContext(conf);
        //创建RDD
//        mapDemo(sc);
        wordCount(sc);
    }
    protected static void mapDemo(JavaSparkContext sc){
        List<String> list= Arrays.asList("hadoop hbase hdfs spark","java scala python");
        JavaRDD<String> strRDD=sc.parallelize(list);
        //map函数的返回值是多个RDD列
        JavaRDD<String[]> splitRDD=strRDD.map(new Function<String, String[]>() {
            @Override
            public String[] call(String s) throws Exception {
                return s.split(" ");
            }
        });
    }
    //wordcount例子
    private static void wordCount(JavaSparkContext sc){
        List<String> list=Arrays.asList("hadoop spark","hadoop hdfs","hdfs tong");
        JavaRDD<String> strRDD=sc.parallelize(list);
        JavaRDD<String> splitRDD=strRDD.flatMap(new FlatMapFunction<String, String>() {
            List<String> list = new ArrayList<String>();
            @Override
            public Iterable<String> call(String s) throws Exception {
                String[] str=s.split(" ");
                for(String word:str){
                    list.add(word);
                }
                return new Iterable<String>() {
                    @Override
                    public Iterator<String> iterator() {
                        return list.iterator();
                    }
                };
            }
        });
        JavaPairRDD<String,Integer> mapRDD=splitRDD.mapToPair(new PairFunction < String, String, Integer > () {
            @Override
            public Tuple2<String, Integer> call(String s) throws Exception {
                //return null;
                return new  Tuple2(s,1);
            }
        });
        JavaPairRDD<String,Integer>reduceByKey=mapRDD.reduceByKey(new Function2<Integer, Integer, Integer>() {
            @Override
            public Integer call(Integer integer, Integer integer2) throws Exception {
                return integer+integer2;
            }
        });
        System.out.println(reduceByKey.collect());
    }
}

Spark streaming

可以从数据源获取数据：kafka，flumn，hdfs
算法：map reduce join 窗口函数
储存：hdfs database
spark streaming批处理：
基于核心spark，接收实时流数据根据一定时间间隔拆分数据
批处理处理引擎也是spark core
流数据处理过程：
输入——>给定一个batch size（时间间隔）把数据流分成时间片——>每段时间片中的数据转换成spark中的RDD——>把对Dstream的transformation转化成RDD当中的transformation
——>把中间结果保存在内存中。
将连续的数据进行持久化，离散化，进行批量处理。
Dstream：（对应于spark中的RDD，加了一个时间的切片）离散流，代表了一个持续不断的数据流。DStream的内部，其实是一系列持续不断产生的RDD；DStream中的每个RDD都包含了一个时间段内的数据；
spark streaming容错机制：血统机制
实时：spark streaming分解成多个spark job
sparkstreaming 框架：
master：记录Dstream 之间的依赖关系或者血缘关系负责任务的调度
worker：从网络上接收数据，进行RDD的计算
client：负责向spark streaming传输数据
receiver：接收外部数据数据流通过receiver传入spark streaming内部（帮装成spark streaming能处理的格式）
不立即计算先储存到内部缓冲区分片需要等待，一旦部署的batch-size到了，缓冲区会把数据转换后成数据块，放到数据队列中，block manager从队列中把数据中拿出来，成为spark能处理的数据

spark streaming作业提交流程：
在这里插入图片描述

spark streaming：streaming context（对应于spark context）
transfromation算子
output（执行算子） print
窗口操作：滑动，滚动

	Dstream中常用的接口
JavaStreamingContext	功能入口	创建Dstream
JavaDstream	数据流
JavaPairDstream	<K ,V>Dstream的接口
JavaRecieverInputDstream	定义任何从网络中传入的数据流

Java API配置过程

 //local[n] n:线程数
SparkConf conf=new SparkConf().setAppName("spark_streaming").setMaster("local[2]");
//功能入口 创建Dstream
JavaStreamingContext jsc = new JavaStreamingContext(conf, Durations.seconds(5));
//创建一个Dstream 接收来自TCP的数据源  主机名 端口号
JavaReceiverInputDStream<String> lines=jsc.socketTextStream("nna",9999);

具体代码：

package com.company;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import scala.Tuple2;
import java.util.Arrays;

public class spark_streaming {
    //客户端要去接收服务器端的信息
    public static void main(String[] args) {
        //local[n] n:线程数
        SparkConf conf=new SparkConf().setAppName("spark_streaming").setMaster("local[2]");
        //功能入口 创建Dstream
        JavaStreamingContext jsc = new JavaStreamingContext(conf, Durations.seconds(5));
        //创建一个Dstream 接收来自TCP的数据源  主机名 端口号
        JavaReceiverInputDStream<String> lines=jsc.socketTextStream("nna",9999);
        JavaDStream<String> words=lines.flatMap(new FlatMapFunction<String, String>() {
            @Override
            public Iterable<String> call(String s) throws Exception {
                return Arrays.asList(s.split(" "));
            }
        });
        //转换成<K,V>类型
        JavaPairDStream<String,Integer> pairs=words.mapToPair(new PairFunction<String, String, Integer>() {
            @Override
            public Tuple2<String, Integer> call(String s) throws Exception {
                return new Tuple2<String, Integer>(s,1);
            }
        });
        //reduceByKey
        JavaPairDStream<String ,Integer> reduce=pairs.reduceByKey(new Function2<Integer, Integer, Integer>() {
            @Override
            public Integer call(Integer integer, Integer integer2) throws Exception {
                return integer+integer2;
            }
        });
        reduce.print();
        jsc.start();
        jsc.awaitTermination();
    }
}

storm

Ispout接口实现spout的和核心接口负责将数据送到topology中处理，storm会跟踪spout发出的tuple的ID
成功——>ack message
失败——>fail message

接口中的一些方法
open(map,conf,TopologyContext context,SpoutOutCollector)	在初始化的时候调用，提供Spout的运行环境
conf	配置信息
Context	任务信息 task id
Collector	发送tuple
close()	spout被shutdown的时候调用
Active（）	从未激活的状态激活时调用
deactive（）	active失效的时候调用
nextTuple()	当storm需要发送tuple时调用，循环调用

IBolt接口
是bolt的继承接口

tuple处理方法	描述
filter join	由nimbus分发给worker
prepare	初始化
execute	处理tuple（元数据信息）
cleanup	清空

在这里插入图片描述

//考虑一个实际的例子，Wordcount
//创建一个Spout进行接收数据，两个Bolt用于处理数据，其中一个用于map操作，一个用于reduce操作
//Ispout接口继承自BaseRichSpout接口(隐式的实现了ack和fail方法)
public  static  class DataSpolt extends BaseRichSpout{
	//创建Spout对象
	 private SpoutOutputCollector collector;
	 //初始化的时候调用，提供Spout的运行环境
	  public void open(Map map, TopologyContext topologyContext, SpoutOutputCollector spoutOutputCollector) {
            this.collector=spoutOutputCollector;
        }
	 //当storm需要发送tuple的时候调用，循环调用
	 public void nextTuple() {
	 	 //emit发送tuple，方法中有可选参数
            this.collector.emit();
	 }
	 //给要发出的tuple一个名字
	 public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
            //声明键值
            outputFieldsDeclarer.declare(new Fields("word"));
        }
	//下面来定义两个Bolt
	//第一个用来执行 流数据切片操作 IBolt接口继承BaseRichBolt
	 public static class SplitBolt extends BaseRichBolt{
	 	//创建OutputCollector属性
	 	private OutputCollector collector;
	 	//初始化Bolt
	 	 public void prepare(Map map, TopologyContext topologyContext, OutputCollector outputCollector) {
            this.collector=outputCollector;
        }
	 }
	 //处理tuple，接收上游数据
	  public void execute(Tuple tuple) {
	  		//getStringByField()方法接收上游数据
            String[] words=tuple.getStringByField("word").split(",");
            for (String word:words){
                //将word数据值发送给下游bolt
                this.collector.emit(new Values(word));
            }
        }
        //同样，给发送的tuple一个名字
         public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
            outputFieldsDeclarer.declare(new Fields("word1"));
        }
        //第二个Bolt，这里用来执行map操作，继承自BaseRichBolt
        public static class CountBolt extends BaseRichBolt{
        //创建Map属性，HashMap和LinkHashMap区别，（HashMap  无序的，通过key的hash放到不同的桶中。LinkHashMap 有序的 按照插入顺序进行排序的）
        private HashMap<String,Integer> map=new HashMap<String,Integer>();
        @Override
        //因继承自BaseRichBolt，即使没有初始化也要重写方法
        public void prepare(Map map, TopologyContext topologyContext, OutputCollector outputCollector) {
        }
		//处理tuple
        @Override
        public void execute(Tuple tuple) {
        //接收上游数据
            String word=tuple.getStringByField("word1");
            if (map.containsKey(word)) {
                map.put(word,map.get(word)+1);
            }
            else{
                map.put(word,1);
            }
            System.out.println(map);
        }
        @Override
        public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {}


        //主函数的定义
        public static void main(String[] args) throws Exception {
        //首先要创建topology，因为Spout和Bolt都是包含在topology中的。
        TopologyBuilder topologyBuilder=new TopologyBuilder();
        //1个 spout,2个bolt
        topologyBuilder.setSpout("dataSpout",new DataSpolt(),3);
        //shuffleGrouping代表上游到下游的分发策略，这里采用的是字段分组
        topologyBuilder.setBolt("splitBolt",new SplitBolt(),3).shuffleGrouping("dataSpout");
        topologyBuilder.setBolt("countBolt",new CountBolt(),3).shuffleGrouping("splitBolt");
        //Config 用于设置配置信息
        Config config =new Config();
        config.setNumWorkers(3);//设置进程数
        if(args!=null&&args.length >0){
        //提交topology应用程序，有参情况下
            StormSubmitter.submitTopology(args[0],config,topologyBuilder.createTopology());

        }else{
        //本地模式提交topology
            LocalCluster localCluster =new LocalCluster();
            localCluster.submitTopology("wordcount",config,topologyBuilder.createTopology());
        }
    }
}

一个完整的具体例子：

flume（事务驱动的日志采集系统）

event是flume传输的最小对象，event由头headers和身体(body)两部分组成：Headers部分是一个map，body部分可以是String或者byte[]等。其中body部分是真正存放数据的地方，headers部分用于interceptor。
channel：为了解决source和sink处理数据速度不一致问题
source——>channel：put事务、channel——>sink：take事务
如果channel容量不够，source中也有一个缓冲区，会暂存event
如果sink数据处理失败：sink中也有一个缓冲区
数据处理过程：1.采集flumn/kafka 2.数据预处理spark、MapReduce 3.ETL hive 4.数据导出可视化
source和sink实际上是一个接口：对于不同的数据源有不同的接口
agent：数据源——> source ——> channel ——> sink——> 数据存储
采集数据，flume和kafka有什么区别?

kafka数据源范围更广，flumn只是涉及到日志采集，kafka按照时间顺序进行数据消费
flume可以对数据进行筛选，kafka不可以
flume收集的数据可以进行更改，而kafka是只读的（管道和消息队列的区别）
kafka可以主动的对数据进行收集，转发。flume是事务驱动的日志采集系统。

组建日志系统
kafka——》flumn——》steaming——》hbase

kafka高吞吐分布式。可以根据不同的消费者不同分类，对收集的数据进行持久化，有强的顺序性。接受的数据源相比flumn更大
flumn流式日志采集工具，可以有不同的接收方，可以做简单的数据清洗（interceptor）。
如果streaming发送到hbase中数据量太大，可以把数据暂存在kafka中

采集本地静态文件：通过监控一个文件夹，对新增的文件夹的内容转换成event，数据不会丢失，注意：不能对监控的文件夹下面的新增文件做出任何更改，被监控的文件夹名必须是唯一的。

自定义source
自定义Source需要继承PollableSource （轮询拉取）或者EventDrivenSource （事件驱动），另外还需要实现Configurable接口。
PollableSource或者EventDrivenSource的区别在于：PollableSource是通过线程不断去调用process方法，主动拉取消息，而EventDrivenSource是需要触发一个调用机制，即被动等待。利用PollableSource实现自定义Source时还需要实现Configurable接口。 Configurable接口：便于项目中初始化某些配置。

一个自定义的source：

//自定义source继承AbstractSource和抽象类Configurable,PollableSource
public class source extends AbstractSource implements Configurable,PollableSource {
	//重写process方法，线程会不断的调用这个方法
	@Override
	//status：状态值
    public Status process() throws EventDeliveryException {
    	//定义event对象
    	SimpleEvent event = new SimpleEvent();
    	//设置event的body部分setBody(Byte[])
    	event.setBody((prefix+"--"+i+"--"+suffix).getBytes());
    	//发送数据到channel
    	getChannelProcessor().processEvent(event);
    	//返回状态值
    	return Status.READY;
    	}
	@Override
	//重写Configurable接口的configure方法，用于放置一些配置信息
    public void configure(Context context) {
        prefix=context.getString("prefix");
        suffix=context.getString("suffix");
    }
}

一个具体的例子：

import org.apache.flume.Context;
import org.apache.flume.EventDeliveryException;
import org.apache.flume.PollableSource;
import org.apache.flume.conf.Configurable;
import org.apache.flume.event.SimpleEvent;
import org.apache.flume.source.AbstractSource;

public class source extends AbstractSource implements Configurable,PollableSource {
    //	定义前缀 后缀
    private String prefix;
    private String suffix;
    @Override
    public Status process() throws EventDeliveryException {
        Status status=null;
        try {
			//	模拟接受数据
            for (int i = 0; i < 5; i++) {
                SimpleEvent event = new SimpleEvent ();
                event.setBody((prefix+"--"+i+"--"+suffix).getBytes());
			//发送数据到channel
                getChannelProcessor().processEvent(event);
                status=Status.READY;
            }
        } catch (Exception e) {
            status=Status.BACKOFF;
        }
        try {
            Thread.sleep(5000);

        } catch (Exception e) {
            e.printStackTrace();
        }
        return status;
    }
    @Override
    public void configure(Context context) {
        prefix=context.getString("prefix");
        suffix=context.getString("suffix");
    }
}

自定义source对应的配置文件：

# a1是给这个agent取的一个名字
# sources  sinks  channels是一个agent下的三个组件，下面这三行是为三个组件去名字
a1.sources = r1
a1.sinks = k1
a1.channels = c1

# 为agent名为a1的source定义相关的属性
a1.sources.r1.type = com.flume.source
a1.sources.r1.prefix=start
a1.sources.r1.suffix=end

# 为agent名为k1的sink定义相关的属性
a1.sinks.k1.type = logger
#下沉的位置  动态创建的

# 为agent名为c1的source定义相关的属性
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

# 让sources关联channels
a1.sources.r1.channels = c1
# 让sinks关联channel
a1.sinks.k1.channel = c1

自定义sink

//自定义的sink需要继承Configurable接口
public class sink extends AbstractSink implements Configurable{
	//重写process方法
	 @Override
    public Status process() throws EventDeliveryException {
		//创建channel对象
		Channel channel = getChannel();
		//source channel sink对象之间的消息收发需要通过transaction，
		//所以创建transaction对象用于接收channel中的数据
		Transaction transaction = channel.getTransaction();
		transaction.begin();
		//take事务，从channel中拉取数据
		Event event = channel.take();
		//getbody,获取event的body部分
		String body = new String(event.getBody());
		transaction.close();
		//返回状态信息
		return Status.READY;
	}
	 @Override
    public void configure(Context context) {
         }
}

具体代码：

import org.apache.flume.Channel;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.EventDeliveryException;
import org.apache.flume.Transaction;
import org.apache.flume.conf.Configurable;
import org.apache.flume.sink.AbstractSink;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class sink extends AbstractSink implements Configurable{
    private static final Logger logger = LoggerFactory.getLogger(sink.class);
    //	前缀 后缀
    private String prefix;
    private String suffix;
    @Override
    public Status process() throws EventDeliveryException {
        Status status =null;
        Channel channel = getChannel();
        //source channel sink 之间的消息的收发都需要通过transaction
        Transaction transaction = channel.getTransaction();
        transaction.begin();
        try {
        //	take 事务
            Event event = channel.take();
        //getbody
            String body = new String(event.getBody());
            logger.info(prefix+body+suffix);

            transaction.commit();
            status=Status.READY;
        } catch (Exception e) {
            transaction.rollback();
            status=Status.BACKOFF;
        }finally {
            transaction.close();
        }
        return status;
    }
    @Override
    public void configure(Context context) {
        prefix=context.getString("prefix");
        suffix=context.getString("suffix");
    }
}

自定义sink对应的配置文件内容：

# a1是给这个agent取的一个名字
# sources  sinks  channels是一个agent下的三个组件，下面这三行是为三个组件去名字
a1.sources = r1
a1.sinks = k1
a1.channels = c1

# 为agent名为a1的source定义相关的属性
a1.sources.r1.type = netcat
a1.sources.r1.bind=hadoop
a1.sources.r1.port=4444

# 为agent名为k1的sink定义相关的属性
a1.sinks.k1.type = com.flume.sink
#下沉的位置  动态创建的
a1.sinks.k1.prefix=start--
a1.sinks.k1.suffix=--end

# 为agent名为c1的source定义相关的属性
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

# 让sources关联channels
a1.sources.r1.channels = c1
# 让sinks关联channel
a1.sinks.k1.channel = c1

自定义的interceptor（拦截器）

//自定义拦截器继承自Interceptor
public class myIntrceptor implements Interceptor {
	//单个事件拦截器，重写intercept（Event event）方法
	@Override
    public Event intercept(Event event) {
    
        byte[]body =event.getBody();
        //自定义头信息，event中分为header部分和body部分，header部分是map形式的数据
        //下面if……else逻辑是，如果body部分开头是数据，则header部分设为（"type","letter"），否则设为("type", "number")。
        if (body[0]<'z'&& body[0]>'a') {
            event.getHeaders().put("type","letter" );
        }else if (body[0]<'9'&&body[0]>'0') {
        //定义头部信息
            event.getHeaders().put("type", "number");
		}
	  //批量事件拦截器
	  @Override
    public List<Event> intercept(List<Event> list) {
        for (Event event : list) {
        //传入list的数据，通过调用单个拦截器的方法来处理事务
            intercept(event);
        }
        return list;
    }
     //自定义拦截器内部添加静态内部类，实现Builder接口，并实现该接口的方法，
      public static class Builder implements Interceptor.Builder{
        @Override
        public void configure(Context context) {
        }
        @Override
        public Interceptor build() {
        //返回创建的自定义拦截器对象
            return new  myIntrceptor() ;
        }
    }
}

具体代码：

import java.util.List;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;

public class myIntrceptor implements Interceptor {

    @Override
    public void close(){}

    @Override
    public void initialize() {}
    //单个事件拦截
    @Override
    public Event intercept(Event event) {
        byte[]body =event.getBody();
        //自定义头信息
        if (body[0]<'z'&& body[0]>'a') {
            event.getHeaders().put("type","letter" );
        }else if (body[0]<'9'&&body[0]>'0') {
        //定义头部信息
            event.getHeaders().put("type", "number");
        }
        return event;
    }
    //批量事件拦截
    @Override
    public List<Event> intercept(List<Event> list) {
        for (Event event : list) {
            intercept(event);
        }
        return list;
    }
    public static class Builder implements Interceptor.Builder{
        @Override
        public void configure(Context context) {
        }
        @Override
        public Interceptor build() {
            return new  myIntrceptor();
        }
    }

自定义拦截器对应的配置文件(这里拦截器的逻辑是将不同的输入流放入不同的channel中，所以要使用级联的agent，有三个配置文件)：

//第一个配置文件，设置了自定义拦截器的信息
a1.sources = r1
a1.sinks = k1 k2
a1.channels = c1 c2

# Describe/configure the source
a1.sources.r1.type = netcat
a1.sources.r1.bind = hadoop
a1.sources.r1.port = 4444

# 拦截器
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = com.Flume.myInterceptor$Builder

# 选择器
a1.sources.r1.selector.type = multiplexing
a1.sources.r1.selector.header = type
# 与自定义拦截器中设置的头信息对应
a1.sources.r1.selector.mapping.letter = c1
a1.sources.r1.selector.mapping.number = c2

# Describe the sink，发送到两个不同的下一跳
a1.sinks.k1.type = avro
a1.sinks.k1.hostname = hadoop
a1.sinks.k1.port = 4141

a1.sinks.k2.type=avro
a1.sinks.k2.hostname = hadoop
a1.sinks.k2.port = 4242

# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

a1.channels.c2.type = memory
a1.channels.c2.capacity = 1000
a1.channels.c2.transactionCapacity = 100

# Bind the source and sink to the channel
a1.sources.r1.channels = c1 c2
a1.sinks.k1.channel = c1
a1.sinks.k2.channel = c2

//第二个配置文件
a2.sources = r1
a2.sinks = k1
a2.channels = c1

a2.sources.r1.type = avro
a2.sources.r1.bind = hadoop
a2.sources.r1.port = 4141

a2.sinks.k1.type = logger

a2.channels.c1.type = memory
a2.channels.c1.capacity = 1000
a2.channels.c1.transactionCapacity = 100

a2.sinks.k1.channel = c1
a2.sources.r1.channels = c1

//第三个配置文件
a3.sources = r1
a3.sinks = k1
a3.channels = c1

a3.sources.r1.type = avro
a3.sources.r1.bind = hadoop
a3.sources.r1.port = 4242

a3.sinks.k1.type = logger

a3.channels.c1.type = memory
a3.channels.c1.capacity = 1000
a3.channels.c1.transactionCapacity = 100

a3.sinks.k1.channel = c1
a3.sources.r1.channels = c1

flink

既能做流处理也能做批处理，灵活的窗口。
异步快照：（保证exeactly only）由barrirr实现。

//获取flink运行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//连接socket获取输入的数据
DataStreamSource<String>text = env.socketTextStream(hostname, port,delimiterString);
//转化处理数据
DataStream<WordWithCount> WordwithCount = text.flatMap(new FlatMapFunction<String, WordWithCount>(){}
//输出数据（打印数据）
 WordwithCount.print().setParallelism(1);//设置并行度
 //执行任务，调用execute()
  env.execute("streaming word count");

具体代码：

//wordcount例子
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.util.Collector;

public class flink {
    public static void main(String[] args) throws Exception {
//		获取所需要的端口号
        int port =9000;
//		获取flink的运行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        String hostname="192.168.25.12";
        String delimiterString ="\n";
//		链接socket获取输入的数据
        //DataStreamSource<String>text = env.socketTextStream(hostname, port,delimiterString);
        DataStreamSource<String>  text=env.fromElements("this a book", "i love china", "i am chinese");
          //计算数据
        DataStream<WordWithCount> WordwithCount = text.flatMap(new FlatMapFunction<String, WordWithCount>(){
            @Override
            public void flatMap(String value, Collector<WordWithCount> out) throws Exception {
            //分割
                String[] splits = value.split(" ");
                for (String words : splits) {
                    out.collect(new WordWithCount(words,1L));

                }
            }

        })//把每行单词转为<word,count>类型的数据
                .keyBy("word")//相同的word数据进行分组
                .timeWindow(Time.seconds(2),Time.seconds(1))//指定计算数据的窗口大小和滑动窗口
                .sum("count");

//		打印数据
        WordwithCount.print().setParallelism(1);//设置并行度
//		调用execute
        env.execute("streaming word count");
    }
    public static class WordWithCount{
        public String word;
        public long count;
        public WordWithCount() {}
        public WordWithCount(String word,long count) {
            this.word=word;
            this.count=count;
        }
        public String toString() {
            return "WordWithCount{"+"word='"+ word+'\''+",count="+count+'}';
        }
    }
}

Redis 开源的、以key-vlaue形式存储的数据库

和普通数据库的区别：
数据库完全在内存中读写性能好
磁盘用于实现持久性——持久化方案：

RDB（快照机制，某个时间点的快照先写到临时文件中，当所有的持久化过程都结束会替换上一次持久化好的机制。优点：每一个快照文件都是完整的，随时可以进行备份，缺点：数据恢复时间过长，不适合大规模数据的恢复，会有数据丢失，不适合数据完整性敏感）、
AOF（记录了服务器端收到的写操作日志，采用追加的方式，在server重启时通过回放这些写操作来重建数据集。缺点：文件体积大、文件恢复慢。优点：场景再现（暂停Redis，编辑AOF删除不想要的命令重启Redis，前提是：AOF没有被重写））

重写机制：追加导致日志文件越来越大，当AOF文件大小超过设定的阈值，会进行重写，启动AOF内容压缩，其中只保留了可以恢复数据的最小指令集（把冗余的指令替换为一条指令），AOF文件也是写在临时文件的。
Redis架构：
无中心架构，节点之间互为主从
单机版Redis（只适合学习）：

单点故障无法避免。

主从（实现了Redis server主从，无法避免sentinel单点故障）：

大于50%sentinel进程宕掉，才会进行server主从切换
这时当Redis1故障了，那么slave1和slave2都会转换成master，中断期间落在服务器上的数据都丢失。
解决方法：配置Redis进程，让其在检测到自己网络有问题的时候，立即停止对外界服务，避免在故障期间还有数据进来。

virual IP：client通过virtual IP与server进行连接，当Redis server之间进行主从切换的时候，返回一个回调脚本，virtual IP会和slave连接。

Redis数据读写流程
1.client选择集群中任意一个server结点进行连接，并发送请求。
2.连接到server节点返回节点拓扑（集群节点列表。槽位跟节点的映射关系，client在内存当中缓存集群）
3.client读写数据时，根据hash（key）值计算得到归属槽位，对应到节点，访问该节点进行读写。
4.目标节点收到client请求，检查自身是否是key值归属节点，如果不是，告知client需要重定向节点，如果是，执行业务
5.如果client收到重定向响应，重新发起读写请求，重复以上过程。

在这里插入图片描述

redis数据结构	描述
String	<String key,String value>
Map<>	<String key,Map map>
List<String key,v1,v2,v3>	有序
Set（无序，无重复）	<String key,v1,v2,v3>
zsort(有序的集合)	<String key,v1,v2,v3>可以在指定位置插入数据结构
对于不同的数据类型，所使用的接口是不一样的。

redis的常用javaapi(jedis)实现 - liumz - 博客园
https://www.cnblogs.com/liumz0323/p/10508062.html

问题：
击穿：在Redis当中获取某一个key值的时候，key值不存在必须向DB发起请求的行为
原因：第一次访问、恶意访问不存在的key、key过期
雪崩：Redis缓存层由于某种原因宕机，所有的请求——》储存层
规避方法：使用集群、限流

Redis的发布订阅（多个client可以同时向一个channel订阅）：

pub/sub API
SUB
SUBSCRIBE	（一般订阅模式）
PSUBSCRIBE	（正则表达式匹配的订阅模式）。
UNSUBSCRIBE	（取消订阅指定的通道，可以指定一个或者多个，如果不带参数，就是取消所有订阅的通道）
PUB
PUBLISH	发布消息，只能在一个通道上发布消息

基于Redis的发布订阅

//publish端发布消息
//建立发布者，通过channel发布消息
//创建redis池
private final JedisPool jedisPool;
//从连接池中取出一个连接，与redis建立连接
Jedis jedis=jedisPool.getResource();
//向channel发布消息publish(channel,message)
jedis.publish("myChannel",line);

//建一个类继承JedisPubSub，用来对订阅channel进行监控。
//onPMessage：监听到订阅模式接收到消息时的回调
//onMessage：监听到订阅频道接收到消息时的回调
public void onMessage(String channel, String message) {
        //接收消息的时候调用
        System.out.println(String.format("接收消息成功！channel：%s,message:%s",channel,message));
        this.unsubscribe();
    }
//onSubscribe：订阅频道时的回调
public void onSubscribe(String channel, int subscribedChannels) {
        //super.onSubscribe(channel, subscribedChannels);
        System.out.println(String.format("订阅消息成功！channel: %s, num: %d", channel,subscribedChannels));
    }
//onUnsubscribe：取消订阅频道时的回调
public void onUnsubscribe(String channel, int subscribedChannels) {
        //super.onUnsubscribe(channel, subscribedChannels);
        System.out.println(String.format("取消订阅 channel : %s,num: %d", channel,subscribedChannels));
    }
//onPSubscribe：订阅频道模式时的回调
//onPUnsubscribe：取消订阅模式时的回调

//sublish端，订阅消息
//建立连接池
private final JedisPool jedisPool;
//从连接池中取出数据
jedis=jedisPool.getResource();
//订阅消息subscribe(JedisPubSub,channel)
jedis.subscribe(msgListener,channelString);
//关闭连接
jedis.close();

//主程序
//连接Jedis服务器
JedisPool jedisPool=new JedisPool(new JedisPoolConfig(),host,port);
//创建发布者对象
redis_publish publish=new redis_publish(jedisPool);
publish.start();
//建立订阅者对象
redis_sublish sublish=new redis_sublish(jedisPool);
sublish.start();

一个具体的例子：

import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import java.io.BufferedReader;
import java.io.InputStreamReader;

public class redis_publish extends Thread{
    //建立发布者，通过channel发布消息
    //创建Redis池
    private final JedisPool jedisPool;
    public redis_publish(JedisPool jedisPool){
        this.jedisPool=jedisPool;
    }
    @Override
    public void run() {
        //从连接池中取出一个连接
        Jedis jedis=jedisPool.getResource();
        jedis.auth("123456");
        //接收消息
        BufferedReader bufferedReader=new BufferedReader(new InputStreamReader(System.in));
        while(true){
            String line;
            try{
                line=bufferedReader.readLine();
                jedis.publish("myChannel",line);
                System.out.println(String.format("消息发布成功:%s,message:%s","myChannel",line));
            }catch (Exception e){
            }
        }
    }
}