cassandra与hadoop整合

最新推荐文章于 2024-07-28 22:59:32 发布

嫩草终结者

最新推荐文章于 2024-07-28 22:59:32 发布

阅读量3.3k

点赞数

分类专栏：大数据 Cassandra基础

本文链接：https://blog.csdn.net/qq_32523587/article/details/57079254

版权

Cassandra基础同时被 2 个专栏收录

14 篇文章 3 订阅

订阅专栏

大数据

7 篇文章 1 订阅

订阅专栏

cassandra与hadoop的安装就不提了，本文主要介绍cassandra与hadoop的整合。因为关于cassandra与hadoop整合这类的资料在网上基本上找不到，而公司一直在用的就是cassandra数据库，无奈之下只能从官方文档里去找答案。然后发现官方文档有过这方面的介绍，里面提到，在cassandra的源代码里有几个关于整合hadoop的WordCount的代码。果断去源码里找到了相关的代码，加载到本地以后经过一番调试，总算是成功了一小部分了，接下来的部分还需要慢慢探索。

刚开始把代码修改完，运行之后老是出现NativeIO之类的错误，后来网上查了一下，勉强算是找了一个解决方案，就是从源码中将NativeIO.java的源代码下载到本地来，修改以下

的部分：

将返回结果改成true，原因之类的不清楚，不管怎么样程序总算可以正常的跑起来了。

首先来一段hdfs的创建、删除文件夹或者文件的操作代码：

package com.zhuyun.hadoop;

import java.io.FileInputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;

public class HDFSDemo2 {

	static FileSystem fs = null;
	
	@Before
	public void init() throws Exception{
//		System.setProperty("hadoop.home.dir", "D:/Program Files/hadoop-2.7.3");
		fs = FileSystem.get(new URI("hdfs://192.168.10.203:9000"), new Configuration(), "root");
	}
	
	@Test
	public void testUpload() throws Exception{
		InputStream in = new FileInputStream("D:/myeclipse/myeclipse.ini");
		OutputStream out = fs.create(new Path("/user/root/output/myeclipse.ini"));
		IOUtils.copyBytes(in, out, 4096, true);
	}
	
	@Test
	public void testDelete() throws Exception{
		boolean flag = fs.delete(new Path("/user/root/output"),true);
		System.err.println(flag);
	}
	
	@Test
	public void testMkdir() throws Exception{
		boolean flag = fs.mkdirs(new Path("/user/root/output"));
		System.out.println(flag);
	}
	
	@After
	public void destroy(){
		System.out.println(1);
	}
}

下面是一段hadoop经典的代码WordCount，统计文档里的单词出现的次数:

运行时需要输入文档和输出的位置，类似于hdfs://192.168.1.10:9000/user/root/input.txt hdfs://192.168.1.10:9000/user/root/output

package com.zhuyun.hadoop;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WordCount {

  public static class TokenizerMapper
       extends Mapper<Object, Text, Text, IntWritable>{

    private final static IntWritable one = new IntWritable(1);
    private Text word = new Text();
    
    //Object key, Text value就是输入的key和value,第三个参数Context context这是可以记录输入的key和value
    public void map(Object key, Text value, Context context
                    ) throws IOException, InterruptedException {
      StringTokenizer itr = new StringTokenizer(value.toString());		//输入的文档
      while (itr.hasMoreTokens()) {										//当未到文档的末尾时
        word.set(itr.nextToken());										//设置key的名称
        context.write(word, one);										//添加到记录文件中，value值是1(one)
        												//例如：bye	1
														//	   hello	1
														//	   world	1
        												//	   world	1
      }
    }
    
  }

  public static class IntSumReducer
       extends Reducer<Text,IntWritable,Text,IntWritable> {
    private IntWritable result = new IntWritable();

    //reduce的输入是一个key对应一组的值的value，reduce也有context和map的context作用一致。
    public void reduce(Text key, Iterable<IntWritable> values,
                       Context context
                       ) throws IOException, InterruptedException {
      int sum = 0;
      for (IntWritable val : values) {							//相同的key有多少个value，就累加多少次
        sum += val.get();										//具体每次累加多少，取决于传递过来的value值
      }
      result.set(sum);											//设置value的总数
      context.write(key, result);								//结果：  bye  	1
      															//		hello	1
      															//		world	2
    }
  }

  public static void main(String[] args) throws Exception {
//	System.setProperty("hadoop.home.dir", "D:/Program Files/hadoop-2.7.3");
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf, "word count");			//job的名称
    job.setJarByClass(WordCount.class);						//当前类
    job.setMapperClass(TokenizerMapper.class);				//装载map函数和reduce函数实现类
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);						//定义输出的key类型
    job.setOutputValueClass(IntWritable.class);				//定义输出的value类型
    FileInputFormat.addInputPath(job, new Path(args[0]));		//输入文件的地址
    FileOutputFormat.setOutputPath(job, new Path(args[1]));		//输出文件的地址
    System.exit(job.waitForCompletion(true) ? 0 : 1);
  }
}

下面的代码是从cassandra源码中提取出来的，目前仅仅验证了将数据从cassandra查询出来，将统计结果写入到hdfs里面这一步；而将结果写入到cassandra数据库的这一步操作却一直在出错，原因还在探索中，以下是代码部分：

注：输入输出为 input_mapper output_reducer=not_filesystem

操作结果需要从hdfs中查看 : hdfs dfs -cat output/*

package com.zhuyun.hadoop.cassandra;
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.StringTokenizer;

import org.apache.cassandra.hadoop.ConfigHelper;
import org.apache.cassandra.hadoop.cql3.CqlConfigHelper;
import org.apache.cassandra.hadoop.cql3.CqlInputFormat;
import org.apache.cassandra.hadoop.cql3.CqlOutputFormat;
import org.apache.cassandra.utils.ByteBufferUtil;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.datastax.driver.core.Row;

/**
 * This counts the occurrences of words in ColumnFamily
 *   inputs		 ( id uuid,
 *                   line  text,
 *                   PRIMARY KEY (id))
 *
 * For each word, we output the total number of occurrences across all body texts.
 *
 * When outputting to Cassandra, we write the word counts to column family
 *  output_words ( word text,
 *                 count_num text,
 *                 PRIMARY KEY (word))
 * as a {word, count} to columns: word, count_num with a row key of "word sum"
 * 
 * 将数据从cassandra中取出来，放在hadoop上运算以后，将结果再放入cassandra的另外一个表中
 */
public class WordCount extends Configured implements Tool
{
    private static final Logger logger = LoggerFactory.getLogger(WordCount.class);
    static final String INPUT_MAPPER_VAR = "input_mapper";
    static final String KEYSPACE = "cql3_wordcount";
    static final String COLUMN_FAMILY = "inputs";

    static final String OUTPUT_REDUCER_VAR = "output_reducer";
    static final String OUTPUT_COLUMN_FAMILY = "output_words";

    private static final String OUTPUT_PATH_PREFIX = "hdfs://192.168.10.203:9000/user/root/output";
    private static final String PRIMARY_KEY = "row_key";

    public static void main(String[] args) throws Exception
    {
        // Let ToolRunner handle generic command-line options
        ToolRunner.run(new Configuration(), new WordCount(), args);			//启动
        System.exit(0);														//退出
    }

    //多行数据输入的map程序
    public static class TokenizerMapper extends Mapper<Map<String, ByteBuffer>, Map<String, ByteBuffer>, Text, IntWritable>
    {
        private final static IntWritable one = new IntWritable(1);
        private Text word = new Text();
        private ByteBuffer sourceColumn;
        protected void setup(org.apache.hadoop.mapreduce.Mapper.Context context)
        throws IOException, InterruptedException
        {
        }

        public void map(Map<String, ByteBuffer> keys, Map<String, ByteBuffer> columns, Context context) throws IOException, InterruptedException
        {
            for (Entry<String, ByteBuffer> column : columns.entrySet())
            {
                if (!"line".equalsIgnoreCase(column.getKey()))
                    continue;

                String value = ByteBufferUtil.string(column.getValue());

                StringTokenizer itr = new StringTokenizer(value);
                while (itr.hasMoreTokens())
                {
                    word.set(itr.nextToken());
                    context.write(word, one);
                }
            }
        }
    }

    //单行数据输入的map程序
    public static class NativeTokenizerMapper extends Mapper<Long, Row, Text, IntWritable>
    {
        private final static IntWritable one = new IntWritable(1);
        private Text word = new Text();
        private ByteBuffer sourceColumn;
        protected void setup(org.apache.hadoop.mapreduce.Mapper.Context context)
        throws IOException, InterruptedException
        {
        }

        public void map(Long key, Row row, Context context) throws IOException, InterruptedException
        {
            String value = row.getString("line");
            logger.debug("read {}:{}={} from {}", key, "line", value, context.getInputSplit());
            StringTokenizer itr = new StringTokenizer(value);
            while (itr.hasMoreTokens())
            {
                word.set(itr.nextToken());
                context.write(word, one);
            }
        }
    }

    //reduce程序，输出文件存储到文件系统中
    public static class ReducerToFilesystem extends Reducer<Text, IntWritable, Text, IntWritable>
    {
        public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException
        {
            int sum = 0;
            for (IntWritable val : values)				//结果有几个，就累加几次
                sum += val.get();
            context.write(key, new IntWritable(sum));
        }
    }
    
    //reduce程序，输出文件存储到cassandra数据库中
    public static class ReducerToCassandra extends Reducer<Text, IntWritable, Map<String, ByteBuffer>, List<ByteBuffer>>
    {
        private Map<String, ByteBuffer> keys;
        private ByteBuffer key;
        protected void setup(org.apache.hadoop.mapreduce.Mapper.Context context)
        throws IOException, InterruptedException
        {
            keys = new LinkedHashMap<String, ByteBuffer>();
        }

        public void reduce(Text word, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException
        {
            int sum = 0;
            for (IntWritable val : values)
                sum += val.get();
            
            System.out.println("--------word-------" + word.toString());
            System.out.println("-------------2222222----------");
            System.out.println("ByteBufferUtil.bytes--------" + ByteBufferUtil.bytes(word.toString()));
            keys.put("word", ByteBufferUtil.bytes(word.toString()));
            System.out.println("--------keys------------------" + keys.toString());
            context.write(keys, getBindVariables(word, sum));
        }

        private List<ByteBuffer> getBindVariables(Text word, int sum)
        {
            List<ByteBuffer> variables = new ArrayList<ByteBuffer>();
            variables.add(ByteBufferUtil.bytes(String.valueOf(sum)));         
            return variables;
        }
    }

    public int run(String[] args) throws Exception
    {
        String outputReducerType = "filesystem";
        String inputMapperType = "native";
        String outputReducer = null;
        String inputMapper = null;

        if (args != null)
        {
            if(args[0].startsWith(OUTPUT_REDUCER_VAR))
                outputReducer = args[0];
            if(args[0].startsWith(INPUT_MAPPER_VAR))
                inputMapper = args[0];
            
            if (args.length == 2)
            {
                if(args[1].startsWith(OUTPUT_REDUCER_VAR))
                    outputReducer = args[1];
                if(args[1].startsWith(INPUT_MAPPER_VAR))
                    inputMapper = args[1]; 
            }
        }

        if (outputReducer != null)
        {
            String[] s = outputReducer.split("=");
            if (s != null && s.length == 2)
                outputReducerType = s[1];
        }
        logger.info("output reducer type: " + outputReducerType);
        if (inputMapper != null)
        {
            String[] s = inputMapper.split("=");
            if (s != null && s.length == 2)
                inputMapperType = s[1];
        }
        
        @SuppressWarnings("deprecation")
		Job job = new Job(getConf(), "wordcount");
        job.setJarByClass(WordCount.class);

        if (outputReducerType.equalsIgnoreCase("filesystem"))
        {
            job.setCombinerClass(ReducerToFilesystem.class);
            job.setReducerClass(ReducerToFilesystem.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);
            FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH_PREFIX));
        }
        else
        {
            job.setReducerClass(ReducerToCassandra.class);

            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(IntWritable.class);
            job.setOutputKeyClass(Map.class);
            job.setOutputValueClass(List.class);

            job.setOutputFormatClass(CqlOutputFormat.class);

            ConfigHelper.setOutputColumnFamily(job.getConfiguration(), KEYSPACE, OUTPUT_COLUMN_FAMILY);
            job.getConfiguration().set(PRIMARY_KEY, "word,sum");
            String query = "UPDATE " + KEYSPACE + "." + OUTPUT_COLUMN_FAMILY +
                           " SET count_num = ? ";
            CqlConfigHelper.setOutputCql(job.getConfiguration(), query);
            ConfigHelper.setOutputInitialAddress(job.getConfiguration(), "192.168.10.201");
            ConfigHelper.setOutputPartitioner(job.getConfiguration(), "Murmur3Partitioner");
            
        }

        if (inputMapperType.equalsIgnoreCase("native"))
        {
            job.setMapperClass(NativeTokenizerMapper.class);
            job.setInputFormatClass(CqlInputFormat.class);
//            CqlConfigHelper.setInputCql(job.getConfiguration(), "select * from " + COLUMN_FAMILY + " where token(id) > ? and token(id) <= ? allow filtering");
            CqlConfigHelper.setInputCql(job.getConfiguration(), "select * from cql3_wordcount.inputs where "
            		+ "line='But make allowance for their doubting too:' and token(id)>? and token(id)<=? allow filtering");
        }
        else
        {
            job.setMapperClass(TokenizerMapper.class);
            job.setInputFormatClass(CqlInputFormat.class);
            ConfigHelper.setInputRpcPort(job.getConfiguration(), "9042");
        }

        ConfigHelper.setInputInitialAddress(job.getConfiguration(), "192.168.10.201");
        ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, COLUMN_FAMILY);
        ConfigHelper.setInputPartitioner(job.getConfiguration(), "Murmur3Partitioner");
        
        CqlConfigHelper.setInputCQLPageRowSize(job.getConfiguration(), "3");
        job.waitForCompletion(true);
        return 0;
    }
}

下面这部分代码是，统计cassandra数据库中，某一个表的总行数，相当于select count(*) from table1; 这个语句，不过当cassandra中数据量太大的时候，该操作就会超时，因此才需要利用hadoop来统计：

注：这段代码运行时，无须带参数，操作结果需要从hdfs中查看 : hdfs dfs -cat output/*

package com.zhuyun.hadoop.cassandra;
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;

import org.apache.cassandra.hadoop.ConfigHelper;
import org.apache.cassandra.hadoop.cql3.CqlConfigHelper;
import org.apache.cassandra.hadoop.cql3.CqlInputFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.datastax.driver.core.Row;

/**
 *统计cassandra数据库表的总行数
 */
public class LineCount extends Configured implements Tool
{
    private static final Logger logger = LoggerFactory.getLogger(LineCount.class);
    static final String INPUT_MAPPER_VAR = "input_mapper";
    static final String KEYSPACE = "cql3_wordcount";
    static final String COLUMN_FAMILY = "inputs2";
//    static final String KEYSPACE = "keyspace1";
//    static final String COLUMN_FAMILY = "table4";

    static final String OUTPUT_REDUCER_VAR = "output_reducer";
    static final String OUTPUT_COLUMN_FAMILY = "output_words";

    private static final String OUTPUT_PATH_PREFIX = "hdfs://192.168.10.203:9000/user/root/output";
    public static void main(String[] args) throws Exception
    {
        // Let ToolRunner handle generic command-line options
        ToolRunner.run(new Configuration(), new LineCount(), args);			//启动
        System.exit(0);														//退出
    }



    //单行数据输入的map程序
    public static class NativeTokenizerMapper extends Mapper<Long, Row, Text, IntWritable>
    {
        private final static IntWritable one = new IntWritable(1);
        private Text word = new Text();
        protected void setup(Context context)
        throws IOException, InterruptedException
        {
        }

        public void map(Long key, Row row, Context context) throws IOException, InterruptedException
        {
        	word.set("count");
        	context.write(word, one);
        }
    }

    //reduce程序，输出文件存储到文件系统中
    public static class ReducerToFilesystem extends Reducer<Text, IntWritable, Text, IntWritable>
    {
        public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException
        {
            int sum = 0;
            for (IntWritable val : values)				//结果有几个，就累加几次
                sum += val.get();
            context.write(key, new IntWritable(sum));
        }
    }
    

    public int run(String[] args) throws Exception
    {
        String outputReducerType = "filesystem";
        String inputMapperType = "native";
        String outputReducer = null;
        String inputMapper = null;

        if (args != null)
        {
            if(args[0].startsWith(OUTPUT_REDUCER_VAR))
                outputReducer = args[0];
            if(args[0].startsWith(INPUT_MAPPER_VAR))
                inputMapper = args[0];
            
            if (args.length == 2)
            {
                if(args[1].startsWith(OUTPUT_REDUCER_VAR))
                    outputReducer = args[1];
                if(args[1].startsWith(INPUT_MAPPER_VAR))
                    inputMapper = args[1]; 
            }
        }

        if (outputReducer != null)
        {
            String[] s = outputReducer.split("=");
            if (s != null && s.length == 2)
                outputReducerType = s[1];
        }
        logger.info("output reducer type: " + outputReducerType);
        if (inputMapper != null)
        {
            String[] s = inputMapper.split("=");
            if (s != null && s.length == 2)
                inputMapperType = s[1];
        }
        
        @SuppressWarnings("deprecation")
		Job job = new Job(getConf(), "wordcount");
        job.setJarByClass(LineCount.class);

        if (outputReducerType.equalsIgnoreCase("filesystem"))
        {
            job.setCombinerClass(ReducerToFilesystem.class);
            job.setReducerClass(ReducerToFilesystem.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);
            FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH_PREFIX));
        }

        if (inputMapperType.equalsIgnoreCase("native"))
        {
            job.setMapperClass(NativeTokenizerMapper.class);
            job.setInputFormatClass(CqlInputFormat.class);
//            CqlConfigHelper.setInputCql(job.getConfiguration(), "select * from " + COLUMN_FAMILY + " where token(id) > ? and token(id) <= ? allow filtering");
            CqlConfigHelper.setInputCql(job.getConfiguration(), "select * from " + COLUMN_FAMILY + " where "
            		+ "id=6cfc5374-013f-40b9-92d1-ac86d5103bfd and token(id)>? and token(id)<=?");
        }

        ConfigHelper.setInputInitialAddress(job.getConfiguration(), "192.168.10.201");
        ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, COLUMN_FAMILY);
        ConfigHelper.setInputPartitioner(job.getConfiguration(), "Murmur3Partitioner");
        
        CqlConfigHelper.setInputCQLPageRowSize(job.getConfiguration(), "10000");
        job.waitForCompletion(true);
        return 0;
    }
}