cassandra与hadoop整合

cassandra与hadoop的安装就不提了,本文主要介绍cassandra与hadoop的整合。因为关于cassandra与hadoop整合这类的资料在网上基本上找不到,而公司一直在用的就是cassandra数据库,无奈之下只能从官方文档里去找答案。然后发现官方文档有过这方面的介绍,里面提到,在cassandra的源代码里有几个关于整合hadoop的WordCount的代码。果断去源码里找到了相关的代码,加载到本地以后经过一番调试,总算是成功了一小部分了,接下来的部分还需要慢慢探索。


刚开始把代码修改完,运行之后老是出现NativeIO之类的错误,后来网上查了一下,勉强算是找了一个解决方案,就是从源码中将NativeIO.java的源代码下载到本地来,修改以下

的部分:

将返回结果改成true,原因之类的不清楚,不管怎么样程序总算可以正常的跑起来了。


首先来一段hdfs的创建、删除文件夹或者文件的操作代码:

package com.zhuyun.hadoop;

import java.io.FileInputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;

public class HDFSDemo2 {

	static FileSystem fs = null;
	
	@Before
	public void init() throws Exception{
//		System.setProperty("hadoop.home.dir", "D:/Program Files/hadoop-2.7.3");
		fs = FileSystem.get(new URI("hdfs://192.168.10.203:9000"), new Configuration(), "root");
	}
	
	@Test
	public void testUpload() throws Exception{
		InputStream in = new FileInputStream("D:/myeclipse/myeclipse.ini");
		OutputStream out = fs.create(new Path("/user/root/output/myeclipse.ini"));
		IOUtils.copyBytes(in, out, 4096, true);
	}
	
	@Test
	public void testDelete() throws Exception{
		boolean flag = fs.delete(new Path("/user/root/output"),true);
		System.err.println(flag);
	}
	
	@Test
	public void testMkdir() throws Exception{
		boolean flag = fs.mkdirs(new Path("/user/root/output"));
		System.out.println(flag);
	}
	
	@After
	public void destroy(){
		System.out.println(1);
	}
}



下面是一段hadoop经典的代码WordCount,统计文档里的单词出现的次数: 

运行时需要输入文档和输出的位置,类似于hdfs://192.168.1.10:9000/user/root/input.txt    hdfs://192.168.1.10:9000/user/root/output

package com.zhuyun.hadoop;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WordCount {

  public static class TokenizerMapper
       extends Mapper<Object, Text, Text, IntWritable>{

    private final static IntWritable one = new IntWritable(1);
    private Text word = new Text();
    
    //Object key, Text value就是输入的key和value,第三个参数Context context这是可以记录输入的key和value
    public void map(Object key, Text value, Context context
                    ) throws IOException, InterruptedException {
      StringTokenizer itr = new StringTokenizer(value.toString());		//输入的文档
      while (itr.hasMoreTokens()) {										//当未到文档的末尾时
        word.set(itr.nextToken());										//设置key的名称
        context.write(word, one);										//添加到记录文件中,value值是1(one)
        												//例如:bye	1
														//	   hello	1
														//	   world	1
        												//	   world	1
      }
    }
    
  }

  public static class IntSumReducer
       extends Reducer<Text,IntWritable,Text,IntWritable> {
    private IntWritable result = new IntWritable();

    //reduce的输入是一个key对应一组的值的value,reduce也有context和map的context作用一致。
    public void reduce(Text key, Iterable<IntWritable> values,
                       Context context
                       ) throws IOException, InterruptedException {
      int sum = 0;
      for (IntWritable val : values) {							//相同的key有多少个value,就累加多少次
        sum += val.get();										//具体每次累加多少,取决于传递过来的value值
      }
      result.set(sum);											//设置value的总数
      context.write(key, result);								//结果:  bye  	1
      															//		hello	1
      															//		world	2
    }
  }

  public static void main(String[] args) throws Exception {
//	System.setProperty("hadoop.home.dir", "D:/Program Files/hadoop-2.7.3");
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf, "word count");			//job的名称
    job.setJarByClass(WordCount.class);						//当前类
    job.setMapperClass(TokenizerMapper.class);				//装载map函数和reduce函数实现类
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);						//定义输出的key类型
    job.setOutputValueClass(IntWritable.class);				//定义输出的value类型
    FileInputFormat.addInputPath(job, new Path(args[0]));		//输入文件的地址
    FileOutputFormat.setOutputPath(job, new Path(args[1]));		//输出文件的地址
    System.exit(job.waitForCompletion(true) ? 0 : 1);
  }
}


下面的代码是从cassandra源码中提取出来的,目前仅仅验证了将数据从cassandra查询出来,将统计结果写入到hdfs里面这一步;而将结果写入到cassandra数据库的这一步操作却一直在出错,原因还在探索中,以下是代码部分:

注:输入输出为  input_mapper  output_reducer=not_filesystem

操作结果需要从hdfs中查看    :   hdfs dfs -cat output/*

package com.zhuyun.hadoop.cassandra;
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.StringTokenizer;

import org.apache.cassandra.hadoop.ConfigHelper;
import org.apache.cassandra.hadoop.cql3.CqlConfigHelper;
import org.apache.cassandra.hadoop.cql3.CqlInputFormat;
import org.apache.cassandra.hadoop.cql3.CqlOutputFormat;
import org.apache.cassandra.utils.ByteBufferUtil;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.datastax.driver.core.Row;

/**
 * This counts the occurrences of words in ColumnFamily
 *   inputs		 ( id uuid,
 *                   line  text,
 *                   PRIMARY KEY (id))
 *
 * For each word, we output the total number of occurrences across all body texts.
 *
 * When outputting to Cassandra, we write the word counts to column family
 *  output_words ( word text,
 *                 count_num text,
 *                 PRIMARY KEY (word))
 * as a {word, count} to columns: word, count_num with a row key of "word sum"
 * 
 * 将数据从cassandra中取出来,放在hadoop上运算以后,将结果再放入cassandra的另外一个表中
 */
public class WordCount extends Configured implements Tool
{
    private static final Logger logger = LoggerFactory.getLogger(WordCount.class);
    static final String INPUT_MAPPER_VAR = "input_mapper";
    static final String KEYSPACE = "cql3_wordcount";
    static final String COLUMN_FAMILY = "inputs";

    static final String OUTPUT_REDUCER_VAR = "output_reducer";
    static final String OUTPUT_COLUMN_FAMILY = "output_words";

    private static final String OUTPUT_PATH_PREFIX = "hdfs://192.168.10.203:9000/user/root/output";
    private static final String PRIMARY_KEY = "row_key";

    public static void main(String[] args) throws Exception
    {
        // Let ToolRunner handle generic command-line options
        ToolRunner.run(new Configuration(), new WordCount(), args);			//启动
        System.exit(0);														//退出
    }

    //多行数据输入的map程序
    public static class TokenizerMapper extends Mapper<Map<String, ByteBuffer>, Map<String, ByteBuffer>, Text, IntWritable>
    {
        private final static IntWritable one = new IntWritable(1);
        private Text word = new Text();
        private ByteBuffer sourceColumn;
        protected void setup(org.apache.hadoop.mapreduce.Mapper.Context context)
        throws IOException, InterruptedException
        {
        }

        public void map(Map<String, ByteBuffer> keys, Map<String, ByteBuffer> columns, Context context) throws IOException, InterruptedException
        {
            for (Entry<String, ByteBuffer> column : columns.entrySet())
            {
                if (!"line".equalsIgnoreCase(column.getKey()))
                    continue;

                String value = ByteBufferUtil.string(column.getValue());

                StringTokenizer itr = new StringTokenizer(value);
                while (itr.hasMoreTokens())
                {
                    word.set(itr.nextToken());
                    context.write(word, one);
                }
            }
        }
    }

    //单行数据输入的map程序
    public static class NativeTokenizerMapper extends Mapper<Long, Row, Text, IntWritable>
    {
        private final static IntWritable one = new IntWritable(1);
        private Text word = new Text();
        private ByteBuffer sourceColumn;
        protected void setup(org.apache.hadoop.mapreduce.Mapper.Context context)
        throws IOException, InterruptedException
        {
        }

        public void map(Long key, Row row, Context context) throws IOException, InterruptedException
        {
            String value = row.getString("line");
            logger.debug("read {}:{}={} from {}", key, "line", value, context.getInputSplit());
            StringTokenizer itr = new StringTokenizer(value);
            while (itr.hasMoreTokens())
            {
                word.set(itr.nextToken());
                context.write(word, one);
            }
        }
    }

    //reduce程序,输出文件存储到文件系统中
    public static class ReducerToFilesystem extends Reducer<Text, IntWritable, Text, IntWritable>
    {
        public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException
        {
            int sum = 0;
            for (IntWritable val : values)				//结果有几个,就累加几次
                sum += val.get();
            context.write(key, new IntWritable(sum));
        }
    }
    
    //reduce程序,输出文件存储到cassandra数据库中
    public static class ReducerToCassandra extends Reducer<Text, IntWritable, Map<String, ByteBuffer>, List<ByteBuffer>>
    {
        private Map<String, ByteBuffer> keys;
        private ByteBuffer key;
        protected void setup(org.apache.hadoop.mapreduce.Mapper.Context context)
        throws IOException, InterruptedException
        {
            keys = new LinkedHashMap<String, ByteBuffer>();
        }

        public void reduce(Text word, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException
        {
            int sum = 0;
            for (IntWritable val : values)
                sum += val.get();
            
            System.out.println("--------word-------" + word.toString());
            System.out.println("-------------2222222----------");
            System.out.println("ByteBufferUtil.bytes--------" + ByteBufferUtil.bytes(word.toString()));
            keys.put("word", ByteBufferUtil.bytes(word.toString()));
            System.out.println("--------keys------------------" + keys.toString());
            context.write(keys, getBindVariables(word, sum));
        }

        private List<ByteBuffer> getBindVariables(Text word, int sum)
        {
            List<ByteBuffer> variables = new ArrayList<ByteBuffer>();
            variables.add(ByteBufferUtil.bytes(String.valueOf(sum)));         
            return variables;
        }
    }

    public int run(String[] args) throws Exception
    {
        String outputReducerType = "filesystem";
        String inputMapperType = "native";
        String outputReducer = null;
        String inputMapper = null;

        if (args != null)
        {
            if(args[0].startsWith(OUTPUT_REDUCER_VAR))
                outputReducer = args[0];
            if(args[0].startsWith(INPUT_MAPPER_VAR))
                inputMapper = args[0];
            
            if (args.length == 2)
            {
                if(args[1].startsWith(OUTPUT_REDUCER_VAR))
                    outputReducer = args[1];
                if(args[1].startsWith(INPUT_MAPPER_VAR))
                    inputMapper = args[1]; 
            }
        }

        if (outputReducer != null)
        {
            String[] s = outputReducer.split("=");
            if (s != null && s.length == 2)
                outputReducerType = s[1];
        }
        logger.info("output reducer type: " + outputReducerType);
        if (inputMapper != null)
        {
            String[] s = inputMapper.split("=");
            if (s != null && s.length == 2)
                inputMapperType = s[1];
        }
        
        @SuppressWarnings("deprecation")
		Job job = new Job(getConf(), "wordcount");
        job.setJarByClass(WordCount.class);

        if (outputReducerType.equalsIgnoreCase("filesystem"))
        {
            job.setCombinerClass(ReducerToFilesystem.class);
            job.setReducerClass(ReducerToFilesystem.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);
            FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH_PREFIX));
        }
        else
        {
            job.setReducerClass(ReducerToCassandra.class);

            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(IntWritable.class);
            job.setOutputKeyClass(Map.class);
            job.setOutputValueClass(List.class);

            job.setOutputFormatClass(CqlOutputFormat.class);

            ConfigHelper.setOutputColumnFamily(job.getConfiguration(), KEYSPACE, OUTPUT_COLUMN_FAMILY);
            job.getConfiguration().set(PRIMARY_KEY, "word,sum");
            String query = "UPDATE " + KEYSPACE + "." + OUTPUT_COLUMN_FAMILY +
                           " SET count_num = ? ";
            CqlConfigHelper.setOutputCql(job.getConfiguration(), query);
            ConfigHelper.setOutputInitialAddress(job.getConfiguration(), "192.168.10.201");
            ConfigHelper.setOutputPartitioner(job.getConfiguration(), "Murmur3Partitioner");
            
        }

        if (inputMapperType.equalsIgnoreCase("native"))
        {
            job.setMapperClass(NativeTokenizerMapper.class);
            job.setInputFormatClass(CqlInputFormat.class);
//            CqlConfigHelper.setInputCql(job.getConfiguration(), "select * from " + COLUMN_FAMILY + " where token(id) > ? and token(id) <= ? allow filtering");
            CqlConfigHelper.setInputCql(job.getConfiguration(), "select * from cql3_wordcount.inputs where "
            		+ "line='But make allowance for their doubting too:' and token(id)>? and token(id)<=? allow filtering");
        }
        else
        {
            job.setMapperClass(TokenizerMapper.class);
            job.setInputFormatClass(CqlInputFormat.class);
            ConfigHelper.setInputRpcPort(job.getConfiguration(), "9042");
        }

        ConfigHelper.setInputInitialAddress(job.getConfiguration(), "192.168.10.201");
        ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, COLUMN_FAMILY);
        ConfigHelper.setInputPartitioner(job.getConfiguration(), "Murmur3Partitioner");
        
        CqlConfigHelper.setInputCQLPageRowSize(job.getConfiguration(), "3");
        job.waitForCompletion(true);
        return 0;
    }
}

下面这部分代码是,统计cassandra数据库中,某一个表的总行数,相当于select count(*) from table1;  这个语句,不过当cassandra中数据量太大的时候,该操作就会超时,因此才需要利用hadoop来统计:

注:这段代码运行时,无须带参数,操作结果需要从hdfs中查看    :   hdfs dfs -cat output/*

package com.zhuyun.hadoop.cassandra;
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;

import org.apache.cassandra.hadoop.ConfigHelper;
import org.apache.cassandra.hadoop.cql3.CqlConfigHelper;
import org.apache.cassandra.hadoop.cql3.CqlInputFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.datastax.driver.core.Row;

/**
 *统计cassandra数据库表的总行数
 */
public class LineCount extends Configured implements Tool
{
    private static final Logger logger = LoggerFactory.getLogger(LineCount.class);
    static final String INPUT_MAPPER_VAR = "input_mapper";
    static final String KEYSPACE = "cql3_wordcount";
    static final String COLUMN_FAMILY = "inputs2";
//    static final String KEYSPACE = "keyspace1";
//    static final String COLUMN_FAMILY = "table4";

    static final String OUTPUT_REDUCER_VAR = "output_reducer";
    static final String OUTPUT_COLUMN_FAMILY = "output_words";

    private static final String OUTPUT_PATH_PREFIX = "hdfs://192.168.10.203:9000/user/root/output";
    public static void main(String[] args) throws Exception
    {
        // Let ToolRunner handle generic command-line options
        ToolRunner.run(new Configuration(), new LineCount(), args);			//启动
        System.exit(0);														//退出
    }



    //单行数据输入的map程序
    public static class NativeTokenizerMapper extends Mapper<Long, Row, Text, IntWritable>
    {
        private final static IntWritable one = new IntWritable(1);
        private Text word = new Text();
        protected void setup(Context context)
        throws IOException, InterruptedException
        {
        }

        public void map(Long key, Row row, Context context) throws IOException, InterruptedException
        {
        	word.set("count");
        	context.write(word, one);
        }
    }

    //reduce程序,输出文件存储到文件系统中
    public static class ReducerToFilesystem extends Reducer<Text, IntWritable, Text, IntWritable>
    {
        public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException
        {
            int sum = 0;
            for (IntWritable val : values)				//结果有几个,就累加几次
                sum += val.get();
            context.write(key, new IntWritable(sum));
        }
    }
    

    public int run(String[] args) throws Exception
    {
        String outputReducerType = "filesystem";
        String inputMapperType = "native";
        String outputReducer = null;
        String inputMapper = null;

        if (args != null)
        {
            if(args[0].startsWith(OUTPUT_REDUCER_VAR))
                outputReducer = args[0];
            if(args[0].startsWith(INPUT_MAPPER_VAR))
                inputMapper = args[0];
            
            if (args.length == 2)
            {
                if(args[1].startsWith(OUTPUT_REDUCER_VAR))
                    outputReducer = args[1];
                if(args[1].startsWith(INPUT_MAPPER_VAR))
                    inputMapper = args[1]; 
            }
        }

        if (outputReducer != null)
        {
            String[] s = outputReducer.split("=");
            if (s != null && s.length == 2)
                outputReducerType = s[1];
        }
        logger.info("output reducer type: " + outputReducerType);
        if (inputMapper != null)
        {
            String[] s = inputMapper.split("=");
            if (s != null && s.length == 2)
                inputMapperType = s[1];
        }
        
        @SuppressWarnings("deprecation")
		Job job = new Job(getConf(), "wordcount");
        job.setJarByClass(LineCount.class);

        if (outputReducerType.equalsIgnoreCase("filesystem"))
        {
            job.setCombinerClass(ReducerToFilesystem.class);
            job.setReducerClass(ReducerToFilesystem.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);
            FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH_PREFIX));
        }

        if (inputMapperType.equalsIgnoreCase("native"))
        {
            job.setMapperClass(NativeTokenizerMapper.class);
            job.setInputFormatClass(CqlInputFormat.class);
//            CqlConfigHelper.setInputCql(job.getConfiguration(), "select * from " + COLUMN_FAMILY + " where token(id) > ? and token(id) <= ? allow filtering");
            CqlConfigHelper.setInputCql(job.getConfiguration(), "select * from " + COLUMN_FAMILY + " where "
            		+ "id=6cfc5374-013f-40b9-92d1-ac86d5103bfd and token(id)>? and token(id)<=?");
        }

        ConfigHelper.setInputInitialAddress(job.getConfiguration(), "192.168.10.201");
        ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, COLUMN_FAMILY);
        ConfigHelper.setInputPartitioner(job.getConfiguration(), "Murmur3Partitioner");
        
        CqlConfigHelper.setInputCQLPageRowSize(job.getConfiguration(), "10000");
        job.waitForCompletion(true);
        return 0;
    }
}



评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值