cassandra与hadoop的安装就不提了,本文主要介绍cassandra与hadoop的整合。因为关于cassandra与hadoop整合这类的资料在网上基本上找不到,而公司一直在用的就是cassandra数据库,无奈之下只能从官方文档里去找答案。然后发现官方文档有过这方面的介绍,里面提到,在cassandra的源代码里有几个关于整合hadoop的WordCount的代码。果断去源码里找到了相关的代码,加载到本地以后经过一番调试,总算是成功了一小部分了,接下来的部分还需要慢慢探索。
刚开始把代码修改完,运行之后老是出现NativeIO之类的错误,后来网上查了一下,勉强算是找了一个解决方案,就是从源码中将NativeIO.java的源代码下载到本地来,修改以下
的部分:
将返回结果改成true,原因之类的不清楚,不管怎么样程序总算可以正常的跑起来了。
首先来一段hdfs的创建、删除文件夹或者文件的操作代码:
package com.zhuyun.hadoop;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
public class HDFSDemo2 {
static FileSystem fs = null;
@Before
public void init() throws Exception{
// System.setProperty("hadoop.home.dir", "D:/Program Files/hadoop-2.7.3");
fs = FileSystem.get(new URI("hdfs://192.168.10.203:9000"), new Configuration(), "root");
}
@Test
public void testUpload() throws Exception{
InputStream in = new FileInputStream("D:/myeclipse/myeclipse.ini");
OutputStream out = fs.create(new Path("/user/root/output/myeclipse.ini"));
IOUtils.copyBytes(in, out, 4096, true);
}
@Test
public void testDelete() throws Exception{
boolean flag = fs.delete(new Path("/user/root/output"),true);
System.err.println(flag);
}
@Test
public void testMkdir() throws Exception{
boolean flag = fs.mkdirs(new Path("/user/root/output"));
System.out.println(flag);
}
@After
public void destroy(){
System.out.println(1);
}
}
下面是一段hadoop经典的代码WordCount,统计文档里的单词出现的次数:
运行时需要输入文档和输出的位置,类似于hdfs://192.168.1.10:9000/user/root/input.txt hdfs://192.168.1.10:9000/user/root/output
package com.zhuyun.hadoop;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCount {
public static class TokenizerMapper
extends Mapper<Object, Text, Text, IntWritable>{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
//Object key, Text value就是输入的key和value,第三个参数Context context这是可以记录输入的key和value
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString()); //输入的文档
while (itr.hasMoreTokens()) { //当未到文档的末尾时
word.set(itr.nextToken()); //设置key的名称
context.write(word, one); //添加到记录文件中,value值是1(one)
//例如:bye 1
// hello 1
// world 1
// world 1
}
}
}
public static class IntSumReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();
//reduce的输入是一个key对应一组的值的value,reduce也有context和map的context作用一致。
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) { //相同的key有多少个value,就累加多少次
sum += val.get(); //具体每次累加多少,取决于传递过来的value值
}
result.set(sum); //设置value的总数
context.write(key, result); //结果: bye 1
// hello 1
// world 2
}
}
public static void main(String[] args) throws Exception {
// System.setProperty("hadoop.home.dir", "D:/Program Files/hadoop-2.7.3");
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "word count"); //job的名称
job.setJarByClass(WordCount.class); //当前类
job.setMapperClass(TokenizerMapper.class); //装载map函数和reduce函数实现类
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class); //定义输出的key类型
job.setOutputValueClass(IntWritable.class); //定义输出的value类型
FileInputFormat.addInputPath(job, new Path(args[0])); //输入文件的地址
FileOutputFormat.setOutputPath(job, new Path(args[1])); //输出文件的地址
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
注:输入输出为 input_mapper output_reducer=not_filesystem
操作结果需要从hdfs中查看 : hdfs dfs -cat output/*
package com.zhuyun.hadoop.cassandra;
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.StringTokenizer;
import org.apache.cassandra.hadoop.ConfigHelper;
import org.apache.cassandra.hadoop.cql3.CqlConfigHelper;
import org.apache.cassandra.hadoop.cql3.CqlInputFormat;
import org.apache.cassandra.hadoop.cql3.CqlOutputFormat;
import org.apache.cassandra.utils.ByteBufferUtil;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.datastax.driver.core.Row;
/**
* This counts the occurrences of words in ColumnFamily
* inputs ( id uuid,
* line text,
* PRIMARY KEY (id))
*
* For each word, we output the total number of occurrences across all body texts.
*
* When outputting to Cassandra, we write the word counts to column family
* output_words ( word text,
* count_num text,
* PRIMARY KEY (word))
* as a {word, count} to columns: word, count_num with a row key of "word sum"
*
* 将数据从cassandra中取出来,放在hadoop上运算以后,将结果再放入cassandra的另外一个表中
*/
public class WordCount extends Configured implements Tool
{
private static final Logger logger = LoggerFactory.getLogger(WordCount.class);
static final String INPUT_MAPPER_VAR = "input_mapper";
static final String KEYSPACE = "cql3_wordcount";
static final String COLUMN_FAMILY = "inputs";
static final String OUTPUT_REDUCER_VAR = "output_reducer";
static final String OUTPUT_COLUMN_FAMILY = "output_words";
private static final String OUTPUT_PATH_PREFIX = "hdfs://192.168.10.203:9000/user/root/output";
private static final String PRIMARY_KEY = "row_key";
public static void main(String[] args) throws Exception
{
// Let ToolRunner handle generic command-line options
ToolRunner.run(new Configuration(), new WordCount(), args); //启动
System.exit(0); //退出
}
//多行数据输入的map程序
public static class TokenizerMapper extends Mapper<Map<String, ByteBuffer>, Map<String, ByteBuffer>, Text, IntWritable>
{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
private ByteBuffer sourceColumn;
protected void setup(org.apache.hadoop.mapreduce.Mapper.Context context)
throws IOException, InterruptedException
{
}
public void map(Map<String, ByteBuffer> keys, Map<String, ByteBuffer> columns, Context context) throws IOException, InterruptedException
{
for (Entry<String, ByteBuffer> column : columns.entrySet())
{
if (!"line".equalsIgnoreCase(column.getKey()))
continue;
String value = ByteBufferUtil.string(column.getValue());
StringTokenizer itr = new StringTokenizer(value);
while (itr.hasMoreTokens())
{
word.set(itr.nextToken());
context.write(word, one);
}
}
}
}
//单行数据输入的map程序
public static class NativeTokenizerMapper extends Mapper<Long, Row, Text, IntWritable>
{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
private ByteBuffer sourceColumn;
protected void setup(org.apache.hadoop.mapreduce.Mapper.Context context)
throws IOException, InterruptedException
{
}
public void map(Long key, Row row, Context context) throws IOException, InterruptedException
{
String value = row.getString("line");
logger.debug("read {}:{}={} from {}", key, "line", value, context.getInputSplit());
StringTokenizer itr = new StringTokenizer(value);
while (itr.hasMoreTokens())
{
word.set(itr.nextToken());
context.write(word, one);
}
}
}
//reduce程序,输出文件存储到文件系统中
public static class ReducerToFilesystem extends Reducer<Text, IntWritable, Text, IntWritable>
{
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException
{
int sum = 0;
for (IntWritable val : values) //结果有几个,就累加几次
sum += val.get();
context.write(key, new IntWritable(sum));
}
}
//reduce程序,输出文件存储到cassandra数据库中
public static class ReducerToCassandra extends Reducer<Text, IntWritable, Map<String, ByteBuffer>, List<ByteBuffer>>
{
private Map<String, ByteBuffer> keys;
private ByteBuffer key;
protected void setup(org.apache.hadoop.mapreduce.Mapper.Context context)
throws IOException, InterruptedException
{
keys = new LinkedHashMap<String, ByteBuffer>();
}
public void reduce(Text word, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException
{
int sum = 0;
for (IntWritable val : values)
sum += val.get();
System.out.println("--------word-------" + word.toString());
System.out.println("-------------2222222----------");
System.out.println("ByteBufferUtil.bytes--------" + ByteBufferUtil.bytes(word.toString()));
keys.put("word", ByteBufferUtil.bytes(word.toString()));
System.out.println("--------keys------------------" + keys.toString());
context.write(keys, getBindVariables(word, sum));
}
private List<ByteBuffer> getBindVariables(Text word, int sum)
{
List<ByteBuffer> variables = new ArrayList<ByteBuffer>();
variables.add(ByteBufferUtil.bytes(String.valueOf(sum)));
return variables;
}
}
public int run(String[] args) throws Exception
{
String outputReducerType = "filesystem";
String inputMapperType = "native";
String outputReducer = null;
String inputMapper = null;
if (args != null)
{
if(args[0].startsWith(OUTPUT_REDUCER_VAR))
outputReducer = args[0];
if(args[0].startsWith(INPUT_MAPPER_VAR))
inputMapper = args[0];
if (args.length == 2)
{
if(args[1].startsWith(OUTPUT_REDUCER_VAR))
outputReducer = args[1];
if(args[1].startsWith(INPUT_MAPPER_VAR))
inputMapper = args[1];
}
}
if (outputReducer != null)
{
String[] s = outputReducer.split("=");
if (s != null && s.length == 2)
outputReducerType = s[1];
}
logger.info("output reducer type: " + outputReducerType);
if (inputMapper != null)
{
String[] s = inputMapper.split("=");
if (s != null && s.length == 2)
inputMapperType = s[1];
}
@SuppressWarnings("deprecation")
Job job = new Job(getConf(), "wordcount");
job.setJarByClass(WordCount.class);
if (outputReducerType.equalsIgnoreCase("filesystem"))
{
job.setCombinerClass(ReducerToFilesystem.class);
job.setReducerClass(ReducerToFilesystem.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH_PREFIX));
}
else
{
job.setReducerClass(ReducerToCassandra.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Map.class);
job.setOutputValueClass(List.class);
job.setOutputFormatClass(CqlOutputFormat.class);
ConfigHelper.setOutputColumnFamily(job.getConfiguration(), KEYSPACE, OUTPUT_COLUMN_FAMILY);
job.getConfiguration().set(PRIMARY_KEY, "word,sum");
String query = "UPDATE " + KEYSPACE + "." + OUTPUT_COLUMN_FAMILY +
" SET count_num = ? ";
CqlConfigHelper.setOutputCql(job.getConfiguration(), query);
ConfigHelper.setOutputInitialAddress(job.getConfiguration(), "192.168.10.201");
ConfigHelper.setOutputPartitioner(job.getConfiguration(), "Murmur3Partitioner");
}
if (inputMapperType.equalsIgnoreCase("native"))
{
job.setMapperClass(NativeTokenizerMapper.class);
job.setInputFormatClass(CqlInputFormat.class);
// CqlConfigHelper.setInputCql(job.getConfiguration(), "select * from " + COLUMN_FAMILY + " where token(id) > ? and token(id) <= ? allow filtering");
CqlConfigHelper.setInputCql(job.getConfiguration(), "select * from cql3_wordcount.inputs where "
+ "line='But make allowance for their doubting too:' and token(id)>? and token(id)<=? allow filtering");
}
else
{
job.setMapperClass(TokenizerMapper.class);
job.setInputFormatClass(CqlInputFormat.class);
ConfigHelper.setInputRpcPort(job.getConfiguration(), "9042");
}
ConfigHelper.setInputInitialAddress(job.getConfiguration(), "192.168.10.201");
ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, COLUMN_FAMILY);
ConfigHelper.setInputPartitioner(job.getConfiguration(), "Murmur3Partitioner");
CqlConfigHelper.setInputCQLPageRowSize(job.getConfiguration(), "3");
job.waitForCompletion(true);
return 0;
}
}
下面这部分代码是,统计cassandra数据库中,某一个表的总行数,相当于select count(*) from table1; 这个语句,不过当cassandra中数据量太大的时候,该操作就会超时,因此才需要利用hadoop来统计:
注:这段代码运行时,无须带参数,操作结果需要从hdfs中查看 : hdfs dfs -cat output/*
package com.zhuyun.hadoop.cassandra;
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.cassandra.hadoop.ConfigHelper;
import org.apache.cassandra.hadoop.cql3.CqlConfigHelper;
import org.apache.cassandra.hadoop.cql3.CqlInputFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.datastax.driver.core.Row;
/**
*统计cassandra数据库表的总行数
*/
public class LineCount extends Configured implements Tool
{
private static final Logger logger = LoggerFactory.getLogger(LineCount.class);
static final String INPUT_MAPPER_VAR = "input_mapper";
static final String KEYSPACE = "cql3_wordcount";
static final String COLUMN_FAMILY = "inputs2";
// static final String KEYSPACE = "keyspace1";
// static final String COLUMN_FAMILY = "table4";
static final String OUTPUT_REDUCER_VAR = "output_reducer";
static final String OUTPUT_COLUMN_FAMILY = "output_words";
private static final String OUTPUT_PATH_PREFIX = "hdfs://192.168.10.203:9000/user/root/output";
public static void main(String[] args) throws Exception
{
// Let ToolRunner handle generic command-line options
ToolRunner.run(new Configuration(), new LineCount(), args); //启动
System.exit(0); //退出
}
//单行数据输入的map程序
public static class NativeTokenizerMapper extends Mapper<Long, Row, Text, IntWritable>
{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
protected void setup(Context context)
throws IOException, InterruptedException
{
}
public void map(Long key, Row row, Context context) throws IOException, InterruptedException
{
word.set("count");
context.write(word, one);
}
}
//reduce程序,输出文件存储到文件系统中
public static class ReducerToFilesystem extends Reducer<Text, IntWritable, Text, IntWritable>
{
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException
{
int sum = 0;
for (IntWritable val : values) //结果有几个,就累加几次
sum += val.get();
context.write(key, new IntWritable(sum));
}
}
public int run(String[] args) throws Exception
{
String outputReducerType = "filesystem";
String inputMapperType = "native";
String outputReducer = null;
String inputMapper = null;
if (args != null)
{
if(args[0].startsWith(OUTPUT_REDUCER_VAR))
outputReducer = args[0];
if(args[0].startsWith(INPUT_MAPPER_VAR))
inputMapper = args[0];
if (args.length == 2)
{
if(args[1].startsWith(OUTPUT_REDUCER_VAR))
outputReducer = args[1];
if(args[1].startsWith(INPUT_MAPPER_VAR))
inputMapper = args[1];
}
}
if (outputReducer != null)
{
String[] s = outputReducer.split("=");
if (s != null && s.length == 2)
outputReducerType = s[1];
}
logger.info("output reducer type: " + outputReducerType);
if (inputMapper != null)
{
String[] s = inputMapper.split("=");
if (s != null && s.length == 2)
inputMapperType = s[1];
}
@SuppressWarnings("deprecation")
Job job = new Job(getConf(), "wordcount");
job.setJarByClass(LineCount.class);
if (outputReducerType.equalsIgnoreCase("filesystem"))
{
job.setCombinerClass(ReducerToFilesystem.class);
job.setReducerClass(ReducerToFilesystem.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH_PREFIX));
}
if (inputMapperType.equalsIgnoreCase("native"))
{
job.setMapperClass(NativeTokenizerMapper.class);
job.setInputFormatClass(CqlInputFormat.class);
// CqlConfigHelper.setInputCql(job.getConfiguration(), "select * from " + COLUMN_FAMILY + " where token(id) > ? and token(id) <= ? allow filtering");
CqlConfigHelper.setInputCql(job.getConfiguration(), "select * from " + COLUMN_FAMILY + " where "
+ "id=6cfc5374-013f-40b9-92d1-ac86d5103bfd and token(id)>? and token(id)<=?");
}
ConfigHelper.setInputInitialAddress(job.getConfiguration(), "192.168.10.201");
ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, COLUMN_FAMILY);
ConfigHelper.setInputPartitioner(job.getConfiguration(), "Murmur3Partitioner");
CqlConfigHelper.setInputCQLPageRowSize(job.getConfiguration(), "10000");
job.waitForCompletion(true);
return 0;
}
}