参考网址:
https://www.elastic.co/guide/en/elasticsearch/hadoop/current/mapreduce.html
1.下载依赖jar
elasticsearch-hadoop2.2.0.jar这个从私服下载吧。
2.数据流向是:
hbase导出数据-》hdfs-》es2
3.以下直接粘贴代码
<span style="font-weight: bold;">
</span>import java.util.Map.Entry;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.mapreduce.MutationSerialization;
import org.apache.hadoop.hbase.mapreduce.ResultSerialization;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.elasticsearch.hadoop.mr.EsOutputFormat;
import org.elasticsearch.hadoop.mr.LinkedMapWritable;
public class MyJob extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
Path input = new Path(args[0]);
Configuration conf = getConf();
conf.setBoolean("mapred.map.tasks.speculative.execution", false);
conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
conf.setStrings("io.serializations", conf.get("io.serializations"),
MutationSerialization.class.getName(), ResultSerialization.class.getName());//如果是hbase0.9导出数据就不需要这个参数了,1以上需要
//conf.set("es.nodes", "host228"); // index or indices used for storing data
conf.set("es.port", "9200"); // index or indices used for storing data
//conf.set("es.resource", "ehlindex/tr_plate"); // index or indices used for storing data
GenericOptionsParser parser = new GenericOptionsParser(conf, args);
for (Entry<String, String> entry : conf) {
System.out.printf("%s=%s\n", entry.getKey(), entry.getValue());
}
Job job = Job.getInstance(conf, "hfile 2 es");
job.setJarByClass(MyJob.class);
FileInputFormat.addInputPath(job, input);
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setOutputFormatClass(EsOutputFormat.class);
job.setMapOutputValueClass(LinkedMapWritable.class);
job.setNumReduceTasks(0);
job.setMapperClass(MyMaper.class);
return job.waitForCompletion(true)?0:1;
}
public static void main(String[] args) throws Exception {
int run = ToolRunner.run( new MyJob(), args);
System.exit(run);
}
}<strong>
</strong>
<span style="font-weight: bold;">
</span>import java.io.IOException;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.elasticsearch.hadoop.mr.LinkedMapWritable;
import com.ehl.im.transfer.TRFieldEnum;
import com.ehl.im.transfer.TravelRecord;
public class MyMaper extends Mapper<ImmutableBytesWritable, Result, NullWritable, LinkedMapWritable>
{
protected void map(ImmutableBytesWritable key, Result value,
Context context)
throws IOException, InterruptedException {
if("true".equals( context.getConfiguration().get("notinsert"))){
return ;
}
try {
LinkedMapWritable linkObj = result2Map(value);
context.write(NullWritable.get(), linkObj);
} catch (Exception e) {
e.printStackTrace();
}
}
private LinkedMapWritable result2Map(Result r){
LinkedMapWritable linkObj = new LinkedMapWritable();
byte[] passCarRowValue = r.getValue("cf".getBytes(), null);
TravelRecord record = new TravelRecord(passCarRowValue);
linkObj.put(new Text("timestamp"),new LongWritable(Long.valueOf(record.getStringValue(TRFieldEnum.TIMESTAMP))));
linkObj.put(new Text("car_plate_number"),new Text(record.getStringValue(TRFieldEnum.CAR_PLATE_NUMBER)));
try {
linkObj.put(new Text("carplateindex"),new Text(CarPlateCommonUtil.produceCarPlateIndexStr( record.getStringValue(TRFieldEnum.CAR_PLATE_NUMBER))) );
} catch (Exception e) {
e.printStackTrace();
}
linkObj.put(new Text("speed"), new LongWritable(Long.valueOf(record.getStringValue(TRFieldEnum.SPEED))));
linkObj.put(new Text("lane_id"),new Text(record.getStringValue(TRFieldEnum.LANE_ID)));
linkObj.put(new Text("camera_location"),new Text(record.getStringValue(TRFieldEnum.CAMERA_LOCATION)));
linkObj.put(new Text("bay_id"),new Text(record.getStringValue(TRFieldEnum.BAY_ID)));
linkObj.put(new Text("camera_orientation"),new Text(record.getStringValue(TRFieldEnum.CAMERA_ORIENTATION)));
linkObj.put(new Text("car_brand"),new Text(record.getStringValue(TRFieldEnum.CAR_BRAND)));
linkObj.put(new Text("car_color"),new Text(record.getStringValue(TRFieldEnum.CAR_COLOR)));
linkObj.put(new Text("car_plate_color"),new Text(record.getStringValue(TRFieldEnum.CAR_PLATE_COLOR)));
linkObj.put(new Text("car_plate_type"),new Text(record.getStringValue(TRFieldEnum.CAR_PLATE_TYPE)));
linkObj.put(new Text("car_status"),new Text(record.getStringValue(TRFieldEnum.CAR_STATUS)));
linkObj.put(new Text("travel_orientation"),new Text(record.getStringValue(TRFieldEnum.TRAVEL_ORIENTATION)));
linkObj.put(new Text("plate_coordinates"),new Text(record.getStringValue(TRFieldEnum.PLATE_COORDINATES)));
linkObj.put(new Text("driver_coordinates"),new Text(record.getStringValue(TRFieldEnum.DRIVER_COORDINATES)));
String[] imgUrls = record.getStringArrayValue(TRFieldEnum.IMAGE_URLS);
if (imgUrls != null) {
if (imgUrls.length >= 1 && imgUrls[0] != null && !"".equals(imgUrls[0])) {
linkObj.put(new Text("tp1"),new Text(imgUrls[0]));
}
if (imgUrls.length >= 2 && imgUrls[1] != null && !"".equals(imgUrls[1])) {
linkObj.put(new Text("tp2"),new Text(imgUrls[1]));
}
if (imgUrls.length >= 3 && imgUrls[2] != null && !"".equals(imgUrls[2])) {
linkObj.put(new Text("tp3"),new Text(imgUrls[2]));
}
}
return linkObj;
}
}<strong>
</strong>
以下是建立es的索引
curl -XPOST host213:9200/ehlindex -d '{
"settings" : { "number_of_shards" : 20,"number_of_replicas" : 0 },
"mappings" : {
"tr_plate" : {
"properties" : {
"timestamp" : { "type" : "long", "index" : "not_analyzed" } ,
"car_plate_number" : { "type" : "string", "index" : "not_analyzed" } ,
"speed" : { "type" : "long", "index" : "not_analyzed" } ,
"lane_id" : { "type" : "string", "index" : "not_analyzed" } ,
"camera_location" : { "type" : "string", "index" : "not_analyzed" } ,
"bay_id" : { "type" : "string", "index" : "not_analyzed" } ,
"camera_orientation" : { "type" : "string", "index" : "not_analyzed" } ,
"car_brand" : { "type" : "string", "index" : "not_analyzed" } ,
"car_color" : { "type" : "string", "index" : "not_analyzed" } ,
"car_plate_color" : { "type" : "string", "index" : "not_analyzed" } ,
"car_plate_type" : { "type" : "string", "index" : "not_analyzed" } ,
"tp1" : { "type" : "string", "index" : "not_analyzed" } ,
"tp2" : { "type" : "string", "index" : "not_analyzed" } ,
"tp3" : { "type" : "string", "index" : "not_analyzed" } ,
"car_status" : { "type" : "string", "index" : "not_analyzed" } ,
"travel_orientation" : { "type" : "string", "index" : "not_analyzed" } ,
"plate_coordinates" : { "type" : "string", "index" : "not_analyzed" } ,
"driver_coordinates" : { "type" : "string", "index" : "not_analyzed" } ,
"carplateindex" : { "type" : "string", "index" : "analyzed" }
}
}
}
}'
hadoop jar downloads/Hfile2Es-0.0.1-SNAPSHOT-jar-with-dependencies.jar -D es.resource=ehlindex/tr_plate -D es.nodes=host228 /yangxTest/qhd_data1/qhd_data1
这些参数在MyJob中都能获取,所以比较灵活,纯粹干货自己消化吧