mr解析xml将数据(ipv4、ipv6)批量导入hbase

24 篇文章 0 订阅
20 篇文章 4 订阅
首先在hbase中建立相应的表:

hbase(main):003:0> create 'messages','cf'


[hadoop@h71 hui]$ vi messages3.java

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.dom4j.Attribute;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;

public class messages3 {

        public static void main(String[] args) throws Exception {

                final Configuration configuration = new Configuration();
                configuration.set("hbase.zookeeper.quorum", "192.168.8.71");

                configuration.set(TableOutputFormat.OUTPUT_TABLE, "messages");

                configuration.set("dfs.socket.timeout", "180000");

                final Job job = new Job(configuration, "HBaseBatchImport");
                job.setJarByClass(messages3.class);
                
                job.setMapperClass(BatchImportMapper.class);
                job.setReducerClass(BatchImportReducer.class);
                job.setMapOutputKeyClass(LongWritable.class);
                job.setMapOutputValueClass(Text.class);

                job.setInputFormatClass(TextInputFormat.class);
                job.setOutputFormatClass(TableOutputFormat.class);

                FileInputFormat.setInputPaths(job, "hdfs://192.168.8.71:9000/messages");

                job.waitForCompletion(true);
        }

        static class BatchImportMapper extends
                Mapper<LongWritable, Text, LongWritable, Text> {
                Text v2 = new Text();

                protected void map(LongWritable key, Text value, Context context)
                        throws java.io.IOException, InterruptedException { 
                        SAXReader reader = new SAXReader();
                        Document document = null;
                        try {
                           document = reader.read("/home/hadoop/hui/dao.xml");
                        } catch (DocumentException e1) {
                           e1.printStackTrace();
                        }
                        Element root = document.getRootElement();
                        List e2 = document.selectNodes("/peizhi/hbase/fengefu/@fuhao");
                        String h2 = ((Attribute)e2.get(0)).getText();
                        final String[] splited = value.toString().split(h2);
                        if(splited[3].length()<5){
                                System.out.println("gai hang shu jv wu yong");
                        }else{
                        try {
                                final String date0 = splited[0]+" "+splited[1]+" "+splited[2];
                                SimpleDateFormat dateformat1 = new SimpleDateFormat("MMM dd HH:mm:ss",Locale.ENGLISH);
                                Date date=dateformat1.parse(date0);
                                SimpleDateFormat datef=new SimpleDateFormat("MMddHHmmss");
                                String date1="2017"+datef.format(date);
                                String rowKey = date1;
                                v2.set(rowKey + " " + value.toString());
                                context.write(key, v2);
                        } catch (NumberFormatException e) {
                                final Counter counter = context.getCounter("BatchImport",
                                                "ErrorFormat");
                                counter.increment(1L);
                                System.out.println("chu cuo le" + splited[0] + " " + e.getMessage());
                        } catch (ParseException e) {
                                e.printStackTrace();
                        }
                        }
                }
        }

        static class BatchImportReducer extends
                        TableReducer<LongWritable, Text, NullWritable> {
                protected void reduce(LongWritable key,
                                java.lang.Iterable<Text> values, Context context)
                                throws java.io.IOException, InterruptedException {
                        SAXReader reader = new SAXReader();
                        Document document = null;
                        try {
                                document = reader.read("/home/hadoop/hui/dao.xml");
                        } catch (DocumentException e1) {
                                e1.printStackTrace();
                        }
                                Element root = document.getRootElement();
                        List e = document.selectNodes("/peizhi/hbase/zhengze");
                        List e2 = document.selectNodes("/peizhi/hbase/fengefu/@fuhao");
                        String h1 = ((Element)e.get(e.size()-1)).getText();
                        String h2 = ((Attribute)e2.get(0)).getText();
                        for (Text text : values) {
                                final String[] splited = text.toString().split(h2);
                                final Put put = new Put(Bytes.toBytes(splited[0]));
                                for (Iterator i = root.element("hbase").elementIterator(); i.hasNext();) {
                                                Element element = (Element) i.next();
                                                String name = null;
                                                String neirong = null;
                                                int a = 0;
                                                if (element.getQualifiedName().equals("ziduan")) {
                                                        name = element.attributeValue("name");
                                                        neirong = element.getText();
                                                        a = Integer.parseInt(neirong);
                                                        String pattern1 = h1;
                                                        Pattern p1 = Pattern.compile(pattern1);
                                                        Matcher matcher1 = p1.matcher(splited[4]);
                                                        if(matcher1.matches()){
                                            //splited[0]=20170123195900
                                                put.add(Bytes.toBytes("cf"), name.getBytes(), Bytes.toBytes(splited[a+1]));
                                                context.write(NullWritable.get(), put);
                                                        }else{
                                                                break;
                                                        }
                                                }
                                        }
                                }
                }
        }
}
[hadoop@h71 hui]$ hadoop fs -cat hdfs://192.168.8.71:9000/messages
Jan 23 19:59:00 192.168.101.254 s_sys@hui trafficlogger: empty map for 1:4097 in classnames
Feb 20 06:25:04 h107 rsyslogd: [origin software="rsyslogd" swVersion="8.4.2" x-pid="22204" x-info="http://www.rsyslog.com"] rsyslogd was HUPed
Jan 24 19:59:01 192.168.101.254 s_sys@hui trafficlogger: empty map for 1:4097 in classnames


[hadoop@h71 hui]$ vi dao.xml

<?xml version="1.0" encoding="UTF-8"?>
<peizhi>
        <hbase>
                <jianbao name="messages"></jianbao>
                <liezu name="cf"></liezu>
                <ziduan name="ipv4">3</ziduan>
                <ziduan name="ipv6">3</ziduan>
                <ziduan name="host">4</ziduan>
                <ziduan name="leixing">5</ziduan>
                <guolv ip="h107"></guolv>
                <fengefu fuhao=" "></fengefu>
                <zhengze>^([\d.]+)</zhengze>
        </hbase>
</peizhi>

所遇到的坑:

必须将dom4j-1.6.1.jar和jaxen-1.1-beta-7.jar同时导入所有节点的hbase-1.0.0-cdh5.5.2/lib/目录下,一开始我只将dom4j-1.6.1.jar导入了主节点的相应目录下报这个错:
17/03/17 19:12:12 INFO mapreduce.Job: Task Id : attempt_1489747351579_0006_r_000000_0, Status : FAILED
Error: java.lang.ClassNotFoundException: org.dom4j.DocumentException
        at java.net.URLClassLoader$1.run(URLClassLoader.java:366)
        at java.net.URLClassLoader$1.run(URLClassLoader.java:355)
        at java.security.AccessController.doPrivileged(Native Method)
        at java.net.URLClassLoader.findClass(URLClassLoader.java:354)
        at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
        at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:308)
        at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
        at java.lang.Class.forName0(Native Method)
        at java.lang.Class.forName(Class.java:270)
        at org.apache.hadoop.conf.Configuration.getClassByNameOrNull(Configuration.java:2138)
        at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2103)
        at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2197)
        at org.apache.hadoop.mapreduce.task.JobContextImpl.getReducerClass(JobContextImpl.java:220)
        at org.apache.hadoop.mapred.ReduceTask.runNewReducer(ReduceTask.java:611)
        at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:389)
        at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:163)
        at java.security.AccessController.doPrivileged(Native Method)
        at javax.security.auth.Subject.doAs(Subject.java:415)
        at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1671)
        at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)
必须要将所需要的xml文件导入到所有节点的相应目录下,一开始我只导入到了主节点的相应目录下报这个错:
Error: java.lang.NullPointerException
        at messages3$BatchImportMapper.map(messages3.java:70)
        at messages3$BatchImportMapper.map(messages3.java:56)
        at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:145)
        at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:787)
        at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
        at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:163)
        at java.security.AccessController.doPrivileged(Native Method)
        at javax.security.auth.Subject.doAs(Subject.java:415)
        at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1671)
        at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)

下面的这个方法是通过java内置类来转换ipv6和ipv4的:

[hadoop@h71 q1]$ vi messages33.java

import java.net.Inet4Address;
import java.net.Inet6Address;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.dom4j.Attribute;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;

public class messages33 {

        public static void main(String[] args) throws Exception {

                final Configuration configuration = new Configuration();
                configuration.set("hbase.zookeeper.quorum", "192.168.8.71");

                configuration.set(TableOutputFormat.OUTPUT_TABLE, "messages");

                configuration.set("dfs.socket.timeout", "180000");

                final Job job = new Job(configuration, "HBaseBatchImport");
                job.setJarByClass(messages33.class);
                
                job.setMapperClass(BatchImportMapper.class);
                job.setReducerClass(BatchImportReducer.class);
                job.setMapOutputKeyClass(LongWritable.class);
                job.setMapOutputValueClass(Text.class);

                job.setInputFormatClass(TextInputFormat.class);
                job.setOutputFormatClass(TableOutputFormat.class);

                FileInputFormat.setInputPaths(job, "hdfs://192.168.8.71:9000/messages");

                job.waitForCompletion(true);
        }

        static class BatchImportMapper extends
                        Mapper<LongWritable, Text, LongWritable, Text> {
                Text v2 = new Text();

                protected void map(LongWritable key, Text value, Context context)
                                throws java.io.IOException, InterruptedException { 
                	SAXReader reader = new SAXReader();
            		Document document = null;
            		try {
						document = reader.read("/home/hadoop/hui/dao.xml");
					} catch (DocumentException e1) {
						e1.printStackTrace();
					}
                	Element root = document.getRootElement();
                	List e2 = document.selectNodes("/peizhi/hbase/fengefu/@fuhao");
                	String h2 = ((Attribute)e2.get(0)).getText();
                        final String[] splited = value.toString().split(h2);
                        try {
                        		final String date0 = splited[0]+" "+splited[1]+" "+splited[2];
                        		SimpleDateFormat dateformat1 = new SimpleDateFormat("MMM dd HH:mm:ss",Locale.ENGLISH);
                                Date date=dateformat1.parse(date0);
                                SimpleDateFormat datef=new SimpleDateFormat("MMddHHmmss");
                                String date1="2017"+datef.format(date);
                                String rowKey = date1;
                                v2.set(rowKey + " " + value.toString());
                                context.write(key, v2);
                        } catch (NumberFormatException e) {
                                final Counter counter = context.getCounter("BatchImport",
                                                "ErrorFormat");
                                counter.increment(1L);
                                System.out.println("chu cuo le" + splited[0] + " " + e.getMessage());
                        } catch (ParseException e) {
							e.printStackTrace();
						}
                }
        }

        static class BatchImportReducer extends
                        TableReducer<LongWritable, Text, NullWritable> {
                protected void reduce(LongWritable key,
                                java.lang.Iterable<Text> values, Context context)
                                throws java.io.IOException, InterruptedException {
                	SAXReader reader = new SAXReader();
            		Document document = null;
        				try {
							document = reader.read("/home/hadoop/hui/dao.xml");
						} catch (DocumentException e1) {
							e1.printStackTrace();
						}
        			Element root = document.getRootElement();
            		List e2 = document.selectNodes("/peizhi/hbase/fengefu/@fuhao");
            		String h2 = ((Attribute)e2.get(0)).getText();
                        for (Text text : values) {
                                final String[] splited = text.toString().split(h2);
                                final Put put = new Put(Bytes.toBytes(splited[0]));                                                          
                                for (Iterator i = root.element("hbase").elementIterator(); i.hasNext();) {
                        			Element element = (Element) i.next();
                        			String name = null;
                        			String neirong = null;
                        			int a = 0;
                                  InetAddress addressIPv6 = null;
                                  Inet6Address IPv6 = null;
                                  Inet4Address IPv4 = null;
                        			if (element.getQualifiedName().equals("ziduan")) {
                        				name = element.attributeValue("name");
                        				neirong = element.getText();
                        				a = Integer.parseInt(neirong);
                        					try{
                                            addressIPv6 = InetAddress.getByName(splited[a+1]);
                                            }
                                            catch (UnknownHostException e1){
                                            e1.printStackTrace();
                                            }
                                            if(name.equals("ipv4")){
                                            	if(addressIPv6 instanceof Inet4Address){
                                            		IPv4 = (Inet4Address) addressIPv6;
//                                            		System.out.println("addressIPv4 =" + addressIPv6.getHostAddress());
                                            		put.add(Bytes.toBytes("cf"), name.getBytes(), Bytes.toBytes(addressIPv6.getHostAddress()));
                                            		context.write(NullWritable.get(), put);
                                            	}
                    						}
                                            else if(name.equals("ipv6")){
                                            	if(addressIPv6 instanceof Inet6Address){
                                            		IPv6 = (Inet6Address) addressIPv6;
//                                            		System.out.println("addressIPv6 =" + addressIPv6.getHostAddress());
                                            		put.add(Bytes.toBytes("cf"), name.getBytes(), Bytes.toBytes(addressIPv6.getHostAddress()));
                                            		context.write(NullWritable.get(), put);
                                            	}else if(splited[a+1].equals("h107")){
                                            		break;
                                            	}
                                            }else{
                            					put.add(Bytes.toBytes("cf"), name.getBytes(), Bytes.toBytes(splited[a+1]));
                                                context.write(NullWritable.get(), put);
                                            }
                        			}
                        		}
                        	}
                }
        }
}
[hadoop@h71 hui]$ vi dao.xml
[hadoop@h72 hui]$ vi dao.xml
[hadoop@h73 hui]$ vi dao.xml
<?xml version="1.0" encoding="UTF-8"?>
<peizhi>
	<hbase>
		<jianbao name="messages"></jianbao>
		<liezu name="cf"></liezu>
		<ziduan name="ipv4">3</ziduan>
		<ziduan name="ipv6">3</ziduan>
		<ziduan name="host">4</ziduan>
		<ziduan name="leixing">5</ziduan>
		<fengefu fuhao=" "></fengefu>
	</hbase>
</peizhi>
[hadoop@h71 ~]$ hadoop fs -cat /messages
Jan 23 19:59:00 192.168.101.254 s_sys@hui trafficlogger: empty map for 1:4097 in classnames
Feb 20 06:25:04 h107 rsyslogd: [origin software="rsyslogd" swVersion="8.4.2" x-pid="22204" x-info="http://www.rsyslog.com"] rsyslogd was HUPed
Jan 24 19:59:01 :: s_sys@hui trafficlogger: empty map for 1:4097 in classnames
Jan 23 19:59:02 192.168.101.254 s_sys@hui trafficlogger: empty map for 1:4097 in classnames
[hadoop@h71 q1]$ /usr/jdk1.7.0_25/bin/javac messages33.java
[hadoop@h71 q1]$ /usr/jdk1.7.0_25/bin/jar cvf xx.jar messages33*class
[hadoop@h71 q1]$ hadoop jar xx.jar messages33
hbase(main):108:0> scan 'messages'
ROW                                                          COLUMN+CELL                                                                                                                                                                     
 20170123195900                                              column=cf:host, timestamp=1489827720926, value=s_sys@hui                                                                                                                    
 20170123195900                                              column=cf:ipv4, timestamp=1489827720926, value=192.168.101.254                                                                                                                  
 20170123195900                                              column=cf:leixing, timestamp=1489827720926, value=trafficlogger:                                                                                                                
 20170123195902                                              column=cf:host, timestamp=1489827720926, value=s_sys@hui                                                                                                                    
 20170123195902                                              column=cf:ipv4, timestamp=1489827720926, value=192.168.101.254                                                                                                                  
 20170123195902                                              column=cf:leixing, timestamp=1489827720926, value=trafficlogger:                                                                                                                
 20170124195901                                              column=cf:host, timestamp=1489827720926, value=s_sys@hui                                                                                                                    
 20170124195901                                              column=cf:ipv6, timestamp=1489827720926, value=0:0:0:0:0:0:0:0                                                                                                                  
 20170124195901                                              column=cf:leixing, timestamp=1489827720926, value=trafficlogger:                                                                                                                
3 row(s) in 0.0190 seconds


  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

小强签名设计

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值