hadoop 多表关联

最新推荐文章于 2018-12-09 15:18:28 发布

小草君

最新推荐文章于 2018-12-09 15:18:28 发布

阅读量2.5k

点赞数 2

分类专栏：互联网 java 文章标签： java 大数据 hadoop

java 同时被 2 个专栏收录

34 篇文章

订阅专栏

互联网

27 篇文章

订阅专栏

多表关联和单表关联类似，它也是通过对原始数据进行一定的处理，从其中挖掘出关心的信息。如下

输入的是两个文件，一个代表工厂表，包含工厂名列和地址编号列；另一个代表地址表，包含地址名列和地址编号列。要求从输入数据中找出工厂名和地址名的对应关系，输出工厂名-地址名表

样本如下：

factory:

factoryname addressed
Beijing Red Star 1
Shenzhen Thunder 3
Guangzhou Honda 2
Beijing Rising 1
Guangzhou Development Bank 2
Tencent 3
Back of Beijing 1

address:

addressID addressname
1 Beijing
2 Guangzhou
3 Shenzhen
4 Xian

import java.util.*;


import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.GenericOptionsParser;


public class MTjoin {
	public static int time = 0;


	/*
	 *
	 * 在map中先区分输入行属于左表还是右表，然后对两列值进行分割，
	 *
	 * 保存连接列在key值，剩余列和左右表标志在value中，最后输出
	 *
	 */

	public static class Map extends Mapper<Object, Text, Text, Text> {
		/* 实现map函数 */

		public void map( Object key, Text value, Context context )

		throws IOException, InterruptedException
		{
			String line = value.toString();         /* 每行文件 */

			String relationtype = new String();     /* 左右表标识 */


			/* 输入文件首行，不处理 */

			if ( line.contains( "factoryname" ) == true

			     || line.contains( "addressed" ) == true )
			{
				return;
			}


			/* 输入的一行预处理文本 */

			StringTokenizer itr = new StringTokenizer( line );

			String mapkey = new String();

			String mapvalue = new String();

			int i = 0;

			while ( itr.hasMoreTokens() )
			{
				/* 先读取一个单词 */

				String token = itr.nextToken();

				/* 判断该地址ID就把存到"values[0]" */

				if ( token.charAt( 0 ) >= '0' && token.charAt( 0 ) <= '9' )
				{
					mapkey = token;

					if ( i > 0 )
					{
						relationtype = "1";
					} else {
						relationtype = "2";
					}

					continue;
				}


				/* 存工厂名 */

				mapvalue += token + " ";

				i++;
			}


			/* 输出左右表 */

			context.write( new Text( mapkey ), new Text( relationtype + "+" + mapvalue ) );
		}
	}


	/*
	 *
	 * reduce解析map输出，将value中数据按照左右表分别保存，
	 *
	 * 　　* 然后求出笛卡尔积，并输出。
	 *
	 */

	public static class Reduce extends Reducer<Text, Text, Text, Text> {
		/* 实现reduce函数 */

		public void reduce( Text key, Iterable<Text> values, Context context )

		throws IOException, InterruptedException
		{
			/* 输出表头 */

			if ( 0 == time )
			{
				context.write( new Text( "factoryname" ), new Text( "addressname" ) );

				time++;
			}


			int factorynum = 0;

			String[] factory = new String[10];

			int addressnum = 0;

			String[] address = new String[10];


			Iterator ite = values.iterator();

			while ( ite.hasNext() )
			{
				String record = ite.next().toString();

				int len = record.length();

				int i = 2;

				if ( 0 == len )
				{
					continue;
				}


				/* 取得左右表标识 */

				char relationtype = record.charAt( 0 );


				/* 左表 */

				if ( '1' == relationtype )
				{
					factory[factorynum] = record.substring( i );

					factorynum++;
				}


				/* 右表 */

				if ( '2' == relationtype )
				{
					address[addressnum] = record.substring( i );

					addressnum++;
				}
			}


			/* 求笛卡尔积 */

			if ( 0 != factorynum && 0 != addressnum )
			{
				for ( int m = 0; m < factorynum; m++ )
				{
					for ( int n = 0; n < addressnum; n++ )
					{
						/* 输出结果 */

						context.write( new Text( factory[m] ),

							       new Text( address[n] ) );
					}
				}
			}
		}
	}


	public static void main( String[] args ) throws Exception
	{
		Configuration conf = new Configuration();

		/* 这句话很关键 */

		/*      conf.set("mapred.job.tracker", "192.168.1.2:9001"); */


		/*
		 * 可使用args
		 *      String[] ioArgs = new String[] { "MTjoin_in", "MTjoin_out" };
		 */

		String[] otherArgs = new GenericOptionsParser( conf, args ).getRemainingArgs();

		if ( otherArgs.length != 2 )
		{
			System.err.println( "Usage: Multiple Table Join <in> <out>" );

			System.exit( 2 );
		}


		Job job = new Job( conf, "Multiple Table Join" );

		job.setJarByClass( MTjoin.class );


		/* 设置Map和Reduce处理类 */

		job.setMapperClass( Map.class );

		job.setReducerClass( Reduce.class );


		/* 设置输出类型 */

		job.setOutputKeyClass( Text.class );

		job.setOutputValueClass( Text.class );


		/* 设置输入和输出目录 */

		FileInputFormat.addInputPath( job, new Path( otherArgs[0] ) );

		FileOutputFormat.setOutputPath( job, new Path( otherArgs[1] ) );

		System.exit( job.waitForCompletion( true ) ? 0 : 1 );
	}
}

结果：

factoryname     addressname
Beijing Red Star        Beijing
Beijing Rising  Beijing
Bank of Beijing         Beijing
Guangzhou Honda         Guangzhou
Guangzhou Development Bank      Guangzhou
Shenzhen Thunder        Shenzhen
Tencent         Shenzhen

 javac -classpath hadoop-core-1.1.2.jar:/opt/hadoop-1.1.2/lib/commons-cli-1.2.jar -d firstProject firstProject/MTJoin.java

jar -cvf MTJoin.jar -C firstProject/ .

删除已经存在的output

hadoop fs -rmr output

hadoop fs -mkdir input

hadoop fs -put factory input

 hadoop fs -put address input

运行

hadoop jar  MTJoin.jar MTJoin input output

查看结果

 hadoop fs -cat output/part-r-00000