The following are top voted examples for showing how to use parquet.hadoop.ParquetOutputFormat. These examples are extracted from open source projects. You can vote up the examples you like and your votes will be used in our system to product more good examples.
Example 1
Project: iow-hadoop-streaming File: ParquetAsTextOutputFormat.java View source code | Vote up | 7 votes |
privatestaticCompressionCodecName getCodec(JobConf conf){CompressionCodecName codec;if(ParquetOutputFormat.isCompressionSet(conf)){// explicit parquet config codec =ParquetOutputFormat.getCompression(conf);}elseif(getCompressOutput(conf)){// from hadoop config// find the right codecClass<?> codecClass = getOutputCompressorClass(conf,DefaultCodec.class); LOG.info("Compression set through hadoop codec: "+ codecClass.getName()); codec =CompressionCodecName.fromCompressionCodec(codecClass);}else{ codec =CompressionCodecName.UNCOMPRESSED;} LOG.info("Compression: "+ codec.name());return codec;}
Example 2
Project: tajo File: ParquetAppender.java View source code | Vote up | 6 votes |
/** * Creates a new ParquetAppender. * * @param conf Configuration properties. * @param schema The table schema. * @param meta The table metadata. * @param workDir The path of the Parquet file to write to. */publicParquetAppender(Configuration conf,TaskAttemptId taskAttemptId,Schema schema,TableMeta meta,Path workDir)throwsIOException{super(conf, taskAttemptId, schema, meta, workDir);this.blockSize =Integer.parseInt( meta.getOption(ParquetOutputFormat.BLOCK_SIZE,StorageConstants.PARQUET_DEFAULT_BLOCK_SIZE));this.pageSize =Integer.parseInt( meta.getOption(ParquetOutputFormat.PAGE_SIZE,StorageConstants.PARQUET_DEFAULT_PAGE_SIZE));this.compressionCodecName =CompressionCodecName.fromConf( meta.getOption(ParquetOutputFormat.COMPRESSION,StorageConstants.PARQUET_DEFAULT_COMPRESSION_CODEC_NAME));this.enableDictionary =Boolean.parseBoolean( meta.getOption(ParquetOutputFormat.ENABLE_DICTIONARY,StorageConstants.PARQUET_DEFAULT_IS_DICTIONARY_ENABLED));this.validating =Boolean.parseBoolean( meta.getOption(ParquetOutputFormat.VALIDATION,StorageConstants.PARQUET_DEFAULT_IS_VALIDATION_ENABLED));}
Example 3
Project: pbase File: TestInputOutputFormat.java View source code | Vote up | 6 votes |
privatevoid runMapReduceJob(CompressionCodecName codec,Map<String,String> extraConf)throwsIOException,ClassNotFoundException,InterruptedException{Configuration conf =newConfiguration(this.conf);for(Map.Entry<String,String> entry : extraConf.entrySet()){ conf.set(entry.getKey(), entry.getValue());}finalFileSystem fileSystem = parquetPath.getFileSystem(conf); fileSystem.delete(parquetPath,true); fileSystem.delete(outputPath,true);{ writeJob =newJob(conf,"write");TextInputFormat.addInputPath(writeJob, inputPath); writeJob.setInputFormatClass(TextInputFormat.class); writeJob.setNumReduceTasks(0);ParquetOutputFormat.setCompression(writeJob, codec);ParquetOutputFormat.setOutputPath(writeJob, parquetPath); writeJob.setOutputFormatClass(ParquetOutputFormat.class); writeJob.setMapperClass(readMapperClass);ParquetOutputFormat.setWriteSupportClass(writeJob,MyWriteSupport.class);GroupWriteSupport.setSchema(MessageTypeParser.parseMessageType(writeSchema), writeJob.getConfiguration()); writeJob.submit(); waitForJob(writeJob);}{ conf.set(ReadSupport.PARQUET_READ_SCHEMA, readSchema); readJob =newJob(conf,"read"); readJob.setInputFormatClass(ParquetInputFormat.class);ParquetInputFormat.setReadSupportClass(readJob,MyReadSupport.class);ParquetInputFormat.setInputPaths(readJob, parquetPath); readJob.setOutputFormatClass(TextOutputFormat.class);TextOutputFormat.setOutputPath(readJob, outputPath); readJob.setMapperClass(writeMapperClass); readJob.setNumReduceTasks(0); readJob.submit(); waitForJob(readJob);}}
Example 4
Project: pbase File: CodecConfigTest.java View source code | Vote up | 6 votes |
publicvoid shouldUseParquetFlagToSetCodec(String codecNameStr,CompressionCodecName expectedCodec)throwsIOException{//Test mapreduce APIJob job =newJob();Configuration conf = job.getConfiguration(); conf.set(ParquetOutputFormat.COMPRESSION, codecNameStr);TaskAttemptContext task =ContextUtil.newTaskAttemptContext(conf,newTaskAttemptID(newTaskID(newJobID("test",1),false,1),1));Assert.assertEquals(CodecConfig.from(task).getCodec(), expectedCodec);//Test mapred APIJobConf jobConf =newJobConf(); jobConf.set(ParquetOutputFormat.COMPRESSION, codecNameStr);Assert.assertEquals(CodecConfig.from(jobConf).getCodec(), expectedCodec);}
Example 5
Project: pbase File: DeprecatedParquetOutputFormat.java View source code | Vote up | 6 votes |
publicRecordWriterWrapper(ParquetOutputFormat<V> realOutputFormat,FileSystem fs,JobConf conf,String name,Progressable progress)throwsIOException{CompressionCodecName codec = getCodec(conf);String extension = codec.getExtension()+".parquet";Path file = getDefaultWorkFile(conf, name, extension);try{ realWriter =(ParquetRecordWriter<V>) realOutputFormat.getRecordWriter(conf, file, codec);}catch(InterruptedException e){Thread.interrupted();thrownewIOException(e);}}
Example 6
Project: tajo-cdh File: ParquetAppender.java View source code | Vote up | 6 votes |
/** * Creates a new ParquetAppender. * * @param conf Configuration properties. * @param schema The table schema. * @param meta The table metadata. * @param path The path of the Parquet file to write to. */publicParquetAppender(Configuration conf,Schema schema,TableMeta meta,Path path)throwsIOException{super(conf, schema, meta, path);this.blockSize =Integer.parseInt( meta.getOption(ParquetOutputFormat.BLOCK_SIZE));this.pageSize =Integer.parseInt( meta.getOption(ParquetOutputFormat.PAGE_SIZE));this.compressionCodecName =CompressionCodecName.fromConf( meta.getOption(ParquetOutputFormat.COMPRESSION));this.enableDictionary =Boolean.parseBoolean( meta.getOption(ParquetOutputFormat.ENABLE_DICTIONARY));this.validating =Boolean.parseBoolean( meta.getOption(ParquetOutputFormat.VALIDATION));}
Example 7
Project: tajo-cdh File: StorageUtil.java View source code | Vote up | 6 votes |
publicstaticOptions newPhysicalProperties(CatalogProtos.StoreType type){Options options =newOptions();if(CatalogProtos.StoreType.CSV == type){ options.put(CSVFILE_DELIMITER, DEFAULT_FIELD_DELIMITER);}elseif(CatalogProtos.StoreType.RCFILE == type){ options.put(RCFILE_SERDE, DEFAULT_BINARY_SERDE);}elseif(CatalogProtos.StoreType.SEQUENCEFILE == type){ options.put(SEQUENCEFILE_SERDE, DEFAULT_TEXT_SERDE); options.put(SEQUENCEFILE_DELIMITER, DEFAULT_FIELD_DELIMITER);}elseif(type ==CatalogProtos.StoreType.PARQUET){ options.put(ParquetOutputFormat.BLOCK_SIZE, PARQUET_DEFAULT_BLOCK_SIZE); options.put(ParquetOutputFormat.PAGE_SIZE, PARQUET_DEFAULT_PAGE_SIZE); options.put(ParquetOutputFormat.COMPRESSION, PARQUET_DEFAULT_COMPRESSION_CODEC_NAME); options.put(ParquetOutputFormat.ENABLE_DICTIONARY, PARQUET_DEFAULT_IS_DICTIONARY_ENABLED); options.put(ParquetOutputFormat.VALIDATION, PARQUET_DEFAULT_IS_VALIDATION_ENABLED);}return options;}
Example 8
Project: hadoop-arch-book File: JavaSessionize.java View source code | Vote up | 5 votes |
publicstaticvoid main(String[] args)throwsException{if(args.length ==0){System.err.println("Usage: JavaSessionize <master> [input file]");System.exit(1);}System.out.println("Output:"+ outputPath);JavaSparkContext jsc =newJavaSparkContext(args[0],"JavaSessionize",System.getenv("SPARK_HOME"),JavaSparkContext.jarOfClass(JavaSessionize.class));JavaRDD<String> dataSet =(args.length ==2)? jsc.textFile(args[1]): jsc.parallelize(testLines);// @formatter:offJavaPairRDD<String,SerializableLogLine> parsed = dataSet.map(newPairFunction<String,String,SerializableLogLine>(){// @formatter:on@OverridepublicTuple2<String,SerializableLogLine> call(String s)throwsException{returnnewTuple2<String,SerializableLogLine>(getIP(s), getFields(s));}});// This groups clicks by IP addressJavaPairRDD<String,List<SerializableLogLine>> grouped = parsed .groupByKey();JavaPairRDD<String,List<SerializableLogLine>> sessionized = grouped .mapValues(newFunction<List<SerializableLogLine>,List<SerializableLogLine>>(){@OverridepublicList<SerializableLogLine> call (List<SerializableLogLine> logLines)throwsException{return sessionize(logLines);}}); sessionized.foreach(newVoidFunction<Tuple2<String,List<SerializableLogLine>>>(){@Overridepublicvoid call(Tuple2<String,List<SerializableLogLine>> stringListTuple2)throwsException{System.out.println("IP: "+ stringListTuple2._1());for(SerializableLogLine line : stringListTuple2._2()){System.out.println(line);}}});// right now sessionize is an RDD of pairs: <String,List<LogLine>>.// We want to output an RDD of <String,LogLine>// First, grab the Lists, then flatten them,// then pair them with something empty to make Hadoop happy// @formatter:offJavaRDD<List<SerializableLogLine>> nokeys = sessionized.map(newFunction<Tuple2<String,List<SerializableLogLine>>,List<SerializableLogLine>>(){// @formatter:on@OverridepublicList<SerializableLogLine> call(Tuple2<String,List<SerializableLogLine>> stringListTuple2)throwsException{return stringListTuple2._2();}});// @formatter:offJavaRDD<SerializableLogLine> flatLines = nokeys.flatMap(newFlatMapFunction<List<SerializableLogLine>,SerializableLogLine>(){// @formatter:on@OverridepublicIterable<SerializableLogLine> call (List<SerializableLogLine> serializableLogLines)throwsException{return serializableLogLines;}});JavaPairRDD<Void,SerializableLogLine> outputPairs = flatLines.map (newPairFunction<SerializableLogLine,Void,SerializableLogLine>(){@OverridepublicTuple2<Void,SerializableLogLine> call (SerializableLogLine serializableLogLine)throwsException{returnnewTuple2<Void,SerializableLogLine>(null, serializableLogLine);}});Job job =newJob();ParquetOutputFormat.setWriteSupportClass(job,AvroWriteSupport.class);AvroParquetOutputFormat.setSchema(job,LogLine.SCHEMA$);//dummy instance, because that's the only way to get the class of a// parameterized typeParquetOutputFormat<LogLine> pOutput =newParquetOutputFormat<LogLine>();//System.out.println("job write support - " +// job.getConfiguration().get("parquet.write.support.class") +// " job schema - " + job.getConfiguration().get("parquet// .avro.schema")) ; outputPairs.saveAsNewAPIHadoopFile(outputPath,//pathVoid.class,//key classLogLine.class,//value class pOutput.getClass(),//output format class job.getConfiguration());//configuration}
Example 9
Project: tajo File: HiveCatalogStore.java View source code | Vote up | 5 votes |
@Overridepublicfinalvoid createTable(finalCatalogProtos.TableDescProto tableDescProto){HiveCatalogStoreClientPool.HiveCatalogStoreClient client =null;TableDesc tableDesc =newTableDesc(tableDescProto);String[] splitted =CatalogUtil.splitFQTableName(tableDesc.getName());String databaseName = splitted[0];String tableName = splitted[1];try{ client = clientPool.getClient(); org.apache.hadoop.hive.metastore.api.Table table =new org.apache.hadoop.hive.metastore.api.Table(); table.setDbName(databaseName); table.setTableName(tableName); table.setParameters(newHashMap<>(tableDesc.getMeta().getOptions().getAllKeyValus()));// TODO: set owner//table.setOwner();StorageDescriptor sd =newStorageDescriptor(); sd.setSerdeInfo(newSerDeInfo()); sd.getSerdeInfo().setParameters(newHashMap<>()); sd.getSerdeInfo().setName(table.getTableName());//If tableType is a managed-table, the location is hive-warehouse dir// and it will be wrong path in output committing table.setTableType(TableType.EXTERNAL_TABLE.name()); table.putToParameters("EXTERNAL","TRUE");Path tablePath =newPath(tableDesc.getUri());FileSystem fs = tablePath.getFileSystem(conf);if(fs.isFile(tablePath)){ LOG.warn("A table path is a file, but HiveCatalogStore does not allow a file path."); sd.setLocation(tablePath.getParent().toString());}else{ sd.setLocation(tablePath.toString());}// set column informationList<Column> columns = tableDesc.getSchema().getRootColumns();ArrayList<FieldSchema> cols =newArrayList<>(columns.size());for(Column eachField : columns){ cols.add(newFieldSchema(eachField.getSimpleName(),HiveCatalogUtil.getHiveFieldType(eachField.getDataType()),""));} sd.setCols(cols);// set partition keysif(tableDesc.hasPartition()&& tableDesc.getPartitionMethod().getPartitionType().equals(PartitionType.COLUMN)){List<FieldSchema> partitionKeys =newArrayList<>();for(Column eachPartitionKey : tableDesc.getPartitionMethod().getExpressionSchema().getRootColumns()){ partitionKeys.add(newFieldSchema(eachPartitionKey.getSimpleName(),HiveCatalogUtil.getHiveFieldType(eachPartitionKey.getDataType()),""));} table.setPartitionKeys(partitionKeys);}if(tableDesc.getMeta().getDataFormat().equalsIgnoreCase(BuiltinStorages.RCFILE)){StorageFormatDescriptor descriptor = storageFormatFactory.get(IOConstants.RCFILE); sd.setInputFormat(descriptor.getInputFormat()); sd.setOutputFormat(descriptor.getOutputFormat());String serde = tableDesc.getMeta().getOption(StorageConstants.RCFILE_SERDE);if(StorageConstants.DEFAULT_TEXT_SERDE.equals(serde)){ sd.getSerdeInfo().setSerializationLib(ColumnarSerDe.class.getName());}else{ sd.getSerdeInfo().setSerializationLib(LazyBinaryColumnarSerDe.class.getName());}if(tableDesc.getMeta().getOptions().containsKey(StorageConstants.RCFILE_NULL)){ table.putToParameters(serdeConstants.SERIALIZATION_NULL_FORMAT,StringEscapeUtils.unescapeJava(tableDesc.getMeta().getOption(StorageConstants.RCFILE_NULL)));}}elseif(tableDesc.getMeta().getDataFormat().equals(BuiltinStorages.TEXT)){// TextFileStorageFormatDescriptor has deprecated class. so the class name set directly sd.setInputFormat(TextInputFormat.class.getName()); sd.setOutputFormat(HiveIgnoreKeyTextOutputFormat.class.getName()); sd.getSerdeInfo().setSerializationLib(LazySimpleSerDe.class.getName());String fieldDelimiter = tableDesc.getMeta().getOption(StorageConstants.TEXT_DELIMITER,StorageConstants.DEFAULT_FIELD_DELIMITER);// User can use an unicode for filed delimiter such as \u0001, \001.// In this case, java console will convert this value into "\\u001".// And hive will un-espace this value again.// As a result, user can use right field delimiter.// So, we have to un-escape this value. sd.getSerdeInfo().putToParameters(serdeConstants.SERIALIZATION_FORMAT,StringEscapeUtils.unescapeJava(fieldDelimiter)); sd.getSerdeInfo().putToParameters(serdeConstants.FIELD_DELIM,StringEscapeUtils.unescapeJava(fieldDelimiter)); table.getParameters().remove(StorageConstants.TEXT_DELIMITER);if(tableDesc.getMeta().containsOption(StorageConstants.TEXT_NULL)){ table.putToParameters(serdeConstants.SERIALIZATION_NULL_FORMAT,StringEscapeUtils.unescapeJava(tableDesc.getMeta().getOption(StorageConstants.TEXT_NULL))); table.getParameters().remove(StorageConstants.TEXT_NULL);}}elseif(tableDesc.getMeta().getDataFormat().equalsIgnoreCase(BuiltinStorages.SEQUENCE_FILE)){StorageFormatDescriptor descriptor = storageFormatFactory.get(IOConstants.SEQUENCEFILE); sd.setInputFormat(descriptor.getInputFormat()); sd.setOutputFormat(descriptor.getOutputFormat());String serde = tableDesc.getMeta().getOption(StorageConstants.SEQUENCEFILE_SERDE);if(StorageConstants.DEFAULT_TEXT_SERDE.equals(serde)){ sd.getSerdeInfo().setSerializationLib(LazySimpleSerDe.class.getName());String fieldDelimiter = tableDesc.getMeta().getOption(StorageConstants.SEQUENCEFILE_DELIMITER,StorageConstants.DEFAULT_FIELD_DELIMITER);// User can use an unicode for filed delimiter such as \u0001, \001.// In this case, java console will convert this value into "\\u001".// And hive will un-espace this value again.// As a result, user can use right field delimiter.// So, we have to un-escape this value. sd.getSerdeInfo().putToParameters(serdeConstants.SERIALIZATION_FORMAT,StringEscapeUtils.unescapeJava(fieldDelimiter)); sd.getSerdeInfo().putToParameters(serdeConstants.FIELD_DELIM,StringEscapeUtils.unescapeJava(fieldDelimiter)); table.getParameters().remove(StorageConstants.SEQUENCEFILE_DELIMITER);}else{ sd.getSerdeInfo().setSerializationLib(LazyBinarySerDe.class.getName());}if(tableDesc.getMeta().containsOption(StorageConstants.SEQUENCEFILE_NULL)){ table.putToParameters(serdeConstants.SERIALIZATION_NULL_FORMAT,StringEscapeUtils.unescapeJava(tableDesc.getMeta().getOption(StorageConstants.SEQUENCEFILE_NULL))); table.getParameters().remove(StorageConstants.SEQUENCEFILE_NULL);}}elseif(tableDesc.getMeta().getDataFormat().equalsIgnoreCase(BuiltinStorages.PARQUET)){StorageFormatDescriptor descriptor = storageFormatFactory.get(IOConstants.PARQUET); sd.setInputFormat(descriptor.getInputFormat()); sd.setOutputFormat(descriptor.getOutputFormat()); sd.getSerdeInfo().setSerializationLib(descriptor.getSerde());if(tableDesc.getMeta().containsOption(ParquetOutputFormat.COMPRESSION)){ table.putToParameters(ParquetOutputFormat.COMPRESSION, tableDesc.getMeta().getOption(ParquetOutputFormat.COMPRESSION));}}else{thrownewUnsupportedException(tableDesc.getMeta().getDataFormat()+" in HivecatalogStore");} sd.setSortCols(newArrayList<>()); table.setSd(sd); client.getHiveClient().createTable(table);}catch(Throwable t){thrownewTajoInternalError(t);}finally{if(client !=null) client.release();}}
Example 10
Project: clickstream-tutorial File: JavaSessionize.java View source code | Vote up | 5 votes |
publicstaticvoid main(String[] args)throwsException{if(args.length ==0){System.err.println("Usage: JavaSessionize <master> [input file]");System.exit(1);}String outputPath;if(args.length ==3){ outputPath = args[2];}else{ outputPath =newFile(temp,"output").getAbsolutePath();}System.out.println("Output:"+ outputPath);JavaSparkContext jsc =newJavaSparkContext(args[0],"JavaSessionize",System.getenv("SPARK_HOME"),JavaSparkContext.jarOfClass(JavaSessionize.class));JavaRDD<String> dataSet =(args.length ==2)? jsc.textFile(args[1]): jsc.parallelize(testLines);JavaPairRDD<String,SerializableLogLine> parsed = dataSet.mapToPair (newPairFunction<String,String,SerializableLogLine>(){@OverridepublicTuple2<String,SerializableLogLine> call(String s)throwsException{returnnewTuple2<String,SerializableLogLine>(getIP(s), getFields(s));}});// This groups clicks by IP addressJavaPairRDD<String,Iterable<SerializableLogLine>> grouped = parsed .groupByKey();JavaPairRDD<String,Iterable<SerializableLogLine>> sessionized = grouped.mapValues(newFunction<Iterable<SerializableLogLine>,Iterable<SerializableLogLine>>(){@OverridepublicIterable<SerializableLogLine> call (Iterable<SerializableLogLine> logLines)throwsException{return sessionize(logLines);}}); sessionized.foreach(newVoidFunction<Tuple2<String,Iterable<SerializableLogLine>>>(){@Overridepublicvoid call(Tuple2<String,Iterable<SerializableLogLine>> stringListTuple2)throwsException{System.out.println("IP: "+ stringListTuple2._1());for(SerializableLogLine line : stringListTuple2._2()){System.out.println(line);}}});// right now sessionize is an RDD of pairs: <String,List<LogLine>>.// We want to output an RDD of <String,LogLine>// First, grab the Lists, then flatten them,// then pair them with something empty to make Hadoop happyJavaRDD<Iterable<SerializableLogLine>> nokeys = sessionized.map(newFunction<Tuple2<String,Iterable<SerializableLogLine>>,Iterable<SerializableLogLine>>(){@OverridepublicIterable<SerializableLogLine> call(Tuple2<String,Iterable<SerializableLogLine>> stringListTuple2)throwsException{return stringListTuple2._2();}});// @formatter:offJavaRDD<SerializableLogLine> flatLines = nokeys.flatMap(newFlatMapFunction<Iterable<SerializableLogLine>,SerializableLogLine>(){// @formatter:on@OverridepublicIterable<SerializableLogLine> call (Iterable<SerializableLogLine> serializableLogLines)throwsException{return serializableLogLines;}});JavaPairRDD<Void,SerializableLogLine> outputPairs = flatLines .mapToPair(newPairFunction<SerializableLogLine,Void,SerializableLogLine>(){@OverridepublicTuple2<Void,SerializableLogLine> call (SerializableLogLine serializableLogLine)throwsException{returnnewTuple2<Void,SerializableLogLine>(null, serializableLogLine);}});Job job =newJob();ParquetOutputFormat.setWriteSupportClass(job,AvroWriteSupport.class);AvroParquetOutputFormat.setSchema(job,LogLine.SCHEMA$);//dummy instance, because that's the only way to get the class of a// parameterized typeParquetOutputFormat<LogLine> pOutput =newParquetOutputFormat<LogLine>();//System.out.println("job write support - " +// job.getConfiguration().get("parquet.write.support.class") +// " job schema - " + job.getConfiguration().get("parquet// .avro.schema")) ; outputPairs.saveAsNewAPIHadoopFile(outputPath,//pathVoid.class,//key classLogLine.class,//value class pOutput.getClass(),//output format class job.getConfiguration());//configuration}