hive创建表指定自定义多个字符

最新推荐文章于 2023-05-30 10:23:45 发布

Fei-joe

最新推荐文章于 2023-05-30 10:23:45 发布

阅读量1k

点赞数

分类专栏： hive mapreduce hadoop

本文链接：https://blog.csdn.net/qq_33290422/article/details/84346323

版权

mapreduce hadoop 同时被 2 个专栏收录

10 篇文章 0 订阅

订阅专栏

hive

5 篇文章 0 订阅

订阅专栏

hive创建表指定分隔符，不支持多个字符作为分隔符,如果想使用多个字符作为分割符的话就需要实现InputFormat.主要重写next方法,代码如下

package gaode_84;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobConfigurable;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
public class MyHiveInputFormat extends TextInputFormat implements
JobConfigurable {

public RecordReader<LongWritable, Text> getRecordReader(
InputSplit genericSplit, JobConf job, Reporter reporter)
throws IOException {
reporter.setStatus(genericSplit.toString());
return new MyRecordReader((FileSplit) genericSplit, job);
}
}

================================================================

package gaode_84;

import java.io.IOException;
import java.io.InputStream;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.util.LineReader;

public class MyRecordReader implements RecordReader<LongWritable, Text> {

private CompressionCodecFactory compressionCodecs = null;
private long start;
private long pos;
private long end;
private LineReader lineReader;
int maxLineLength;

// 构造方法
public MyRecordReader(FileSplit inputSplit, Configuration job)
throws IOException {
maxLineLength = job.getInt("mapred.mutilCharRecordReader.maxlength",
Integer.MAX_VALUE);
start = inputSplit.getStart();
end = start + inputSplit.getLength();
final Path file = inputSplit.getPath();
// 创建压缩器
compressionCodecs = new CompressionCodecFactory(job);
final CompressionCodec codec = compressionCodecs.getCodec(file);
// 打开文件系统
FileSystem fs = file.getFileSystem(job);
FSDataInputStream fileIn = fs.open(file);
boolean skipFirstLine = false;

if (codec != null) {
lineReader = new LineReader(codec.createInputStream(fileIn), job);
end = Long.MAX_VALUE;
} else {
if (start != 0) {
skipFirstLine = true;
--start;
fileIn.seek(start);
}
lineReader = new LineReader(fileIn, job);
}

if (skipFirstLine) {
start += lineReader.readLine(new Text(), 0,
(int) Math.min((long) Integer.MAX_VALUE, end - start));
}
this.pos = start;
}

public MyRecordReader(InputStream in, long offset, long endOffset,
int maxLineLength) {
this.maxLineLength = maxLineLength;
this.start = offset;
this.lineReader = new LineReader(in);
this.pos = offset;
this.end = endOffset;
}

public MyRecordReader(InputStream in, long offset, long endOffset,
Configuration job) throws IOException {
this.maxLineLength = job.getInt(
"mapred.mutilCharRecordReader.maxlength", Integer.MAX_VALUE);
this.lineReader = new LineReader(in, job);
this.start = offset;
this.end = endOffset;
}

@Override
public void close() throws IOException {
if (lineReader != null)
lineReader.close();
}

@Override
public LongWritable createKey() {
return new LongWritable();
}

@Override
public Text createValue() {
return new Text();
}

@Override
public long getPos() throws IOException {
return pos;
}

@Override
public float getProgress() throws IOException {
if (start == end) {
return 0.0f;
} else {
return Math.min(1.0f, (pos - start) / (float) (end - start));
}
}

@Override
public boolean next(LongWritable key, Text value) throws IOException {
while (pos < end) {
key.set(pos);
int newSize = lineReader.readLine(value, maxLineLength,
Math.max((int) Math.min(Integer.MAX_VALUE, end - pos),
maxLineLength));
// 把字符串中的"##"转变为"#"
String strReplace = value.toString().replace("-##-", "\001");// 替换的自定义字符 -##-
Text txtReplace = new Text();
txtReplace.set(strReplace);
value.set(txtReplace.getBytes(), 0, txtReplace.getLength());
if (newSize == 0)
return false;
pos += newSize;
if (newSize < maxLineLength)
return true;

}
return false;
}
}

then put jar into hive lib

Enter hive cli

add jar /mnt/modules/hive/lib/joe_hive.jar;

create external table taping_bi(
shop_id string,
is_closed string,
name string,
city_id string,
city string,
real_city string,
province3 string,
area_code string,
phone string
) STORED AS inputformat 'gaode_84.MyHiveInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' location 'hdfs://master:8020/user/hdfs/population/*/data';

OK!

thupdi_fei

Fei-joe

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
hive创建表指定自定义多个字符

hive创建表指定分隔符，不支持多个字符作为分隔符,如果想使用多个字符作为分割符的话就需要实现InputFormat.主要重写next方法,代码如下package gaode_84;import java.io.IOException;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;...
复制链接

扫一扫

专栏目录