遇到问题:
1.写入HDFS
generateActualKey 设置为NullWritable
generateActualValue 返回value值
设置输出文件名:
generateFileNameForKeyValue
设置目录可以覆盖模式
checkOutputSpecs
2.DataFrame转换为RDD时会有[];所以需要去掉;
默认字段分割符为","
3.hive创建表时要指定字段分割符和行分割符
dingdang,love NULL
xuejiao,love1312 NULL
解决问题:
CREATE TABLE IF NOT EXISTS src(
c1 string,
c2 string
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
4.分区表
创建分区表目录
alter table src add partition(date_key='20200828')
创建分区表
CREATE TABLE IF NOT EXISTS src(
c1 string,
c2 string
)
PARTITIONED BY(Date_Key string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
代码:
public class RDDMultipleTextOutputFormat<K, V> extends MultipleTextOutputFormat<K, V> {
@Override
protected K generateActualKey(K key, V value) {
key = (K) NullWritable.get();
//System.out.println("=============== key ======" + key);
return key;
}
@Override
protected V generateActualValue(K key, V value) {
value = (V)value.toString();
//System.out.println("=============== value ======" + value);
return value;
}
@Override
protected String generateFileNameForKeyValue(K key, V value, String name) {
name = name.replace("part", key.toString());
//System.out.println("=============== name ======" + name);
return super.generateFileNameForKeyValue(key, value, name);
}
@Override
public void checkOutputSpecs(FileSystem ignored, JobConf job) throws IOException {
Path outDir = getOutputPath(job);
if (outDir == null && job.getNumReduceTasks() != 0) {
throw new InvalidJobConfException(
"Output directory not set in JobConf.");
}
if (outDir != null) {
FileSystem fs = outDir.getFileSystem(job);
// normalize the output directory
outDir = fs.makeQualified(outDir);
setOutputPath(job, outDir);
// get delegation token for the outDir's file system
TokenCache.obtainTokensForNamenodes(job.getCredentials(),
new Path[] { outDir }, job);
//使spark的输出目录可以存在
// check its existence
/*if (fs.exists(outDir)) {
throw new FileAlreadyExistsException("Output directory "
+ outDir + " already exists");
}*/
}
}
public static class OutputFormatUtil {
public static String prefixOutputName = "";
}
}
@Override
public int run(String[] args) throws Exception {
SparkConf conf = getSparkconf();
logger.warn("===========HiveMain run start===========");
try {
//获取Javacontext
JavaSparkContext jsc = getJavaSparkContext();
//SQL的session
SparkSession session = getSparkSession();
Configuration hadoopconf = jsc.hadoopConfiguration();
//conf.set("spark.hadoop.fs.defaultFS", "hdfs://192.168.13.124:8020");
hadoopconf.set("mapreduce.output.fileoutputformat.compress", "false");
//conf.set("spark.sql.warehouse.dir", "hdfs://192.168.13.124:9000/user/hive/warehouse");
FileSystem fs = FileSystem.get(hadoopconf);
session.sql("use default");
String strPath = args[0];
JavaRDD<String> rowrdd = jsc.textFile(strPath);
StructType schema = createSchema(new String[]{"c1","c2"},
new DataType[]{DataTypes.StringType,DataTypes.StringType});
JavaRDD<Row> rowJavaRDD = parserdata2Row(rowrdd);
Dataset<Row> df = session.createDataFrame(rowJavaRDD, schema);
df.createOrReplaceTempView("src");
df.show();
JavaPairRDD<Text, Text> pairRDD = df.javaRDD().mapToPair(f -> {
String replace = f.toString().replace("[", "").
replace("]", "").replace("null", " ");
String key = "S003_WA_SOURCE_0005";
String[] value = replace.split(",");
System.out.println("f : " + replace);
return new Tuple2<Text, Text>(new Text(key), new Text(value[0] + "," + value[1]));
});
pairRDD.saveAsHadoopFile("/user/hive/warehouse/src/date_key=20200828", Text.class,
Text.class, RDDMultipleTextOutputFormat.class);
}catch (Exception e) {
// TODO: handle exception
}
return 0;
}
private static StructType createSchema(String[] strFields, DataType[] dts ) {
ArrayList<StructField> fields = new ArrayList<StructField>();
StructField field = null;
if (strFields.length != dts.length) {
System.out.println("Schema is error");
return null;
}
for (int i = 0; i < strFields.length; i++) {
field = DataTypes.createStructField(strFields[i], dts[i], true);
fields.add(field);
}
StructType schema = DataTypes.createStructType(fields);
return schema;
}
public JavaRDD<Row> parserdata2Row( JavaRDD<String> sourceDataRDD) {
return sourceDataRDD.map(f -> {
String[] strLine = f.split("\t");
//System.out.println("===== strline: " + Arrays.asList(strLine).size());
//System.out.println("===== strline value: " + Arrays.asList(strLine));
//获得TXT类型inputView的查询字段
String[] res = new String[2];
for (int i = 0; i < 2; i++) {
res[i] = strLine[i];
}
return RowFactory.create(res);
});
}
2033

被折叠的 条评论
为什么被折叠?



