把byte[]写入文件的类的实现
package lm;
import java. io. File;
import java. io. FileOutputStream;
import java. io. IOException;
public class FileMgr {
private String toPath = null ;
private long n = 0 ;
class Out {
public FileOutputStream writer;
public File file;
public long count = 0 ;
}
private Out out = new Out ( ) ;
public FileMgr ( String topic, String toPath) {
this . toPath = toPath + File. separator + topic + "_" ;
newFile ( ) ;
}
public void close ( ) throws IOException {
if ( out != null ) {
if ( out. writer != null ) {
out. writer. close ( ) ;
out. writer = null ;
}
out. count = 0 ;
out. file = null ;
}
}
public void newFile ( ) {
try {
if ( out. file != null ) {
out. writer. close ( ) ;
out. file. renameTo ( new File ( out. file. getAbsolutePath ( ) + ".ok" ) ) ;
}
File f = new File ( this . toPath + System. currentTimeMillis ( ) + "_" + n++ ) ;
out. file = f;
out. writer = new FileOutputStream ( f) ;
out. count = 0 ;
} catch ( IOException e) {
e. printStackTrace ( ) ;
}
}
public void append ( byte[ ] data) throws IOException {
out. writer. write ( data) ;
out. writer. flush ( ) ;
newFile ( ) ;
}
}
csv转换成GenericRecord格式的数据的工具类
package lm;
import io. confluent. kafka. schemaregistry. client. CachedSchemaRegistryClient;
import io. confluent. kafka. schemaregistry. client. rest. exceptions. RestClientException;
import org. apache. avro. Schema;
import org. apache. avro. generic. GenericData;
import org. apache. avro. generic. GenericDatumWriter;
import org. apache. avro. generic. GenericRecord;
import org. apache. avro. io. Encoder;
import org. apache. avro. io. EncoderFactory;
import java. io. BufferedReader;
import java. io. ByteArrayOutputStream;
import java. io. File;
import java. io. FileInputStream;
import java. io. FileNotFoundException;
import java. io. FileOutputStream;
import java. io. IOException;
import java. io. InputStreamReader;
import java. io. UnsupportedEncodingException;
import java. nio. ByteBuffer;
import java. util. ArrayList;
import java. util. Collection;
import java. util. List;
import static org. apache. avro. Schema. Type. RECORD ;
public class Csv2AvroUtil {
private Schema schema = null ;
private String split = "," ;
private final int bufferSize = 5 * 1024 * 1024 ;
private int batchSize = 10000 ;
private String topic;
public Csv2AvroUtil ( String schemaUrl, String topic, String split) {
try {
this . topic = topic;
Schema. Parser parser = new Schema. Parser ( ) ;
CachedSchemaRegistryClient cachedSchemaRegistryClient = new CachedSchemaRegistryClient ( schemaUrl, 100 ) ;
schema = parser. parse ( cachedSchemaRegistryClient. getLatestSchemaMetadata ( topic) . getSchema ( ) ) ;
System. out. println ( "get schema successfully, schema: " + schema. toString ( ) ) ;
this . split = split;
} catch ( IOException e ) {
System. out. println ( "get schema failed!" ) ;
e. printStackTrace ( ) ;
} catch ( RestClientException e ) {
System. out. println ( "get schema failed!" ) ;
e. printStackTrace ( ) ;
}
}
public byte[ ] convert ( List< String> lines) {
ByteArrayOutputStream out = new ByteArrayOutputStream ( bufferSize) ;
GenericDatumWriter< GenericRecord> writer = new GenericDatumWriter < GenericRecord> ( schema) ;
Encoder encoder = EncoderFactory. get ( ) . binaryEncoder ( out, null ) ;
try {
for ( String line: lines) {
System. out. println ( "line:" + line) ;
String[ ] vals = line. split ( this . split) ;
GenericRecord record = new GenericData. Record ( this . schema) ;
if ( vals. length != schema. getFields ( ) . size ( ) ) {
continue ;
}
for ( int i = 0 ; i < schema. getFields ( ) . size ( ) ; ++ i) {
Object o = convert ( schema. getFields ( ) . get ( i) . schema ( ) , vals[ i] ) ;
record. put ( i, o) ;
}
writer. write ( record, encoder) ;
}
encoder. flush ( ) ;
out. flush ( ) ;
} catch ( IOException e ) {
e. printStackTrace ( ) ;
} catch ( Exception e ) {
e. printStackTrace ( ) ;
}
return out. toByteArray ( ) ;
}
private Object convert ( Schema schema, String v) throws Exception {
Object o = null ;
if ( null == v) {
return null ;
}
if ( v. length ( ) == 0 || v. equals ( "" ) ) {
switch ( schema. getType ( ) ) {
case STRING :
return v;
case BYTES :
return ByteBuffer. wrap ( v. getBytes ( "UTF-8" ) ) ;
case UNION :
break ;
default :
return null ;
}
}
switch ( schema. getType ( ) ) {
case NULL :
o = null ;
break ;
case BOOLEAN :
o = Boolean. parseBoolean ( v) ;
break ;
case INT :
o = Integer. parseInt ( v) ;
break ;
case LONG :
o = Long. parseLong ( v) ;
break ;
case FLOAT :
o = Float. parseFloat ( v) ;
break ;
case DOUBLE :
o = Double. parseDouble ( v) ;
break ;
case BYTES :
try {
o = ByteBuffer. wrap ( v. getBytes ( "UTF-8" ) ) ;
} catch ( UnsupportedEncodingException e) {
e. printStackTrace ( ) ;
}
break ;
case STRING :
o = v;
break ;
case UNION :
for ( Schema mem : schema. getTypes ( ) ) {
o = convert ( mem, v) ;
if ( o != null ) break ;
}
break ;
case RECORD :
throw new Exception ( "Unsopported test.avro type:" + RECORD ) ;
case MAP :
throw new Exception ( "Unsopported test.avro type:" + RECORD ) ;
case ENUM : throw new Exception ( "Unsopported test.avro type:" + RECORD ) ;
case ARRAY :
throw new Exception ( "Unsopported test.avro type:" + RECORD ) ;
case FIXED :
throw new Exception ( "Unsopported test.avro type:" + RECORD ) ;
default :
throw new Exception ( "Unsopported test.avro type:" + RECORD ) ;
}
return o;
}
public void readFile ( String file, String toPath) {
try {
FileMgr mgr = new FileMgr ( this . topic, toPath) ;
BufferedReader in = new BufferedReader ( new InputStreamReader ( new FileInputStream ( file) ) ) ;
String line;
List< String> lines = new ArrayList < String> ( this . batchSize) ;
int size = 0 ;
while ( ( line = in . readLine ( ) ) != null ) {
if ( 0 == size) lines. clear ( ) ;
lines. add ( line) ;
++ size;
if ( size >= batchSize) {
byte[ ] data = this . convert ( lines) ;
mgr. append ( data) ;
lines. clear ( ) ;
size = 0 ;
}
}
if ( size> 0 && ! lines. isEmpty ( ) ) {
byte[ ] data = this . convert ( lines) ;
mgr. append ( data) ;
}
in . close ( ) ;
mgr. close ( ) ;
} catch ( Exception e) {
e. printStackTrace ( ) ;
}
}
public static void main ( String[ ] args) {
if ( args. length != 5 ) {
return ;
}
String schema = args[ 0 ] ;
String topic = args[ 1 ] ;
String split = args[ 2 ] ;
String file = args[ 3 ] ;
String toPath = args[ 4 ] ;
Csv2AvroUtil c2a = new Csv2AvroUtil ( schema, topic, split) ;
c2a. readFile ( file, toPath) ;
}
}