结合Flink一周的使用经验,用到了三个模块 (source于kafka,sink到Mysql,kudu,kafka)
source kafka是最简单的
import scala. collection. JavaConverters. _
import com. alibaba. fastjson. JSON
import org. apache. flink. api. common. serialization. SimpleStringSchema
import org. apache. flink. streaming. api. scala. { StreamExecutionEnvironment, createTypeInformation}
import org. apache. flink. streaming. connectors. kafka. FlinkKafkaConsumer
import org. apache. kafka. clients. consumer. ConsumerConfig
import java. util. Properties
val env = StreamExecutionEnvironment. getExecutionEnvironment
val prop = new Properties ( )
prop. setProperty ( ConsumerConfig. BOOTSTRAP_SERVERS_CONFIG , "host:9092" )
prop. setProperty ( ConsumerConfig. GROUP_ID_CONFIG , "kafkagroup" )
prop. setProperty ( ConsumerConfig. KEY_DESERIALIZER_CLASS_CONFIG , "org.apache.kafka.common.serialization.StringSerializer" )
prop. setProperty ( ConsumerConfig. VALUE_DESERIALIZER_CLASS_CONFIG , "org.apache.kafka.common.serialization.StringSerializer" )
prop. setProperty ( ConsumerConfig. AUTO_OFFSET_RESET_CONFIG , "earliest" )
val ds = env. addSource (
new FlinkKafkaConsumer [ String] (
"topic_name" ,
new SimpleStringSchema ( ) ,
prop
)
)
val dataStream = ds. map ( x => { val line = JSON . parseObject ( x)
line
} ) . filter ( x => x. containsKey ( "report" ) ) . map ( x => {
val appid = x. get ( "appid" ) . toString
val userid = x. get ( "userid" ) . toString
val time = x. get ( "time" ) . toString
Reading ( appid, userid, time)
appid + "," + userid + "," + time
val map = Map ( "appid" - > appid, "userid" - > userid, "time" - > time)
map. asJava
} )
case class Reading ( appid: string, userid: string, time: string)
Sink MySQL
class JDBCSink ( ) extends RichSinkFunction [ SensorReading] {
var conn: Connection = _
var insertStmt: PreparedStatement = _
override def open ( parameters: Configuration) : Unit = {
super . open ( parameters)
conn = DriverManager. getConnection ( "jdbc:mysql://IP:3306/库名" , "用户名" , "密码" )
insertStmt = conn. prepareStatement ( "INSERT INTO salary_table (appid, userid, create_time) VALUES (?,?,?)" )
}
override def invoke ( value: SensorReading, context: SinkFunction. Context) : Unit = {
insertStmt. setString ( 1 , value. appid)
insertStmt. setString ( 2 , value. userid)
insertStmt. setString ( 4 , value. create_time)
insertStmt. execute ( )
}
override def close ( ) : Unit = {
insertStmt. close ( )
conn. close ( )
}
}
dataStream. addSink ( new JDBCSink ( ) )
env. execute ( "job" )
Sink KUDU
package org. example;
import org. apache. flink. configuration. Configuration;
import org. apache. flink. streaming. api. functions. sink. RichSinkFunction;
import org. apache. kudu. Schema;
import org. apache. kudu. Type;
import org. apache. kudu. client. * ;
import java. io. ByteArrayOutputStream;
import java. io. IOException;
import java. io. ObjectOutputStream;
import java. util. Map;
public class SinkKudu extends RichSinkFunction < Map< String, String>> {
private KuduClient client;
private KuduTable table;
private String kuduMaster;
private String tableName;
private Schema schema;
private KuduSession kuduSession;
private ByteArrayOutputStream out;
private ObjectOutputStream os;
public SinkKudu ( String kuduMaster, String tableName ) {
this . kuduMaster = kuduMaster;
this . tableName = tableName;
}
@Override
public void open ( Configuration parameters) throws Exception {
out = new ByteArrayOutputStream ( ) ;
os = new ObjectOutputStream ( out) ;
client = new KuduClient. KuduClientBuilder ( kuduMaster) . build ( ) ;
table = client. openTable ( tableName) ;
schema = table. getSchema ( ) ;
kuduSession = client. newSession ( ) ;
kuduSession. setFlushMode ( SessionConfiguration. FlushMode. AUTO_FLUSH_BACKGROUND ) ;
}
public void invoke ( Map< String, String> map ) {
if ( map == null ) {
return ;
}
try {
int columnCount = schema. getColumnCount ( ) ;
Insert insert = table. newInsert ( ) ;
PartialRow row = insert. getRow ( ) ;
for ( int i = 0 ; i < columnCount; i++ ) {
String value = map. get ( schema. getColumnByIndex ( i) . getName ( ) ) ;
insertData ( row, schema. getColumnByIndex ( i) . getType ( ) , schema. getColumnByIndex ( i) . getName ( ) , value) ;
}
OperationResponse response = kuduSession . apply ( insert) ;
if ( response != null ) {
System. out. println ( response. getRowError ( ) . toString ( ) ) ;
}
} catch ( Exception e) {
System. out. println ( e) ;
}
}
@Override
public void close ( ) throws Exception {
try {
kuduSession. close ( ) ;
client. close ( ) ;
os. close ( ) ;
out. close ( ) ;
} catch ( Exception e) {
System. out. println ( e) ;
}
}
private void insertData ( PartialRow row, Type type, String columnName, String value) throws IOException {
try {
switch ( type) {
case STRING :
row. addString ( columnName, value) ;
return ;
case INT32 :
row. addInt ( columnName, Integer. valueOf ( value) ) ;
return ;
case INT64 :
row. addLong ( columnName, Long. valueOf ( value) ) ;
return ;
case DOUBLE :
row. addDouble ( columnName, Double. valueOf ( value) ) ;
return ;
case FLOAT :
row. addFloat ( columnName, Float. valueOf ( value) ) ;
return ;
default :
throw new UnsupportedOperationException ( "Unknown type " + type) ;
}
} catch ( Exception e) {
System. out. println ( "数据插入异常" ) ;
}
}
}
val kuduMaster = "ip" ;
val tableInfo = "tablename"
dataStream. addSink ( new SinkKudu ( kuduMaster, tableInfo) )
env. execute ( "flink_kudu_job" )
Sink Kakfa 不需要自定义sink 原生支持
val prop2 = new Properties ( )
prop2. setProperty ( ProducerConfig. BOOTSTRAP_SERVERS_CONFIG , "10.0.20.7:9092" )
prop2. setProperty ( ProducerConfig. RETRIES_CONFIG , "0" )
prop2. setProperty ( ProducerConfig. KEY_SERIALIZER_CLASS_CONFIG , "org.apache.kafka.common.serialization.StringSerializer" )
prop2. setProperty ( ProducerConfig. VALUE_SERIALIZER_CLASS_CONFIG , "org.apache.kafka.common.serialization.StringSerializer" )
dataStream. addSink ( new FlinkKafkaProducer [ String] (
"ip:9092" ,
"topic" ,
new SimpleStringSchema ( ) ) )
env. execute ( "event_attendees_ff" )