一:spark 读取kafka官网提供了两种方式:
第一种方式:Receiver-based Approach 此方法基于receiver
第二种方式:Direct Approach (No Receivers)方法,kafka版本要求0.10后
注:详细请参考http://spark.apache.org/docs/latest/streaming-programming-guide.html#output-operations-on-dstreams
二 :两种方式的区别和特点:
Receiver-based 方法是使用一个receiver来接收数据;通过Receiver从kafka接收的数据存放在 Spark executors;但是基于spark默认配置,当executors节点宕掉时候会出现丢数据情况,为了确保无数据丢失,你就要另外配置把数据持久化到Write Ahead Logs(预写日志)中;这个同步操作会把数据写到hdfs中(预写日志HDFS路径),这样当出现数据丢失的时候都可以从预写日志中恢复;
此方法需说明:读取kafka的分区数和生成的rdd的分区数不存在相关性;
Direct Approach:此方法是从spark1.3版本后提供的新的整合接口;可以直接end-to-end消费,此方法定期的按消息的最新offset去kafka拉取数据;此方法存在以下优点;
1: 行度简化:Spark Streaming会根据kafka的partitions数量生成相同分区的rdd;多分区并行从kafka读取数据;
第一种方式:Receiver-based Approach 此方法基于receiver
第二种方式:Direct Approach (No Receivers)方法,kafka版本要求0.10后
注:详细请参考http://spark.apache.org/docs/latest/streaming-programming-guide.html#output-operations-on-dstreams
二 :两种方式的区别和特点:
Receiver-based 方法是使用一个receiver来接收数据;通过Receiver从kafka接收的数据存放在 Spark executors;但是基于spark默认配置,当executors节点宕掉时候会出现丢数据情况,为了确保无数据丢失,你就要另外配置把数据持久化到Write Ahead Logs(预写日志)中;这个同步操作会把数据写到hdfs中(预写日志HDFS路径),这样当出现数据丢失的时候都可以从预写日志中恢复;
此方法需说明:读取kafka的分区数和生成的rdd的分区数不存在相关性;
Direct Approach:此方法是从spark1.3版本后提供的新的整合接口;可以直接end-to-end消费,此方法定期的按消息的最新offset去kafka拉取数据;此方法存在以下优点;
1: 行度简化:Spark Streaming会根据kafka的partitions数量生成相同分区的rdd;多分区并行从kafka读取数据;
2:效率高:为了实现无数据丢失,基于Receiver的方法需先从kafka拉去数据存放到预写日志中;然后再从预写日志读取数据;这样一写一读都要io消耗;效率不高;而Direct Approach方法直接从kafka读取就进行操作,省去了持久化和读的操作也能保证数据无丢失;
三:官方网站推荐采用第二种方法下面是案例参考代码:
public class KafkaSimple implements Serializable {
public static void main(String[] args) throws InterruptedException, SQLException {
SparkConf conf = new SparkConf().setMaster("local[8]").setAppName("NetworkWordCount");
JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(1));
Map<String, Object> kafkaParams = new HashMap<String, Object>();
kafkaParams.put("bootstrap.servers", "localhost:6667");
//kafkaParams.put("key.deserializer", StringDeserializer.class);
//kafkaParams.put("value.deserializer", StringDeserializer.class);
kafkaParams.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
kafkaParams.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
kafkaParams.put("group.id", "test");
kafkaParams.put("auto.offset.reset", "latest");
kafkaParams.put("enable.auto.commit", false);
HikariConfig hikariConf = new HikariDataSource();
hikariConf.setMaximumPoolSize(20);
hikariConf.setDriverClassName("com.mysql.jdbc.Driver");
hikariConf.setJdbcUrl("jdbc:mysql://localhost/test");
hikariConf.setUsername("root");
hikariConf.setPassword("Aa2123456");
hikariConf.setConnectionTimeout(500000);
HikariCPConnectionProvider.prepare(hikariConf);
//Statement statement =(Statement) HikariCPConnectionProvider.getConnection().createStatement();
Collection<String> topics = Arrays.asList("ltsdata");
final JavaInputDStream<ConsumerRecord<String, String>> stream =
KafkaUtils.createDirectStream(
jssc,
LocationStrategies.PreferConsistent(),
ConsumerStrategies.<String, String>Subscribe(topics, kafkaParams)
);
stream.foreachRDD(new VoidFunction<JavaRDD<ConsumerRecord<String,String>>>(){
private static final long serialVersionUID = 1L;
public void call(JavaRDD<ConsumerRecord<String, String>> rdd) throws Exception {
final OffsetRange[] offsetRanges = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
//System.out.println(offsetRanges[0]+"==="+offsetRanges[1]);
rdd.foreachPartition(new VoidFunction<Iterator<ConsumerRecord<String, String>>>(){
private static final long serialVersionUID = 1L;
public void call(Iterator<ConsumerRecord<String, String>> Iterators) throws Exception {
Statement statement =(Statement) HikariCPConnectionProvider.getConnection().createStatement();
OffsetRange o = offsetRanges[TaskContext.get().partitionId()];
System.out.println(o.topic()+";"+o.partition()+";"+o.fromOffset()+";"+o.untilOffset());
if(Iterators.hasNext()){
ConsumerRecord<String, String> record =Iterators.next();
System.out.println("key:"+record.key()+";value="+record.value()+";topic="+record.topic());
String sql = record.value();
statement.execute(sql);
System.out.println(sql);
}
statement.close();
}
});
}
});
jssc.start(); // Start the computation
jssc.awaitTermination();
}
}
public class HikariCPConnectionProvider implements Serializable {
private transient static HikariDataSource dataSource;
private transient static HikariConfig conf;
private transient static Connection conn;
private transient static Statement statement;
public static void prepare(HikariConfig config) {
dataSource = new HikariDataSource(config);
conf = config;
// dataSource.setAutoCommit(false);
prepare();
}
public static void prepare() {
try {
if (dataSource==null || dataSource.isClosed()){
dataSource = new HikariDataSource(conf);
}
if (conn == null || conn.isClosed()) {
conn =HikariCPConnectionProvider.getConnection();
}
if (statement == null || statement.isClosed()) {
statement = conn.createStatement();
}
} catch (SQLException e) {
throw new RuntimeException(e);
}
}
public static Connection getConnection() throws SQLException {
return dataSource.getConnection();
}
public static Statement getStatement(){
return statement;
}
public static void close() {
try {
if (statement != null ) {
statement.close();
statement = null;
}
if (conn != null){
conn.close();
conn = null;
}
if( dataSource != null) {
dataSource.close();
dataSource = null;
}
} catch (SQLException e) {
throw new RuntimeException(e.getMessage(), e);
}
}
public static boolean isClosed(){
if (dataSource.isClosed()){
return true;
} else return false;
}
}