Spark Streaming 自定义 Receiver
spark streaming 除了内部支持的数据源之外,还可以自定义数据源。只需要继承Receiver<>类,然后重写onStart()和onStop()方法就可以了。下面以从MySQL获取数据为例:
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.Map;
import org.apache.spark.storage.StorageLevel;
import org.apache.spark.streaming.receiver.Receiver;
public class DataBaseReceiver extends Receiver<String> {
private Connection conn;
private Statement st;
private ResultSet rs;
private String url;
private String username;
private String password;
private String dbtable;
private String[] fields;
private String driverClassName;
private String sql;
private int totalSize;
private int pageSize;
private StringBuilder sb = new StringBuilder();
public DataBaseReceiver(StorageLevel storageLevel,Map<String,Object> map) {
super(storageLevel);
this.url = map.get("db_jdbc_url")+"";
this.username = map.get("db_jdbc_userName")+"";
this.password = map.get("db_jdbc_password")+"";
this.dbtable = map.get("dbTableName")+"";
this.fields = (map.get("fields")+"").split(",");
this.totalSize = 0;
this.pageSize = Integer.parseInt(map.get("pageSize")+"");
this.driverClassName = map.get("db_jdbc_driverClass")+"";
}
@Override
public void onStart() {//在此方法中,主要是做一些初始化,以及启动一个线程,用以获取数据
try {
Class.forName(driverClassName);
conn = DriverManager.getConnection(url,username,password);
st = conn.createStatement();
} catch (Exception e) {
e.printStackTrace();
}
new Thread(new DBRunnable()).start();
}
@Override
public void onStop() {
if(rs != null){
try {
rs.close();
} catch (Exception e) {
e.printStackTrace();
}finally{
rs = null;
}
}
if(st != null){
try {
st.close();
} catch (Exception e) {
e.printStackTrace();
}finally{
st = null;
}
}
if(conn != null){
try {
conn.close();
} catch (Exception e) {
e.printStackTrace();
}finally{
conn = null;
}
}
}
class DBRunnable implements Runnable{
@Override
public void run() {
while(!isStopped()){
try {
System.out.println("===================================已读行数:"+totalSize+"=========================================");
int newTotalSize = getTotalSize();
int size = newTotalSize - totalSize;
System.out.println("===================================新增行数:"+size+"=========================================");
if(size > 0){
int pageNum = size/pageSize;
if(size%pageSize != 0){
pageNum += 1;
}
executeSql(pageNum);
totalSize = newTotalSize;
}else{
System.out.println("===================================没有新的数据,进入休眠=========================================");
Thread.sleep(5000);
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
private int getTotalSize(){
int newTotalSize = 0;
try {
sql = "select count(*) from "+dbtable;
rs = st.executeQuery(sql);
rs.next();
newTotalSize = rs.getInt(1);
} catch (Exception e) {
e.printStackTrace();
}
return newTotalSize;
}
private void executeSql(int pageNum){
try {
System.out.println("===================================页数:"+pageNum+"=========================================");
for(int i = 0;i < pageNum;i++){
int startIndex = totalSize+i*pageSize;
sql = "select * from "+dbtable+" limit "+startIndex+","+pageSize;
System.out.println("===================================执行语句:"+sql+"=========================================");
rs = st.executeQuery(sql);
storeData();
}
} catch (Exception e) {
e.printStackTrace();
}
}
private void storeData(){
try {
while(rs.next()){
String line = getLine();
store(line);
System.out.println("===================================存储数据:"+line+"=========================================");
sb.setLength(0);
}
} catch (Exception e) {
e.printStackTrace();
}
}
private String getLine(){
try {
int i = 0;
for(String field : fields){
sb.append(rs.getString(field));
i++;
if(i < fields.length){
sb.append(",");
}
}
} catch (Exception e) {
e.printStackTrace();
}
return sb.toString();
}
}
}
如果需要从其他数据源获取数据,只需如法炮制即可。
之后在Application中使用如下方式调用:
SparkConf conf = new SparkConf();
JavaSparkContext sc = new JavaSparkContext(conf);
JavaStreamingContext streamingContext = new JavaStreamingContext(sc, Durations.seconds(3));
Map<String,Object> source = new HashMap<>();
JavaReceiverInputDStream<String> receiverStream = streamingContext.receiverStream(new DataBaseReceiver(StorageLevel.MEMORY_ONLY(), source));
//关于StorageLevel的等级,可以参看官方文档
ps:如果运行程序的时候采取的是Local模式,在指定master的时候,需指定大于1的数(即 --master local[n],其中n>1)。否则spark只能获取数据,而不能处理数据。