1.这个文章记录我从redis管理kafka的Consumer的offsets,是sparkStreaming作为Consumer的
版本的话kafka010,sparkStreaming 2.2.0,redis应该2.9.0 pom文件放在最后
2.主要的代码就两个
一个是Kafka010Demo03,
另一个就是RedisUtilsDemo。
然后其他就是连接redis读取配置文件的代码了。
我的是redis集群 3台机子6个端口
kafka三台节点
3.代码
ConsumerStrategies.Subscribe传的是集合topics
还有个地方getOffsetFromRedis传参的时候传的是单个topic,最好也能优化成传topics,有空改改
package Kafka010
import Kafka010.Utils.{MyKafkaUtils, RedisUtilsDemo}
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, HasOffsetRanges, KafkaUtils, LocationStrategies, OffsetRange}
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* Created by Shi shuai RollerQing on 2019/12/24 19:47
* kakfa的API 0-10版本的Consumer测试
*/
//TODO : kakfa的API 0-10版本的Consumer测试
object Kafka010Demo03 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName(s"${this.getClass.getCanonicalName}")
val ssc = new StreamingContext(conf, Seconds(5))
val groupID = "SparkKafka010"
val topics = List("topicB")
val kafkaParams: Map[String, String] = MyKafkaUtils.getKafkaConsumerParams(groupID, "false")
//从外部存储中获取offset
val offsets: Map[TopicPartition, Long] = RedisUtilsDemo.getOffsetFromRedis("topicB", groupID)
val ds: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](topics, kafkaParams, offsets)//要的就是这样的类型
)
ds.foreachRDD(rdd => {
val ranges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
//代表对数据进行处理
if(! rdd.isEmpty())
println(rdd.count())
//代表对offset进行处理
ranges.foreach(offset => {
println(s"${offset.partition}, ${offset.fromOffset}, ${offset.untilOffset}")
})
RedisUtilsDemo.saveOffsetToRedis(ranges, groupID)
})
ssc.start()
ssc.awaitTermination()
}
}
// "SparkKafka010".hashCode % 50 = 48
// kafka 新版API中,自动管理offset,缺省情况下 offset 5s 提交
注意存入redis的offsets的格式
key(kafka:topic:groupid)
value(parition:offset;parition:offset)
package Kafka010.Utils
import cn.bigdata.antispider.common.util.jedis.JedisConnectionUtil
import org.apache.kafka.common.TopicPartition
import org.apache.spark.streaming.kafka010.OffsetRange
/**
* Created by Shi shuai RollerQing on 2019/12/25 21:37
* 最基础的
* getOffsetFromRedis => topic 集合类型;value => json 串; 没有考虑 key 不存在的情况
*
* 校验:kafka的消费者API
* 1、从kafka中获取offset合法值(partition: minoffset, maxoffset)(Kafka 消费者API => 010)【高】
* /2、校验
*/
object RedisUtilsDemo {
private val jedis = JedisConnectionUtil.getJedisCluster
// 读取offset
// key(kafka:topic:groupid) value(parition:offset;parition:offset)
// 没有考虑 key 不存在的情况 所以第一次运行大概是空指针的情况吧
def getOffsetFromRedis(topic: String, groupID: String): Map[TopicPartition, Long] = {
val key = s"kafka:$topic:$groupID"
val offsetStr: String = jedis.get(key)
val arr: Array[(TopicPartition, Long)] = offsetStr.split(";").map(str => {
val fileds = str.split(":")
val partition: Int = fileds.head.toInt
val offset: Long = fileds.last.toLong
(new TopicPartition(topic, partition) -> offset)
})
val map: Map[TopicPartition, Long] = arr.toMap
map
}
// 保存offset
// key(kafka:topic:groupid) value(parition:offset;parition:offset)
def saveOffsetToRedis(ranges: Array[OffsetRange], groupID: String): Unit = {
ranges.map(offsets => (offsets.topic, (offsets.partition, offsets.untilOffset)))
.groupBy(_._1) //按照topic分组
.foreach{case((topic, buffer)) => //buffer 为上面map的整体(topic, (partition,untilOffset))
val key = s"kafka:$topic:$groupID"
val value = buffer.map{case (_, (partition, untilOffset)) => s"$partition:$untilOffset"}.mkString(";")
jedis.set(key, value)
}
}
}
注意:
第一次运行应该是nullPointException空指针错误
因为去redis读取offsets是没有记录的,第一读的时候,因为压根就没有。
所以如果要测试的话,要么根据格式手动存一下,
要么就跟Kafka0-10版本之02手动设置偏移量去读取kafka数据,然后将saveOffsetToRedis写上,保存消费后的offsets进redis再重新运行上面的demo
完了应该可以在redis找到相应记录
这样再次运行Kafka010Demo03
redis也变了
下面是pom文件和其他配置的代码文件
package cn.bigdata.antispider.common.util.jedis;
import redis.clients.jedis.*;
import java.io.IOException;
import java.util.*;
public class JedisConnectionUtil {
/**
* jedis哨兵集群连接单例对象
*/
private static JedisSentinelPool jedisSentinelPool = null;
/**
* jedis集群连接单例对象
*/
private static JedisCluster jedisCluster = null;
/**
* 创建JedisCluster
* JedisCluster不需要单独构建连接池,其已经基于连接池实现
*/
private static void createJedisCluster(){
//读取配置文件
Properties prop = PropertiesUtil.getProperties();
//jedisCluster配置
String[] serverArray = prop.getProperty("servers").split(",");
int connectionTimeout = Integer.valueOf(prop.getProperty("connectionTimeout"));
int soTimeout = Integer.valueOf(prop.getProperty("soTimeout"));
int maxAttempts = Integer.valueOf(prop.getProperty("maxAttempts"));
//jedis连接池配置
// 建立连接池配置参数
JedisPoolConfig config = new JedisPoolConfig();
// 设置最大连接数
config.setMaxTotal(new Integer(prop.getProperty("maxTotal")));
// 设置最大阻塞时间,记住是毫秒数milliseconds
config.setMaxWaitMillis(new Integer(prop.getProperty("maxWaitMillis")));
// 设置最大空间连接数
config.setMaxIdle(new Integer(prop.getProperty("maxIdle")));
// 设置最小空间连接数
config.setMinIdle(new Integer(prop.getProperty("minIdle")));
// jedis实例是否可用
boolean testOnBorrow = prop.getProperty("testOnBorrow") == "false" ? false : true;
config.setTestOnBorrow(testOnBorrow);
//#从连接池获取不到连接则阻塞
boolean blockWhenExhausted = prop.getProperty("blockWhenExhausted") == "false" ? false : true;
config.setBlockWhenExhausted(blockWhenExhausted);
//#连接对象后进先出
boolean lifo = prop.getProperty("lifo") == "false" ? false : true;
config.setLifo(lifo);
//#归还连接到池时测试连接
boolean testOnReturn = prop.getProperty("testOnReturn") == "false" ? false : true;
config.setTestOnReturn(testOnReturn);
//#测试连接池空闲的连接
boolean testWhileIdle = prop.getProperty("testWhileIdle") == "false" ? false : true;
config.setTestWhileIdle(testWhileIdle);
//#测试连接池空闲连接的时间间隔,testWhileIdle=true时生效
config.setTimeBetweenEvictionRunsMillis(new Integer(prop.getProperty("timeBetweenEvictionRunsMillis")));
Set<HostAndPort> nodes = new HashSet<HostAndPort>();
for (String ipPort : serverArray) {
String[] ipPortPair = ipPort.split(":");
nodes.add(new HostAndPort(ipPortPair[0].trim(), Integer.valueOf(ipPortPair[1].trim())));
}
//注意:这里超时时间不要太短,他会有超时重试机制
jedisCluster = new JedisCluster(nodes, connectionTimeout, soTimeout, maxAttempts,config);
}
/**
* 创建sentinel连接池
*/
private static void createJedisSentinelPool(){
//读取jedis配置文件
Properties prop = PropertiesUtil.getProperties();
//jedis连接池配置
// 建立连接池配置参数
JedisPoolConfig config = new JedisPoolConfig();
// 设置最大连接数
config.setMaxTotal(new Integer(prop.getProperty("maxTotal")));
// 设置最大阻塞时间,记住是毫秒数milliseconds
config.setMaxWaitMillis(new Integer(prop.getProperty("maxWaitMillis")));
// 设置最大空间连接数
config.setMaxIdle(new Integer(prop.getProperty("maxIdle")));
// 设置最小空间连接数
config.setMinIdle(new Integer(prop.getProperty("minIdle")));
// jedis实例是否可用
boolean testOnBorrow = prop.getProperty("testOnBorrow") == "false" ? false : true;
config.setTestOnBorrow(testOnBorrow);
//#从连接池获取不到连接则阻塞
boolean blockWhenExhausted = prop.getProperty("blockWhenExhausted") == "false" ? false : true;
config.setBlockWhenExhausted(blockWhenExhausted);
//#连接对象后进先出
boolean lifo = prop.getProperty("lifo") == "false" ? false : true;
config.setLifo(lifo);
//#归还连接到池时测试连接
boolean testOnReturn = prop.getProperty("testOnReturn") == "false" ? false : true;
config.setTestOnReturn(testOnReturn);
//#测试连接池空闲的连接
boolean testWhileIdle = prop.getProperty("testWhileIdle") == "false" ? false : true;
config.setTestWhileIdle(testWhileIdle);
//#测试连接池空闲连接的时间间隔,testWhileIdle=true时生效
config.setTimeBetweenEvictionRunsMillis(new Integer(prop.getProperty("timeBetweenEvictionRunsMillis")));
//获取redis密码
//String password = prop.getProperty("PASSWORD");
String masterName = prop.getProperty("MASTER");
String sentinel_1 = prop.getProperty("SENTINEL_1");
String sentinel_2 = prop.getProperty("SENTINEL_2");
String sentinel_3 = prop.getProperty("SENTINEL_3");
Set<String> sentinels = new HashSet<String>();
sentinels.add(sentinel_1);
sentinels.add(sentinel_2);
sentinels.add(sentinel_3);
jedisSentinelPool = new JedisSentinelPool(masterName, sentinels, config);
}
/**
* 在多线程环境同步初始化
*/
private static synchronized void JedisClusterInit(){
if (jedisCluster == null)
createJedisCluster();
}
/**
* 在多线程环境同步初始化
*/
private static synchronized void sentinelPoolInit(){
if (jedisSentinelPool == null)
createJedisSentinelPool();
}
/**
* 获取一个jedis对象
* @return
*/
public static Jedis getJedis(){
if (jedisSentinelPool == null)
sentinelPoolInit();
return jedisSentinelPool.getResource();
}
/**
* 获取一个jedis对象
* @return
*/
public static JedisCluster getJedisCluster(){
if (jedisCluster == null)
JedisClusterInit();
return jedisCluster;
}
/**
* 释放一个连接
* @param jedis
*/
public static void returnRes(Jedis jedis){
jedisSentinelPool.returnResource(jedis);
}
/**
* 销毁一个连接
* @param jedis
*/
public static void returnBrokenRes(Jedis jedis){
jedisSentinelPool.returnBrokenResource(jedis);
}
/**
* 获取集群上所有key
* @param pattern
* @return
*/
public static TreeSet<String> keys(JedisCluster jc, String pattern){
TreeSet<String> keys = new TreeSet<>();
Map<String, JedisPool> clusterNodes = jc.getClusterNodes();
for(String k : clusterNodes.keySet()){
JedisPool jp = clusterNodes.get(k);
Jedis jedis = jp.getResource();
try {
keys.addAll(jedis.keys(pattern));
} catch(Exception e){
e.printStackTrace();
} finally{
//用完一定要close这个链接!!!
jedis.close();
}
}
return keys;
}
/**
* 连接redis方法
* @param args
*/
public static void main(String[] args) throws IOException {
JedisCluster jedisCluster = getJedisCluster();
// jedisCluster.set("BlackChangeFlag", "false");
// jedisCluster.set("ProcessChangeFlag", "false");
// jedisCluster.set("FilterChangeFlag", "false");
// jedisCluster.set("AnalyzeRuleChangeFlag", "false");
// jedisCluster.set("QueryCriticalPagesChangeFlag", "false");
// jedisCluster.set("BookCriticalPagesChangeFlag", "false");
// jedisCluster.set("ClassifyRuleChangeFlag", "false");
jedisCluster.set("ExtractDataChangeFlag", "true");
// jedisCluster.set("dang", "no");
//System.out.println("sjk============" + jedisCluster.get("sjk"));
TreeSet<String> keySets = keys(jedisCluster, "*ChangeFlag");
//使用完成后不要调用close,将会导致无法继续使用该对象
jedisCluster.close();
for(String k: keySets){
System.out.println(k + " ============ " + jedisCluster.get(k));
}
}
}
package cn.bigdata.antispider.common.util.jedis;
import java.util.Properties;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
/*获取配置文件及信息的工具类*/
public class PropertiesUtil {
private static ResourceLoader loader = ResourceLoader.getInstance();
private static ConcurrentMap<String, String> configMap = new ConcurrentHashMap<String, String>();
private static final String DEFAULT_CONFIG_FILE = "jedisConfig.properties";
private static Properties prop = null;
/**
* 通过配置文件名获取配置文件中key对应的value值,如果未设置使用默认值
* @param key key值
* @param default_key 默认key
* @param propName Properties名称
* @return key所对应的值
*/
public static String getStringByKey(String key, String default_key,String propName) {
prop = getProperties(propName);
key = key.trim();
if (!configMap.containsKey(key)) {
if (prop.getProperty(key) != null) {
configMap.put(key, prop.getProperty(key));
}else if(prop.getProperty(default_key) != null) {
configMap.put(key, prop.getProperty(default_key));
}
}
return configMap.get(key);
}
/**
* 通过配置文件名获取配置文件中key对应的value值
* @param key key键
* @param propName Properties对象名称
* @return key所对应的value
*/
public static String getStringByKey(String key, String propName) {
/*try {
prop = loader.getPropFromProperties(propName);
} catch (Exception e) {
throw new RuntimeException(e);
}*/
prop = getProperties(propName);
key = key.trim();
if (!configMap.containsKey(key)) {
if (prop.getProperty(key) != null) {
configMap.put(key, prop.getProperty(key));
}
}
return configMap.get(key);
}
/**
* 使用默认设置的配置文件名获取配置文件中key对应的value值
* @param key key值
* @return
*/
public static String getStringByKey(String key) {
return getStringByKey(key, DEFAULT_CONFIG_FILE);
}
/**
* 加载默认配置文件
* @return Properties对象
*/
public static Properties getProperties() {
try {
return loader.getPropFromProperties(DEFAULT_CONFIG_FILE);
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
/**
* 加载配置文件
* @param propName Properties对象名称
* @return Properties对象
*/
public static Properties getProperties(String propName) {
try {
prop = loader.getPropFromProperties(propName);
} catch (Exception e) {
throw new RuntimeException(e);
}
return prop;
}
}
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<groupId>cn.bigdata.spider</groupId>
<artifactId>spider-producer</artifactId>
<packaging>jar</packaging>
<version>1.0-SNAPSHOT</version>
<modelVersion>4.0.0</modelVersion>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<jdk.version>1.8</jdk.version>
<scala.binary.version>2.11</scala.binary.version>
<scala.version>2.11.8</scala.version>
<spark.version>2.1.3</spark.version>
<kafka.version>1.1.0</kafka.version>
<hadoop.version>2.6.2</hadoop.version>
<mysql.driver.version>5.1.35</mysql.driver.version>
<jedis.version>2.9.0</jedis.version>
<fastjson.version>1.2.4</fastjson.version>
<junit.version>4.12</junit.version>
<c3p0.version>0.9.1.2</c3p0.version>
<commons-lang.version>2.6</commons-lang.version>
</properties>
<dependencies>
<!-- 优化url的解析(暂未使用) -->
<dependency>
<groupId>io.lemonlabs</groupId>
<artifactId>scala-uri_2.11</artifactId>
<version>1.4.5</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<!-- spark -->
<!-- spark-core -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<!-- spark-streaming -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<!-- spark-sql -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<!-- spark-streaming-kafka-0-10 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<!-- commons-lang -->
<dependency>
<groupId>commons-lang</groupId>
<artifactId>commons-lang</artifactId>
<version>${commons-lang.version}</version>
</dependency>
<!-- logging -->
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-api</artifactId>
<version>2.11.0</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>2.11.0</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-slf4j-impl</artifactId>
<version>2.11.0</version>
</dependency>
<!--注意:依赖不在Hadoop 环境中,集群运行需要上传依赖包并指定-->
<dependency>
<groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId>
<version>2.3.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<!-- mysql driver -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>${mysql.driver.version}</version>
</dependency>
<!--c3p0 数据库连接池-->
<dependency>
<groupId>c3p0</groupId>
<artifactId>c3p0</artifactId>
<version>${c3p0.version}</version>
</dependency>
<!--redis 缓存-->
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>${jedis.version}</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>${fastjson.version}</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.4</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>${junit.version}</version>
</dependency>
</dependencies>
</project>
jedisConfig.properties
#jedisCluster连接配置
#redisCluster实例地址
servers = 192.168.37.111:7001,192.168.37.111:7002,192.168.37.112:7003,192.168.37.112:7004,192.168.37.113:7005,192.168.37.113:7006
#连接redisCluster实例超时时间
connectionTimeout = 300000
#读写redisCluster实例超时时间
soTimeout = 300000
#连接redisCluster实例重试次数
maxAttempts = 6
#jedis连接池配置
#连接池最大连接数
maxTotal = 200
#获取连接池连接最大等待时间(毫秒)
maxWaitMillis = 15000
#最大空闲连接数
maxIdle = 50
#最小空闲连接数
minIdle = 10
#对拿到的connection进行validateObject校验
testOnBorrow = false
#从连接池获取不到连接则阻塞
blockWhenExhausted = true
#连接对象后进先出
lifo = true
#归还连接到池时测试连接
testOnReturn = false
#测试连接池空闲的连接
testWhileIdle = true
#测试连接池空闲连接的时间间隔,testWhileIdle=true时生效
timeBetweenEvictionRunsMillis = 30000
# 监控数据-键标识(分别是数据处理监控,查询监控,预订监控)
# 实际上是某个key的前缀
cluster.key.monitor.dataProcess = CSANTI_MONITOR_DP
cluster.key.monitor.linkProcess = CSANTI_MONITOR_LP
cluster.key.monitor.query = CSANTI_MONITOR_QUERY
cluster.key.monitor.book = CSANTI_MONITOR_BOOK
#监控数据有效期-单位秒 设置24小时过期
cluster.exptime.monitor = 86400
#反爬黑名单数据-键标识
cluster.key.anti_black_list = CSANTI_ANTI_BLACK
#反爬黑名单数据有效期-单位秒
cluster.exptime.anti_black_list = 3600
#反占座黑名单数据-键标识
cluster.key.ocp_black_list = CSANTI_OCP_BLACK
#反占座黑名单数据有效期-单位秒
cluster.exptime.ocp_black_list = 3600
package Kafka010.Utils
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.kafka.common.serialization.StringDeserializer
/**
* Created by Shi shuai RollerQing on 2019/12/24 19:20
*/
object MyKafkaUtils {
def getKafkaConsumerParams(grouid: String = "SparkStreaming010", autoCommit: String = "true"): Map[String, String] = {
val kafkaParams = Map[String, String] (
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "hadoop01:9092,hadoop02:9092,hadoop03:9092",
ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG -> autoCommit,
//ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "latest",//earliest、 none 、latest 具体含义可以点进去看
ConsumerConfig.GROUP_ID_CONFIG -> grouid,
ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer].getName,
ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer].getName
)
kafkaParams
}
/**
* 这个是官网的写kafka配置的写法,不过还是推荐使用第一种,这样不用自己写参数,避免手误
*
* 这个没有经过测试 要是使用也要改下 传参数进来 比如跟上面一样的groupid 要不就使用默认的
* @return
*/
def getKafkaConsumerParams2(): Map[String, Object] = {
val kafkaParams = Map[String, Object] {
"bootstrap.servers" -> "hadoop01:9092,hadoop02:9092,hadoop03:9092"
"key.deserializer" -> classOf[StringDeserializer]
"value.deserializer" -> classOf[StringDeserializer]
"auto.offset.reset" -> "latest"
"group.id" -> "topicA"
"enable.auto.commit" -> (true: java.lang.Boolean)
}
kafkaParams
}
def main(args: Array[String]): Unit = {
println(classOf[StringDeserializer].getName) //org.apache.kafka.common.serialization.StringDeserializer
println(classOf[StringDeserializer].getClass) //class java.lang.Class
println(classOf[StringDeserializer]) //class org.apache.kafka.common.serialization.StringDeserializer
}
}