一、现象
cdc创建任务的时候出现卡死的情况或者报错[CDC:ErrKafkaNewSaramaProducer]kafka: client has run out of available brokers to talk to (Is your cluster reachable?)
二、可能产生的原因
- kafka-version没有指定
- kafka开启sasl验证,cdc没有支持,https://github.com/pingcap/tiflow/issues/1106
- cdc版本兼容不了当前的kafka
- server.properties中的listerers要配置,不要用localhost listeners=PLAINTEXT://kafka_ip:9092
- cdc机器不能访问kafka的9092端口
- cdc-ctl的机器不能访问kafka的9092端口
三、排查方法
1.已经指定了Kafka-version,并且kafka没有开启sasl验证,所以原因1,2排除
2.尝试了多个版本的kafka和tidb集群,也是一样的报错,原因3排除
3.listeners的配置已经调了,kafka的9092端口已经对cdc机器开了,所以原因4,5排除
4.偶然间在tidb集群部署的某一台机器部署了kafka,这个可以正常创建任务,所以问题一定是在端口访问上,但是kafka的9092端口已经对cdc机器开启了,那么还有可能是什么呢,最终通过暴力的遍历测试定位到cdc-ctl在创建任务的时候同样是需要访问kafka的9092端口
四、最终结论
cdc同步到kafka的时候,kafka的9092端口不仅要对cdc机器开放,也要对cdc-ctl机器开放
下面先不用看,还没有缕清思路
五、源码理解
1.源头
因为对ctl为什么需要访问kafka百思不得其姐(解),所以想着从源码中来寻找答案,看是否能找到原因
2.源码步骤理解
2.1 func newCmdCreateChangefeed ,validate,complete,run – create changefeed
2.2
2.3
2.4
2.5 NewKafkaSaramaSink 的 adminClient --用来创建一个新的kafka链接
2.6 var NewAdminClientImpl kafka.ClusterAdminClientCreator = kafka.NewSaramaAdminClient
2.7 NewSaramaAdminClient 建立kafka链接
https://github.com/pingcap/tiflow/blob/bc1c72da525844c7940ce2f33b5acd092b0ff713/pkg/cmd/cli/cli_changefeed_create.go
// newCmdCreateChangefeed creates the `cli changefeed create` command.
func newCmdCreateChangefeed(f factory.Factory) *cobra.Command {
commonChangefeedOptions := newChangefeedCommonOptions()
o := newCreateChangefeedOptions(commonChangefeedOptions)
command := &cobra.Command{
Use: "create",
Short: "Create a new replication task (changefeed)",
Args: cobra.NoArgs,
RunE: func(cmd *cobra.Command, args []string) error {
ctx := cmdcontext.GetDefaultContext()
err := o.complete(ctx, f, cmd)
if err != nil {
return err
}
err = o.validate(ctx, cmd)
if err != nil {
return err
}
return o.run(ctx, cmd)
},
}
o.addFlags(command)
return command
}
// run the `cli changefeed create` command.
func (o *createChangefeedOptions) run(ctx context.Context, cmd *cobra.Command) error {
id := o.changefeedID
if id == "" {
id = uuid.New().String()
}
// validate checks that the provided attach options are specified.
func (o *createChangefeedOptions) validate(ctx context.Context, cmd *cobra.Command) error {
if o.commonChangefeedOptions.sinkURI == "" {
return errors.New("Creating changefeed without a sink-uri")
}
// complete adapts from the command line args to the data and client required.
func (o *createChangefeedOptions) complete(ctx context.Context, f factory.Factory, cmd *cobra.Command) error {
etcdClient, err := f.EtcdClient()
if err != nil {
return err
}
// validateSink will create a sink and verify that the configuration is correct.
func (o *createChangefeedOptions) validateSink(
ctx context.Context, cfg *config.ReplicaConfig, opts map[string]string,
) error {
return sink.Validate(ctx, o.commonChangefeedOptions.sinkURI, cfg, opts)
}
https://github.com/pingcap/tiflow/blob/da440ec90c4b63b07d0d6e85d579451187e5537c/pkg/cmd/factory/factory.go
// Factory defines the client-side construction factory.
type Factory interface {
ClientGetter
EtcdClient() (*etcd.CDCEtcdClient, error)
PdClient() (pd.Client, error)
KvStorage() (kv.Storage, error)
}
https://github.com/pingcap/tiflow/blob/20f4adecffcd0e3ae1c78799ca19833d9d6842f5/pkg/config/replica_config.go
// GetDefaultReplicaConfig returns the default replica config.
func GetDefaultReplicaConfig() *ReplicaConfig {
return defaultReplicaConfig.Clone()
}
type replicaConfig struct {
CaseSensitive bool `toml:"case-sensitive" json:"case-sensitive"`
EnableOldValue bool `toml:"enable-old-value" json:"enable-old-value"`
ForceReplicate bool `toml:"force-replicate" json:"force-replicate"`
CheckGCSafePoint bool `toml:"check-gc-safe-point" json:"check-gc-safe-point"`
Filter *FilterConfig `toml:"filter" json:"filter"`
Mounter *MounterConfig `toml:"mounter" json:"mounter"`
Sink *SinkConfig `toml:"sink" json:"sink"`
Consistent *ConsistentConfig `toml:"consistent" json:"consistent"`
}
https://github.com/pingcap/tiflow/blob/20f4adecffcd0e3ae1c78799ca19833d9d6842f5/pkg/config/sink.go
// SinkConfig represents sink config for a changefeed
type SinkConfig struct {
DispatchRules []*DispatchRule `toml:"dispatchers" json:"dispatchers"`
Protocol string `toml:"protocol" json:"protocol"`
ColumnSelectors []*ColumnSelector `toml:"column-selectors" json:"column-selectors"`
SchemaRegistry string `toml:"schema-registry" json:"schema-registry"`
}
https://github.com/pingcap/tiflow/blob/dd41f0f1b0335991b0c7afd33171c665528bd7ac/cdc/sink/sink.go
// Validate sink if given valid parameters.
func Validate(ctx context.Context, sinkURI string, cfg *config.ReplicaConfig, opts map[string]string) error {
sinkFilter, err := filter.NewFilter(cfg)
if err != nil {
return err
}
// New creates a new sink with the sink-uri
func New(
ctx context.Context, changefeedID model.ChangeFeedID, sinkURIStr string,
filter *filter.Filter, config *config.ReplicaConfig, opts map[string]string,
errCh chan error,
) (Sink, error) {
// parse sinkURI as a URI
sinkURI, err := url.Parse(sinkURIStr)
if err != nil {
return nil, cerror.WrapError(cerror.ErrSinkURIInvalid, err)
}
if newSink, ok := sinkIniterMap[strings.ToLower(sinkURI.Scheme)]; ok {
return newSink(ctx, changefeedID, sinkURI, filter, config, opts, errCh)
}
return nil, cerror.ErrSinkURIInvalid.GenWithStack("the sink scheme (%s) is not supported", sinkURI.Scheme)
}
func init() {
// register blackhole sink
sinkIniterMap["blackhole"] = func(
ctx context.Context, changefeedID model.ChangeFeedID, sinkURI *url.URL,
filter *filter.Filter, config *config.ReplicaConfig, opts map[string]string,
errCh chan error,
) (Sink, error) {
return newBlackHoleSink(ctx), nil
}
// register kafka sink
sinkIniterMap["kafka"] = func(
ctx context.Context, changefeedID model.ChangeFeedID, sinkURI *url.URL,
filter *filter.Filter, config *config.ReplicaConfig, opts map[string]string,
errCh chan error,
) (Sink, error) {
return mq.NewKafkaSaramaSink(ctx, sinkURI, filter, config, opts, errCh)
}
sinkIniterMap["kafka+ssl"] = sinkIniterMap["kafka"]
https://github.com/pingcap/tiflow/blob/dd41f0f1b0335991b0c7afd33171c665528bd7ac/pkg/filter/filter.go
// NewFilter creates a filter.
func NewFilter(cfg *config.ReplicaConfig) (*Filter, error) {
f, err := VerifyRules(cfg)
if err != nil {
return nil, cerror.WrapError(cerror.ErrFilterRuleInvalid, err)
}
https://github.com/pingcap/tiflow/blob/8e8fddb046754f148d76a3b263e33c2d06d38e44/cdc/owner/changefeed.go
type changefeed struct {
newSink func() DDLSink
}
https://github.com/pingcap/tiflow/blob/0b7969deea495ff6462b9a980a5717cca2fbcec5/cdc/sink/mq/mq.go
// NewKafkaSaramaSink creates a new Kafka mqSink.
func NewKafkaSaramaSink(ctx context.Context, sinkURI *url.URL,
replicaConfig *config.ReplicaConfig,
errCh chan error,
) (*mqSink, error) {
topic := strings.TrimFunc(sinkURI.Path, func(r rune) bool {
return r == '/'
})
if topic == "" {
return nil, cerror.ErrKafkaInvalidConfig.GenWithStack("no topic is specified in sink-uri")
}
adminClient, err := kafka.NewAdminClientImpl(baseConfig.BrokerEndpoints, saramaConfig)
if err != nil {
return nil, cerror.WrapError(cerror.ErrKafkaNewSaramaProducer, err)
}
https://github.com/pingcap/tiflow/blob/dd41f0f1b0335991b0c7afd33171c665528bd7ac/cdc/sink/mq/producer/kafka/kafka.go
// NewAdminClientImpl specifies the build method for the admin client.
var NewAdminClientImpl kafka.ClusterAdminClientCreator = kafka.NewSaramaAdminClient
https://github.com/pingcap/tiflow/blob/7fa1f2fb33e3685c3a45d27e5786f3075ae9fa41/pkg/kafka/cluster_admin_client.go
// NewSaramaAdminClient constructs a ClusterAdminClient with sarama.
func NewSaramaAdminClient(addrs []string, conf *sarama.Config) (ClusterAdminClient, error) {
return sarama.NewClusterAdmin(addrs, conf)
}