open-falcon源码阅读（四）——transfer源码阅读

最新推荐文章于 2021-08-10 11:31:34 发布

阿团团

最新推荐文章于 2021-08-10 11:31:34 发布

阅读量895

点赞数

分类专栏： Open-falcon学习文章标签： Open-falcon Go 监控集群监控

本文链接：https://blog.csdn.net/jiangxuege/article/details/99301826

版权

Open-falcon学习专栏收录该内容

17 篇文章 0 订阅

订阅专栏

本人水平：参加工作一年，刚看完一本《go实战》的菜鸡

代码版本：2019年1月15日使用go get github.com/open-falcon/falcon-plus拉下来的代码

1 概览

1.1 目录结构

g：全局共享信息的存放，包括配置信息等
http：http服务
proc：自监控相关
receiver：接收数据
sender：发送数据

1.2 main函数

func main() {
	cfg := flag.String("c", "cfg.json", "configuration file")
	version := flag.Bool("v", false, "show version")
	versionGit := flag.Bool("vg", false, "show version")
	flag.Parse()

	// 打印版本
	if *version {
		fmt.Println(g.VERSION)
		os.Exit(0)
	}
	if *versionGit {
		fmt.Println(g.VERSION, g.COMMIT)
		os.Exit(0)
	}

	// 解析配置
	g.ParseConfig(*cfg)

	// 打印服务开启日志，开启数据收发服务
	proc.Start()
	sender.Start()
	receiver.Start()

	// http
	http.Start()

	// 阻塞 main 函数
	select {}
}

2 服务

2.1 服务启动

启动就打印一下日志

/*
	启动
 */
func Start() {
	// 打印启动日志
	log.Println("proc.Start, ok")
}

另外这个文件里还定义了所有的自监控指标，没有注释的指标，其实并没有用到，只是初始化了一下，官网也没提到这些指标

//  transfer 自监控指标
var (
	// 计数统计
	RecvCnt       = nproc.NewSCounterQps("RecvCnt")  // 接收到的监控数据计数
	RpcRecvCnt    = nproc.NewSCounterQps("RpcRecvCnt")  // 通过 rpc 上报监控数据计数
	HttpRecvCnt   = nproc.NewSCounterQps("HttpRecvCnt")  // 通过 http 上报监控数据计数
	SocketRecvCnt = nproc.NewSCounterQps("SocketRecvCnt")

	SendToJudgeCnt = nproc.NewSCounterQps("SendToJudgeCnt")  // 监控数据插入 judge 缓存队列成功计数
	SendToTsdbCnt  = nproc.NewSCounterQps("SendToTsdbCnt")   // 监控数据插入 tsdb 缓存队列成功计数
	SendToGraphCnt = nproc.NewSCounterQps("SendToGraphCnt")  // 监控数据插入 graph 缓存队列成功计数

	SendToJudgeDropCnt = nproc.NewSCounterQps("SendToJudgeDropCnt")    // 监控数据插入 judge 缓存队列失败计数
	SendToTsdbDropCnt  = nproc.NewSCounterQps("SendToTsdbDropCnt")   // 监控数据插入 tsdb 缓存队列失败计数
	SendToGraphDropCnt = nproc.NewSCounterQps("SendToGraphDropCnt")  // 监控数据插入 graph 缓存队列失败计数

	SendToJudgeFailCnt = nproc.NewSCounterQps("SendToJudgeFailCnt")    // 向 judge 发送数据失败计数
	SendToTsdbFailCnt  = nproc.NewSCounterQps("SendToTsdbFailCnt")    // 向 tsdb 发送数据失败计数
	SendToGraphFailCnt = nproc.NewSCounterQps("SendToGraphFailCnt")    // 向 graph 发送数据失败计数

	// 缓存队列长度，周期刷新
	JudgeQueuesCnt = nproc.NewSCounterBase("JudgeSendCacheCnt")  // judge 缓存队列长度
	TsdbQueuesCnt  = nproc.NewSCounterBase("TsdbSendCacheCnt")  // tsdb 缓存队列长度
	GraphQueuesCnt = nproc.NewSCounterBase("GraphSendCacheCnt")  // graph 缓存队列长度

	// http 请求次数
	HistoryRequestCnt = nproc.NewSCounterQps("HistoryRequestCnt")
	InfoRequestCnt    = nproc.NewSCounterQps("InfoRequestCnt")
	LastRequestCnt    = nproc.NewSCounterQps("LastRequestCnt")
	LastRawRequestCnt = nproc.NewSCounterQps("LastRawRequestCnt")

	// http 响应次数
	HistoryResponseCounterCnt = nproc.NewSCounterQps("HistoryResponseCounterCnt")
	HistoryResponseItemCnt    = nproc.NewSCounterQps("HistoryResponseItemCnt")
	LastRequestItemCnt        = nproc.NewSCounterQps("LastRequestItemCnt")
	LastRawRequestItemCnt     = nproc.NewSCounterQps("LastRawRequestItemCnt")
)

监控指标实现代码如下，分两种，一种就是普通的值，另一种是QPS

const (
	DefaultOtherMaxSize      = 100  // 备注最大长度
	DefaultSCounterQpsPeriod = 1  // 两次计算 QPS 的最小时间间隔
)

/*
	基本指标类型
 */
type SCounterBase struct {
	sync.RWMutex  // 读写锁
	Name  string  // 指标名
	Cnt   int64  // 数值
	Time  string  // 时间
	ts    int64  // 时间戳
	Other map[string]interface{}  // 备注
}

/*
	新建
 */
func NewSCounterBase(name string) *SCounterBase {
	uts := time.Now().Unix()
	return &SCounterBase{Name: name, Cnt: 0, Time: ntime.FormatTs(uts),
		ts: uts, Other: make(map[string]interface{})}
}

/*
	以对象形式返回指标
 */
func (this *SCounterBase) Get() *SCounterBase {
	this.RLock()
	defer this.RUnlock()

	return &SCounterBase{
		Name:  this.Name,
		Cnt:   this.Cnt,
		Time:  this.Time,
		ts:    this.ts,
		Other: deepCopyMap(this.Other),
	}
}

/*
	设置值
 */
func (this *SCounterBase) SetCnt(cnt int64) {
	this.Lock()
	this.Cnt = cnt
	this.ts = time.Now().Unix()  // 更新时间
	this.Time = ntime.FormatTs(this.ts)
	this.Unlock()
}

/*
	值递增
 */
func (this *SCounterBase) Incr() {
	this.IncrBy(int64(1))
}

/*
	值增加
 */
func (this *SCounterBase) IncrBy(incr int64) {
	this.Lock()
	this.Cnt += incr
	this.Unlock()
}

/*
	添加备注
 */
func (this *SCounterBase) PutOther(key string, value interface{}) bool {
	this.Lock()
	defer this.Unlock()

	ret := false
	_, exist := this.Other[key]
	if exist {
		this.Other[key] = value
		ret = true
	} else {
		if len(this.Other) < DefaultOtherMaxSize {
			this.Other[key] = value
			ret = true
		}
	}

	return ret
}

/*
	QPS 类指标
 */
type SCounterQps struct {
	sync.RWMutex  // 读写锁
	Name    string  // 指标名称
	Cnt     int64  // 指标值
	Qps     int64  // QPS
	Time    string  // 时间
	ts      int64  // 时间戳
	lastTs  int64  // 上一次取值时的时间戳
	lastCnt int64  // 上一次取值时的值
	Other   map[string]interface{}  // 备注
}

/*
	新建
 */
func NewSCounterQps(name string) *SCounterQps {
	uts := time.Now().Unix()
	return &SCounterQps{Name: name, Cnt: 0, Time: ntime.FormatTs(uts), ts: uts,
		Qps: 0, lastCnt: 0, lastTs: uts, Other: make(map[string]interface{})}
}

/*
	以对象形式返回指标
 */
func (this *SCounterQps) Get() *SCounterQps {
	this.Lock()
	defer this.Unlock()

	this.ts = time.Now().Unix()
	this.Time = ntime.FormatTs(this.ts)

	if this.ts-this.lastTs > DefaultSCounterQpsPeriod {  // 避免频繁计算 QPS，如果距离上次计算时间间隔小于预设值，直接返回上次 QPS 结果
		this.Qps = int64((this.Cnt - this.lastCnt) / (this.ts - this.lastTs))  // 计算 QPS
		this.lastTs = this.ts
		this.lastCnt = this.Cnt
	}

	return &SCounterQps{
		Name:    this.Name,
		Cnt:     this.Cnt,
		Qps:     this.Qps,
		Time:    this.Time,
		ts:      this.ts,
		lastTs:  this.lastTs,
		lastCnt: this.lastCnt,
		Other:   deepCopyMap(this.Other),
	}
}

/*
	值递增
 */
func (this *SCounterQps) Incr() {
	this.IncrBy(int64(1))
}

/*
	值增加
 */
func (this *SCounterQps) IncrBy(incr int64) {
	this.Lock()
	this.incrBy(incr)
	this.Unlock()
}

/*
	添加备注
 */
func (this *SCounterQps) PutOther(key string, value interface{}) bool {
	this.Lock()
	defer this.Unlock()

	ret := false
	_, exist := this.Other[key]
	if exist {
		this.Other[key] = value
		ret = true
	} else {
		if len(this.Other) < DefaultOtherMaxSize {
			this.Other[key] = value
			ret = true
		}
	}

	return ret
}

/*
	值增加
 */
func (this *SCounterQps) incrBy(incr int64) {
	this.Cnt += incr
}

/*
	映射深拷贝
 */
func deepCopyMap(src map[string]interface{}) map[string]interface{} {
	dst := make(map[string]interface{})
	for key, val := range src {
		dst[key] = val
	}
	return dst
}

以及一个数据追踪缓存和一个数据过滤器

// 数据追踪缓存
var (
	RecvDataTrace = nproc.NewDataTrace("RecvDataTrace", 3)
)

// filter
var (
	RecvDataFilter = nproc.NewDataFilter("RecvDataFilter", 5)
)

数据追踪缓存，在transfer向graph发送数据之前，会检查这个数据是不是设置的追踪的数据（通过api设置），如果是，会把最新的数据插入缓存中，缓存的实现代码如下

/*
	数据追踪缓存
 */
type DataTrace struct {
	sync.RWMutex  // 读写锁
	MaxSize int  // 数据缓存大小最大值
	Name    string  // 缓存名称
	PK      string  // 主键
	L       *list.List  // 保存缓存数据
}

func NewDataTrace(name string, maxSize int) *DataTrace {
	return &DataTrace{L: list.New(), Name: name, MaxSize: maxSize}
}

/*
	设置主键
 */
func (this *DataTrace) SetPK(pk string) {
	this.Lock()
	defer this.Unlock()

	// 主键修改，清空缓存
	if this.PK != pk {
		this.L = list.New()
	}
	this.PK = pk
}

/*
	数据跟踪
 */
func (this *DataTrace) Trace(pk string, v interface{}) {
	this.RLock()
	// 若主键不符合，放弃数据跟踪
	if this.PK != pk {
		this.RUnlock()
		return
	}
	this.RUnlock()

	// 主键符合，缓存中插入最新数据
	this.Lock()
	defer this.Unlock()
	this.L.PushFront(v)
	if this.L.Len() > this.MaxSize {  // 缓存超度超过最大值，清理过期数据
		this.L.Remove(this.L.Back())
	}
}

/*
	获取追踪数据
 */
func (this *DataTrace) GetAllTraced() []interface{} {
	this.RLock()
	defer this.RUnlock()

	items := make([]interface{}, 0)
	for e := this.L.Front(); e != nil; e = e.Next() {
		items = append(items, e)
	}

	return items
}

过滤器也是类似的作用，配置好阈值和比较符后，在向graph发送数据前，会判断是否符合阈值，符合阈值的数值会保存在缓存中，阈值等也是通过api设置的，过滤器实现代码如下

/*
	数据过滤器
 */
type DataFilter struct {
	sync.RWMutex  // 读写锁
	MaxSize   int  // 数据缓存大小最大值
	Name      string  // 过滤器名称
	PK        string  // 主键
	Opt       string  // 操作符，和阈值相关
	Threshold float64  // 阈值
	L         *list.List  // 数据缓存
}

func NewDataFilter(name string, maxSize int) *DataFilter {
	return &DataFilter{L: list.New(), Name: name, MaxSize: maxSize}
}

/*
	过滤器属性设置
 */
func (this *DataFilter) SetFilter(pk string, opt string, threshhold float64) error {
	this.Lock()
	defer this.Unlock()

	// 检测操作符是否合法
	if !legalOpt(opt) {
		return fmt.Errorf("bad opt: %s", opt)
	}

	// 主键改变，清空缓存
	if this.PK != pk {
		this.L = list.New()
	}
	this.PK = pk
	this.Opt = opt
	this.Threshold = threshhold

	return nil
}

/*
	过滤
 */
func (this *DataFilter) Filter(pk string, val float64, v interface{}) {
	this.RLock()
	// 主键不符合，结束
	if this.PK != pk {
		this.RUnlock()
		return
	}
	this.RUnlock()

	// 主键符合，对比阈值
	this.Lock()
	defer this.Unlock()
	if compute(this.Opt, val, this.Threshold) {  // 如果符合阈值条件
		this.L.PushFront(v)  // 记录数据
		if this.L.Len() > this.MaxSize {  // 缓存大小超过最大值，清理过期数据
			this.L.Remove(this.L.Back())
		}
	}
}

/*
	获取缓存的所有数据
 */
func (this *DataFilter) GetAllFiltered() []interface{} {
	this.RLock()
	defer this.RUnlock()

	items := make([]interface{}, 0)
	for e := this.L.Front(); e != nil; e = e.Next() {
		items = append(items, e)
	}

	return items
}

// 常量，用于对比
const (
	MinPositiveFloat64 = 0.000001
	MaxNegativeFloat64 = -0.000001
)

/*
	阈值判断
 */
func compute(opt string, left float64, right float64) bool {
	switch opt {
	case "eq":
		return math.Abs(left-right) < MinPositiveFloat64
	case "ne":
		return math.Abs(left-right) >= MinPositiveFloat64
	case "gt":
		return (left - right) > MinPositiveFloat64
	case "lt":
		return (left - right) < MaxNegativeFloat64
	default:
		return false
	}
}

/*
	检测操作符是否合法
 */
func legalOpt(opt string) bool {
	switch opt {
	case "eq", "ne", "gt", "lt":
		return true
	default:
		return false
	}
}

2.2 数据发送服务

/*
	发送数据服务启动
 */
func Start() {
	// 读取允许的数据上报最小间隔，最小间隔不能小于 30 秒
	MinStep = g.Config().MinStep
	if MinStep < 1 {
		MinStep = 30
	}

	// 初始化
	initConnPools()  // 初始化连接池
	initSendQueues()  // 初始化发送队列
	initNodeRings()  // 初始化一致性哈希环

	// 定时任务
	startSendTasks()  // 数据发送任务开启
	startSenderCron()  // 定时记录运行情况

	log.Println("send.Start, ok")
}

2.2.1 初始化连接池

针对judge、graph、tsdb创建三组连接池，每个IP对应一个连接池

/*
	初始化连接池
 */
func initConnPools() {
	cfg := g.Config()

	// judge 连接池
	judgeInstances := nset.NewStringSet()  // 映射，用于保存 judge 集群名 - 地址信息
	for _, instance := range cfg.Judge.Cluster {  // 读取配置的 judge 地址
		judgeInstances.Add(instance)
	}
	JudgeConnPools = backend.CreateSafeRpcConnPools(cfg.Judge.MaxConns, cfg.Judge.MaxIdle,  // 创建线程安全的 judge 连接池
		cfg.Judge.ConnTimeout, cfg.Judge.CallTimeout, judgeInstances.ToSlice())

	// tsdb 连接池
	if cfg.Tsdb.Enabled {
		TsdbConnPoolHelper = backend.NewTsdbConnPoolHelper(cfg.Tsdb.Address, cfg.Tsdb.MaxConns, cfg.Tsdb.MaxIdle, cfg.Tsdb.ConnTimeout, cfg.Tsdb.CallTimeout)
	}

	// graph 连接池
	graphInstances := nset.NewSafeSet()  // 映射，用于保存 graph 集群名 - 地址信息
	for _, nitem := range cfg.Graph.ClusterList {  // 读取配置的 graph 地址，因为 graph 支持多个地址用逗号隔开，所以这里和 judge 略有不同
		for _, addr := range nitem.Addrs {
			graphInstances.Add(addr)
		}
	}
	GraphConnPools = backend.CreateSafeRpcConnPools(cfg.Graph.MaxConns, cfg.Graph.MaxIdle,  // 创建线程安全的 graph 连接池
		cfg.Graph.ConnTimeout, cfg.Graph.CallTimeout, graphInstances.ToSlice())
}

创建连接池的代码，连接judge和graph模块用的是rpc连接

/*
	线程安全 RPC 连接池
 */
type SafeRpcConnPools struct {
	sync.RWMutex  // 读写锁
	M           map[string]*connp.ConnPool  // 每个实例对应一个连接池
	MaxConns    int  // 最大连接数
	MaxIdle     int  // 最大空闲数
	ConnTimeout int  // 连接超时时间
	CallTimeout int  // 响应超时时间
}

/*
	创建连接池
 */
func CreateSafeRpcConnPools(maxConns, maxIdle, connTimeout, callTimeout int, cluster []string) *SafeRpcConnPools {
	cp := &SafeRpcConnPools{M: make(map[string]*connp.ConnPool), MaxConns: maxConns, MaxIdle: maxIdle,
		ConnTimeout: connTimeout, CallTimeout: callTimeout}

	ct := time.Duration(cp.ConnTimeout) * time.Millisecond
	for _, address := range cluster {
		if _, exist := cp.M[address]; exist {  // 跳过重复的地址
			continue
		}
		cp.M[address] = createOneRpcPool(address, address, ct, maxConns, maxIdle)  // 每个实例创建一个连接池
	}

	return cp
}

/*
	创建一个连接池
 */
func createOneRpcPool(name string, address string, connTimeout time.Duration, maxConns int, maxIdle int) *connp.ConnPool {
	p := connp.NewConnPool(name, address, int32(maxConns), int32(maxIdle))  // 声明连接池

	// 连接池 New 函数，新建连接时调用
	p.New = func(connName string) (connp.NConn, error) {
		_, err := net.ResolveTCPAddr("tcp", p.Address)  // 地址解析为 TCP 格式
		if err != nil {
			return nil, err
		}

		conn, err := net.DialTimeout("tcp", p.Address, connTimeout)  // 建立 TCP 连接
		if err != nil {
			return nil, err
		}

		// 封装为 RPC 连接
		return rpcpool.NewRpcClient(rpc.NewClient(conn), connName), nil
	}

	// 返回连接池
	return p
}

连接tsdb的连接池，和上面的差不多，不过tsdb只有一个实例，对应一个连接池

/*
	tsdb 连接池
 */
type TsdbConnPoolHelper struct {
	p           *connp.ConnPool  // 连接池
	maxConns    int  // 最大连接数
	maxIdle     int  // 最大空闲数
	connTimeout int  // 连接超时时间
	callTimeout int  // 响应超时时间
	address     string  // 地址
}

/*
	创建连接池
 */
func NewTsdbConnPoolHelper(address string, maxConns, maxIdle, connTimeout, callTimeout int) *TsdbConnPoolHelper {
	return &TsdbConnPoolHelper{
		p:           newTsdbConnPool(address, maxConns, maxIdle, connTimeout),
		maxConns:    maxConns,
		maxIdle:     maxIdle,
		connTimeout: connTimeout,
		callTimeout: callTimeout,
		address:     address,
	}
}

/*
	创建 tsdb 连接池
*/
func newTsdbConnPool(address string, maxConns int, maxIdle int, connTimeout int) *connp.ConnPool {
	pool := connp.NewConnPool("tsdb", address, int32(maxConns), int32(maxIdle))  // 创建一个连接池

	// 连接池 New 函数，新建连接时调用
	pool.New = func(name string) (connp.NConn, error) {
		_, err := net.ResolveTCPAddr("tcp", address)  // 地址解析为 TCP 格式
		if err != nil {
			return nil, err
		}

		conn, err := net.DialTimeout("tcp", address, time.Duration(connTimeout)*time.Millisecond)  // 建立 TCP 连接
		if err != nil {
			return nil, err
		}

		// 封装为 tsdb 连接
		return TsdbClient{conn, name}, nil
	}

	return pool
}

连接池的底层实现还是比较简单易懂，线程安全靠读写锁实现

// 连接数量超过最大上限报错
var ErrMaxConn = fmt.Errorf("maximum connections reached")

// 连接
type NConn interface {
	io.Closer
	Name() string
	Closed() bool
}

// 连接池
type ConnPool struct {
	sync.RWMutex  // 读写锁

	Name     string  // 连接池名
	Address  string  // IP
	MaxConns int32  // 最大连接数
	MaxIdle  int32  // 最大空闲连接数
	Cnt      int64  // 创建新连接次数

	New func(name string) (NConn, error)  // New 函数，新建连接时调用

	active int32 // 活跃连接数，包括空闲连接和使用中连接
	free   []NConn  // 空闲连接列表
	all    map[string]NConn  // 连接名 - 连接映射
}

/*
	创建新连接池
 */
func NewConnPool(name string, address string, maxConns int32, maxIdle int32) *ConnPool {
	return &ConnPool{Name: name, Address: address, MaxConns: maxConns, MaxIdle: maxIdle, Cnt: 0, all: make(map[string]NConn)}
}

/*
	查询连接池连接情况
 */
func (this *ConnPool) Proc() string {
	this.RLock()
	defer this.RUnlock()

	return fmt.Sprintf("Name:%s,Cnt:%d,active:%d,all:%d,free:%d",
		this.Name, this.Cnt, this.active, len(this.all), len(this.free))
}

/*
	获取一个连接
 */
func (this *ConnPool) Fetch() (NConn, error) {
	this.Lock()
	defer this.Unlock()

	// 尝试获取空闲连接
	conn := this.fetchFree()
	if conn != nil {
		return conn, nil
	}

	// 超过连接数上限，返回报错
	if this.overMax() {
		return nil, ErrMaxConn
	}

	// 没有超过连接数上限，创建新连接
	conn, err := this.newConn()
	if err != nil {
		return nil, err
	}

	// 活跃连接数加一
	this.increActive()
	return conn, nil
}


/*
	释放连接
 */
func (this *ConnPool) Release(conn NConn) {
	this.Lock()
	defer this.Unlock()

	if this.overMaxIdle() {
		// 如果超过最大空闲连接数，关闭这一连接，活跃连接数减一
		this.deleteConn(conn)
		this.decreActive()
	} else {
		// 没超过最大空闲连接数，把该连接加入到空闲连接中
		this.addFree(conn)
	}
}

/*
	强制关闭连接
 */
func (this *ConnPool) ForceClose(conn NConn) {
	this.Lock()
	defer this.Unlock()

	this.deleteConn(conn)  // 删除连接
	this.decreActive()  // 活跃连接数减一
}

/*
	摧毁连接池
 */
func (this *ConnPool) Destroy() {
	this.Lock()
	defer this.Unlock()

	// 关闭所有空闲连接（不理解，难道 free 里的连接不在 all 里面包括了吗？）
	for _, conn := range this.free {
		if conn != nil && !conn.Closed() {
			conn.Close()
		}
	}

	// 关闭所有连接
	for _, conn := range this.all {
		if conn != nil && !conn.Closed() {
			conn.Close()
		}
	}

	// 内部参数清空
	this.active = 0
	this.free = []NConn{}
	this.all = map[string]NConn{}
}

/*
	创建新连接
 */
func (this *ConnPool) newConn() (NConn, error) {
	name := fmt.Sprintf("%s_%d_%d", this.Name, this.Cnt, time.Now().Unix()) // 用连接池属性和时间戳生成连接名
	conn, err := this.New(name)  // 创建新连接
	if err != nil {  // 如果创建新连接出错，关闭连接
		if conn != nil {
			conn.Close()
		}
		return nil, err
	}

	this.Cnt++  // 连接计数加一
	this.all[conn.Name()] = conn  // 保存连接
	return conn, nil
}

/*
	删除连接
 */
func (this *ConnPool) deleteConn(conn NConn) {
	// 关闭连接并删除
	if conn != nil {
		conn.Close()
	}
	delete(this.all, conn.Name())
}

/*
	增加空闲连接
 */
func (this *ConnPool) addFree(conn NConn) {
	this.free = append(this.free, conn)
}

/*
	获取空闲连接
 */
func (this *ConnPool) fetchFree() NConn {
	if len(this.free) == 0 {
		return nil
	}

	conn := this.free[0]
	this.free = this.free[1:]
	return conn
}

/*
	活跃连接数加一
 */
func (this *ConnPool) increActive() {
	this.active += 1
}

/*
	活跃连接数减一
 */
func (this *ConnPool) decreActive() {
	this.active -= 1
}

/*
	判断是否达到最大连接数
 */
func (this *ConnPool) overMax() bool {
	return this.active >= this.MaxConns
}

/*
	判断是否达到最大空闲连接数
 */
func (this *ConnPool) overMaxIdle() bool {
	return int32(len(this.free)) >= this.MaxIdle
}

2.2.2 初始化发送队列

比较简单，用的go自带的list，线程安全且有容量限制

/*
	初始化发送队列
*/
func initSendQueues() {
	cfg := g.Config()
	for node := range cfg.Judge.Cluster {  // 新建 judge 发送队列
		Q := nlist.NewSafeListLimited(DefaultSendQueueMaxSize)
		JudgeQueues[node] = Q
	}

	for node, nitem := range cfg.Graph.ClusterList {   // 新建 graph 发送队列
		for _, addr := range nitem.Addrs {
			Q := nlist.NewSafeListLimited(DefaultSendQueueMaxSize)
			GraphQueues[node+addr] = Q
		}
	}

	if cfg.Tsdb.Enabled {   // 新建 tsdb 发送队列
		TsdbQueue = nlist.NewSafeListLimited(DefaultSendQueueMaxSize)
	}
}

2.2.3 初始化一致性哈希环节点

一致性哈希算法这个文章讲得很清晰面试必备：什么是一致性Hash算法？

对judge和graph集群各建立一个一致性哈希环，每个实例的名字就是配置文件里的节点名

/*
	初始化一致性哈希环节点
 */
func initNodeRings() {
	cfg := g.Config()

	// 新建 judge 集群一致性哈希环
	JudgeNodeRing = rings.NewConsistentHashNodesRing(int32(cfg.Judge.Replicas), cutils.KeysOfMap(cfg.Judge.Cluster))
	// 新建 graph 集群一致性哈希环
	GraphNodeRing = rings.NewConsistentHashNodesRing(int32(cfg.Graph.Replicas), cutils.KeysOfMap(cfg.Graph.Cluster))
}

一致性哈希环的代码

/*
	一致性哈希环
 */
type ConsistentHashNodeRing struct {
	ring *consistent.Consistent
}

/*
	创建一致性哈希环
 */
func NewConsistentHashNodesRing(numberOfReplicas int32, nodes []string) *ConsistentHashNodeRing {
	ret := &ConsistentHashNodeRing{ring: consistent.New()}  // 新建哈希环
	ret.SetNumberOfReplicas(numberOfReplicas)  // 设置虚拟节点数
	ret.SetNodes(nodes)  // 设置所有节点
	return ret
}

/*
	找到最接近对象的节点
 */
func (this *ConsistentHashNodeRing) GetNode(pk string) (string, error) {
	return this.ring.Get(pk)
}

/*
	设置节点
 */
func (this *ConsistentHashNodeRing) SetNodes(nodes []string) {
	for _, node := range nodes {
		this.ring.Add(node)
	}
}

/*
	设置虚拟节点数
 */
func (this *ConsistentHashNodeRing) SetNumberOfReplicas(num int32) {
	this.ring.NumberOfReplicas = int(num)
}

哈希环的底层代码如下

/*
	类型定义
 */
type uints []uint32

/*
	求切片长度
 */
func (x uints) Len() int { return len(x) }

/*
	比较两个下标对应元素大小
 */
func (x uints) Less(i, j int) bool { return x[i] < x[j] }

/*
	交换两个元素
 */
func (x uints) Swap(i, j int) { x[i], x[j] = x[j], x[i] }

/*
	空哈希环报错
 */
var ErrEmptyCircle = errors.New("empty circle")

/*
	一致性哈希环
 */
type Consistent struct {
	circle           map[uint32]string  // 哈希值-节点映射
	members          map[string]bool  // 节点列表
	sortedHashes     uints  // 保存排序后的节点哈希值
	NumberOfReplicas int  // 虚拟节点数量
	count            int64  // 节点个数
	scratch          [64]byte  // 用途不明，哈希值计算函数中有同名变量
	sync.RWMutex  // 读写锁
}

/*
	创建一致性哈希环
 */
func New() *Consistent {
	c := new(Consistent)
	c.NumberOfReplicas = 20
	c.circle = make(map[uint32]string)
	c.members = make(map[string]bool)
	return c
}

/*
	生成虚拟节点名，用于计算虚拟节点哈希值
 */
func (c *Consistent) eltKey(elt string, idx int) string {
	return strconv.Itoa(idx) + elt
}

/*
	添加一个节点
 */
func (c *Consistent) Add(elt string) {
	c.Lock()
	defer c.Unlock()
	c.add(elt)
}

/*
	添加一个节点
 */
func (c *Consistent) add(elt string) {
	// 添加所有的虚拟节点
	for i := 0; i < c.NumberOfReplicas; i++ {
		c.circle[c.hashKey(c.eltKey(elt, i))] = elt
	}
	c.members[elt] = true
	c.updateSortedHashes()
	c.count++
}

/*
	删除节点
 */
func (c *Consistent) Remove(elt string) {
	c.Lock()
	defer c.Unlock()
	c.remove(elt)
}

/*
	删除节点
 */
func (c *Consistent) remove(elt string) {
	// 删除所有的虚拟节点
	for i := 0; i < c.NumberOfReplicas; i++ {
		delete(c.circle, c.hashKey(c.eltKey(elt, i)))
	}
	// 从节点列表中删除该节点
	delete(c.members, elt)
	// 更新排序哈希表
	c.updateSortedHashes()
	// 节点个数减一
	c.count--
}

/*
	设置节点列表，用指定节点列表覆盖一致性哈希环的节点列表
 */
func (c *Consistent) Set(elts []string) {
	c.Lock()
	defer c.Unlock()
	// 移除不在指定节点列表中的节点
	for k := range c.members {
		found := false
		for _, v := range elts {
			if k == v {
				found = true
				break
			}
		}
		if !found {
			c.remove(k)
		}
	}
	// 指定节点列表中的新节点添加
	for _, v := range elts {
		_, exists := c.members[v]
		if exists {
			continue
		}
		c.add(v)
	}
}

/*
	返回一致性哈希环的节点列表
 */
func (c *Consistent) Members() []string {
	c.RLock()
	defer c.RUnlock()
	var m []string
	for k := range c.members {
		m = append(m, k)
	}
	return m
}

/*
	寻找对象最接近的节点
 */
func (c *Consistent) Get(name string) (string, error) {
	c.RLock()
	defer c.RUnlock()
	// 一致性哈希环没有节点，报错返回
	if len(c.circle) == 0 {
		return "", ErrEmptyCircle
	}
	key := c.hashKey(name)  // 求对象哈希值
	i := c.search(key)  // 最接近对象节点的索引
	return c.circle[c.sortedHashes[i]], nil  // 返回最接近节点名
}

/*
	搜索最接近的节点
 */
func (c *Consistent) search(key uint32) (i int) {
	// 比较函数
	f := func(x int) bool {
		return c.sortedHashes[x] > key
	}
	i = sort.Search(len(c.sortedHashes), f)  // 直接比较下标索引
	if i >= len(c.sortedHashes) {  // 如果大于所有节点的哈希值，则最接近的节点为第一个
		i = 0
	}
	return
}

/*
	寻找最接近对象的两个不同节点
 */
func (c *Consistent) GetTwo(name string) (string, string, error) {
	c.RLock()
	defer c.RUnlock()
	// 一致性哈希环没有节点，报错返回
	if len(c.circle) == 0 {
		return "", "", ErrEmptyCircle
	}
	// 查找最接近对象的节点
	key := c.hashKey(name)
	i := c.search(key)
	a := c.circle[c.sortedHashes[i]]

	// 总节点数唯一，返回该节点
	if c.count == 1 {
		return a, "", nil
	}

	// 查找下一个与最接近节点不同的节点（排除归属于同一节点的虚拟节点）
	start := i
	var b string
	for i = start + 1; i != start; i++ {
		if i >= len(c.sortedHashes) {
			i = 0
		}
		b = c.circle[c.sortedHashes[i]]
		if b != a {
			break
		}
	}
	return a, b, nil
}

/*
	寻找最接近对象的 N 个不同节点
 */
func (c *Consistent) GetN(name string, n int) ([]string, error) {
	c.RLock()
	defer c.RUnlock()

	// 一致性哈希环没有节点，报错返回
	if len(c.circle) == 0 {
		return nil, ErrEmptyCircle
	}

	// 保证 n 小于等于总节点数
	if c.count < int64(n) {
		n = int(c.count)
	}

	var (
		key   = c.hashKey(name)
		i     = c.search(key)
		start = i
		res   = make([]string, 0, n)
		elem  = c.circle[c.sortedHashes[i]]
	)

	res = append(res, elem)

	if len(res) == n {
		return res, nil
	}

	// 查找下一个与最接近节点不同的节点（排除归属于同一节点的虚拟节点），直到节点个数满足需求
	for i = start + 1; i != start; i++ {
		if i >= len(c.sortedHashes) {
			i = 0
		}
		elem = c.circle[c.sortedHashes[i]]
		if !sliceContainsMember(res, elem) {
			res = append(res, elem)
		}
		if len(res) == n {
			break
		}
	}

	return res, nil
}

/*
	hash 值计算
 */
func (c *Consistent) hashKey(key string) uint32 {
	// 只取前 64 个字节
	if len(key) < 64 {
		var scratch [64]byte
		copy(scratch[:], key)
		return crc32.ChecksumIEEE(scratch[:len(key)])
	}
	return crc32.ChecksumIEEE([]byte(key))
}

/*
	哈希值列表更新排序
 */
func (c *Consistent) updateSortedHashes() {
	// 清空列表
	hashes := c.sortedHashes[:0]

	// 如果哈希表的容量小于总虚拟节点数的 4 倍，扩容
	if cap(c.sortedHashes)/(c.NumberOfReplicas*4) > len(c.circle) {
		hashes = nil
	}

	// 重新添加所有元素
	for k := range c.circle {
		hashes = append(hashes, k)
	}
	// 排序
	sort.Sort(hashes)
	c.sortedHashes = hashes
}

/*
	判断列表中是否包含某个元素
 */
func sliceContainsMember(set []string, member string) bool {
	for _, m := range set {
		if m == member {
			return true
		}
	}
	return false
}

2.2.4 数据发送任务开启

用一个for循环，不断从数据发送队列中拿一定量的数据，封装后通过连接池发送

/*
	数据发送任务
 */
func startSendTasks() {
	cfg := g.Config()

	// 读取配置的最大连接数
	judgeConcurrent := cfg.Judge.MaxConns
	graphConcurrent := cfg.Graph.MaxConns
	tsdbConcurrent := cfg.Tsdb.MaxConns

	if tsdbConcurrent < 1 {
		tsdbConcurrent = 1
	}

	if judgeConcurrent < 1 {
		judgeConcurrent = 1
	}

	if graphConcurrent < 1 {
		graphConcurrent = 1
	}

	// 开启向 judge 发送数据任务
	for node := range cfg.Judge.Cluster {
		queue := JudgeQueues[node]
		go forward2JudgeTask(queue, node, judgeConcurrent)
	}

	// 开启向 graph 发送数据任务
	for node, nitem := range cfg.Graph.ClusterList {
		for _, addr := range nitem.Addrs {  // 由于 graph 支持多地址，这里多了一个读取地址的操作
			queue := GraphQueues[node+addr]
			go forward2GraphTask(queue, node, addr, graphConcurrent)
		}
	}

	// 开启向 tsdb 发送数据任务
	if cfg.Tsdb.Enabled {
		go forward2TsdbTask(tsdbConcurrent)
	}
}

向judge发送数据的代码，用了信号量来控制最大进程数，这个最大进程数就是配置文件里的maxConns，即连接池的最大连接数

/*
	不断把缓存队列中的数据发送到 judge
 */
func forward2JudgeTask(Q *list.SafeListLimited, node string, concurrent int) {
	batch := g.Config().Judge.Batch  // 一次发送的数据量
	addr := g.Config().Judge.Cluster[node]  // judge 实例地址
	sema := nsema.NewSemaphore(concurrent)  // 信号量设置最大进程数

	for {
		items := Q.PopBackBy(batch)  // 取出一次发送的数据
		count := len(items)
		if count == 0 {  // 没有待发送的数据，休眠
			time.Sleep(DefaultSendTaskSleepInterval)
			continue
		}

		judgeItems := make([]*cmodel.JudgeItem, count)  // 保存待发送数据的队列
		for i := 0; i < count; i++ {  // 封装为 judge 的数据格式
			judgeItems[i] = items[i].(*cmodel.JudgeItem)
		}

		sema.Acquire()  // 等待获取信号量，发送数据
		go func(addr string, judgeItems []*cmodel.JudgeItem, count int) {
			defer sema.Release()

			resp := &cmodel.SimpleRpcResponse{}  // RPC 响应
			var err error
			sendOk := false
			for i := 0; i < 3; i++ {  // 最多重试 3 次
				err = JudgeConnPools.Call(addr, "Judge.Send", judgeItems, resp)  // 尝试发送数据
				if err == nil {  // 发送成功，结束
					sendOk = true
					break
				}
				time.Sleep(time.Millisecond * 10)  // 休眠 10 毫秒，准备下一次尝试
			}

			// 结果处理
			if !sendOk {  // 发送失败，记录日志，失败计数加一
				log.Printf("send judge %s:%s fail: %v", node, addr, err)
				proc.SendToJudgeFailCnt.IncrBy(int64(count))
			} else {  // 发送成功，成功计数加一
				proc.SendToJudgeCnt.IncrBy(int64(count))
			}
		}(addr, judgeItems, count)
	}
}

连接池底层的代码，前面已经看过连接池的底层代码，每个连接池创建连接的New函数不一样，judge连接池的New函数实际上创建的是一个rpc连接，rpc调用远程judge的Judge.Send方法，发送数据

/*
	发送数据
 */
func (this *SafeRpcConnPools) Call(addr, method string, args interface{}, resp interface{}) error {
	connPool, exists := this.Get(addr)  // 获取地址对应的连接池
	if !exists {  // 地址不存在对应连接池，报错
		return fmt.Errorf("%s has no connection pool", addr)
	}

	conn, err := connPool.Fetch()  // 获取一个连接
	if err != nil {  // 获取连接失败，报错
		return fmt.Errorf("%s get connection fail: conn %v, err %v. proc: %s", addr, conn, err, connPool.Proc())
	}

	rpcClient := conn.(*rpcpool.RpcClient)
	callTimeout := time.Duration(this.CallTimeout) * time.Millisecond  // 设置超时时间

	done := make(chan error, 1)  // 通道，用于接收发送信息
	go func() {
		done <- rpcClient.Call(method, args, resp)  // 发送 RPC 请求
	}()

	select {
		case <-time.After(callTimeout):  // 超时
			connPool.ForceClose(conn)  // 关闭连接
			return fmt.Errorf("%s, call timeout", addr)  // 记录错误
		case err = <-done:  // 发送完成
			if err != nil {  // 发送失败
				connPool.ForceClose(conn)
				err = fmt.Errorf("%s, call failed, err %v. proc: %s", addr, err, connPool.Proc())
			} else {
				connPool.Release(conn)  // 发送成功，释放连接
			}
			return err
	}
}

/*
	获取实例对应的连接池
 */
func (this *SafeRpcConnPools) Get(address string) (*connp.ConnPool, bool) {
	this.RLock()
	defer this.RUnlock()
	p, exists := this.M[address]
	return p, exists
}

graph的代码和judge的代码近乎一致，略去

tsdb的也差不太多，唯一区别就在于不是rpc调用，发送数据是以字节的形式发送

/*
	不断把缓存队列中的数据发送到 tsdb
 */
func forward2TsdbTask(concurrent int) {
	batch := g.Config().Tsdb.Batch  // 一次发送的数据量
	retry := g.Config().Tsdb.MaxRetry  // 最大重试次数
	sema := nsema.NewSemaphore(concurrent)  // 信号量设置最大进程数

	for {
		items := TsdbQueue.PopBackBy(batch)  // 取出一次发送的数据
		if len(items) == 0 {  // 没有待发送的数据，休眠
			time.Sleep(DefaultSendTaskSleepInterval)
			continue
		}

		sema.Acquire()  // 等待获取信号量，发送数据
		go func(itemList []interface{}) {
			defer sema.Release()

			var tsdbBuffer bytes.Buffer  // 数据转字节
			for i := 0; i < len(itemList); i++ {
				tsdbItem := itemList[i].(*cmodel.TsdbItem)
				tsdbBuffer.WriteString(tsdbItem.TsdbString())
				tsdbBuffer.WriteString("\n")
			}

			var err error
			for i := 0; i < retry; i++ {  // 多次尝试发送数据
				err = TsdbConnPoolHelper.Send(tsdbBuffer.Bytes())
				if err == nil {
					proc.SendToTsdbCnt.IncrBy(int64(len(itemList)))  // 发送成功数据数增加
					break
				}
				time.Sleep(100 * time.Millisecond)  // 休眠 100 毫秒，再次尝试发送
			}

			if err != nil {  // 发送失败
				proc.SendToTsdbFailCnt.IncrBy(int64(len(itemList)))  // 发送失败数据数增加
				log.Println(err)
				return
			}
		}(items)
	}
}

连接池的代码，都差不多

/*
	发送数据
 */
func (t *TsdbConnPoolHelper) Send(data []byte) (err error) {
	conn, err := t.p.Fetch()  // 获取连接
	if err != nil {
		return fmt.Errorf("get connection fail: err %v. proc: %s", err, t.p.Proc())
	}

	cli := conn.(TsdbClient).cli

	done := make(chan error, 1)  // 通道，用于接收发送信息
	go func() {
		_, err = cli.Write(data)  // 发送数据
		done <- err
	}()

	select {
		case <-time.After(time.Duration(t.callTimeout) * time.Millisecond):  // 超时
			t.p.ForceClose(conn)  // 关闭连接
			return fmt.Errorf("%s, call timeout", t.address)
		case err = <-done:
			if err != nil {  // 发送失败，关闭连接并报错
				t.p.ForceClose(conn)
				err = fmt.Errorf("%s, call failed, err %v. proc: %s", t.address, err, t.p.Proc())
			} else {
				t.p.Release(conn)  // 发送成功，释放连接
			}
			return err
	}
}

/*
	摧毁连接池
 */
func (t *TsdbConnPoolHelper) Destroy() {
	if t.p != nil {
		t.p.Destroy()
	}
}

2.2.5 定时记录运行情况

transfer会定期把一些graph连接池的指标打印到日志里，这部分代码就是干这个的。另外这部分代码还定时获取缓存队列的长度

const (
	DefaultProcCronPeriod = time.Duration(5) * time.Second  // 记录缓存队列长度的时间周期，5 秒
	DefaultLogCronPeriod  = time.Duration(3600) * time.Second  // 记录日志的时间周期，1 小时
)

/*
	定时记录
 */
func startSenderCron() {
	go startProcCron()  // 记录缓存队列长度
	go startLogCron()  // 打印日志
}

/*
	定时记录缓存队列长度
 */
func startProcCron() {
	for {
		time.Sleep(DefaultProcCronPeriod)
		refreshSendingCacheSize()
	}
}

/*
	定时打印日志
 */
func startLogCron() {
	for {
		time.Sleep(DefaultLogCronPeriod)
		logConnPoolsProc()
	}
}

/*
	刷新缓存队列的长度
 */
func refreshSendingCacheSize() {
	proc.JudgeQueuesCnt.SetCnt(calcSendCacheSize(JudgeQueues))  // judge 缓存队列更新
	proc.GraphQueuesCnt.SetCnt(calcSendCacheSize(GraphQueues))  // graph 缓存队列更新
}

/*
	计算某组件所有实例缓存队列长度和
 */
func calcSendCacheSize(mapList map[string]*list.SafeListLimited) int64 {
	var cnt int64 = 0
	for _, list := range mapList {
		if list != nil {
			cnt += int64(list.Len())
		}
	}
	return cnt
}

/*
	打印 graph 连接池指标
 */
func logConnPoolsProc() {
	log.Printf("connPools proc: \n%v", strings.Join(GraphConnPools.Proc(), "\n"))
}

具体的连接池指标如下，包括连接池名、新建连接计数、活跃连接数、所有连接数、空闲连接数

/*
	所有连接池的指标
 */
func (this *SafeRpcConnPools) Proc() []string {
	procs := []string{}
	for _, cp := range this.M {
		procs = append(procs, cp.Proc())
	}
	return procs
}

/*
	查询连接池连接情况
 */
func (this *ConnPool) Proc() string {
	this.RLock()
	defer this.RUnlock()

	return fmt.Sprintf("Name:%s,Cnt:%d,active:%d,all:%d,free:%d",
		this.Name, this.Cnt, this.active, len(this.all), len(this.free))
}

2.3 数据发送服务

transfer接收数据的方式有三种，主要是rpc，接收agent发来的数据走的就是rpc，另外一个socket方式即将被废弃，让用户可以通过telnet方式发送数据

还有一种是通过api，在2.4节

/*
	数据接收服务
 */
func Start() {
	go rpc.StartRpc()  // RPC 服务
	go socket.StartSocket()  // telnet 服务，即将废弃
}

2.3.1 RPC服务

/*
	开启 RPC 服务
 */
func StartRpc() {
	if !g.Config().Rpc.Enabled {
		return
	}

	addr := g.Config().Rpc.Listen
	tcpAddr, err := net.ResolveTCPAddr("tcp", addr)  // TCP 地址解析
	if err != nil {
		log.Fatalf("net.ResolveTCPAddr fail: %s", err)
	}

	listener, err := net.ListenTCP("tcp", tcpAddr)  // 开启 TCP 监听
	if err != nil {
		log.Fatalf("listen %s fail: %s", addr, err)
	} else {
		log.Println("rpc listening", addr)
	}

	server := rpc.NewServer()  // 开启 RPC
	server.Register(new(Transfer))  // 注册服务

	for {
		conn, err := listener.Accept()
		if err != nil {
			log.Println("listener.Accept occur error:", err)
			continue
		}
		go server.ServeCodec(jsonrpc.NewServerCodec(conn))  // RPC 请求处理
	}
}

具体的rpc请求就两个方法，一个是ping，另一个是上报监控数据

type Transfer int

/*
	响应
 */
type TransferResp struct {
	Msg        string  // ok，表示成功
	Total      int  // 接收到的数据个数
	ErrInvalid int  // 无效数据个数
	Latency    int64  // 处理上报监控数据用时，单位毫秒
}

func (t *TransferResp) String() string {
	s := fmt.Sprintf("TransferResp total=%d, err_invalid=%d, latency=%dms",
		t.Total, t.ErrInvalid, t.Latency)
	if t.Msg != "" {
		s = fmt.Sprintf("%s, msg=%s", s, t.Msg)
	}
	return s
}

/*
	ping
 */
func (this *Transfer) Ping(req cmodel.NullRpcRequest, resp *cmodel.SimpleRpcResponse) error {
	return nil
}

/*
	接收监控数据
 */
func (t *Transfer) Update(args []*cmodel.MetricValue, reply *cmodel.TransferResponse) error {
	return RecvMetricValues(args, reply, "rpc")
}

/*
	处理上报的监控数据
 */
func RecvMetricValues(args []*cmodel.MetricValue, reply *cmodel.TransferResponse, from string) error {
	start := time.Now()
	reply.Invalid = 0

	items := []*cmodel.MetaData{}

	// 统计无效上报数据
	for _, v := range args {
		if v == nil {
			reply.Invalid += 1
			continue
		}

		// 历史遗留问题.
		// 老版本 agent 上报的 metric=kernel.hostname 的数据,其取值为 string 类型,现在已经不支持了
		// 所以,这里硬编码过滤掉
		if v.Metric == "kernel.hostname" {
			reply.Invalid += 1
			continue
		}

		// 监控项名或 endpoint 缺失，数据无效
		if v.Metric == "" || v.Endpoint == "" {
			reply.Invalid += 1
			continue
		}

		// counter type 检查
		if v.Type != g.COUNTER && v.Type != g.GAUGE && v.Type != g.DERIVE {
			reply.Invalid += 1
			continue
		}

		// 无监控值，无效
		if v.Value == "" {
			reply.Invalid += 1
			continue
		}

		// step 检查
		if v.Step <= 0 {
			reply.Invalid += 1
			continue
		}

		// 太长的监控项名无效
		if len(v.Metric)+len(v.Tags) > 510 {
			reply.Invalid += 1
			continue
		}

		// 修改异常时间戳为当前时间
		now := start.Unix()
		if v.Timestamp <= 0 || v.Timestamp > now*2 {
			v.Timestamp = now
		}

		// 封装监控数据
		fv := &cmodel.MetaData{
			Metric:      v.Metric,
			Endpoint:    v.Endpoint,
			Timestamp:   v.Timestamp,
			Step:        v.Step,
			CounterType: v.Type,
			Tags:        cutils.DictedTagstring(v.Tags),  // tag 转字典格式
		}

		valid := true
		var vv float64
		var err error

		// 监控值尝试转浮点数
		switch cv := v.Value.(type) {
		case string:
			vv, err = strconv.ParseFloat(cv, 64)
			if err != nil {
				valid = false
			}
		case float64:
			vv = cv
		case int64:
			vv = float64(cv)
		default:
			valid = false
		}

		// 数值转浮点数失败，无效
		if !valid {
			reply.Invalid += 1
			continue
		}

		fv.Value = vv
		items = append(items, fv)
	}

	// 统计本次上报监控数据的情况
	cnt := int64(len(items))
	proc.RecvCnt.IncrBy(cnt)  // 接收到的监控数据增加
	if from == "rpc" {
		proc.RpcRecvCnt.IncrBy(cnt)  // 接收 RPC 请求次数增加
	} else if from == "http" {
		proc.HttpRecvCnt.IncrBy(cnt)  // 接收 http 请求次数增加
	}

	cfg := g.Config()

	if cfg.Graph.Enabled {  // 向 graph 缓存队列推送数据
		sender.Push2GraphSendQueue(items)
	}

	if cfg.Judge.Enabled {  // 向 judge 缓存队列推送数据
		sender.Push2JudgeSendQueue(items)
	}

	if cfg.Tsdb.Enabled {  // 向 tsdb 缓存队列推送数据
		sender.Push2TsdbSendQueue(items)
	}

	// 响应
	reply.Message = "ok"
	reply.Total = len(args)
	reply.Latency = (time.Now().UnixNano() - start.UnixNano()) / 1000000

	return nil
}

发送监控数据到缓存队列的代码，judge的

/*
	监控数据发送到 judge 缓存队列
 */
func Push2JudgeSendQueue(items []*cmodel.MetaData) {
	for _, item := range items {
		pk := item.PK()  // 生成监控项主键字符串，用于计算哈希值
		node, err := JudgeNodeRing.GetNode(pk)  // 查找一致性哈希环中最接近的节点
		if err != nil {
			log.Println("E:", err)
			continue
		}

		step := int(item.Step)  // 调整 step，使其不小于最小值
		if step < MinStep {
			step = MinStep
		}
		ts := alignTs(item.Timestamp, int64(step))  // 对齐时间戳，使时间戳能被 step 整除

		judgeItem := &cmodel.JudgeItem{  // 封装数据
			Endpoint:  item.Endpoint,
			Metric:    item.Metric,
			Value:     item.Value,
			Timestamp: ts,
			JudgeType: item.CounterType,
			Tags:      item.Tags,
		}

		// 插入缓存队列
		Q := JudgeQueues[node]
		isSuccess := Q.PushFront(judgeItem)

		// 若插入缓存队列失败，发送失败计数加一
		if !isSuccess {
			proc.SendToJudgeDropCnt.Incr()
		}
	}
}

发送到graph的缓存代码复杂一些，首先graph存储数据用的是rrd文件，所以数据的形式要做调整。其次还要做一些追踪、过滤的处理（具体功能在http那里讲）

/*
	监控数据发送到 graph 缓存队列
 */
func Push2GraphSendQueue(items []*cmodel.MetaData) {
	cfg := g.Config().Graph

	for _, item := range items {
		graphItem, err := convert2GraphItem(item)  // 监控数据转为 graph 的格式
		if err != nil {
			log.Println("E:", err)
			continue
		}
		pk := item.PK()  // 获取监控项主键

		// 统计
		proc.RecvDataTrace.Trace(pk, item)  // 检查该项是否被设为追踪的数据项，如果是，缓存数据
		proc.RecvDataFilter.Filter(pk, item.Value, item)  //  检查该项是否被设为过滤的数据项，如果是，过滤处理

		node, err := GraphNodeRing.GetNode(pk)  // 获取一致性哈希环上最近的节点
		if err != nil {
			log.Println("E:", err)
			continue
		}

		// 向节点的所有缓存队列插入数据
		cnode := cfg.ClusterList[node]
		errCnt := 0
		for _, addr := range cnode.Addrs {
			Q := GraphQueues[node+addr]
			if !Q.PushFront(graphItem) {
				errCnt += 1
			}
		}

		// 若插入缓存队列失败，发送失败计数加一
		if errCnt > 0 {
			proc.SendToGraphDropCnt.Incr()
		}
	}
}

/*
	监控数据转为 graph 的格式
 */
func convert2GraphItem(d *cmodel.MetaData) (*cmodel.GraphItem, error) {
	item := &cmodel.GraphItem{}

	item.Endpoint = d.Endpoint
	item.Metric = d.Metric
	item.Tags = d.Tags
	item.Timestamp = d.Timestamp
	item.Value = d.Value
	item.Step = int(d.Step)
	if item.Step < MinStep {  // 保证 step 不小于最小值
		item.Step = MinStep
	}
	item.Heartbeat = item.Step * 2  // 监控项心跳时间

	// 数据类型转换为 rrd 文件数据类型
	if d.CounterType == g.GAUGE {
		item.DsType = d.CounterType
		item.Min = "U"
		item.Max = "U"
	} else if d.CounterType == g.COUNTER {
		item.DsType = g.DERIVE
		item.Min = "0"
		item.Max = "U"
	} else if d.CounterType == g.DERIVE {
		item.DsType = g.DERIVE
		item.Min = "0"
		item.Max = "U"
	} else {
		return item, fmt.Errorf("not_supported_counter_type")
	}

	item.Timestamp = alignTs(item.Timestamp, int64(item.Step))  // 对齐时间戳，使时间戳能被 step 整除

	return item, nil
}

最后是tsdb，算比较简单了

/*
	监控数据发送到 tsdb 缓存队列
 */
func Push2TsdbSendQueue(items []*cmodel.MetaData) {
	for _, item := range items {
		tsdbItem := convert2TsdbItem(item)  // 数据转化为 tsdb 格式
		isSuccess := TsdbQueue.PushFront(tsdbItem)  // 插入缓存队列

		// 若插入缓存队列失败，发送失败计数加一
		if !isSuccess {
			proc.SendToTsdbDropCnt.Incr()
		}
	}
}

/*
	转化为tsdb格式
  */
func convert2TsdbItem(d *cmodel.MetaData) *cmodel.TsdbItem {
	t := cmodel.TsdbItem{Tags: make(map[string]string)}

	for k, v := range d.Tags {
		t.Tags[k] = v
	}
	t.Tags["endpoint"] = d.Endpoint  // endpoint 变为 tag 的一项
	t.Metric = d.Metric
	t.Timestamp = d.Timestamp
	t.Value = d.Value
	return &t
}

2.3.2 telnet 服务

/*
	开启 telnet 服务
 */
func StartSocket() {
	if !g.Config().Socket.Enabled {
		return
	}

	addr := g.Config().Socket.Listen  // 解析 TCP 地址
	tcpAddr, err := net.ResolveTCPAddr("tcp", addr)
	if err != nil {
		log.Fatalf("net.ResolveTCPAddr fail: %s", err)
	}

	listener, err := net.ListenTCP("tcp", tcpAddr)  // 开启 TCP 监听
	if err != nil {
		log.Fatalf("listen %s fail: %s", addr, err)
	} else {
		log.Println("socket listening", addr)
	}

	defer listener.Close()

	for {
		conn, err := listener.Accept()
		if err != nil {
			log.Println("listener.Accept occur error:", err)
			continue
		}

		go socketTelnetHandle(conn)  // telnet 处理
	}
}

实际的处理代码，随便看看，反正都废弃了，transfer还有这种操作

/*
	telnet 处理请求
 */
func socketTelnetHandle(conn net.Conn) {
	defer conn.Close()

	items := []*cmodel.MetaData{}
	buf := bufio.NewReader(conn)

	cfg := g.Config()
	timeout := time.Duration(cfg.Socket.Timeout) * time.Second

	for {
		conn.SetReadDeadline(time.Now().Add(timeout))  // 设置超时时间
		line, err := buf.ReadString('\n')  // 读取第一行
		if err != nil {
			break
		}

		line = strings.Trim(line, "\n")  // 去掉换行符

		if line == "quit" {  // 接收到停止信号，结束
			break
		}

		if line == "" {  // 跳过空行
			continue
		}

		t := strings.Fields(line)  // 从空格处切分指令
		if len(t) < 2 {
			continue
		}

		cmd := t[0]  // 第一个单词为命令

		if cmd != "update" {  // 更新命令，继续读取下一行
			continue
		}

		item, err := convertLine2MetaData(t[1:])  // 把输入监控数据封装为监控项格式
		if err != nil {
			continue
		}

		items = append(items, item)
	}

	// 统计
	proc.SocketRecvCnt.IncrBy(int64(len(items)))  // socket 接收到的监控数据计数增加
	proc.RecvCnt.IncrBy(int64(len(items)))  // 接收到的监控数据计数增加

	if cfg.Graph.Enabled {  // 数据插入 graph 缓存队列
		sender.Push2GraphSendQueue(items)
	}

	if cfg.Judge.Enabled {  // 数据插入 judge 缓存队列
		sender.Push2JudgeSendQueue(items)
	}

	return

}

/*
	命令行输入的监控数据封装
 */
func convertLine2MetaData(fields []string) (item *cmodel.MetaData, err error) {
	if len(fields) != 4 && len(fields) != 5 && len(fields) != 6 {  // 检测输入的监控数据格式
		err = fmt.Errorf("not_enough_fileds")
		return
	}

	endpoint, metric := fields[0], fields[1]  // 读取 endpoint 和监控项名
	ts, err := strconv.ParseInt(fields[2], 10, 64)  // 读取时间戳
	if err != nil {
		return
	}

	v, err := strconv.ParseFloat(fields[3], 64)  // 读取监控值
	if err != nil {
		return
	}

	type_ := g.COUNTER  // 读取数据类型
	if len(fields) >= 5 {
		type_ = fields[4]
	}

	if type_ != g.DERIVE && type_ != g.GAUGE && type_ != g.COUNTER {  // 检测数据类型是否合法
		err = fmt.Errorf("invalid_counter_type")
		return
	}

	// 读取 step
	var step int64 = g.DEFAULT_STEP  // 默认 step
	if len(fields) == 6 {  // 如果输入数据有六项，第六项包括了 step
		dst_args := strings.Split(fields[5], ":")   // 第六项只是一个整数，即 step
		if len(dst_args) == 1 {
			step, err = strconv.ParseInt(dst_args[0], 10, 64)
			if err != nil {
				return
			}
		} else if len(dst_args) == 4 {  // 第六项是 heartbeat:min:max:step 的格式，读取 step
			step, err = strconv.ParseInt(dst_args[3], 10, 64)
			if err != nil {
				return
			}
		} else {
			err = fmt.Errorf("invalid_counter_step")
			return
		}
	}

	// 封装
	item = &cmodel.MetaData{
		Metric:      metric,
		Endpoint:    endpoint,
		Timestamp:   ts,
		Step:        step,
		Value:       v,
		CounterType: type_,
		Tags:        make(map[string]string),
	}

	return item, nil
}

3 http

四种api

/*
	启动
 */
func Start() {
	go startHttpServer()
}

/*
	开启 http
 */
func startHttpServer() {
	if !g.Config().Http.Enabled {
		return
	}

	addr := g.Config().Http.Listen
	if addr == "" {
		return
	}

	configCommonRoutes()  // 常规 api
	configProcHttpRoutes()  // 自监控 api
	configDebugHttpRoutes()  // 连接池 api

	configApiRoutes()  // 监控数据上报 api

	s := &http.Server{
		Addr:           addr,
		MaxHeaderBytes: 1 << 30,
	}

	log.Println("http.startHttpServer ok, listening", addr)
	log.Fatalln(s.ListenAndServe())
}

3.1 常规api

就每个组件都有的那些api

/*
	常规 api
 */
func configCommonRoutes() {
	/*
		查看组件是否正常
	 */
	http.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) {
		w.Write([]byte("ok\n"))
	})

	/*
		查看组件版本
	 */
	http.HandleFunc("/version", func(w http.ResponseWriter, r *http.Request) {
		w.Write([]byte(fmt.Sprintf("%s\n", g.VERSION)))
	})

	/*
		查看工作目录
	 */
	http.HandleFunc("/workdir", func(w http.ResponseWriter, r *http.Request) {
		w.Write([]byte(fmt.Sprintf("%s\n", file.SelfDir())))
	})

	/*
		查看配置
	 */
	http.HandleFunc("/config", func(w http.ResponseWriter, r *http.Request) {
		RenderDataJson(w, g.Config())
	})

	/*
		配置文件重读取
	 */
	http.HandleFunc("/config/reload", func(w http.ResponseWriter, r *http.Request) {
		if strings.HasPrefix(r.RemoteAddr, "127.0.0.1") {  // 只接受本机 IP 发送该请求
			g.ParseConfig(g.ConfigFile)
			RenderDataJson(w, "ok")
		} else {  // 非本机 IP 请求，拒绝
			RenderDataJson(w, "no privilege")
		}
	})
}

3.2 自监控api

两个返回所有自监控指标的api，一个追踪器一个过滤器

追踪器和过滤器这两个官网都没介绍的功能，我试了一下，真的可以用，但是感觉没啥太大用处，都只能设置一个监控项，还不如直接在告警层面上操作

/*
	自监控 api
 */
func configProcHttpRoutes() {
	/*
		获取所有自监控指标
	 */
	http.HandleFunc("/counter/all", func(w http.ResponseWriter, r *http.Request) {
		RenderDataJson(w, proc.GetAll())
	})

	/*
		获取所有自监控指标，即将被废弃
	 */
	http.HandleFunc("/statistics/all", func(w http.ResponseWriter, r *http.Request) {
		RenderDataJson(w, proc.GetAll())
	})

	/*
		查询最小发送间隔
	 */
	http.HandleFunc("/proc/step", func(w http.ResponseWriter, r *http.Request) {
		RenderDataJson(w, map[string]interface{}{"min_step": sender.MinStep})
	})

	/*
		追踪某个监控项的最新值
		第一次请求该 api，设置要追踪的监控项，开始追踪
	        最多可获取 3 个缓存的最新监控值
	 */
	http.HandleFunc("/trace/", func(w http.ResponseWriter, r *http.Request) {
		urlParam := r.URL.Path[len("/trace/"):]
		args := strings.Split(urlParam, "/")  // 格式为：endpoint/metric/tag

		argsLen := len(args)
		endpoint := args[0]
		metric := args[1]
		tags := make(map[string]string)  // 读取 tag
		if argsLen > 2 {
			tagVals := strings.Split(args[2], ",")
			for _, tag := range tagVals {
				tagPairs := strings.Split(tag, "=")
				if len(tagPairs) == 2 {
					tags[tagPairs[0]] = tagPairs[1]
				}
			}
		}
		proc.RecvDataTrace.SetPK(cutils.PK(endpoint, metric, tags))  // 设置追踪主键
		RenderDataJson(w, proc.RecvDataTrace.GetAllTraced())  // 返回追踪值
	})

	/*
		设置阈值和条件，过滤器会缓存满足条件的监控值，最多缓存 5 个最新值
		第一次请求该 api，设置要过滤的监控项，开始过滤
		操作符支持 eq、ne、gt、lt
	 */
	http.HandleFunc("/filter/", func(w http.ResponseWriter, r *http.Request) {
		urlParam := r.URL.Path[len("/filter/"):]
		args := strings.Split(urlParam, "/")  // 格式为：endpoint/metric/操作符/阈值/tag

		argsLen := len(args)
		endpoint := args[0]
		metric := args[1]
		opt := args[2]

		threadholdStr := args[3]
		threadhold, err := strconv.ParseFloat(threadholdStr, 64)
		if err != nil {
			RenderDataJson(w, "bad threadhold")
			return
		}

		tags := make(map[string]string)  // 读取 tag
		if argsLen > 4 {
			tagVals := strings.Split(args[4], ",")
			for _, tag := range tagVals {
				tagPairs := strings.Split(tag, "=")
				if len(tagPairs) == 2 {
					tags[tagPairs[0]] = tagPairs[1]
				}
			}
		}

		err = proc.RecvDataFilter.SetFilter(cutils.PK(endpoint, metric, tags), opt, threadhold)  // 设置过滤器
		if err != nil {
			RenderDataJson(w, err.Error())
			return
		}

		RenderDataJson(w, proc.RecvDataFilter.GetAllFiltered())  // 返回过滤值
	})
}

3.3 连接池api

查询各个连接池的情况，活跃连接数空闲连接数这些

// Copyright 2017 Xiaomi, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package http

import (
	"fmt"
	"github.com/open-falcon/falcon-plus/modules/transfer/sender"
	"net/http"
	"strings"
)

/*
	连接池 api
 */
func configDebugHttpRoutes() {
	/*
		查询连接池参数
	 */
	http.HandleFunc("/debug/connpool/", func(w http.ResponseWriter, r *http.Request) {
		urlParam := r.URL.Path[len("/debug/connpool/"):]
		args := strings.Split(urlParam, "/")  // 参数为 judge、graph 或 tsdb

		argsLen := len(args)
		if argsLen < 1 {
			w.Write([]byte(fmt.Sprintf("bad args\n")))
			return
		}

		var result string
		receiver := args[0]
		switch receiver {
		case "judge":
			result = strings.Join(sender.JudgeConnPools.Proc(), "\n")
		case "graph":
			result = strings.Join(sender.GraphConnPools.Proc(), "\n")
		default:
			result = fmt.Sprintf("bad args, module not exist\n")
		}
		w.Write([]byte(result))
	})
}

3.4 监控数据上报

通过api也是可以上报监控数据的

/*
	监控数据上报
 */
func api_push_datapoints(rw http.ResponseWriter, req *http.Request) {
	if req.ContentLength == 0 {
		http.Error(rw, "blank body", http.StatusBadRequest)
		return
	}

	decoder := json.NewDecoder(req.Body)
	var metrics []*cmodel.MetricValue   // 解析上报的监控项
	err := decoder.Decode(&metrics)
	if err != nil {
		http.Error(rw, "decode error", http.StatusBadRequest)
		return
	}

	reply := &cmodel.TransferResponse{}
	prpc.RecvMetricValues(metrics, reply, "http")  // 处理上报的监控数据

	RenderDataJson(rw, reply)
}

/*
	监控数据上报 api
 */
func configApiRoutes() {
	http.HandleFunc("/api/push", api_push_datapoints)
}