目录
1. 代码准备
clone telegraf和influxdb的代码,我这都是clone的默认的master分支
// telegraf
git cloen https://github.com/influxdata/telegraf.git
// influxdb
git clone https://github.com/influxdata/influxdb.git
clone的telegraf代码里,在/influxdata/telegraf/plugins/outputs下有influxdb和influxdb_v2目录,这里对应的是influxdb的不同版本,目前我clone的master分支的influxdb代码就是v2的版本,所以等下连接到influxdb的时候要注意对应的url为/v2/write
2. 先看influxdb写入数据的逻辑
代码位置:/github.com/influxdata/influxdb/http/api_handler.go
api_handler.go里面都是定义的http api的操作,这里我们只看写入操作:NewWriteHandler
... 省略其他handler操作
writeBackend := NewWriteBackend(b)
h.WriteHandler = NewWriteHandler(writeBackend)
...
进入write_handler.go查看NewWriteHandler具体代码
// NewWriteHandler creates a new handler at /api/v2/write to receive line protocol.
func NewWriteHandler(b *WriteBackend) *WriteHandler {
h := &WriteHandler{
Router: NewRouter(b.HTTPErrorHandler),
HTTPErrorHandler: b.HTTPErrorHandler,
Logger: b.Logger,
PointsWriter: b.PointsWriter,
BucketService: b.BucketService,
OrganizationService: b.OrganizationService,
EventRecorder: b.WriteEventRecorder,
}
h.HandlerFunc("POST", writePath, h.handleWrite)
return h
}
这里发送的是http post请求,具体的writePath为:
writePath = "/api/v2/write"
write的具体处理逻辑在handleWrite里面
func (h *WriteHandler) handleWrite(w http.ResponseWriter, r *http.Request) {
// 省略其他代码,只保留关键的
...
if err := h.PointsWriter.WritePoints(ctx, points); err != nil {
logger.Error("Error writing points", zap.Error(err))
h.HandleHTTPError(ctx, &platform.Error{
Code: platform.EInternal,
Op: "http/handleWrite",
Msg: fmt.Sprintf("unable to write points to database: %v", err),
Err: err,
}, w)
return
}
...
}
核心逻辑位于/github.com/influxdata/influxdb/storage/engine.go的WritePoints
func (e *Engine) WritePoints(ctx context.Context, points []models.Point) error {
// ...
// Add the write to the WAL to be replayed if there is a crash or shutdown.
if _, err := e.wal.WriteMulti(ctx, values); err != nil {
return err
}
// ....
}
核心逻辑在 /github.com/influxdata/influxdb/storage/wal/wal.go的WriteMulti
func (l *WAL) WriteMulti(ctx context.Context, values map[string][]value.Value) (int, error) {
// ...
id, err := l.writeToLog(entry)
if err != nil {
l.tracker.IncWritesErr()
return -1, err
}
// ...
}
func (l *WAL) writeToLog(entry WALEntry) (int, error) {
// ...
这里会启用snappy对数据进行压缩
// ...
}
3.telegraf如何收集数据写入influxdb
3.1 启动telegraf agent
代码位置 /github.com/influxdata/telegraf/cmd/telegraf/telegraf.go
main()里面会区分当前的运行环境
func main() {
// ...
if runtime.GOOS == "windows" && windowsRunAsService() {
// windows环境
}else {
// 非windows环境
stop = make(chan struct{})
reloadLoop(
stop,
inputFilters,
outputFilters,
aggregatorFilters,
processorFilters,
)
}
}
// 根据给定的inputFilters,outputFilters,aggregatorFilters,processorFilters调用reloadLoop()。其实最后两个参数(aggregatorFilters,processorFilters)并没有使用
func reloadLoop(
stop chan struct{},
inputFilters []string,
outputFilters []string,
aggregatorFilters []string,
processorFilters []string,
) {
// ...
// 运行agent
err := runAgent(ctx, inputFilters, outputFilters)
if err != nil && err != context.Canceled {
log.Fatalf("E! [telegraf] Error running agent: %v", err)
}
// ....
}
// 运行agent
func runAgent(ctx context.Context,
inputFilters []string,
outputFilters []string,
) error {
// ...
return ag.Run(ctx)
}
3.2 看下agent run都具体干了些啥
代码位置 /github.com/influxdata/telegraf/agent/agent.go
1.初始化所有的inout/output插件
2.连接到outputs
3.启动inputs服务
4.goroutine运行inputs,运行成功后关闭inputs服务
5.如果存在processor,则运行processor
6.如果存在aggregrator,则运行aggregator
7.运行outputs
8.关闭outputs
func (a *Agent) Run(ctx context.Context) error {
// ...
// 初始化所有的input/output插件
log.Printf("D! [agent] Initializing plugins")
err := a.initPlugins()
if err != nil {
return err
}
// 连接到所有的outputs插件
log.Printf("D! [agent] Connecting outputs")
err = a.connectOutputs(ctx)
if err != nil {
return err
}
...
// 启动input service
log.Printf("D! [agent] Starting service inputs")
err = a.startServiceInputs(ctx, inputC)
if err != nil {
return err
}
...
// 紧接着在goroutine里面运行input ,停止input service
wg.Add(1)
go func(dst chan telegraf.Metric) {
defer wg.Done()
// 运行input插件
err := a.runInputs(ctx, startTime, dst)
if err != nil {
log.Printf("E! [agent] Error running inputs: %v", err)
}
// 停止input service
log.Printf("D! [agent] Stopping service inputs")
a.stopServiceInputs()
close(dst)
log.Printf("D! [agent] Input channel closed")
}(dst)
...
// 如果存在processors,则运行processor
if len(a.Config.Processors) > 0 {
dst = procC
wg.Add(1)
go func(src, dst chan telegraf.Metric) {
defer wg.Done()
err := a.runProcessors(src, dst)
if err != nil {
log.Printf("E! [agent] Error running processors: %v", err)
}
close(dst)
log.Printf("D! [agent] Processor channel closed")
}(src, dst)
src = dst
}
...
// 如果存在aggregators,则运行aggregators
if len(a.Config.Aggregators) > 0 {
dst = outputC
wg.Add(1)
go func(src, dst chan telegraf.Metric) {
defer wg.Done()
err := a.runAggregators(startTime, src, dst)
if err != nil {
log.Printf("E! [agent] Error running aggregators: %v", err)
}
close(dst)
log.Printf("D! [agent] Output channel closed")
}(src, dst)
src = dst
}
...
// 运行outputs
go func(src chan telegraf.Metric) {
defer wg.Done()
err := a.runOutputs(startTime, src)
if err != nil {
log.Printf("E! [agent] Error running outputs: %v", err)
}
}(src)
....
// 最后关闭outputs
a.closeOutputs()
}
看下运行outputs里面都干了啥
func (a *Agent) runOutputs(
startTime time.Time,
src <-chan telegraf.Metric,
) error {
// ...
// 循环遍历每个output,然后执行flush
var wg sync.WaitGroup
for _, output := range a.Config.Outputs {
interval := interval
// Overwrite agent flush_interval if this plugin has its own.
if output.Config.FlushInterval != 0 {
interval = output.Config.FlushInterval
}
wg.Add(1)
go func(output *models.RunningOutput) {
defer wg.Done()
if a.Config.Agent.RoundInterval {
err := internal.SleepContext(
ctx, internal.AlignDuration(startTime, interval))
if err != nil {
return
}
}
a.flush(ctx, output, interval, jitter)
}(output)
}
}
func (a *Agent) flush(
ctx context.Context,
output *models.RunningOutput,
interval time.Duration,
jitter time.Duration,
) {
// ...
for {
// Favor shutdown over other methods.
select {
case <-ctx.Done():
logError(a.flushOnce(output, interval, output.Write))
return
default:
}
select {
case <-ticker.C:
logError(a.flushOnce(output, interval, output.Write))
case <-output.BatchReady:
// Favor the ticker over batch ready
select {
case <-ticker.C:
logError(a.flushOnce(output, interval, output.Write))
default:
logError(a.flushOnce(output, interval, output.WriteBatch))
}
case <-ctx.Done():
logError(a.flushOnce(output, interval, output.Write))
return
}
}
// ....
}
写入到influxdb的核心方法是output.Write
代码位置:/github.com/influxdata/telegraf/internal/models/running_output.go
func (ro *RunningOutput) Write() error {
// ...
// 调用RunningOutput的write(),这的write()是一个接口,由于我们要写入到influxdb,那么就看influxdb里面关于write()的实现代码
err := ro.write(batch)
if err != nil {
ro.buffer.Reject(batch)
return err
}
ro.buffer.Accept(batch)
// ...
}
查看influxdb里面实现的write()
func (i *InfluxDB) Write(metrics []telegraf.Metric) error {
// ...
client := i.clients[n]
// 这里write()是client的一个interface,我们跟进去可以看到有udClient,httpClient,MockClient的实现,我们这里看httpClient的实现
err = client.Write(ctx, metrics)
if err == nil {
return nil
}
// ...
}
httpClient对write()的实现
代码位置:/github.com/influxdata/telegraf/plugins/outputs/influxdb_v2/http.go
// Write sends the metrics to InfluxDB
func (c *httpClient) Write(ctx context.Context, metrics []telegraf.Metric) error {
// ...
// 如果telegraf.conf里面没有指定database_tag,那么就使用database作为默认数据库,写入数据,否则使用database_tag作为选择的数据库
if c.config.DatabaseTag == "" {
err := c.writeBatch(ctx, c.config.Database, metrics)
if err != nil {
return err
}
} else {
// ...
for db, batch := range batches {
if !c.config.SkipDatabaseCreation && !c.createdDatabases[db] {
err := c.CreateDatabase(ctx, db)
if err != nil {
log.Printf("W! [outputs.influxdb] when writing to [%s]: database %q creation failed: %v",
c.config.URL, db, err)
}
}
err := c.writeBatch(ctx, db, batch)
if err != nil {
return err
}
}
// ...
}
}
核心代码是writeBatch()
func (c *httpClient) writeBatch(ctx context.Context, db string, metrics []telegraf.Metric) error {
// 根据telegraf.conf配置文件里的url,rp,以及传入的db,拼接连接到本机influxdb的url
url, err := makeWriteURL(c.config.URL, db, c.config.RetentionPolicy, c.config.Consistency)
if err != nil {
return err
}
// ...
// 总算连接到influxdb,真正开始写数据了,一把辛酸泪
reader := influx.NewReader(metrics, c.config.Serializer)
req, err := c.makeWriteRequest(url, reader)
if err != nil {
return err
}
resp, err := c.client.Do(req.WithContext(ctx))
if err != nil {
return err
}
defer resp.Body.Close()
// ....
}
// 根据schema拼接不同的url,这里根据u.Path可见等下会调用influxdb的/api/v2/write请求,终于快要接到influxdb了
func makeWriteURL(loc *url.URL, db, rp, consistency string) (string, error) {
// ...
switch u.Scheme {
case "unix":
u.Scheme = "http"
u.Host = "127.0.0.1"
u.Path = "/api/v2/write"
case "http", "https":
u.Path = path.Join(u.Path, "/api/v2/write")
default:
return "", fmt.Errorf("unsupported scheme: %q", loc.Scheme)
}
u.RawQuery = params.Encode()
// ...
}
// 发送些数据的请求到influxdb
func (c *httpClient) makeWriteRequest(url string, body io.Reader) (*http.Request, error) {
var err error
// 如果telegraf.conf里面的ContentEncoding指定了以gzip压缩,那么就压缩了
if c.config.ContentEncoding == "gzip" {
body, err = internal.CompressWithGzip(body)
if err != nil {
return nil, err
}
}
// 发送post请求,连接到influxdb
req, err := http.NewRequest("POST", url, body)
if err != nil {
return nil, err
}
// 设置content-type请求头
req.Header.Set("Content-Type", "text/plain; charset=utf-8")
c.addHeaders(req)
// 如果设置了gzip压缩,设置Content-Encoding为gzip
if c.config.ContentEncoding == "gzip" {
req.Header.Set("Content-Encoding", "gzip")
}
return req, nil
}
到此,终于把数据写入到influxdb了。。。。