database/sql: rows.Next panic from concurrent map writes

版权声明:本文为Doctorq原创文章,未经博主允许不得转载。 https://blog.csdn.net/qhshiniba/article/details/82347851

问题描述

我们最近用golang重构了以前c++的系统,在最后上线前的压测环节,我负责的模块偶现如下错误:

[mysql] 2018/09/02 23:17:35 packets.go:72: read tcp xx:39151->xx:3306: i/o timeout
 [mysql] 2018/09/02 23:17:35 packets.go:408: busy buffer
 [mysql] 2018/09/02 23:17:35 packets.go:72: read tcp xx:39151->xx:3306: i/o timeout
 [mysql] 2018/09/02 23:17:35 packets.go:408: busy buffer
 [mysql] 2018/09/02 23:17:35 connection.go:372: invalid connection
 [mysql] 2018/09/02 23:17:35 connection.go:372: invalid connection
 [mysql] 2018/09/02 23:17:35 packets.go:72: read tcp xx:39151->xx:3306: i/o timeout
 [mysql] 2018/09/02 23:17:35 packets.go:408: busy buffer
 [mysql] 2018/09/02 23:17:35 connection.go:372: invalid connection
 fatal error: concurrent map writes

   goroutine 4932 [running]:
    runtime.throw(0xcadf03, 0x15)
         /home/doctorq/soft/sf-go/src/runtime/panic.go:605 +0x95 fp=0xc422162758 sp=0xc422162738 pc=0x42d415
     runtime.mapdelete_fast64(0xb7d760, 0xc420180ff0, 0x91d)
         /home/doctorq/soft/sf-go/src/runtime/hashmap_fast.go:755 +0x212 fp=0xc4221627a0 sp=0xc422162758 pc=0x40ed12
     database/sql.(*DB).putConnDBLocked(0xc4201beaa0, 0xc4201880e0, 0x0, 0x0, 0x0)
         /home/doctorq/soft/sf-go/src/database/sql/sql.go:1109 +0x258 fp=0xc422162888 sp=0xc4221627a0 pc=0x919798
     database/sql.(*DB).putConn(0xc4201beaa0, 0xc4201880e0, 0x0, 0x0)
         /home/doctorq/soft/sf-go/src/database/sql/sql.go:1079 +0x10d fp=0xc4221628f0 sp=0xc422162888 pc=0x91943d
     database/sql.(*driverConn).releaseConn(0xc4201880e0, 0x0, 0x0)
         /home/doctorq/soft/sf-go/src/database/sql/sql.go:380 +0x47 fp=0xc422162920 sp=0xc4221628f0 pc=0x915897
     database/sql.(*driverConn).(database/sql.releaseConn)-fm(0x0, 0x0)
         /home/doctorq/soft/sf-go/src/database/sql/sql.go:643 +0x3e fp=0xc422162948 sp=0xc422162920 pc=0x92437e
     database/sql.(*Rows).close(0xc4213c1080, 0x0, 0x0, 0x0, 0x0)
         /home/doctorq/soft/sf-go/src/database/sql/sql.go:2767 +0x15c fp=0xc422162998 sp=0xc422162948 pc=0x92261c
     database/sql.(*Rows).Close(0xc4213c1080, 0xc4213c10b0, 0xc4221629f0)
         /home/doctorq/soft/sf-go/src/database/sql/sql.go:2738 +0x3d fp=0xc4221629d0 sp=0xc422162998 pc=0x92248d
     database/sql.(*Rows).Next(0xc4213c1080, 0xc42733f780)
         /home/doctorq/soft/sf-go/src/database/sql/sql.go:2445 +0xa4 fp=0xc422162a20 sp=0xc4221629d0 pc=0x920fa4
     (*MysqlAdapter).ParserRows(0xc4200115c0, 0x11ca6a0, 0xc4263eab10, 0xc4213c1080, 0x0, 0x0, 0x0, 0x0, 0x0)

报错信息显示,我不正确的使用了非安全的map导致的。但是错误信息指向了database/sql库,我觉得不太可能是golang基础库的问题,所以我从自身代码找问题。

解决过程

问题指向的代码如下:


func (p *MysqlAdapter) ParserRows(ctx context.Context, rows *sql.Rows) ([]map[string]interface{}, error) {
    if rows == nil {
        errStr := "rows is nil"
        return nil, errors.New(errStr)
    }
    defer rows.Close()
    columns, err := rows.Columns()
    if err != nil {
        errStr := fmt.Sprintf("rows.Columns error:[%s]", err.Error())
        log.Error(ctx, errStr)
        return nil, errors.New(errStr)
    }
    count := len(columns)
    values := make([]interface{}, count)
    valuePtrs := make([]interface{}, count)
    var resultRows []map[string]interface{}
    for rows.Next() {//panic发生的点
        for i, _ := range columns {
            valuePtrs[i] = &values[i]
        }
        rows.Scan(valuePtrs...)

        resultMap := make(map[string]interface{}, count)
        for i, col := range columns {
            var v interface{}
            val := values[i]
            b, ok := val.([]byte)
            if ok {
                v = string(b)
            } else {
                v = val
            }
            resultMap[col] = v
        }
        resultRows = append(resultRows, resultMap)
    }
    log.Infof(ctx, "row size:[%d]", len(resultRows))
    return resultRows, nil
}

rows.Next()执行的时候报的错,首先怀疑我自己的resultRows这个map使用不当,所以我把普通map更换成了sync.Map,替换后的代码如下:

func (p *MysqlAdapter) ParserRows(ctx context.Context, rows *sql.Rows) ([]sync.Map, error) {
    if rows == nil {
        errStr := "rows is nil"
        return nil, errors.New(errStr)
    }
    defer rows.Close()
    columns, err := rows.Columns()
    if err != nil {
        errStr := fmt.Sprintf("rows.Columns error:[%s]", err.Error())
        log.Error(ctx, errStr)
        return nil, errors.New(errStr)
    }
    count := len(columns)
    values := make([][]byte, count)
    scans := make([]interface{}, count)
    for i, _ := range columns {
        scans[i] = &values[i]
    }

    var syncMaps []sync.Map
    for rows.Next() {
        if err := rows.Scan(scans...); err != nil { //query.Scan查询出来的不定长值放到scans[i] = &values[i],也就是每行都放在values里
            errStr := "mysql scan failed"
            log.Errorf(ctx, errStr)
            return nil, errors.New(errStr)
        }
        var syncMap sync.Map
        for k, v := range values {
            key := columns[k]
            syncMap.Store(key, string(v))
        }
        syncMaps = append(syncMaps, syncMap)
    }
    //  log.Infof(ctx, "row size:[%d]", len(syncMaps))
    return syncMaps, nil
}

然后继续压测,仍然偶现(有时运行1小时就挂了,有时候运行6个小时也没事)。在经历了3天都搞到晚上2点都失败了,仍然没解决这个问题后,我求助golang开发人员了。

过程中我还重构了代码,任然没解决。最后我放弃了,求助于golang/go和go-sql-driver开发人员

golang/go的开发人员很快回复了我。让我用race和vet工具辅助定位问题,

继续-战

我用race工具确实查出了有同时读写map的问题。

==================
WARNING: DATA RACE
Write at 0x00c4202b8008 by goroutine 306:
  database/sql.(*DB).putConnDBLocked()
      /home/doctorq/soft/sf-go/src/database/sql/sql.go:1119 +0x214
  database/sql.(*DB).putConn()
      /home/doctorq/soft/sf-go/src/database/sql/sql.go:1079 +0x1a8
  database/sql.(*driverConn).releaseConn()
      /home/doctorq/soft/sf-go/src/database/sql/sql.go:380 +0x65
  database/sql.(*driverConn).(database/sql.releaseConn)-fm()
      /home/doctorq/soft/sf-go/src/database/sql/sql.go:643 +0x55
  database/sql.(*Rows).close()
      /home/doctorq/soft/sf-go/src/database/sql/sql.go:2767 +0x287
  database/sql.(*Rows).Close()
      /home/doctorq/soft/sf-go/src/database/sql/sql.go:2738 +0x4a
  database/sql.(*Rows).Next()
      /home/doctorq/soft/sf-go/src/database/sql/sql.go:2445 +0x120
  gitlab.sftcwl.com/dispatch/data/handler/db.(*MysqlAdapter).ParserRows()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/handler/db/mysql_adapter.go:87 +0x52e
  gitlab.sftcwl.com/dispatch/data/handler/db.(*MysqlAdapter).OrderQuery()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/handler/db/mysql_adapter.go:65 +0x47c
  gitlab.sftcwl.com/dispatch/data/handler/db.(*MysqlProxy).GetOrderId()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/handler/db/mysql_proxy.go:179 +0xe03
  gitlab.sftcwl.com/dispatch/data/handler/proxy.(*OrderProxy).GetUndistributedOrder()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/handler/proxy/order.go:76 +0x112
  gitlab.sftcwl.com/dispatch/data/handler/service.(*GetOrder).Access()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/handler/service/get_order.go:35 +0x32c
  gitlab.sftcwl.com/dispatch/data/handler.(*DataServiceHandler).GetOrderDetail()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/handler/DataService_handler.go:45 +0xcb
  gitlab.sftcwl.com/dispatch/data/vendor/gitlab.sftcwl.com/dispatch/util/proto._DataService_GetOrderDetail_Handler.func1()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/gitlab.sftcwl.com/dispatch/util/proto/data.pb.go:2709 +0xa1
  gitlab.sftcwl.com/dispatch/data/vendor/github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1.1()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/github.com/grpc-ecosystem/go-grpc-middleware/chain.go:31 +0x2c7
  gitlab.sftcwl.com/dispatch/data/framework/interceptor/perfcounter.UnaryServerInterceptor.func1()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/framework/interceptor/perfcounter/perfcounter.go:32 +0x142
  gitlab.sftcwl.com/dispatch/data/vendor/github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1.1()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/github.com/grpc-ecosystem/go-grpc-middleware/chain.go:34 +0x1ca
  gitlab.sftcwl.com/dispatch/data/framework/interceptor/header.UnaryServerInterceptor.func1()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/framework/interceptor/header/header.go:59 +0x2ae
  gitlab.sftcwl.com/dispatch/data/vendor/github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/github.com/grpc-ecosystem/go-grpc-middleware/chain.go:39 +0x367
  gitlab.sftcwl.com/dispatch/data/vendor/gitlab.sftcwl.com/dispatch/util/proto._DataService_GetOrderDetail_Handler()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/gitlab.sftcwl.com/dispatch/util/proto/data.pb.go:2711 +0x1ec
  gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc.(*Server).processUnaryRPC()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/server.go:1026 +0xbb7
  gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc.(*Server).handleStream()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/server.go:1252 +0x14a6
  gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc.(*Server).serveStreams.func1.1()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/server.go:699 +0xac

Previous read at 0x00c4202b8008 by goroutine 309:
  runtime.growslice()
      /home/doctorq/soft/sf-go/src/runtime/slice.go:82 +0x0
  database/sql.(*DB).putConnDBLocked()
      /home/doctorq/soft/sf-go/src/database/sql/sql.go:1119 +0x30c
  database/sql.(*DB).putConn()
      /home/doctorq/soft/sf-go/src/database/sql/sql.go:1079 +0x1a8
  database/sql.(*driverConn).releaseConn()
      /home/doctorq/soft/sf-go/src/database/sql/sql.go:380 +0x65
  database/sql.(*driverConn).(database/sql.releaseConn)-fm()
      /home/doctorq/soft/sf-go/src/database/sql/sql.go:643 +0x55
  database/sql.(*Rows).close()
      /home/doctorq/soft/sf-go/src/database/sql/sql.go:2767 +0x287
  database/sql.(*Rows).Close()
      /home/doctorq/soft/sf-go/src/database/sql/sql.go:2738 +0x4a
  database/sql.(*Rows).Next()
      /home/doctorq/soft/sf-go/src/database/sql/sql.go:2445 +0x120
  gitlab.sftcwl.com/dispatch/data/handler/db.(*MysqlAdapter).ParserRows()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/handler/db/mysql_adapter.go:87 +0x52e
  gitlab.sftcwl.com/dispatch/data/handler/db.(*MysqlAdapter).OrderQuery()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/handler/db/mysql_adapter.go:65 +0x47c
  gitlab.sftcwl.com/dispatch/data/handler/db.(*MysqlProxy).GetOrderId()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/handler/db/mysql_proxy.go:179 +0xe03
  gitlab.sftcwl.com/dispatch/data/handler/proxy.(*OrderProxy).GetUndistributedOrder()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/handler/proxy/order.go:76 +0x112
  gitlab.sftcwl.com/dispatch/data/handler/service.(*GetOrder).Access()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/handler/service/get_order.go:35 +0x32c
  gitlab.sftcwl.com/dispatch/data/handler.(*DataServiceHandler).GetOrderDetail()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/handler/DataService_handler.go:45 +0xcb
  gitlab.sftcwl.com/dispatch/data/vendor/gitlab.sftcwl.com/dispatch/util/proto._DataService_GetOrderDetail_Handler.func1()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/gitlab.sftcwl.com/dispatch/util/proto/data.pb.go:2709 +0xa1
  gitlab.sftcwl.com/dispatch/data/vendor/github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1.1()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/github.com/grpc-ecosystem/go-grpc-middleware/chain.go:31 +0x2c7
  gitlab.sftcwl.com/dispatch/data/framework/interceptor/perfcounter.UnaryServerInterceptor.func1()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/framework/interceptor/perfcounter/perfcounter.go:32 +0x142
  gitlab.sftcwl.com/dispatch/data/vendor/github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1.1()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/github.com/grpc-ecosystem/go-grpc-middleware/chain.go:34 +0x1ca
  gitlab.sftcwl.com/dispatch/data/framework/interceptor/header.UnaryServerInterceptor.func1()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/framework/interceptor/header/header.go:59 +0x2ae
  gitlab.sftcwl.com/dispatch/data/vendor/github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/github.com/grpc-ecosystem/go-grpc-middleware/chain.go:39 +0x367
  gitlab.sftcwl.com/dispatch/data/vendor/gitlab.sftcwl.com/dispatch/util/proto._DataService_GetOrderDetail_Handler()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/gitlab.sftcwl.com/dispatch/util/proto/data.pb.go:2711 +0x1ec
  gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc.(*Server).processUnaryRPC()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/server.go:1026 +0xbb7
  gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc.(*Server).handleStream()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/server.go:1252 +0x14a6
  gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc.(*Server).serveStreams.func1.1()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/server.go:699 +0xac

Goroutine 306 (running) created at:
  gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc.(*Server).serveStreams.func1()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/server.go:697 +0xb8
  gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/internal/transport.(*http2Server).operateHeaders()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/internal/transport/http2_server.go:432 +0x14f9
  gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/internal/transport.(*http2Server).HandleStreams()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/internal/transport/http2_server.go:472 +0x8da
  gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc.(*Server).serveStreams()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/server.go:695 +0x183
  gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc.(*Server).handleRawConn.func2()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/server.go:649 +0x53
  gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc.(*Server).handleRawConn.func3()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/server.go:657 +0x34

Goroutine 309 (running) created at:
  gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc.(*Server).serveStreams.func1()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/server.go:697 +0xb8
  gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/internal/transport.(*http2Server).operateHeaders()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/internal/transport/http2_server.go:432 +0x14f9
  gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/internal/transport.(*http2Server).HandleStreams()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/internal/transport/http2_server.go:472 +0x8da
  gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc.(*Server).serveStreams()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/server.go:695 +0x183
  gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc.(*Server).handleRawConn.func2()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/server.go:649 +0x53
  gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc.(*Server).handleRawConn.func3()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/server.go:657 +0x34
==================
race: limit on 8192 simultaneously alive goroutines is exceeded, dying
exit status 66

但是很奇怪的是,这个问题仍然定位到database/sql库。然后我尝试用第二个工具vet,静态代码检测工具来定位代码问题,果然发现了问题所在。

handler/db/mysql_adapter.go:98: call of append copies lock value: sync.Map contains sync.Mutex
framework/app/mysql.go:130: literal copies lock value from *client: database/sql.DB contains sync.Mutex

这个地方是说这些对象,不能复制,应该用指针,所以我修改了我原先的方法改成指针传递。

func (p *MysqlAdapter) ParserRows(ctx context.Context, rows *sql.Rows) ([]*sync.Map, error) {
    if rows == nil {
        errStr := "rows is nil"
        return nil, errors.New(errStr)
    }
    defer rows.Close()
    columns, err := rows.Columns()
    if err != nil {
        errStr := fmt.Sprintf("rows.Columns error:[%s]", err.Error())
        log.Error(ctx, errStr)
        return nil, errors.New(errStr)
    }
    count := len(columns)
    values := make([][]byte, count)
    scans := make([]interface{}, count)
    for i, _ := range columns {
        scans[i] = &values[i]
    }

    var syncMaps []*sync.Map
    for rows.Next() {
        if err := rows.Scan(scans...); err != nil { //query.Scan查询出来的不定长值放到scans[i] = &values[i],也就是每行都放在values里
            errStr := "mysql scan failed"
            log.Errorf(ctx, errStr)
            return nil, errors.New(errStr)
        }
        var syncMap sync.Map
        for k, v := range values {
            key := columns[k]
            syncMap.Store(key, string(v))
        }
        syncMaps = append(syncMaps, &syncMap)
    }
    //  log.Infof(ctx, "row size:[%d]", len(syncMaps))
    return syncMaps, nil
}

根本原因

根本原因还是mysql client的创建过程,也用了复制:


type DpMysql struct {
    mysqlclient.Client
}


var client *mysqlclient.Client
    client, err = mysqlclient.NewClient(mysqlConfig)
    if err != nil {
        return nil, err
    }

    MysqlNS.config = mysqlConfig

    return &DpMysql{*client}, nil

因为复制的过程,会引起sync.Mutex传递出错,所以用指针来传递。修改后的代码如下:

type DpMysql struct {
    *mysqlclient.Client
}
var client *mysqlclient.Client
    client, err = mysqlclient.NewClient(mysqlConfig)
    if err != nil {
        return nil, err
    }

    MysqlNS.config = mysqlConfig

    return &DpMysql{client}, nil

至此,问题解决。

总结

  • race工具能直接定位到进程之间同步锁的问题。
  • vet工具能很准确的定位到语法问题,很有用。
  • 新人入行,踩坑了。
  • 多进程尽量使用sync.Map
  • 对象用指针向上传递

没有更多推荐了,返回首页