问题场景:
应用程序初始化阶段,从数据库读取记录作为输入,调用OpenSSL函数进行加密计算。数据库记录数直接影响计算总耗时,为加快运算进度,默认采用4线程并发处理。
问题描述:
最近测试反馈,应用程序的core dump目录下出现core文件。登录测试坏境,由于系统采用容器部署应用程序,而宿主机的标准库版本可能与应用程序使用的库版本不一致,执行如下命令创建容器用于调试:
docker run -it --name app_gdb -v /opt/core-docker/app/app-38:/opt/local/tmp/core localhost/service/app:V1.0 bash
如下是gdb调试core文件结果:
[root@localhost ~]# gdb app core_G_7f558e7fa700_25_1604933120
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
Core was generated by `/opt/demo/bin/app'.
Program terminated with signal 11, Segmentation fault.
#0 sha1_block_data_order_avx () at sha1-x86_64.s:3456
3456 sha1-x86_64.s: No such file or directory.
warning: Missing auto-load scripts referenced in section .debug_gdb_scripts
of file /opt/demo/lib/libhttp.so
Use `info auto-load python [REGEXP]' to list them.
Missing separate debuginfos, use: debuginfo-install glibc-2.17-157.el7_3.1.x86_64 libgcc-4.8.5-11.el7.x86_64 libstdc++-4.8.5-11.el7.x86_64 zlib-1.2.7-17.el7.x86_ 64
(gdb) bt
#0 sha1_block_data_order_avx () at sha1-x86_64.s:3456
#1 0xca62c1d6ca62c1d6 in ?? ()
#2 0xca62c1d6ca62c1d6 in ?? ()
#3 0xca62c1d6ca62c1d6 in ?? ()
#4 0xca62c1d6ca62c1d6 in ?? ()
#5 0xca62c1d6ca62c1d6 in ?? ()
#6 0xca62c1d6ca62c1d6 in ?? ()
#7 0xca62c1d6ca62c1d6 in ?? ()
#8 0xca62c1d6ca62c1d6 in ?? ()
#9 0x00007fd8bc87c900 in ?? ()
#10 0x0000000000000018 in ?? ()
#11 0x00007fd8b8001568 in ?? ()
#12 0x00007fd923ffe630 in ?? ()
#13 0x00000000000001ca in ?? ()
#14 0x000000000047db5e in SHA1_Update (c=<error reading variable: Cannot access memory at address 0x1a5e13f>,
data_=<error reading variable: Cannot access memory at address 0x1a5e137>, len=<error reading variable: Cannot access memory at address 0x1a5e12f>)
at ../md32_common.h:350
Backtrace stopped: previous frame inner to this frame (corrupt stack?)
从bt的堆栈结果看: 线程7f558e7fa700(core文件名包含线程id)调用堆栈损坏,而且根本没有业务应用的调用代码信息。这个问题比较棘手啦!
原因分析:
看到毫无头绪的堆栈,咨询旁边的高手,有两方面的怀疑:
- 栈踩踏:线程栈空间存在大块内存申请,造成栈溢出;
- 多线程:多线程访问全局对象不互斥,导致线程执行的不确定性。
结合这两个怀疑点,走查代码发现疑点1:栈空间存在大块内存申请。示例如下:
rapidjson::StringBuffer buffer;
Element stKey; // 栈上的临时变量
stKey.reqid = reqid;
stKey.rsp = buffer.GetString(); // 按照测试模型 buffer大小接近900KB; 该条语句存在两次深拷贝: 1. GetString返回值构造string对象; 2. buffer内容拷贝至rsp字段
KeysHandler::GetInstance().AddElement(stKey); // 函数AddElement参数为值传递,存在一次深拷贝
这是由于前期不熟悉业务,未考虑到后续数据指标提高。将使用栈内存修改为堆内存,如下:
rapidjson::StringBuffer buffer;
Element_Ptr pkey = Element_Ptr(new Element(reqId)); // new 申请堆内存,采用智能指针托管
pkey->rsp = string(buffer.GetString());
Handler::GetInstance().AddElement(pkey);
针对疑点2走读排查,业务代码在公共对象上都已增加互斥锁。修改疑点1后,接着在此修复版本上复现。按照测试的数据模型,用例多次执行后,core dump依旧。看到结果,倒吸一口凉气。
静下来捋一捋整个问题:
- 测试反馈用例未执行时段,同样出现core。查看日志应用存在加密计算,与测试描述不符。要对周边反馈保持怀疑,想办法通过其他手段来证实。
- 业务功能近期没有大改,加密计算的代码与旧版本一致。而旧版本经过多轮测试从未出现过core。
- 数据规模的变化:新系统的数据指标 高于 旧版本的数据模型,线程的运算压力要高于旧版本。难道还是多线程并发处理造成的?
结合bt堆栈,只有OpenSSL的“足迹”,难不成是OpenSSL的问题?久经“码”场的高手从旁敲打:
- 增加运算线程数复现
- 会不会是OpenSSL的bug?
将默认4线程调整为16线程,问题很快复现,更加增加对多线程的怀疑。(之前自身考虑过增加线程复现,限于验证环境并非独享,未避免影响他人测试,在协调环境上未坚决协调)
OpenSSL是否有Bug,找度娘,问谷哥(咨询同事如何科学上网,blablabla…着实耗费精力)。
检索关键词:openssl sha1_block_data_order_avx
果然是 OpenSSL,还是谷哥靠谱,有问必有解啊!
解决方案:
更准确地说,是OpenSSL多线程使用的问题。正确方式是这样的:
详见:
https://www.openssl.org/docs/man1.0.2/man3/threads.html
https://www.openssl.org/blog/blog/2017/02/21/threads/
应用程序使用的OpenSSL版本:1.0.2q;结合官方描述增加两个回调,代码如下(linux版本):
static pthread_mutex_t *lock_cs;
static long *lock_count;
void CRYPTO_thread_setup(void)
{
int i;
// CRYPTO_num_locks返回值41,此处申请41把锁pthread_mutex_t
lock_cs=(pthread_mutex_t *)OPENSSL_malloc(CRYPTO_num_locks() * sizeof(pthread_mutex_t));
lock_count=(long *)OPENSSL_malloc(CRYPTO_num_locks() * sizeof(long));
for (i=0; i<CRYPTO_num_locks(); i++)
{
lock_count[i]=0;
pthread_mutex_init(&(lock_cs[i]),NULL);
}
CRYPTO_set_id_callback((unsigned long (*)())pthreads_thread_id);
CRYPTO_set_locking_callback((void (*)(int, int, const char *, int))pthreads_locking_callback);
}
// 在确认工作线程退出时,调用
void CRYPTO_thread_cleanup(void)
{
int i;
LOG_INFO("CRYPTO_thread_cleanup begin.");
CRYPTO_set_locking_callback(NULL);
for (i=0; i<CRYPTO_num_locks(); i++)
{
pthread_mutex_destroy(&(lock_cs[i]));
LOG_INFO("pthread_mutex_destroy %8ld:%s.", lock_count[i], CRYPTO_get_lock_name(i));
}
OPENSSL_free(lock_cs);
OPENSSL_free(lock_count);
LOG_INFO("CRYPTO_thread_cleanup end.");
}
// 参数mode确定加锁 or 解锁;参数type指向锁Id,确定加/解哪把锁
// 参数file、line表示加解锁的文件名称和行号
void pthreads_locking_callback(int mode, int type, const char *file, int line)
{
// 增加日志,观察OpenSSL哪些地方需要加锁保证互斥访问
LOG_INFO("thread=%4d mode=%s lock=%s %s:%d.", CRYPTO_thread_id(), (mode&CRYPTO_LOCK)?"lock":"unlock", (type&CRYPTO_READ)?"read":"write", file, line);
if (CRYPTO_LOCK_SSL_CERT == type)
{
LOG_INFO("(t,m,f,l) %ld %d %s %d.", CRYPTO_thread_id(), mode, file, line);
}
if (mode & CRYPTO_LOCK)
{
pthread_mutex_lock(&(lock_cs[type]));
lock_count[type]++;
}
else
{
pthread_mutex_unlock(&(lock_cs[type]));
}
}
// 获取线程Id
unsigned long pthreads_thread_id(void)
{
unsigned long ret;
ret=(unsigned long)pthread_self();
return(ret);
}
官方示例见源码:crypto/threads/mttest.c
OpenSSL源码解析
crypto\crypto.h
// 锁索引Id
# define CRYPTO_LOCK_ERR 1
# define CRYPTO_LOCK_EX_DATA 2
# define CRYPTO_LOCK_X509 3
# define CRYPTO_LOCK_X509_INFO 4
# define CRYPTO_LOCK_X509_PKEY 5
# define CRYPTO_LOCK_X509_CRL 6
# define CRYPTO_LOCK_X509_REQ 7
# define CRYPTO_LOCK_DSA 8
# define CRYPTO_LOCK_RSA 9
# define CRYPTO_LOCK_EVP_PKEY 10
# define CRYPTO_LOCK_X509_STORE 11
# define CRYPTO_LOCK_SSL_CTX 12
# define CRYPTO_LOCK_SSL_CERT 13
# define CRYPTO_LOCK_SSL_SESSION 14
# define CRYPTO_LOCK_SSL_SESS_CERT 15
# define CRYPTO_LOCK_SSL 16
# define CRYPTO_LOCK_SSL_METHOD 17
# define CRYPTO_LOCK_RAND 18
# define CRYPTO_LOCK_RAND2 19
# define CRYPTO_LOCK_MALLOC 20
# define CRYPTO_LOCK_BIO 21
# define CRYPTO_LOCK_GETHOSTBYNAME 22
# define CRYPTO_LOCK_GETSERVBYNAME 23
# define CRYPTO_LOCK_READDIR 24
# define CRYPTO_LOCK_RSA_BLINDING 25
# define CRYPTO_LOCK_DH 26
# define CRYPTO_LOCK_MALLOC2 27
# define CRYPTO_LOCK_DSO 28
# define CRYPTO_LOCK_DYNLOCK 29
# define CRYPTO_LOCK_ENGINE 30
# define CRYPTO_LOCK_UI 31
# define CRYPTO_LOCK_ECDSA 32
# define CRYPTO_LOCK_EC 33
# define CRYPTO_LOCK_ECDH 34
# define CRYPTO_LOCK_BN 35
# define CRYPTO_LOCK_EC_PRE_COMP 36
# define CRYPTO_LOCK_STORE 37
# define CRYPTO_LOCK_COMP 38
# define CRYPTO_LOCK_FIPS 39
# define CRYPTO_LOCK_FIPS2 40
# define CRYPTO_NUM_LOCKS 41
crypto\cryptlib.c
int CRYPTO_num_locks(void)
{
return CRYPTO_NUM_LOCKS;
}
// 锁名称
/* real #defines in crypto.h, keep these upto date */
static const char *const lock_names[CRYPTO_NUM_LOCKS] = {
"<<ERROR>>",
"err",
"ex_data",
"x509",
"x509_info",
"x509_pkey",
"x509_crl",
"x509_req",
"dsa",
"rsa",
"evp_pkey",
"x509_store",
"ssl_ctx",
"ssl_cert",
"ssl_session",
"ssl_sess_cert",
"ssl",
"ssl_method",
"rand",
"rand2",
"debug_malloc",
"BIO",
"gethostbyname",
"getservbyname",
"readdir",
"RSA_blinding",
"dh",
"debug_malloc2",
"dso",
"dynlock",
"engine",
"ui",
"ecdsa",
"ec",
"ecdh",
"bn",
"ec_pre_comp",
"store",
"comp",
"fips",
"fips2",
#if CRYPTO_NUM_LOCKS != 41
# error "Inconsistency between crypto.h and cryptlib.c"
#endif
};
OpenSSL锁使用示例,截取至源文件md_rand.c
static void ssleay_rand_add(const void *buf, int num, double add)
{
int do_not_lock;
/* check if we already have the lock */
if (crypto_lock_rand) {
CRYPTO_THREADID cur;
CRYPTO_THREADID_current(&cur);
CRYPTO_r_lock(CRYPTO_LOCK_RAND2);
do_not_lock = !CRYPTO_THREADID_cmp(&locking_threadid, &cur);
CRYPTO_r_unlock(CRYPTO_LOCK_RAND2);
} else
do_not_lock = 0; // do_not_lock 表示不加锁:0-加锁,非0-解锁
if (!do_not_lock)
CRYPTO_w_lock(CRYPTO_LOCK_RAND); // 宏定义见下方定义,参数CRYPTO_LOCK_RAND 为type
/* 省略中间代码 */
if (!do_not_lock)
CRYPTO_w_unlock(CRYPTO_LOCK_RAND);
}
# define CRYPTO_w_lock(type) \
CRYPTO_lock(CRYPTO_LOCK|CRYPTO_WRITE,type,__FILE__,__LINE__)
# define CRYPTO_w_unlock(type) \
CRYPTO_lock(CRYPTO_UNLOCK|CRYPTO_WRITE,type,__FILE__,__LINE__)
# define CRYPTO_r_lock(type) \
CRYPTO_lock(CRYPTO_LOCK|CRYPTO_READ,type,__FILE__,__LINE__)
# define CRYPTO_r_unlock(type) \
CRYPTO_lock(CRYPTO_UNLOCK|CRYPTO_READ,type,__FILE__,__LINE__)
# define CRYPTO_add(addr,amount,type) \
CRYPTO_add_lock(addr,amount,type,__FILE__,__LINE__)