多线程调用OpenSSL，应用程序偶现core dump

最新推荐文章于 2024-08-05 17:47:48 发布

迪司匹林

最新推荐文章于 2024-08-05 17:47:48 发布

阅读量1.8k

点赞数 2

文章标签： c++ segmentfault 安全多线程

本文链接：https://blog.csdn.net/ailoong/article/details/109687745

版权

问题场景：

应用程序初始化阶段，从数据库读取记录作为输入，调用OpenSSL函数进行加密计算。数据库记录数直接影响计算总耗时，为加快运算进度，默认采用4线程并发处理。

问题描述：

最近测试反馈，应用程序的core dump目录下出现core文件。登录测试坏境，由于系统采用容器部署应用程序，而宿主机的标准库版本可能与应用程序使用的库版本不一致，执行如下命令创建容器用于调试：

docker run -it --name app_gdb -v /opt/core-docker/app/app-38:/opt/local/tmp/core localhost/service/app:V1.0 bash

如下是gdb调试core文件结果：

[root@localhost ~]# gdb app core_G_7f558e7fa700_25_1604933120
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
Core was generated by `/opt/demo/bin/app'.
Program terminated with signal 11, Segmentation fault.
#0  sha1_block_data_order_avx () at sha1-x86_64.s:3456
3456    sha1-x86_64.s: No such file or directory.
warning: Missing auto-load scripts referenced in section .debug_gdb_scripts
of file /opt/demo/lib/libhttp.so
Use `info auto-load python [REGEXP]' to list them.
Missing separate debuginfos, use: debuginfo-install glibc-2.17-157.el7_3.1.x86_64 libgcc-4.8.5-11.el7.x86_64 libstdc++-4.8.5-11.el7.x86_64 zlib-1.2.7-17.el7.x86_                       64
(gdb) bt
#0  sha1_block_data_order_avx () at sha1-x86_64.s:3456
#1  0xca62c1d6ca62c1d6 in ?? ()
#2  0xca62c1d6ca62c1d6 in ?? ()
#3  0xca62c1d6ca62c1d6 in ?? ()
#4  0xca62c1d6ca62c1d6 in ?? ()
#5  0xca62c1d6ca62c1d6 in ?? ()
#6  0xca62c1d6ca62c1d6 in ?? ()
#7  0xca62c1d6ca62c1d6 in ?? ()
#8  0xca62c1d6ca62c1d6 in ?? ()
#9  0x00007fd8bc87c900 in ?? ()
#10 0x0000000000000018 in ?? ()
#11 0x00007fd8b8001568 in ?? ()
#12 0x00007fd923ffe630 in ?? ()
#13 0x00000000000001ca in ?? ()
#14 0x000000000047db5e in SHA1_Update (c=<error reading variable: Cannot access memory at address 0x1a5e13f>,
    data_=<error reading variable: Cannot access memory at address 0x1a5e137>, len=<error reading variable: Cannot access memory at address 0x1a5e12f>)
    at ../md32_common.h:350
Backtrace stopped: previous frame inner to this frame (corrupt stack?)

从bt的堆栈结果看：线程7f558e7fa700（core文件名包含线程id）调用堆栈损坏，而且根本没有业务应用的调用代码信息。这个问题比较棘手啦！

原因分析：

看到毫无头绪的堆栈，咨询旁边的高手，有两方面的怀疑：

栈踩踏：线程栈空间存在大块内存申请，造成栈溢出；
多线程：多线程访问全局对象不互斥，导致线程执行的不确定性。

结合这两个怀疑点，走查代码发现疑点1：栈空间存在大块内存申请。示例如下：

rapidjson::StringBuffer buffer;		
Element stKey;	// 栈上的临时变量
stKey.reqid = reqid;
stKey.rsp = buffer.GetString();	// 按照测试模型 buffer大小接近900KB; 该条语句存在两次深拷贝: 1. GetString返回值构造string对象; 2. buffer内容拷贝至rsp字段
KeysHandler::GetInstance().AddElement(stKey);  // 函数AddElement参数为值传递，存在一次深拷贝

这是由于前期不熟悉业务，未考虑到后续数据指标提高。将使用栈内存修改为堆内存，如下：

rapidjson::StringBuffer buffer;	
Element_Ptr pkey = Element_Ptr(new Element(reqId));	 // new 申请堆内存，采用智能指针托管	 
pkey->rsp = string(buffer.GetString());
Handler::GetInstance().AddElement(pkey);

针对疑点2走读排查，业务代码在公共对象上都已增加互斥锁。修改疑点1后，接着在此修复版本上复现。按照测试的数据模型，用例多次执行后，core dump依旧。看到结果，倒吸一口凉气。

静下来捋一捋整个问题：

测试反馈用例未执行时段，同样出现core。查看日志应用存在加密计算，与测试描述不符。要对周边反馈保持怀疑，想办法通过其他手段来证实。
业务功能近期没有大改，加密计算的代码与旧版本一致。而旧版本经过多轮测试从未出现过core。
数据规模的变化：新系统的数据指标高于旧版本的数据模型，线程的运算压力要高于旧版本。难道还是多线程并发处理造成的？

结合bt堆栈，只有OpenSSL的“足迹”，难不成是OpenSSL的问题？久经“码”场的高手从旁敲打：

增加运算线程数复现
会不会是OpenSSL的bug？

将默认4线程调整为16线程，问题很快复现，更加增加对多线程的怀疑。（之前自身考虑过增加线程复现，限于验证环境并非独享，未避免影响他人测试，在协调环境上未坚决协调）

OpenSSL是否有Bug，找度娘，问谷哥（咨询同事如何科学上网，blablabla…着实耗费精力）。
检索关键词：openssl sha1_block_data_order_avx

果然是 OpenSSL，还是谷哥靠谱，有问必有解啊！

解决方案：

更准确地说，是OpenSSL多线程使用的问题。正确方式是这样的：
在这里插入图片描述
详见：
https://www.openssl.org/docs/man1.0.2/man3/threads.html
https://www.openssl.org/blog/blog/2017/02/21/threads/

应用程序使用的OpenSSL版本：1.0.2q；结合官方描述增加两个回调，代码如下（linux版本）：

static pthread_mutex_t *lock_cs;
static long *lock_count;

void CRYPTO_thread_setup(void)
{
	int i;

	// CRYPTO_num_locks返回值41，此处申请41把锁pthread_mutex_t
	lock_cs=(pthread_mutex_t *)OPENSSL_malloc(CRYPTO_num_locks() * sizeof(pthread_mutex_t));
	lock_count=(long *)OPENSSL_malloc(CRYPTO_num_locks() * sizeof(long));
	for (i=0; i<CRYPTO_num_locks(); i++)
	{
		lock_count[i]=0;
		pthread_mutex_init(&(lock_cs[i]),NULL);
	}

	CRYPTO_set_id_callback((unsigned long (*)())pthreads_thread_id);
	CRYPTO_set_locking_callback((void (*)(int, int, const char *, int))pthreads_locking_callback);
}

// 在确认工作线程退出时，调用
void CRYPTO_thread_cleanup(void)
{
	int i;
	LOG_INFO("CRYPTO_thread_cleanup begin.");
	CRYPTO_set_locking_callback(NULL);
	for (i=0; i<CRYPTO_num_locks(); i++)
	{
		pthread_mutex_destroy(&(lock_cs[i]));
		LOG_INFO("pthread_mutex_destroy %8ld:%s.", lock_count[i], CRYPTO_get_lock_name(i));
	}
	OPENSSL_free(lock_cs);
	OPENSSL_free(lock_count);
	LOG_INFO("CRYPTO_thread_cleanup end.");	
}

// 参数mode确定加锁 or 解锁；参数type指向锁Id，确定加/解哪把锁
// 参数file、line表示加解锁的文件名称和行号
void pthreads_locking_callback(int mode, int type, const char *file, int line)
{
	// 增加日志，观察OpenSSL哪些地方需要加锁保证互斥访问
	LOG_INFO("thread=%4d mode=%s lock=%s %s:%d.", CRYPTO_thread_id(), (mode&CRYPTO_LOCK)?"lock":"unlock", (type&CRYPTO_READ)?"read":"write", file, line);

	if (CRYPTO_LOCK_SSL_CERT == type)
	{
		LOG_INFO("(t,m,f,l) %ld %d %s %d.", CRYPTO_thread_id(), mode, file, line);
	}

	if (mode & CRYPTO_LOCK)
	{
		pthread_mutex_lock(&(lock_cs[type]));
		lock_count[type]++;
	}
	else
	{
		pthread_mutex_unlock(&(lock_cs[type]));
	}
}

// 获取线程Id
unsigned long pthreads_thread_id(void)	
{
	unsigned long ret;

	ret=(unsigned long)pthread_self();
	return(ret);
}

官方示例见源码：crypto/threads/mttest.c

OpenSSL源码解析

crypto\crypto.h

// 锁索引Id
# define CRYPTO_LOCK_ERR                 1
# define CRYPTO_LOCK_EX_DATA             2
# define CRYPTO_LOCK_X509                3
# define CRYPTO_LOCK_X509_INFO           4
# define CRYPTO_LOCK_X509_PKEY           5
# define CRYPTO_LOCK_X509_CRL            6
# define CRYPTO_LOCK_X509_REQ            7
# define CRYPTO_LOCK_DSA                 8
# define CRYPTO_LOCK_RSA                 9
# define CRYPTO_LOCK_EVP_PKEY            10
# define CRYPTO_LOCK_X509_STORE          11
# define CRYPTO_LOCK_SSL_CTX             12
# define CRYPTO_LOCK_SSL_CERT            13
# define CRYPTO_LOCK_SSL_SESSION         14
# define CRYPTO_LOCK_SSL_SESS_CERT       15
# define CRYPTO_LOCK_SSL                 16
# define CRYPTO_LOCK_SSL_METHOD          17
# define CRYPTO_LOCK_RAND                18
# define CRYPTO_LOCK_RAND2               19
# define CRYPTO_LOCK_MALLOC              20
# define CRYPTO_LOCK_BIO                 21
# define CRYPTO_LOCK_GETHOSTBYNAME       22
# define CRYPTO_LOCK_GETSERVBYNAME       23
# define CRYPTO_LOCK_READDIR             24
# define CRYPTO_LOCK_RSA_BLINDING        25
# define CRYPTO_LOCK_DH                  26
# define CRYPTO_LOCK_MALLOC2             27
# define CRYPTO_LOCK_DSO                 28
# define CRYPTO_LOCK_DYNLOCK             29
# define CRYPTO_LOCK_ENGINE              30
# define CRYPTO_LOCK_UI                  31
# define CRYPTO_LOCK_ECDSA               32
# define CRYPTO_LOCK_EC                  33
# define CRYPTO_LOCK_ECDH                34
# define CRYPTO_LOCK_BN                  35
# define CRYPTO_LOCK_EC_PRE_COMP         36
# define CRYPTO_LOCK_STORE               37
# define CRYPTO_LOCK_COMP                38
# define CRYPTO_LOCK_FIPS                39
# define CRYPTO_LOCK_FIPS2               40
# define CRYPTO_NUM_LOCKS                41

crypto\cryptlib.c

int CRYPTO_num_locks(void)
{
    return CRYPTO_NUM_LOCKS;
}

// 锁名称
/* real #defines in crypto.h, keep these upto date */
static const char *const lock_names[CRYPTO_NUM_LOCKS] = {  
    "<<ERROR>>",
    "err",
    "ex_data",
    "x509",
    "x509_info",
    "x509_pkey",
    "x509_crl",
    "x509_req",
    "dsa",
    "rsa",
    "evp_pkey",
    "x509_store",
    "ssl_ctx",
    "ssl_cert",
    "ssl_session",
    "ssl_sess_cert",
    "ssl",
    "ssl_method",
    "rand",
    "rand2",
    "debug_malloc",
    "BIO",
    "gethostbyname",
    "getservbyname",
    "readdir",
    "RSA_blinding",
    "dh",
    "debug_malloc2",
    "dso",
    "dynlock",
    "engine",
    "ui",
    "ecdsa",
    "ec",
    "ecdh",
    "bn",
    "ec_pre_comp",
    "store",
    "comp",
    "fips",
    "fips2",
#if CRYPTO_NUM_LOCKS != 41
# error "Inconsistency between crypto.h and cryptlib.c"
#endif
};

OpenSSL锁使用示例，截取至源文件md_rand.c

static void ssleay_rand_add(const void *buf, int num, double add)
{
    int do_not_lock;

    /* check if we already have the lock */
    if (crypto_lock_rand) {
        CRYPTO_THREADID cur;
        CRYPTO_THREADID_current(&cur);
        CRYPTO_r_lock(CRYPTO_LOCK_RAND2);
        do_not_lock = !CRYPTO_THREADID_cmp(&locking_threadid, &cur);
        CRYPTO_r_unlock(CRYPTO_LOCK_RAND2);
    } else
        do_not_lock = 0;	// do_not_lock 表示不加锁：0-加锁，非0-解锁

    if (!do_not_lock)
        CRYPTO_w_lock(CRYPTO_LOCK_RAND);	// 宏定义见下方定义，参数CRYPTO_LOCK_RAND 为type

	/* 省略中间代码 */	

    if (!do_not_lock)
        CRYPTO_w_unlock(CRYPTO_LOCK_RAND);
}


#   define CRYPTO_w_lock(type)     \
        CRYPTO_lock(CRYPTO_LOCK|CRYPTO_WRITE,type,__FILE__,__LINE__)
        
#   define CRYPTO_w_unlock(type)   \
        CRYPTO_lock(CRYPTO_UNLOCK|CRYPTO_WRITE,type,__FILE__,__LINE__)
        
#   define CRYPTO_r_lock(type)     \
        CRYPTO_lock(CRYPTO_LOCK|CRYPTO_READ,type,__FILE__,__LINE__)
        
#   define CRYPTO_r_unlock(type)   \
        CRYPTO_lock(CRYPTO_UNLOCK|CRYPTO_READ,type,__FILE__,__LINE__)
        
#   define CRYPTO_add(addr,amount,type)    \
        CRYPTO_add_lock(addr,amount,type,__FILE__,__LINE__)