一、 tensorflow 离线训练导出格式
- Tensorflow 离线训练的结果可以使用某种格式导出,如 SessionBundle、SavedModel、FreezedGraph,本次在使用SavedModel方式保持模型,进行在线预估。
二、在线预估代码
- 模型加载逻辑
bool TensorModel::LoadModel(const std::string& model_path) {
tensorflow::SessionOptions session_options;
tensorflow::RunOptions run_options;
std::unordered_set<std::string> saved_model_tags;
saved_model_tags.insert(tensorflow::kSavedModelTagServe);
tensorflow::Status status = tensorflow::LoadSavedModel(session_options, run_options, model_path,
saved_model_tags, &model_bundle_);
if (!status.ok()) {
LOG(ERROR) << "LoadSavedModel Failed: " << status.ToString();
return false;
}
return true;
}
- 在线预估逻辑,发现在预估的时候此处出现了hang住的请求,没有反应
tensorflow::Status status = model_bundle_.session->Run(tf_feature.input, {out_tname}, {}, &outputs);
- 具体堆栈
Thread 14 (Thread 0x7f738b7ce700 (LWP 7641)):
#0 0x00007f74888bf995 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
#1 0x00007f748835a55c in __gthread_cond_wait (__mutex=<optimized out>, __cond=<optimized out>) at /tmp/gcc-4.9.2/x86_64-unknown-linux-gnu/libstdc++-v3/include/x86_64-unknown-linux-gnu/bits/gthr-default.h:864
#2 std::condition_variable::wait (this=<optimized out>, __lock=...) at ../../../.././libstdc++-v3/src/c++11/condition_variable.cc:52
#3 0x0000000002ba4874 in Eigen::NonBlockingThreadPoolTempl<tensorflow::thread::EigenEnvironment>::WaitForWork(Eigen::EventCount::Waiter*, tensorflow::thread::EigenEnvironment::Task*) ()
#4 0x0000000002ba52a2 in Eigen::NonBlockingThreadPoolTempl<tensorflow::thread::EigenEnvironment>::WorkerLoop(int) ()
#5 0x0000000002ba4208 in std::_Function_handler<void (), tensorflow::thread::EigenEnvironment::CreateThread(std::function<void ()>)::{lambda()#1}>::_M_invoke(std::_Any_data const&) ()
#6 0x00007f748835e1e0 in std::(anonymous namespace)::execute_native_thread_routine (__p=<optimized out>) at ../../../.././libstdc++-v3/src/c++11/thread.cc:84
#7 0x00007f74888bbe25 in start_thread () from /lib64/libpthread.so.0
#8 0x00007f7487dc6bad in clone () from /lib64/libc.so.6
Thread 13 (Thread 0x7f738afcd700 (LWP 7642)):
#0 0x00007f74888bf995 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
#1 0x00007f748835a55c in __gthread_cond_wait (__mutex=<optimized out>, __cond=<optimized out>) at /tmp/gcc-4.9.2/x86_64-unknown-linux-gnu/libstdc++-v3/include/x86_64-unknown-linux-gnu/bits/gthr-default.h:864
#2 std::condition_variable::wait (this=<optimized out>, __lock=...) at ../../../.././libstdc++-v3/src/c++11/condition_variable.cc:52
#3 0x0000000002ba4874 in Eigen::NonBlockingThreadPoolTempl<tensorflow::thread::EigenEnvironment>::WaitForWork(Eigen::EventCount::Waiter*, tensorflow::thread::EigenEnvironment::Task*) ()
#4 0x0000000002ba52a2 in Eigen::NonBlockingThreadPoolTempl<tensorflow::thread::EigenEnvironment>::WorkerLoop(int) ()
#5 0x0000000002ba4208 in std::_Function_handler<void (), tensorflow::thread::EigenEnvironment::CreateThread(std::function<void ()>)::{lambda()#1}>::_M_invoke(std::_Any_data const&) ()
#6 0x00007f748835e1e0 in std::(anonymous namespace)::execute_native_thread_routine (__p=<optimized out>) at ../../../.././libstdc++-v3/src/c++11/thread.cc:84
#7 0x00007f74888bbe25 in start_thread () from /lib64/libpthread.so.0
#8 0x00007f7487dc6bad in clone () from /lib64/libc.so.6
Thread 12 (Thread 0x7f738a7cc700 (LWP 7643)):
#0 0x00007f74888bf995 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
#1 0x00007f748835a55c in __gthread_cond_wait (__mutex=<optimized out>, __cond=<optimized out>) at /tmp/gcc-4.9.2/x86_64-unknown-linux-gnu/libstdc++-v3/include/x86_64-unknown-linux-gnu/bits/gthr-default.h:864
#2 std::condition_variable::wait (this=<optimized out>, __lock=...) at ../../../.././libstdc++-v3/src/c++11/condition_variable.cc:52
#3 0x0000000002ba4874 in Eigen::NonBlockingThreadPoolTempl<tensorflow::thread::EigenEnvironment>::WaitForWork(Eigen::EventCount::Waiter*, tensorflow::thread::EigenEnvironment::Task*) ()
#4 0x0000000002ba52a2 in Eigen::NonBlockingThreadPoolTempl<tensorflow::thread::EigenEnvironment>::WorkerLoop(int) ()
#5 0x0000000002ba4208 in std::_Function_handler<void (), tensorflow::thread::EigenEnvironment::CreateThread(std::function<void ()>)::{lambda()#1}>::_M_invoke(std::_Any_data const&) ()
#6 0x00007f748835e1e0 in std::(anonymous namespace)::execute_native_thread_routine (__p=<optimized out>) at ../../../.././libstdc++-v3/src/c++11/thread.cc:84
#7 0x00007f74888bbe25 in start_thread () from /lib64/libpthread.so.0
#8 0x00007f7487dc6bad in clone () from /lib64/libc.so.6
Thread 11 (Thread 0x7f7389fcb700 (LWP 7644)):
#0 0x00007f74888bf995 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
#1 0x00007f748835a55c in __gthread_cond_wait (__mutex=<optimized out>, __cond=<optimized out>) at /tmp/gcc-4.9.2/x86_64-unknown-linux-gnu/libstdc++-v3/include/x86_64-unknown-linux-gnu/bits/gthr-default.h:864
#2 std::condition_variable::wait (this=<optimized out>, __lock=...) at ../../../.././libstdc++-v3/src/c++11/condition_variable.cc:52
#3 0x0000000002ba4874 in Eigen::NonBlockingThreadPoolTempl<tensorflow::thread::EigenEnvironment>::WaitForWork(Eigen::EventCount::Waiter*, tensorflow::thread::EigenEnvironment::Task*) ()
#4 0x0000000002ba52a2 in Eigen::NonBlockingThreadPoolTempl<tensorflow::thread::EigenEnvironment>::WorkerLoop(int) ()
#5 0x0000000002ba4208 in std::_Function_handler<void (), tensorflow::thread::EigenEnvironment::CreateThread(std::function<void ()>)::{lambda()#1}>::_M_invoke(std::_Any_data const&) ()
#6 0x00007f748835e1e0 in std::(anonymous namespace)::execute_native_thread_routine (__p=<optimized out>) at ../../../.././libstdc++-v3/src/c++11/thread.cc:84
#7 0x00007f74888bbe25 in start_thread () from /lib64/libpthread.so.0
#8 0x00007f7487dc6bad in clone () from /lib64/libc.so.6
Thread 10 (Thread 0x7f73897ca700 (LWP 7645)):
#0 0x00007f74888bf995 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
#1 0x00007f748835a55c in __gthread_cond_wait (__mutex=<optimized out>, __cond=<optimized out>) at /tmp/gcc-4.9.2/x86_64-unknown-linux-gnu/libstdc++-v3/include/x86_64-unknown-linux-gnu/bits/gthr-default.h:864
#2 std::condition_variable::wait (this=<optimized out>, __lock=...) at ../../../.././libstdc++-v3/src/c++11/condition_variable.cc:52
#3 0x0000000002ba4874 in Eigen::NonBlockingThreadPoolTempl<tensorflow::thread::EigenEnvironment>::WaitForWork(Eigen::EventCount::Waiter*, tensorflow::thread::EigenEnvironment::Task*) ()
#4 0x0000000002ba52a2 in Eigen::NonBlockingThreadPoolTempl<tensorflow::thread::EigenEnvironment>::WorkerLoop(int) ()
#5 0x0000000002ba4208 in std::_Function_handler<void (), tensorflow::thread::EigenEnvironment::CreateThread(std::function<void ()>)::{lambda()#1}>::_M_invoke(std::_Any_data const&) ()
#6 0x00007f748835e1e0 in std::(anonymous namespace)::execute_native_thread_routine (__p=<optimized out>) at ../../../.././libstdc++-v3/src/c++11/thread.cc:84
#7 0x00007f74888bbe25 in start_thread () from /lib64/libpthread.so.0
#8 0x00007f7487dc6bad in clone () from /lib64/libc.so.6
Thread 9 (Thread 0x7f7388fc9700 (LWP 7646)):
#0 0x00007f74888bf995 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
#1 0x00007f748835a55c in __gthread_cond_wait (__mutex=<optimized out>, __cond=<optimized out>) at /tmp/gcc-4.9.2/x86_64-unknown-linux-gnu/libstdc++-v3/include/x86_64-unknown-linux-gnu/bits/gthr-default.h:864
#2 std::condition_variable::wait (this=<optimized out>, __lock=...) at ../../../.././libstdc++-v3/src/c++11/condition_variable.cc:52
#3 0x0000000002ba4874 in Eigen::NonBlockingThreadPoolTempl<tensorflow::thread::EigenEnvironment>::WaitForWork(Eigen::EventCount::Waiter*, tensorflow::thread::EigenEnvironment::Task*) ()
#4 0x0000000002ba52a2 in Eigen::NonBlockingThreadPoolTempl<tensorflow::thread::EigenEnvironment>::WorkerLoop(int) ()
#5 0x0000000002ba4208 in std::_Function_handler<void (), tensorflow::thread::EigenEnvironment::CreateThread(std::function<void ()>)::{lambda()#1}>::_M_invoke(std::_Any_data const&) ()
#6 0x00007f748835e1e0 in std::(anonymous namespace)::execute_native_thread_routine (__p=<optimized out>) at ../../../.././libstdc++-v3/src/c++11/thread.cc:84
#7 0x00007f74888bbe25 in start_thread () from /lib64/libpthread.so.0
#8 0x00007f7487dc6bad in clone () from /lib64/libc.so.6
Thread 8 (Thread 0x7f73887c8700 (LWP 7647)):
#0 0x00007f74888bf995 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
#1 0x00007f748835a55c in __gthread_cond_wait (__mutex=<optimized out>, __cond=<optimized out>) at /tmp/gcc-4.9.2/x86_64-unknown-linux-gnu/libstdc++-v3/include/x86_64-unknown-linux-gnu/bits/gthr-default.h:864
#2 std::condition_variable::wait (this=<optimized out>, __lock=...) at ../../../.././libstdc++-v3/src/c++11/condition_variable.cc:52
#3 0x0000000002ba4874 in Eigen::NonBlockingThreadPoolTempl<tensorflow::thread::EigenEnvironment>::WaitForWork(Eigen::EventCount::Waiter*, tensorflow::thread::EigenEnvironment::Task*) ()
#4 0x0000000002ba52a2 in Eigen::NonBlockingThreadPoolTempl<tensorflow::thread::EigenEnvironment>::WorkerLoop(int) ()
#5 0x0000000002ba4208 in std::_Function_handler<void (), tensorflow::thread::EigenEnvironment::CreateThread(std::function<void ()>)::{lambda()#1}>::_M_invoke(std::_Any_data const&) ()
#6 0x00007f748835e1e0 in std::(anonymous namespace)::execute_native_thread_routine (__p=<optimized out>) at ../../../.././libstdc++-v3/src/c++11/thread.cc:84
#7 0x00007f74888bbe25 in start_thread () from /lib64/libpthread.so.0
#8 0x00007f7487dc6bad in clone () from /lib64/libc.so.6
Thread 7 (Thread 0x7f7387fc7700 (LWP 7648)):
#0 0x00007f74888bf995 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
#1 0x00007f748835a55c in __gthread_cond_wait (__mutex=<optimized out>, __cond=<optimized out>) at /tmp/gcc-4.9.2/x86_64-unknown-linux-gnu/libstdc++-v3/include/x86_64-unknown-linux-gnu/bits/gthr-default.h:864
#2 std::condition_variable::wait (this=<optimized out>, __lock=...) at ../../../.././libstdc++-v3/src/c++11/condition_variable.cc:52
#3 0x0000000002ba4874 in Eigen::NonBlockingThreadPoolTempl<tensorflow::thread::EigenEnvironment>::WaitForWork(Eigen::EventCount::Waiter*, tensorflow::thread::EigenEnvironment::Task*) ()
#4 0x0000000002ba52a2 in Eigen::NonBlockingThreadPoolTempl<tensorflow::thread::EigenEnvironment>::WorkerLoop(int) ()
#5 0x0000000002ba4208 in std::_Function_handler<void (), tensorflow::thread::EigenEnvironment::CreateThread(std::function<void ()>)::{lambda()#1}>::_M_invoke(std::_Any_data const&) ()
#6 0x00007f748835e1e0 in std::(anonymous namespace)::execute_native_thread_routine (__p=<optimized out>) at ../../../.././libstdc++-v3/src/c++11/thread.cc:84
#7 0x00007f74888bbe25 in start_thread () from /lib64/libpthread.so.0
#8 0x00007f7487dc6bad in clone () from /lib64/libc.so.6
Thread 6 (Thread 0x7f73b0f81700 (LWP 7655)):
#0 0x00007f74888bf995 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
#1 0x000000000072a5e7 in gpr_cv_wait ()
#2 0x000000000071b1b4 in executor_thread(void*) ()
#3 0x000000000072a08d in thread_body(void*) ()
#4 0x00007f74888bbe25 in start_thread () from /lib64/libpthread.so.0
#5 0x00007f7487dc6bad in clone () from /lib64/libc.so.6
Thread 5 (Thread 0x7f73b1782700 (LWP 7656)):
#0 0x00007f74888bfd42 in pthread_cond_timedwait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
#1 0x000000000072a637 in gpr_cv_wait ()
#2 0x0000000000710165 in timer_thread(void*) ()
#3 0x000000000072a08d in thread_body(void*) ()
#4 0x00007f74888bbe25 in start_thread () from /lib64/libpthread.so.0
#5 0x00007f7487dc6bad in clone () from /lib64/libc.so.6
Thread 4 (Thread 0x7f73b1f83700 (LWP 7659)):
#0 0x00007f74888bf995 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
#1 0x00007f748835a55c in __gthread_cond_wait (__mutex=<optimized out>, __cond=<optimized out>) at /tmp/gcc-4.9.2/x86_64-unknown-linux-gnu/libstdc++-v3/include/x86_64-unknown-linux-gnu/bits/gthr-default.h:864
#2 std::condition_variable::wait (this=<optimized out>, __lock=...) at ../../../.././libstdc++-v3/src/c++11/condition_variable.cc:52
#3 0x0000000002d3bd64 in nsync::nsync_mu_semaphore_p_with_deadline(nsync::nsync_semaphore_s_*, timespec) ()
#4 0x0000000002d3c191 in nsync::nsync_sem_wait_with_cancel_(nsync::waiter*, timespec, nsync::nsync_note_s_*) ()
#5 0x0000000002d3aa14 in nsync::nsync_cv_wait_with_deadline_generic(nsync::nsync_cv_s_*, void*, void (*)(void*), void (*)(void*), timespec, nsync::nsync_note_s_*) ()
#6 0x0000000002d3af4d in nsync::nsync_cv_wait(nsync::nsync_cv_s_*, nsync::nsync_mu_s_*) ()
#7 0x00000000007b5fab in tensorflow::serving::BatchingSession::Run(tensorflow::RunOptions const&, std::vector<std::pair<std::string, tensorflow::Tensor>, std::allocator<std::pair<std::string, tensorflow::Tensor> > > const&, std::vector<std::string, std::allocator<std::string> > const&, std::vector<std::string, std::allocator<std::string> > const&, std::vector<tensorflow::Tensor, std::allocator<tensorflow::Tensor> >*, tensorflow::RunMetadata*) ()
#8 0x0000000000753678 in tensorflow::serving::(anonymous namespace)::SavedModelPredict(tensorflow::RunOptions const&, tensorflow::serving::ServerCore*, tensorflow::serving::PredictRequest const&, tensorflow::serving::PredictResponse*) ()
#9 0x0000000000756a66 in tensorflow::serving::TensorflowPredictor::Predict(tensorflow::RunOptions const&, tensorflow::serving::ServerCore*, tensorflow::serving::PredictRequest const&, tensorflow::serving::PredictResponse*) ()
#10 0x00000000004f0944 in (anonymous namespace)::PredictionServiceImpl::Predict(grpc::ServerContext*, tensorflow::serving::PredictRequest const*, tensorflow::serving::PredictResponse*) ()
#11 0x00000000006b3b96 in std::_Function_handler<grpc::Status (tensorflow::serving::PredictionService::Service*, grpc::ServerContext*, tensorflow::serving::PredictRequest const*, tensorflow::serving::PredictResponse*), std::_Mem_fn<grpc::Status (tensorflow::serving::PredictionService::Service::*)(grpc::ServerContext*, tensorflow::serving::PredictRequest const*, tensorflow::serving::PredictResponse*)> >::_M_invoke(std::_Any_data const&, tensorflow::serving::PredictionService::Service*, grpc::ServerContext*, tensorflow::serving::PredictRequest const*, tensorflow::serving::PredictResponse*) ()
#12 0x00000000006b8150 in grpc::internal::RpcMethodHandler<tensorflow::serving::PredictionService::Service, tensorflow::serving::PredictRequest, tensorflow::serving::PredictResponse>::RunHandler(grpc::internal::MethodHandler::HandlerParameter const&) ()
#13 0x00000000006c2839 in grpc::Server::SyncRequestThreadManager::DoWork(void*, bool) ()
#14 0x00000000006c6f07 in grpc::ThreadManager::MainWorkLoop() ()
#15 0x00000000006c7024 in grpc::ThreadManager::WorkerThread::Run() ()
#16 0x00007f748835e1e0 in std::(anonymous namespace)::execute_native_thread_routine (__p=<optimized out>) at ../../../.././libstdc++-v3/src/c++11/thread.cc:84
#17 0x00007f74888bbe25 in start_thread () from /lib64/libpthread.so.0
#18 0x00007f7487dc6bad in clone () from /lib64/libc.so.6
Thread 3 (Thread 0x7f73b2784700 (LWP 7765)):
#0 0x00007f7487dc7183 in epoll_wait () from /lib64/libc.so.6
#1 0x0000000000724ca8 in pollset_work ()
#2 0x0000000000701aa6 in cq_next(grpc_completion_queue*, gpr_timespec, void*) ()
#3 0x00000000007022bb in grpc_completion_queue_next ()
#4 0x00000000006c7a11 in grpc::CompletionQueue::AsyncNextInternal(void**, bool*, gpr_timespec) ()
#5 0x00000000006bc999 in grpc::Server::SyncRequestThreadManager::PollForWork(void**, bool*) ()
#6 0x00000000006c6e41 in grpc::ThreadManager::MainWorkLoop() ()
#7 0x00000000006c7024 in grpc::ThreadManager::WorkerThread::Run() ()
#8 0x00007f748835e1e0 in std::(anonymous namespace)::execute_native_thread_routine (__p=<optimized out>) at ../../../.././libstdc++-v3/src/c++11/thread.cc:84
#9 0x00007f74888bbe25 in start_thread () from /lib64/libpthread.so.0
#10 0x00007f7487dc6bad in clone () from /lib64/libc.so.6
Thread 2 (Thread 0x7f740cff9700 (LWP 7766)):
#0 0x00007f74888bf995 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
#1 0x000000000072a5e7 in gpr_cv_wait ()
#2 0x0000000000710165 in timer_thread(void*) ()
#3 0x000000000072a08d in thread_body(void*) ()
#4 0x00007f74888bbe25 in start_thread () from /lib64/libpthread.so.0
#5 0x00007f7487dc6bad in clone () from /lib64/libc.so.6
Thread 1 (Thread 0x7f74890ec8c0 (LWP 7422)):
#0 0x00007f74888bf995 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
#1 0x00007f748835a55c in __gthread_cond_wait (__mutex=<optimized out>, __cond=<optimized out>) at /tmp/gcc-4.9.2/x86_64-unknown-linux-gnu/libstdc++-v3/include/x86_64-unknown-linux-gnu/bits/gthr-default.h:864
#2 std::condition_variable::wait (this=<optimized out>, __lock=...) at ../../../.././libstdc++-v3/src/c++11/condition_variable.cc:52
#3 0x00000000006c1eaa in grpc::Server::Wait() ()
#4 0x0000000000420f45 in main ()
三、问题总结
由于hang住没有任何信息,尝试看堆栈,看源码,打日志都没有办法定位问题,感觉很无助,最开始一直怀疑是input的问题,于是尝试使用tf_serving进行加载模型预估也存在同样的问题。
最终定位还是离线在使用savedmodle模型进行保存模型方式不对,保存的网络图结构不对,没有保存正确的output图节点,导致预估找不到结果。