概述
在分析一些Android问题,比如ANR或Watchdog冻屏时,需要拿到相关进程的Java trace,然后分析是哪出了问题。但是这个Java trace是怎么生成的呢?在Android中的Java进程一般都是运行在art虚拟机之上的,而要拿到相关进程的Java trace,则需要它来完成相关dump操作。根据代码实现也能说明这一点,在art里面运行了一个 SignalCatcher 线程,专门用来处理这个逻辑。
SignalCatcher 线程启动后,会循环等待 SIGQUIT (信号3)的发生,当收到SIGQUIT后,会触发art去执行dump操作。当完成dump后,会通过socket连接到tombstoned,将trace输出到指定路径(通常在/data/anr下),接下来进行下一轮等待。因为art将SIGQUIT进行了拦截处理用来输出trace,因此并不会像linux进程一样默认会退出。
在Watchdog(4) Trace生成过程一文中,我们分析了发生Watchdog时抓Java trace的流程,从实现看是system_server发送了SIGQUIT到目标进程,然后触发SignalCatcher去进行dump,后者完成dump后会连接tombstoned去输出trace内容。本篇主要来讲SignalCatcher接收SIGQUIT,并产生Java trace生成的流程。
生成trace命令
在一些有现场机器的时候,我们可能会再抓一个Java trace,看看最新的状态。如下命令是比较常见的,不过通常需要root权限:
- debuggerd -j $pid
输出指定进程的 java traces,可以重定向输出
usage: debuggerd [-bj] PID
-b, --backtrace just a backtrace rather than a full tombstone
-j collect java traces
- kill -3 $pid
直接发送 SIGQUIT , 在 /data/anr/ 生成trace
示例:
$ adb root
restarting adbd as root
$ adb shell pidof system_server
516
$ adb shell debuggerd -j 516 > system_server_trace.txt
$ adb shell kill -3 516
$ adb shell ls /data/anr
trace_02
$ adb pull /data/anr/trace_02
trace 生成流程
从之前描述可知,SignalCatcher是生成Java trace的关键一环,我们首先来分析它。
SignalCatcher的启动
在Android中,应用/system_server是由zygote进程启动的。在启动进程之后,会执行SpecializeCommon对进程进行专门化处理,而在此流程里会去启动 SignalCatcher。下面来看看这个流程:
/// @frameworks/base/core/jni/com_android_internal_os_Zygote.cpp
// Utility routine to specialize a zygote child process.
static void SpecializeCommon(JNIEnv* env, uid_t uid, gid_t gid, jintArray gids, jint runtime_flags,
jobjectArray rlimits, jlong permitted_capabilities,
jlong effective_capabilities, jint mount_external,
jstring managed_se_info, jstring managed_nice_name,
bool is_system_server, bool is_child_zygote,
jstring managed_instruction_set, jstring managed_app_data_dir,
bool is_top_app, jobjectArray pkg_data_info_list,
jobjectArray allowlisted_data_info_list, bool mount_data_dirs,
bool mount_storage_dirs) {
const char* process_name = is_system_server ? "system_server" : "zygote";
auto fail_fn = std::bind(ZygoteFailure, env, process_name, managed_nice_name, _1);
auto extract_fn = std::bind(ExtractJString, env, process_name, managed_nice_name, _1);
...
SetGids(env, gids, is_child_zygote, fail_fn);
SetRLimits(env, rlimits, fail_fn);
...
if (setresgid(gid, gid, gid) == -1) {
fail_fn(CREATE_ERROR("setresgid(%d) failed: %s", gid, strerror(errno)));
}
// Must be called when the new process still has CAP_SYS_ADMIN, in this case,
// before changing uid from 0, which clears capabilities. The other
// alternative is to call prctl(PR_SET_NO_NEW_PRIVS, 1) afterward, but that
// breaks SELinux domain transition (see b/71859146). As the result,
// privileged syscalls used below still need to be accessible in app process.
SetUpSeccompFilter(uid, is_child_zygote);
// Must be called before losing the permission to set scheduler policy.
SetSchedulerPolicy(fail_fn, is_top_app);
if (setresuid(uid, uid, uid) == -1) {
fail_fn(CREATE_ERROR("setresuid(%d) failed: %s", uid, strerror(errno)));
}
// dumpable, 和抓 core dump 相关
// The "dumpable" flag of a process, which controls core dump generation, is
// overwritten by the value in /proc/sys/fs/suid_dumpable when the effective
// user or group ID changes. See proc(5) for possible values. In most cases,
// the value is 0, so core dumps are disabled for zygote children. However,
// when running in a Chrome OS container, the value is already set to 2,
// which allows the external crash reporter to collect all core dumps. Since
// only system crashes are interested, core dump is disabled for app
// processes. This also ensures compliance with CTS.
int dumpable = prctl(PR_GET_DUMPABLE);
if (dumpable == -1) {
ALOGE("prctl(PR_GET_DUMPABLE) failed: %s", strerror(errno));
RuntimeAbort(env, __LINE__, "prctl(PR_GET_DUMPABLE) failed");
}
if (dumpable == 2 && uid >= AID_APP) {
if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) == -1) {
ALOGE("prctl(PR_SET_DUMPABLE, 0) failed: %s", strerror(errno));
RuntimeAbort(env, __LINE__, "prctl(PR_SET_DUMPABLE, 0) failed");
}
}
// Set process properties to enable debugging if required.
if ((runtime_flags & RuntimeFlags::DEBUG_ENABLE_JDWP) != 0) { // JDWP
EnableDebugger();
}
if ((runtime_flags & RuntimeFlags::PROFILE_FROM_SHELL) != 0) {
// simpleperf needs the process to be dumpable to profile it.
if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) == -1) {
ALOGE("prctl(PR_SET_DUMPABLE) failed: %s", strerror(errno));
RuntimeAbort(env, __LINE__, "prctl(PR_SET_DUMPABLE, 1) failed");
}
}
HeapTaggingLevel heap_tagging_level;
...
// ASAN
bool forceEnableGwpAsan = false;
switch (runtime_flags & RuntimeFlags::GWP_ASAN_LEVEL_MASK) {
default:
case RuntimeFlags::GWP_ASAN_LEVEL_NEVER:
break;
case RuntimeFlags::GWP_ASAN_LEVEL_ALWAYS:
forceEnableGwpAsan = true;
[[fallthrough]];
case RuntimeFlags::GWP_ASAN_LEVEL_LOTTERY:
android_mallopt(M_INITIALIZE_GWP_ASAN, &forceEnableGwpAsan, sizeof(forceEnableGwpAsan));
}
// Now that we've used the flag, clear it so that we don't pass unknown flags to the ART
// runtime.
runtime_flags &= ~RuntimeFlags::GWP_ASAN_LEVEL_MASK;
...
SetCapabilities(permitted_capabilities, effective_capabilities, permitted_capabilities,
fail_fn);
...
const char* se_info_ptr = se_info.has_value() ? se_info.value().c_str() : nullptr;
const char* nice_name_ptr = nice_name.has_value() ? nice_name.value().c_str() : nullptr;
// selinux
if (selinux_android_setcontext(uid, is_system_server, se_info_ptr, nice_name_ptr) == -1) {
fail_fn(CREATE_ERROR("selinux_android_setcontext(%d, %d, \"%s\", \"%s\") failed", uid,
is_system_server, se_info_ptr, nice_name_ptr));
}
// 设置线程名
// Make it easier to debug audit logs by setting the main thread's name to the
// nice name rather than "app_process".
if (nice_name.has_value()) {
SetThreadName(nice_name.value());
} else if (is_system_server) {
SetThreadName("system_server");
}
// Unset the SIGCHLD handler, but keep ignoring SIGHUP (rationale in SetSignalHandlers).
UnsetChldSignalHandler(); // Sets the SIGCHLD handler back to default behavior in zygote children.
...
// 调用 Zygote 的 callPostForkChildHooks
env->CallStaticVoidMethod(gZygoteClass, gCallPostForkChildHooks, runtime_flags,
is_system_server, is_child_zygote, managed_instruction_set);
// Reset the process priority to the default value.
setpriority(PRIO_PROCESS, 0, PROCESS_PRIORITY_DEFAULT);
...
}
Zygote#callPostForkChildHooks
/// @frameworks/base/core/java/com/android/internal/os/Zygote.java
// This function is called from native code in com_android_internal_os_Zygote.cpp
@SuppressWarnings("unused")
private static void callPostForkChildHooks(int runtimeFlags, boolean isSystemServer,
boolean isZygote, String instructionSet) {
ZygoteHooks.postForkChild(runtimeFlags, isSystemServer, isZygote, instructionSet);
}
ZygoteHooks#postForkChild
/// @dalvik/system/ZygoteHooks.java
/**
* Called by the zygote in the child process after every fork.
*
* @param runtimeFlags The runtime flags to apply to the child process.
* @param isSystemServer Whether the child process is system server.
* @param isChildZygote Whether the child process is a child zygote.
* @param instructionSet The instruction set of the child, used to determine
* whether to use a native bridge.
*
* @hide
*/
@SystemApi(client = MODULE_LIBRARIES)
@libcore.api.CorePlatformApi(status = libcore.api.CorePlatformApi.Status.STABLE)
public static void postForkChild(int runtimeFlags, boolean isSystemServer,
boolean isChildZygote, String instructionSet) {
// 进入 native 方法
nativePostForkChild(token, runtimeFlags, isSystemServer, isChildZygote, instructionSet);
Math.setRandomSeedInternal(System.currentTimeMillis());
// Enable memory-mapped coverage if JaCoCo is in the boot classpath. system_server is
// skipped due to being persistent and having its own coverage writing mechanism.
if (!isSystemServer && enableMemoryMappedDataMethod != null) {
try {
enableMemoryMappedDataMethod.invoke(null);
} catch (ReflectiveOperationException e) {
throw new RuntimeException(e);
}
}
}
// Hook for all child processes post forking.
private static native void nativePostForkChild(long token, int runtimeFlags,
boolean isSystemServer, boolean isZygote,
String instructionSet);
ZygoteHooks_nativePostForkChild
/// @art/runtime/native/dalvik_system_ZygoteHooks.cc
static void ZygoteHooks_nativePostForkChild(JNIEnv* env,
jclass,
jlong token,
jint runtime_flags,
jboolean is_system_server,
jboolean is_zygote,
jstring instruction_set) {
...
Runtime* runtime = Runtime::Current();
...
if (instruction_set != nullptr && !is_system_server) {
ScopedUtfChars isa_string(env, instruction_set);
InstructionSet isa = GetInstructionSetFromString(isa_string.c_str());
Runtime::NativeBridgeAction action = Runtime::NativeBridgeAction::kUnload;
if (isa != InstructionSet::kNone && isa != kRuntimeISA) {
action = Runtime::NativeBridgeAction::kInitialize;
}
runtime->InitNonZygoteOrPostFork(env, is_system_server, is_zygote, action, isa_string.c_str());
} else { // 调用 runtime 的 InitNonZygoteOrPostFork
runtime->InitNonZygoteOrPostFork(
env,
is_system_server,
is_zygote,
Runtime::NativeBridgeAction::kUnload,
/*isa=*/ nullptr,
profile_system_server);
}
}
Runtime::InitNonZygoteOrPostFork
void Runtime::InitNonZygoteOrPostFork(JNIEnv* env, bool is_system_server,
// This is true when we are initializing a child-zygote. It requires
// native bridge initialization to be able to run guest native code in
// doPreload().
bool is_child_zygote, NativeBridgeAction action, const char* isa, bool profile_system_server) {
...
// Create the thread pools.
heap_->CreateThreadPool();
// Avoid creating the runtime thread pool for system server since it will not be used and would
// waste memory.
if (!is_system_server) {
ScopedTrace timing("CreateThreadPool");
constexpr size_t kStackSize = 64 * KB;
constexpr size_t kMaxRuntimeWorkers = 4u;
const size_t num_workers =
std::min(static_cast<size_t>(std::thread::hardware_concurrency()), kMaxRuntimeWorkers);
MutexLock mu(Thread::Current(), *Locks::runtime_thread_pool_lock_);
CHECK(thread_pool_ == nullptr);
thread_pool_.reset(new ThreadPool("Runtime", num_workers, /*create_peers=*/false, kStackSize));
thread_pool_->StartWorkers(Thread::Current());
}
// Reset the gc performance data and metrics at zygote fork so that the events from
// before fork aren't attributed to an app.
heap_->ResetGcPerformanceInfo();
GetMetrics()->Reset();
...
StartSignalCatcher(); // 启动 SignalCatcher
...
// Start the JDWP thread. If the command-line debugger flags specified "suspend=y",
// this will pause the runtime (in the internal debugger implementation), so we probably want
// this to come last.
GetRuntimeCallbacks()->StartDebugger();
}
Runtime::StartSignalCatcher
/// @art/runtime/runtime.cc
void Runtime::StartSignalCatcher() {
if (!is_zygote_) {
signal_catcher_ = new SignalCatcher();
}
}
SignalCatcher
/// @art/runtime/signal_catcher.cc
SignalCatcher::SignalCatcher()
: lock_("SignalCatcher lock"),
cond_("SignalCatcher::cond_", lock_),
thread_(nullptr) {
SetHaltFlag(false);
// 创建 Signal Catcher 线程, 启动后调用 Run 函数
// Create a raw pthread; its start routine will attach to the runtime.
CHECK_PTHREAD_CALL(pthread_create, (&pthread_, nullptr, &Run, this), "signal catcher thread");
Thread* self = Thread::Current();
MutexLock mu(self, lock_);
while (thread_ == nullptr) { // 等待新线程创建并attach到runtime
cond_.Wait(self);
}
}
SignalCatcher::Run
/// @art/runtime/signal_catcher.cc
void* SignalCatcher::Run(void* arg) {
SignalCatcher* signal_catcher = reinterpret_cast<SignalCatcher*>(arg);
CHECK(signal_catcher != nullptr);
Runtime* runtime = Runtime::Current();
CHECK(runtime->AttachCurrentThread("Signal Catcher", true, runtime->GetSystemThreadGroup(),
!runtime->IsAotCompiler())); // attach到runtime
Thread* self = Thread::Current();
DCHECK_NE(self->GetState(), kRunnable);
{
MutexLock mu(self, signal_catcher->lock_);
signal_catcher->thread_ = self;
signal_catcher->cond_.Broadcast(self);
}
// Set up mask with signals we want to handle.
SignalSet signals;
signals.Add(SIGQUIT); // 将 SIGQUIT 信号加入信号集
signals.Add(SIGUSR1);
while (true) {
// 等待信号集的信号发生
int signal_number = signal_catcher->WaitForSignal(self, signals);
if (signal_catcher->ShouldHalt()) {
runtime->DetachCurrentThread();
return nullptr;
}
switch (signal_number) {
case SIGQUIT:
signal_catcher->HandleSigQuit(); // 处理 quit 信号 3
break;
case SIGUSR1:
signal_catcher->HandleSigUsr1();
break;
default:
LOG(ERROR) << "Unexpected signal %d" << signal_number;
break;
}
}
}
SignalCatcher处理 SIGQUIT
当 SignalCatcher 接收到信号 SIGQUIT 时,会去调用signal_catcher->HandleSigQuit执行dump操作
SignalCatcher::HandleSigQuit
/// @art/runtime/signal_catcher.cc
void SignalCatcher::HandleSigQuit() {
Runtime* runtime = Runtime::Current();
std::ostringstream os;
os << "\n" << "----- pid " << getpid() << " at " << GetIsoDate() << " -----\n";
DumpCmdLine(os); // 输出 /proc/self/cmdline
// Note: The strings "Build fingerprint:" and "ABI:" are chosen to match the format used by
// debuggerd. This allows, for example, the stack tool to work.
std::string fingerprint = runtime->GetFingerprint(); // 输出 fingerprint
os << "Build fingerprint: '" << (fingerprint.empty() ? "unknown" : fingerprint) << "'\n";
os << "ABI: '" << GetInstructionSetString(runtime->GetInstructionSet()) << "'\n";
os << "Build type: " << (kIsDebugBuild ? "debug" : "optimized") << "\n";
runtime->DumpForSigQuit(os); // runtime dump 操作
if ((false)) {
std::string maps;
if (android::base::ReadFileToString("/proc/self/maps", &maps)) {
os << "/proc/self/maps:\n" << maps;
}
}
os << "----- end " << getpid() << " -----\n";
Output(os.str());
}
DumpCmdLine
android基于linux,读取/proc/self/cmdline
static void DumpCmdLine(std::ostream& os) {
#if defined(__linux__)
// Show the original command line, and the current command line too if it's changed.
// On Android, /proc/self/cmdline will have been rewritten to something like "system_server".
// Note: The string "Cmd line:" is chosen to match the format used by debuggerd.
std::string current_cmd_line;
if (android::base::ReadFileToString("/proc/self/cmdline", ¤t_cmd_line)) {
current_cmd_line.resize(current_cmd_line.find_last_not_of('\0') + 1); // trim trailing '\0's
std::replace(current_cmd_line.begin(), current_cmd_line.end(), '\0', ' ');
os << "Cmd line: " << current_cmd_line << "\n"; // 输出 Cmd line
const char* stashed_cmd_line = GetCmdLine();
if (stashed_cmd_line != nullptr && current_cmd_line != stashed_cmd_line
&& strcmp(stashed_cmd_line, "<unset>") != 0) {
os << "Original command line: " << stashed_cmd_line << "\n";
}
}
#else
os << "Cmd line: " << GetCmdLine() << "\n";
#endif
}
Runtime::DumpForSigQuit
signal quit 时 dump runtime 内部的一些状态
/// @art/runtime/runtime.cc
void Runtime::DumpForSigQuit(std::ostream& os) {
GetClassLinker()->DumpForSigQuit(os);
GetInternTable()->DumpForSigQuit(os);
GetJavaVM()->DumpForSigQuit(os);
GetHeap()->DumpForSigQuit(os);
oat_file_manager_->DumpForSigQuit(os);
if (GetJit() != nullptr) {
GetJit()->DumpForSigQuit(os);
} else {
os << "Running non JIT\n";
}
DumpDeoptimizations(os);
TrackedAllocators::Dump(os);
GetMetrics()->DumpForSigQuit(os);
os << "\n";
thread_list_->DumpForSigQuit(os);
BaseMutex::DumpAll(os);
// Inform anyone else who is interested in SigQuit.
{
ScopedObjectAccess soa(Thread::Current());
callbacks_->SigQuit();
}
}
ClassLinker::DumpForSigQuit
/// @art/runtime/class_linker.cc
void ClassLinker::DumpForSigQuit(std::ostream& os) {
ScopedObjectAccess soa(Thread::Current());
ReaderMutexLock mu(soa.Self(), *Locks::classlinker_classes_lock_);
os << "Zygote loaded classes=" << NumZygoteClasses() << " post zygote classes="
<< NumNonZygoteClasses() << "\n";
ReaderMutexLock mu2(soa.Self(), *Locks::dex_lock_);
os << "Dumping registered class loaders\n";
size_t class_loader_index = 0; // 打印 class loaders
for (const ClassLoaderData& class_loader : class_loaders_) {
ObjPtr<mirror::ClassLoader> loader =
ObjPtr<mirror::ClassLoader>::DownCast(soa.Self()->DecodeJObject(class_loader.weak_root));
if (loader != nullptr) {
os << "#" << class_loader_index++ << " " << loader->GetClass()->PrettyDescriptor() << ": [";
bool saw_one_dex_file = false;
for (const DexCacheData& dex_cache : dex_caches_) {
if (dex_cache.IsValid() && dex_cache.class_table == class_loader.class_table) {
if (saw_one_dex_file) {
os << ":";
}
saw_one_dex_file = true;
os << dex_cache.dex_file->GetLocation();
}
}
os << "]";
bool found_parent = false;
if (loader->GetParent() != nullptr) {
size_t parent_index = 0;
for (const ClassLoaderData& class_loader2 : class_loaders_) {
ObjPtr<mirror::ClassLoader> loader2 = ObjPtr<mirror::ClassLoader>::DownCast(
soa.Self()->DecodeJObject(class_loader2.weak_root));
if (loader2 == loader->GetParent()) {
os << ", parent #" << parent_index;
found_parent = true;
break;
}
parent_index++;
}
if (!found_parent) {
os << ", unregistered parent of type "
<< loader->GetParent()->GetClass()->PrettyDescriptor();
}
} else {
os << ", no parent";
}
os << "\n";
}
}
os << "Done dumping class loaders\n";
Runtime* runtime = Runtime::Current();
os << "Classes initialized: " << runtime->GetStat(KIND_GLOBAL_CLASS_INIT_COUNT) << " in "
<< PrettyDuration(runtime->GetStat(KIND_GLOBAL_CLASS_INIT_TIME)) << "\n";
}
InternTable::DumpForSigQuit
/// @art/runtime/intern_table.cc
void InternTable::DumpForSigQuit(std::ostream& os) const {
os << "Intern table: " << StrongSize() << " strong; " << WeakSize() << " weak\n";
}
JavaVM::DumpForSigQuit
/// @art/runtime/jni/java_vm_ext.cc
void JavaVMExt::DumpForSigQuit(std::ostream& os) {
os << "JNI: CheckJNI is " << (check_jni_ ? "on" : "off");
if (force_copy_) {
os << " (with forcecopy)";
}
Thread* self = Thread::Current();
{
ReaderMutexLock mu(self, *Locks::jni_globals_lock_);
os << "; globals=" << globals_.Capacity();
}
{
MutexLock mu(self, *Locks::jni_weak_globals_lock_);
if (weak_globals_.Capacity() > 0) {
os << " (plus " << weak_globals_.Capacity() << " weak)";
}
}
os << '\n';
{
MutexLock mu(self, *Locks::jni_libraries_lock_);
os << "Libraries: " << Dumpable<Libraries>(*libraries_) << " (" << libraries_->size() << ")\n";
}
}
Heap::DumpForSigQuit
/// @art/runtime/gc/heap.cc
void Heap::DumpForSigQuit(std::ostream& os) {
os << "Heap: " << GetPercentFree() << "% free, " << PrettySize(GetBytesAllocated()) << "/"
<< PrettySize(GetTotalMemory()) << "; " << GetObjectsAllocated() << " objects\n";
DumpGcPerformanceInfo(os);
}
打印gc 信息
void Heap::DumpGcPerformanceInfo(std::ostream& os) {
// Dump cumulative timings.
os << "Dumping cumulative Gc timings\n";
uint64_t total_duration = 0;
// Dump cumulative loggers for each GC type.
uint64_t total_paused_time = 0;
for (auto* collector : garbage_collectors_) {
total_duration += collector->GetCumulativeTimings().GetTotalNs();
total_paused_time += collector->GetTotalPausedTimeNs();
collector->DumpPerformanceInfo(os);
}
if (total_duration != 0) {
const double total_seconds = total_duration / 1.0e9;
const double total_cpu_seconds = GetTotalGcCpuTime() / 1.0e9;
os << "Total time spent in GC: " << PrettyDuration(total_duration) << "\n";
os << "Mean GC size throughput: "
<< PrettySize(GetBytesFreedEver() / total_seconds) << "/s"
<< " per cpu-time: "
<< PrettySize(GetBytesFreedEver() / total_cpu_seconds) << "/s\n";
os << "Mean GC object throughput: "
<< (GetObjectsFreedEver() / total_seconds) << " objects/s\n";
}
uint64_t total_objects_allocated = GetObjectsAllocatedEver();
os << "Total number of allocations " << total_objects_allocated << "\n";
os << "Total bytes allocated " << PrettySize(GetBytesAllocatedEver()) << "\n";
os << "Total bytes freed " << PrettySize(GetBytesFreedEver()) << "\n";
os << "Free memory " << PrettySize(GetFreeMemory()) << "\n";
os << "Free memory until GC " << PrettySize(GetFreeMemoryUntilGC()) << "\n";
os << "Free memory until OOME " << PrettySize(GetFreeMemoryUntilOOME()) << "\n";
os << "Total memory " << PrettySize(GetTotalMemory()) << "\n";
os << "Max memory " << PrettySize(GetMaxMemory()) << "\n";
if (HasZygoteSpace()) {
os << "Zygote space size " << PrettySize(zygote_space_->Size()) << "\n";
}
os << "Total mutator paused time: " << PrettyDuration(total_paused_time) << "\n";
os << "Total time waiting for GC to complete: " << PrettyDuration(total_wait_time_) << "\n";
os << "Total GC count: " << GetGcCount() << "\n";
os << "Total GC time: " << PrettyDuration(GetGcTime()) << "\n";
os << "Total blocking GC count: " << GetBlockingGcCount() << "\n";
os << "Total blocking GC time: " << PrettyDuration(GetBlockingGcTime()) << "\n";
{
MutexLock mu(Thread::Current(), *gc_complete_lock_);
if (gc_count_rate_histogram_.SampleSize() > 0U) {
os << "Histogram of GC count per " << NsToMs(kGcCountRateHistogramWindowDuration) << " ms: ";
gc_count_rate_histogram_.DumpBins(os);
os << "\n";
}
if (blocking_gc_count_rate_histogram_.SampleSize() > 0U) {
os << "Histogram of blocking GC count per "
<< NsToMs(kGcCountRateHistogramWindowDuration) << " ms: ";
blocking_gc_count_rate_histogram_.DumpBins(os);
os << "\n";
}
}
if (kDumpRosAllocStatsOnSigQuit && rosalloc_space_ != nullptr) {
rosalloc_space_->DumpStats(os);
}
os << "Native bytes total: " << GetNativeBytes()
<< " registered: " << native_bytes_registered_.load(std::memory_order_relaxed) << "\n";
os << "Total native bytes at last GC: "
<< old_native_bytes_allocated_.load(std::memory_order_relaxed) << "\n";
BaseMutex::DumpAll(os);
}
OatFileManager::DumpForSigQuit
/// @art/runtime/oat_file_manager.cc
void OatFileManager::DumpForSigQuit(std::ostream& os) {
ReaderMutexLock mu(Thread::Current(), *Locks::oat_file_manager_lock_);
std::vector<const OatFile*> boot_oat_files = GetBootOatFiles();
for (const std::unique_ptr<const OatFile>& oat_file : oat_files_) {
if (ContainsElement(boot_oat_files, oat_file.get())) {
continue;
}
// 打印 oat location
os << oat_file->GetLocation() << ": " << oat_file->GetCompilerFilter() << "\n";
}
}
Jit::DumpForSigQuit
打印jit信息
/// @art/runtime/jit/jit.cc
void Jit::DumpForSigQuit(std::ostream& os) {
DumpInfo(os);
ProfileSaver::DumpInstanceInfo(os);
}
void Jit::DumpInfo(std::ostream& os) {
code_cache_->Dump(os);
cumulative_timings_.Dump(os);
MutexLock mu(Thread::Current(), lock_);
memory_use_.PrintMemoryUse(os);
}
JitCodeCache::Dump
void JitCodeCache::Dump(std::ostream& os) {
MutexLock mu(Thread::Current(), *Locks::jit_lock_);
os << "Current JIT code cache size (used / resident): "
<< GetCurrentRegion()->GetUsedMemoryForCode() / KB << "KB / "
<< GetCurrentRegion()->GetResidentMemoryForCode() / KB << "KB\n"
<< "Current JIT data cache size (used / resident): "
<< GetCurrentRegion()->GetUsedMemoryForData() / KB << "KB / "
<< GetCurrentRegion()->GetResidentMemoryForData() / KB << "KB\n";
if (!Runtime::Current()->IsZygote()) {
os << "Zygote JIT code cache size (at point of fork): "
<< shared_region_.GetUsedMemoryForCode() / KB << "KB / "
<< shared_region_.GetResidentMemoryForCode() / KB << "KB\n"
<< "Zygote JIT data cache size (at point of fork): "
<< shared_region_.GetUsedMemoryForData() / KB << "KB / "
<< shared_region_.GetResidentMemoryForData() / KB << "KB\n";
}
os << "Current JIT mini-debug-info size: " << PrettySize(GetJitMiniDebugInfoMemUsage()) << "\n"
<< "Current JIT capacity: " << PrettySize(GetCurrentRegion()->GetCurrentCapacity()) << "\n"
<< "Current number of JIT JNI stub entries: " << jni_stubs_map_.size() << "\n"
<< "Current number of JIT code cache entries: " << method_code_map_.size() << "\n"
<< "Total number of JIT baseline compilations: " << number_of_baseline_compilations_ << "\n"
<< "Total number of JIT optimized compilations: " << number_of_optimized_compilations_ << "\n"
<< "Total number of JIT compilations for on stack replacement: "
<< number_of_osr_compilations_ << "\n"
<< "Total number of JIT code cache collections: " << number_of_collections_ << std::endl;
histogram_stack_map_memory_use_.PrintMemoryUse(os);
histogram_code_memory_use_.PrintMemoryUse(os);
histogram_profiling_info_memory_use_.PrintMemoryUse(os);
}
DumpDeoptimizations
void Runtime::DumpDeoptimizations(std::ostream& os) {
for (size_t i = 0; i <= static_cast<size_t>(DeoptimizationKind::kLast); ++i) {
if (deoptimization_counts_[i] != 0) {
os << "Number of "
<< GetDeoptimizationKindName(static_cast<DeoptimizationKind>(i))
<< " deoptimizations: "
<< deoptimization_counts_[i]
<< "\n";
}
}
}
TrackedAllocators::Dump
void Dump(std::ostream& os) {
if (kEnableTrackingAllocator) {
os << "Dumping native memory usage\n";
for (size_t i = 0; i < kAllocatorTagCount; ++i) {
uint64_t bytes_used = g_bytes_used[i].load(std::memory_order_relaxed);
uint64_t max_bytes_used = g_max_bytes_used[i].load(std::memory_order_relaxed);
uint64_t total_bytes_used = g_total_bytes_used[i].load(std::memory_order_relaxed);
if (total_bytes_used != 0) {
os << static_cast<AllocatorTag>(i) << " active=" << bytes_used << " max="
<< max_bytes_used << " total=" << total_bytes_used << "\n";
}
}
}
}
thread_list_->DumpForSigQuit
void ThreadList::DumpForSigQuit(std::ostream& os) {
{
ScopedObjectAccess soa(Thread::Current());
// Only print if we have samples.
if (suspend_all_historam_.SampleSize() > 0) { // 打印 suspend histogram
Histogram<uint64_t>::CumulativeData data;
suspend_all_historam_.CreateHistogram(&data);
suspend_all_historam_.PrintConfidenceIntervals(os, 0.99, data); // Dump time to suspend.
}
}
bool dump_native_stack = Runtime::Current()->GetDumpNativeStackOnSigQuit();
Dump(os, dump_native_stack); // dump 虚拟机线程
// dump 没有attach到VM的线程
DumpUnattachedThreads(os, dump_native_stack && kDumpUnattachedThreadNativeStackForSigQuit);
}
ThreadList::Dump
主要用来dump虚拟机线程
void ThreadList::Dump(std::ostream& os, bool dump_native_stack) {
Thread* self = Thread::Current();
{
MutexLock mu(self, *Locks::thread_list_lock_);
os << "DALVIK THREADS (" << list_.size() << "):\n"; // 打印虚拟机线程数
}
if (self != nullptr) {
DumpCheckpoint checkpoint(&os, dump_native_stack);
size_t threads_running_checkpoint;
{
// Use SOA to prevent deadlocks if multiple threads are calling Dump() at the same time.
ScopedObjectAccess soa(self);
// 通过checkpoint去dump线程信息
threads_running_checkpoint = RunCheckpoint(&checkpoint); // 注意此处的参数是checkpoint
}
if (threads_running_checkpoint != 0) { // 等待dump完成
checkpoint.WaitForThreadsToRunThroughCheckpoint(threads_running_checkpoint);
}
} else {
DumpUnattachedThreads(os, dump_native_stack);
}
}
ThreadList::RunCheckpoint
此处checkpoint_function是DumpCheckpoint对象,callback为nullptr
size_t ThreadList::RunCheckpoint(Closure* checkpoint_function, Closure* callback) {
Thread* self = Thread::Current();
Locks::mutator_lock_->AssertNotExclusiveHeld(self);
Locks::thread_list_lock_->AssertNotHeld(self);
Locks::thread_suspend_count_lock_->AssertNotHeld(self);
std::vector<Thread*> suspended_count_modified_threads;
size_t count = 0;
{
// 向所有线程发起checkpoint请求,让相关线程执行checkpoint_function,
// 若线程已经处于suspend状态,则主动调用checkpoint_function来dump
// Call a checkpoint function for each thread, threads which are suspended get their checkpoint
// manually called.
MutexLock mu(self, *Locks::thread_list_lock_);
MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
count = list_.size();
for (const auto& thread : list_) {
if (thread != self) {
bool requested_suspend = false;
while (true) {
// 对runable线程请求一个checkpoint,当线程执行Checkpoint函数时调用checkpoint_function->Run
if (thread->RequestCheckpoint(checkpoint_function)) {
// This thread will run its checkpoint some time in the near future.
if (requested_suspend) { // 请求过suspend,移除计数防止进入suspend
// The suspend request is now unnecessary.
bool updated =
thread->ModifySuspendCount(self, -1, nullptr, SuspendReason::kInternal);
DCHECK(updated);
requested_suspend = false;
}
break;
} else {
// The thread is probably suspended, try to make sure that it stays suspended.
if (thread->GetState() == kRunnable) {// 是runable,继续请求checkpoint
// Spurious fail, try again.
continue;
}
// 非 runable
if (!requested_suspend) {// 还没请求 suspend
bool updated = // 增加suspend计数,防止状态改变
thread->ModifySuspendCount(self, +1, nullptr, SuspendReason::kInternal);
DCHECK(updated);
requested_suspend = true; // 标记已请求
if (thread->IsSuspended()) { //已suspend则直接跳出
break;
}
// The thread raced us to become Runnable. Try to RequestCheckpoint() again.
} else { // 请求过suspend,并进入suspend
// The thread previously raced our suspend request to become Runnable but
// since it is suspended again, it must honor that suspend request now.
DCHECK(thread->IsSuspended());
break;
}
}
}
if (requested_suspend) { // 请求过suspend,已进入suspend,则加入suspend线程队列
suspended_count_modified_threads.push_back(thread);
}
}
}
// Run the callback to be called inside this critical section.
if (callback != nullptr) {
callback->Run(self);
}
}
// Run the checkpoint on ourself while we wait for threads to suspend.
checkpoint_function->Run(self); // dump自身,此线程是 SignalCatcher
// Run the checkpoint on the suspended threads.
for (const auto& thread : suspended_count_modified_threads) { // 主动调用suspend线程的checkpoint方法去dump
// We know for sure that the thread is suspended at this point.
DCHECK(thread->IsSuspended());
checkpoint_function->Run(thread); // 关键,会去dump线程
{
MutexLock mu2(self, *Locks::thread_suspend_count_lock_); // 减少suspend计数
bool updated = thread->ModifySuspendCount(self, -1, nullptr, SuspendReason::kInternal);
DCHECK(updated);
}
}
{
// Imitate ResumeAll, threads may be waiting on Thread::resume_cond_ since we raised their
// suspend count. Now the suspend_count_ is lowered so we must do the broadcast.
MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
Thread::resume_cond_->Broadcast(self); // 通知等待在resume_cond_的线程可以resume
}
return count;
}
上面看到了线程的kRunnable状态,还有很多状态,描述如下
####### ThreadState
/// @art/runtime/thread_state.h
// State stored in our C++ class Thread.
// When we refer to "a suspended state", or when function names mention "ToSuspended" or
// "FromSuspended", we mean any state other than kRunnable, i.e. any state in which the thread is
// guaranteed not to access the Java heap. The kSuspended state is merely one of these.
enum ThreadState {
// Java
// Thread.State JDWP state
kTerminated = 66, // TERMINATED TS_ZOMBIE Thread.run has returned, but Thread* still around
kRunnable, // RUNNABLE TS_RUNNING runnable
kTimedWaiting, // TIMED_WAITING TS_WAIT in Object.wait() with a timeout
kSleeping, // TIMED_WAITING TS_SLEEPING in Thread.sleep()
kBlocked, // BLOCKED TS_MONITOR blocked on a monitor
kWaiting, // WAITING TS_WAIT in Object.wait()
kWaitingForLockInflation, // WAITING TS_WAIT blocked inflating a thin-lock
kWaitingForTaskProcessor, // WAITING TS_WAIT blocked waiting for taskProcessor
kWaitingForGcToComplete, // WAITING TS_WAIT blocked waiting for GC
kWaitingForCheckPointsToRun, // WAITING TS_WAIT GC waiting for checkpoints to run
kWaitingPerformingGc, // WAITING TS_WAIT performing GC
kWaitingForDebuggerSend, // WAITING TS_WAIT blocked waiting for events to be sent
kWaitingForDebuggerToAttach, // WAITING TS_WAIT blocked waiting for debugger to attach
kWaitingInMainDebuggerLoop, // WAITING TS_WAIT blocking/reading/processing debugger events
kWaitingForDebuggerSuspension, // WAITING TS_WAIT waiting for debugger suspend all
kWaitingForJniOnLoad, // WAITING TS_WAIT waiting for execution of dlopen and JNI on load code
kWaitingForSignalCatcherOutput, // WAITING TS_WAIT waiting for signal catcher IO to complete
kWaitingInMainSignalCatcherLoop, // WAITING TS_WAIT blocking/reading/processing signals
kWaitingForDeoptimization, // WAITING TS_WAIT waiting for deoptimization suspend all
kWaitingForMethodTracingStart, // WAITING TS_WAIT waiting for method tracing to start
kWaitingForVisitObjects, // WAITING TS_WAIT waiting for visiting objects
kWaitingForGetObjectsAllocated, // WAITING TS_WAIT waiting for getting the number of allocated objects
kWaitingWeakGcRootRead, // WAITING TS_WAIT waiting on the GC to read a weak root
kWaitingForGcThreadFlip, // WAITING TS_WAIT waiting on the GC thread flip (CC collector) to finish
kNativeForAbort, // WAITING TS_WAIT checking other threads are not run on abort.
kStarting, // NEW TS_WAIT native thread started, not yet ready to run managed code
kNative, // RUNNABLE TS_RUNNING running in a JNI native method
kSuspended, // RUNNABLE TS_RUNNING suspended by GC or debugger
};
Thread::RequestCheckpoint
用于请求checkpoint,设置一个 checkpoint function
/// art/runtime/thread.cc
bool Thread::RequestCheckpoint(Closure* function) {
union StateAndFlags old_state_and_flags;
old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
if (old_state_and_flags.as_struct.state != kRunnable) { // 非 kRunnable 直接返回
return false; // Fail, thread is suspended and so can't run a checkpoint.
}
// We must be runnable to request a checkpoint.
DCHECK_EQ(old_state_and_flags.as_struct.state, kRunnable);
union StateAndFlags new_state_and_flags;
new_state_and_flags.as_int = old_state_and_flags.as_int;
new_state_and_flags.as_struct.flags |= kCheckpointRequest; // 设置kCheckpointRequest flag
bool success = tls32_.state_and_flags.as_atomic_int.CompareAndSetStrongSequentiallyConsistent(
old_state_and_flags.as_int, new_state_and_flags.as_int);
if (success) {
// Succeeded setting checkpoint flag, now insert the actual checkpoint.
if (tlsPtr_.checkpoint_function == nullptr) { // 设置 checkpoint function
tlsPtr_.checkpoint_function = function;
} else {
checkpoint_overflow_.push_back(function);
}
CHECK_EQ(ReadFlag(kCheckpointRequest), true);
TriggerSuspend(); // Trigger a suspend check by making the suspend_trigger_ TLS value an invalid pointer
}
return success;
}
上面的checkpoint_function即是DumpCheckpoint对象,它的Run方法如下:
DumpCheckpoint::Run
void Run(Thread* thread) override {
// Note thread and self may not be equal if thread was already suspended at the point of the
// request.
Thread* self = Thread::Current();
CHECK(self != nullptr);
std::ostringstream local_os;
{
ScopedObjectAccess soa(self);
thread->Dump(local_os, dump_native_stack_, backtrace_map_.get()); // 执行thread的Dump方法
}
{
// Use the logging lock to ensure serialization when writing to the common ostream.
MutexLock mu(self, *Locks::logging_lock_);
*os_ << local_os.str() << std::endl;
}
barrier_.Pass(self);
}
DumpCheckpoint::WaitForThreadsToRunThroughCheckpoint
void WaitForThreadsToRunThroughCheckpoint(size_t threads_running_checkpoint) {
Thread* self = Thread::Current();
ScopedThreadStateChange tsc(self, kWaitingForCheckPointsToRun);
// 等待所有线程完成dump,每个线程执行完 DumpCheckpoint->Run, barrier_会减一
bool timed_out = barrier_.Increment(self, threads_running_checkpoint, kDumpWaitTimeout);
if (timed_out) {
// Avoid a recursive abort.
LOG((kIsDebugBuild && (gAborting == 0)) ? ::android::base::FATAL : ::android::base::ERROR)
<< "Unexpected time out during dump checkpoint.";
}
}
简单看一下DumpCheckpoint的实现,是专门为dump设计的
// A closure used by Thread::Dump.
class DumpCheckpoint final : public Closure {
public:
DumpCheckpoint(std::ostream* os, bool dump_native_stack) // 构造函数,初始化了Barrier
: os_(os),
// Avoid verifying count in case a thread doesn't end up passing through the barrier.
// This avoids a SIGABRT that would otherwise happen in the destructor.
barrier_(0, /*verify_count_on_shutdown=*/false),
// 创建 BacktraceMap, 后面 dump stacks会用到
backtrace_map_(dump_native_stack ? BacktraceMap::Create(getpid()) : nullptr),
dump_native_stack_(dump_native_stack) {
if (backtrace_map_ != nullptr) {
backtrace_map_->SetSuffixesToIgnore(std::vector<std::string> { "oat", "odex" });
}
}
// 线程执行Checkpoint 回调时会被调用。之前列过,此处省略内容
void Run(Thread* thread) override { ... }
// 等待所有线程执行完Checkpoint 或者超时。之前列过,此处省略内容
void WaitForThreadsToRunThroughCheckpoint(size_t threads_running_checkpoint) {... }
private:
// The common stream that will accumulate all the dumps.
std::ostream* const os_;
// The barrier to be passed through and for the requestor to wait upon.
Barrier barrier_;
// A backtrace map, so that all threads use a shared info and don't reacquire/parse separately.
std::unique_ptr<BacktraceMap> backtrace_map_;
// Whether we should dump the native stack.
const bool dump_native_stack_;
};
总结一下上面ThreadList::Dump的大致流程
- SignalCatcher线程对其他所有runable线程发起Checkpoint请求,设置checkpoint_function,当相关线程执行Checkpoint代码时,回调checkpoint_function->Run函数,而实际上调用的是 DumpCheckpoint::Run函数,在此函数中执行dump操作
- SignalCatcher主动调用DumpCheckpoint::Run函数dump自身
- SignalCatcher对所有已处于suspend的线程主动调用DumpCheckpoint::Run函数,去dump相关线程的trace
- 真正实现dump操作是在Thread::Dump
- SignalCatcher等待所有线程dump完成,或超时
下面是我引用的一段对 CheckPoint 的解释,可以见参考的链接
CheckPoint:
提到CheckPoint必须要提到safe point;
safepoint:对于ART编译的代码,可以定期轮询当前Runtime来确认是否需要执行某些特定代码;可以认为这些轮询时的点,就是safepoint;
safepoint可以用来实现暂定一个java线程,也可以用来实现Checkpoint机制;
比如:当正在执行java代码的线程A执行到safepoint时,会执行CheckSuspend函数,在发现当前线程有 checkpoint request时,会在这个点执行线程的CheckPoint函数;如果发现当前线程有suspend request时,会进行SuspendCheck,使得线程进入Suspend状态(暂停);
所以说,ART CheckPoint应该是safepoint的一个功能实现;
接下来看真正去dump线程调用栈的相关信息
Thread::Dump
/// @art/runtime/thread.cc
void Thread::Dump(std::ostream& os, bool dump_native_stack, BacktraceMap* backtrace_map,
bool force_dump_stack) const {
DumpState(os); // dump线程状态
DumpStack(os, dump_native_stack, backtrace_map, force_dump_stack);// dump线程调用栈
}
看一下main线程的示例
// DumpState 输出
"main" prio=5 tid=1 Native
| group="main" sCount=1 dsCount=0 flags=1 obj=0x7130ac28 self=0xe19c0010
| sysTid=516 nice=-2 cgrp=foreground sched=0/0 handle=0xeffb1478
| state=S schedstat=( 2130008115 1535572157 4795 ) utm=150 stm=62 core=2 HZ=100
| stack=0xff564000-0xff566000 stackSize=8192KB
| held mutexes=
// DumpStack 输出
// 首先是 native stack 输出
native: #00 pc 00000b99 [vdso] (__kernel_vsyscall+9)
native: #01 pc 000cf2cb /apex/com.android.runtime/lib/bionic/libc.so (__epoll_pwait+43)
native: #02 pc 00088f9d /apex/com.android.runtime/lib/bionic/libc.so (epoll_wait+45)
native: #03 pc 0001a003 /system/lib/libutils.so (android::Looper::pollInner(int)+259)
native: #04 pc 00019e96 /system/lib/libutils.so (android::Looper::pollOnce(int, int*, int*, void**)+118)
// 接着是 Java stack 输出
native: #05 pc 0010ef8b /system/lib/libandroid_runtime.so (android::android_os_MessageQueue_nativePollOnce(_JNIEnv*, _jobject*, long long, int)+59)
at android.os.MessageQueue.nativePollOnce(Native method)
at android.os.MessageQueue.next(MessageQueue.java:335)
at android.os.Looper.loop(Looper.java:183)
at com.android.server.SystemServer.run(SystemServer.java:622)
at com.android.server.SystemServer.main(SystemServer.java:408)
at java.lang.reflect.Method.invoke(Native method)
at com.android.internal.os.RuntimeInit$MethodAndArgsCaller.run(RuntimeInit.java:592)
at com.android.internal.os.ZygoteInit.main(ZygoteInit.java:925)
如何看呢? 执行流的调用顺序是从下往上看的,越下面的执行的越早。
####### Thread::DumpState
void Thread::DumpState(std::ostream& os) const {
Thread::DumpState(os, this, GetTid());
}
void Thread::DumpState(std::ostream& os, const Thread* thread, pid_t tid) {
std::string group_name;
int priority;
bool is_daemon = false;
Thread* self = Thread::Current();
// If flip_function is not null, it means we have run a checkpoint
// before the thread wakes up to execute the flip function and the
// thread roots haven't been forwarded. So the following access to
// the roots (opeer or methods in the frames) would be bad. Run it
// here. TODO: clean up.
if (thread != nullptr) {
ScopedObjectAccessUnchecked soa(self);
Thread* this_thread = const_cast<Thread*>(thread);
Closure* flip_func = this_thread->GetFlipFunction();
if (flip_func != nullptr) {
flip_func->Run(this_thread);
}
}
// Don't do this if we are aborting since the GC may have all the threads suspended. This will
// cause ScopedObjectAccessUnchecked to deadlock.
if (gAborting == 0 && self != nullptr && thread != nullptr && thread->tlsPtr_.opeer != nullptr) {
ScopedObjectAccessUnchecked soa(self);
priority = jni::DecodeArtField(WellKnownClasses::java_lang_Thread_priority)
->GetInt(thread->tlsPtr_.opeer);
is_daemon = jni::DecodeArtField(WellKnownClasses::java_lang_Thread_daemon)
->GetBoolean(thread->tlsPtr_.opeer);
ObjPtr<mirror::Object> thread_group =
jni::DecodeArtField(WellKnownClasses::java_lang_Thread_group)
->GetObject(thread->tlsPtr_.opeer);
if (thread_group != nullptr) {
ArtField* group_name_field =
jni::DecodeArtField(WellKnownClasses::java_lang_ThreadGroup_name);
ObjPtr<mirror::String> group_name_string =
group_name_field->GetObject(thread_group)->AsString();
group_name = (group_name_string != nullptr) ? group_name_string->ToModifiedUtf8() : "<null>";
}
} else if (thread != nullptr) {
priority = thread->GetNativePriority();
} else {
palette_status_t status = PaletteSchedGetPriority(tid, &priority);
CHECK(status == PALETTE_STATUS_OK || status == PALETTE_STATUS_CHECK_ERRNO);
}
std::string scheduler_group_name(GetSchedulerGroupName(tid));
if (scheduler_group_name.empty()) {
scheduler_group_name = "default";
}
if (thread != nullptr) {
os << '"' << *thread->tlsPtr_.name << '"'; // 线程名
if (is_daemon) { // 是否守护线程
os << " daemon";
}
os << " prio=" << priority // 优先级
<< " tid=" << thread->GetThreadId() // 线程内部id
<< " " << thread->GetState(); // 线程的状态
if (thread->IsStillStarting()) { // 启动中
os << " (still starting up)";
}
os << "\n";
} else { // 没有attach到VM的线程
os << '"' << ::art::GetThreadName(tid) << '"'
<< " prio=" << priority
<< " (not attached)\n";
}
if (thread != nullptr) {
auto suspend_log_fn = [&]() REQUIRES(Locks::thread_suspend_count_lock_) {
os << " | group=\"" << group_name << "\""
<< " sCount=" << thread->tls32_.suspend_count // 线程挂起数
<< " ucsCount=" << thread->tls32_.user_code_suspend_count // How much of 'suspend_count_' is by request of user code
<< " flags=" << thread->tls32_.state_and_flags.as_struct.flags // ThreadFlag 标记位,
<< " obj=" << reinterpret_cast<void*>(thread->tlsPtr_.opeer) //对应的Thread对象 Our managed peer (an instance of java.lang.Thread).
<< " self=" << reinterpret_cast<const void*>(thread) << "\n"; // 线程的地址
};
if (Locks::thread_suspend_count_lock_->IsExclusiveHeld(self)) {
Locks::thread_suspend_count_lock_->AssertExclusiveHeld(self); // For annotalysis.
suspend_log_fn();
} else {
MutexLock mu(self, *Locks::thread_suspend_count_lock_);
suspend_log_fn();
}
}
os << " | sysTid=" << tid // 系统线程id
<< " nice=" << getpriority(PRIO_PROCESS, static_cast<id_t>(tid)) // 调度优先级
<< " cgrp=" << scheduler_group_name; // 调度组,如 foreground,从/proc/self/task/$tid/cgroup读取
if (thread != nullptr) {
int policy;
sched_param sp;
#if !defined(__APPLE__)
// b/36445592 Don't use pthread_getschedparam since pthread may have exited.
policy = sched_getscheduler(tid);
if (policy == -1) {
PLOG(WARNING) << "sched_getscheduler(" << tid << ")";
}
int sched_getparam_result = sched_getparam(tid, &sp);
if (sched_getparam_result == -1) {
PLOG(WARNING) << "sched_getparam(" << tid << ", &sp)";
sp.sched_priority = -1;
}
#else
CHECK_PTHREAD_CALL(pthread_getschedparam, (thread->tlsPtr_.pthread_self, &policy, &sp),
__FUNCTION__);
#endif
os << " sched=" << policy << "/" << sp.sched_priority // 调度策略
<< " handle=" << reinterpret_cast<void*>(thread->tlsPtr_.pthread_self);
}
os << "\n";
// Grab the scheduler stats for this thread.
std::string scheduler_stats;
if (android::base::ReadFileToString(StringPrintf("/proc/self/task/%d/schedstat", tid),
&scheduler_stats)
&& !scheduler_stats.empty()) { // 读取调度状态
scheduler_stats = android::base::Trim(scheduler_stats); // Lose the trailing '\n'.
} else {
scheduler_stats = "0 0 0";
}
char native_thread_state = '?';
int utime = 0;
int stime = 0;
int task_cpu = 0;
GetTaskStats(tid, &native_thread_state, &utime, &stime, &task_cpu);
os << " | state=" << native_thread_state
<< " schedstat=( " << scheduler_stats << " )" // Running时间 Runable时间 Switch次数
<< " utm=" << utime // 用户态所执行的时间
<< " stm=" << stime // 内核态所执行的时间
<< " core=" << task_cpu // 运行在的核心
<< " HZ=" << sysconf(_SC_CLK_TCK) << "\n"; // 时钟频率
if (thread != nullptr) {
os << " | stack=" << reinterpret_cast<void*>(thread->tlsPtr_.stack_begin) << "-"
<< reinterpret_cast<void*>(thread->tlsPtr_.stack_end) << " stackSize=" // 栈地址空间
<< PrettySize(thread->tlsPtr_.stack_size) << "\n"; // 栈大小
// Dump the held mutexes.
os << " | held mutexes=";
for (size_t i = 0; i < kLockLevelCount; ++i) {
if (i != kMonitorLock) {
BaseMutex* mutex = thread->GetHeldMutex(static_cast<LockLevel>(i));
if (mutex != nullptr) {
os << " \"" << mutex->GetName() << "\"";
if (mutex->IsReaderWriterMutex()) {
ReaderWriterMutex* rw_mutex = down_cast<ReaderWriterMutex*>(mutex);
if (rw_mutex->GetExclusiveOwnerTid() == tid) {
os << "(exclusive held)"; // 独占锁
} else {
os << "(shared held)"; // 共享锁
}
}
}
}
}
os << "\n";
}
}
ThreadFlag 和ThreadPriority 相关定义如下
enum ThreadFlag {
kSuspendRequest = 1, // If set implies that suspend_count_ > 0 and the Thread should enter the
// safepoint handler.
kCheckpointRequest = 2, // Request that the thread do some checkpoint work and then continue.
kEmptyCheckpointRequest = 4, // Request that the thread do empty checkpoint and then continue.
kActiveSuspendBarrier = 8, // Register that at least 1 suspend barrier needs to be passed.
};
// Thread priorities. These must match the Thread.MIN_PRIORITY, --> Java Thread常量
// Thread.NORM_PRIORITY, and Thread.MAX_PRIORITY constants.
enum ThreadPriority {
kMinThreadPriority = 1,
kNormThreadPriority = 5,
kMaxThreadPriority = 10,
};
GetSchedulerGroupName 获取线程 cgroup
/// @art/runtime/thread.cc
static std::string GetSchedulerGroupName(pid_t tid) {
// /proc/<pid>/cgroup looks like this:
// 2:devices:/
// 1:cpuacct,cpu:/
// We want the third field from the line whose second field contains the "cpu" token.
std::string cgroup_file;
if (!android::base::ReadFileToString(StringPrintf("/proc/self/task/%d/cgroup", tid),
&cgroup_file)) {
return "";
}
std::vector<std::string> cgroup_lines; // cgroup 有很多行输出
Split(cgroup_file, '\n', &cgroup_lines); // 分成行
for (size_t i = 0; i < cgroup_lines.size(); ++i) {
std::vector<std::string> cgroup_fields;
Split(cgroup_lines[i], ':', &cgroup_fields); // 每行根据:分成多个段
std::vector<std::string> cgroups;
Split(cgroup_fields[1], ',', &cgroups);// 第二段根据,分成cgroups
for (size_t j = 0; j < cgroups.size(); ++j) {
if (cgroups[j] == "cpu") { // 取出组是cpu的描述
return cgroup_fields[2].substr(1); // Skip the leading slash. 跳过/
}
}
}
return "";
}
看下面一个示例:
$ cat task/569/cgroup
6:memory:/
5:freezer:/
4:cpuset:/foreground
3:cpuacct:/uid_0/pid_290
2:cpu:/foreground ---> 取出的是这一行的 foreground
1:blkio:/
0::/
GetTaskStats 获取task 状态,从 /proc/self/task/$tid/stat 读取
/// @art/libartbase/base/utils.cc
void GetTaskStats(pid_t tid, char* state, int* utime, int* stime, int* task_cpu) {
*utime = *stime = *task_cpu = 0;
#ifdef _WIN32
// TODO: implement this.
UNUSED(tid);
*state = 'S';
#else
std::string stats;
// TODO: make this less Linux-specific.
if (!ReadFileToString(StringPrintf("/proc/self/task/%d/stat", tid), &stats)) {
return;
}
// Skip the command, which may contain spaces.
stats = stats.substr(stats.find(')') + 2);//S 290 290 0 0 -1 1077936448 77524 0 908 0 2489 1282 ...
// Extract the three fields we care about.
std::vector<std::string> fields;
Split(stats, ' ', &fields); // 空格分割多段
*state = fields[0][0]; // 第一段的第一个字母 S/R 之类
*utime = strtoull(fields[11].c_str(), nullptr, 10);
*stime = strtoull(fields[12].c_str(), nullptr, 10);
*task_cpu = strtoull(fields[36].c_str(), nullptr, 10);
#endif
}
说完DumpState,下面看 DumpStack 实现
####### Thread::DumpStack
void Thread::DumpStack(std::ostream& os, bool dump_native_stack,
BacktraceMap* backtrace_map, bool force_dump_stack) const {
// TODO: we call this code when dying but may not have suspended the thread ourself. The
// IsSuspended check is therefore racy with the use for dumping (normally we inhibit
// the race with the thread_suspend_count_lock_).
bool dump_for_abort = (gAborting > 0);
bool safe_to_dump = (this == Thread::Current() || IsSuspended());// 是当前线程或处于suspend状态
if (!kIsDebugBuild) {
// We always want to dump the stack for an abort, however, there is no point dumping another
// thread's stack in debug builds where we'll hit the not suspended check in the stack walk.
safe_to_dump = (safe_to_dump || dump_for_abort); // 针对 abort 情况处理
}
if (safe_to_dump || force_dump_stack) {
// If we're currently in native code, dump that stack before dumping the managed stack.
if (dump_native_stack && (dump_for_abort || force_dump_stack || ShouldShowNativeStack(this))) {
ArtMethod* method =
GetCurrentMethod(nullptr,
/*check_suspended=*/ !force_dump_stack,
/*abort_on_error=*/ !(dump_for_abort || force_dump_stack));
DumpNativeStack(os, GetTid(), backtrace_map, " native: ", method); // dump native stacks
}
DumpJavaStack(os,
/*check_suspended=*/ !force_dump_stack,
/*dump_locks=*/ !force_dump_stack); // dump java stack
} else {
os << "Not able to dump stack of thread that isn't suspended";
}
}
####### DumpNativeStack
/// @art/runtime/native_stack_dump.cc
void DumpNativeStack(std::ostream& os,pid_t tid, BacktraceMap* existing_map, const char* prefix,
ArtMethod* current_method, void* ucontext_ptr, bool skip_frames) {
// Historical note: This was disabled when running under Valgrind (b/18119146).
BacktraceMap* map = existing_map;
std::unique_ptr<BacktraceMap> tmp_map;
if (map == nullptr) {
tmp_map.reset(BacktraceMap::Create(getpid()));
map = tmp_map.get();
}
// 使用 unwind 解析backtrace
std::unique_ptr<Backtrace> backtrace(Backtrace::Create(BACKTRACE_CURRENT_PROCESS, tid, map));
backtrace->SetSkipFrames(skip_frames);
if (!backtrace->Unwind(0, reinterpret_cast<ucontext*>(ucontext_ptr))) {
os << prefix << "(backtrace::Unwind failed for thread " << tid
<< ": " << backtrace->GetErrorString(backtrace->GetError()) << ")" << std::endl;
return;
} else if (backtrace->NumFrames() == 0) {
os << prefix << "(no native stack frames for thread " << tid << ")" << std::endl;
return;
}
// Check whether we have and should use addr2line.
bool use_addr2line;
if (kUseAddr2line) { // 判断是否使用 addr2line 解析将地址转换成文件名/行号对
// Try to run it to see whether we have it. Push an argument so that it doesn't assume a.out
// and print to stderr.
use_addr2line = (gAborting > 0) && RunCommand(FindAddr2line() + " -h");
} else {
use_addr2line = false;
}
std::unique_ptr<Addr2linePipe> addr2line_state;
// 输出pc每一行
for (Backtrace::const_iterator it = backtrace->begin();
it != backtrace->end(); ++it) {
// We produce output like this:
// ] #00 pc 000075bb8 /system/lib/libc.so (unwind_backtrace_thread+536)
// In order for parsing tools to continue to function, the stack dump
// format must at least adhere to this format:
// #XX pc <RELATIVE_ADDR> <FULL_PATH_TO_SHARED_LIBRARY> ...
// The parsers require a single space before and after pc, and two spaces
// after the <RELATIVE_ADDR>. There can be any prefix data before the
// #XX. <RELATIVE_ADDR> has to be a hex number but with no 0x prefix.
os << prefix << StringPrintf("#%02zu pc ", it->num);
bool try_addr2line = false;
if (!BacktraceMap::IsValid(it->map)) {
os << StringPrintf(Is64BitInstructionSet(kRuntimeISA) ? "%016" PRIx64 " \?\?\?"
: "%08" PRIx64 " \?\?\?",
it->pc);
} else {
os << StringPrintf(Is64BitInstructionSet(kRuntimeISA) ? "%016" PRIx64 " "
: "%08" PRIx64 " ",
it->rel_pc); // 输出 pc 值
if (it->map.name.empty()) {
os << StringPrintf("<anonymous:%" PRIx64 ">", it->map.start);
} else {
os << it->map.name;
}
if (it->map.offset != 0) { // map offset
os << StringPrintf(" (offset %" PRIx64 ")", it->map.offset);
}
os << " (";
if (!it->func_name.empty()) { // 函数名及offset
os << it->func_name;
if (it->func_offset != 0) {
os << "+" << it->func_offset;
}
// Functions found using the gdb jit interface will be in an empty
// map that cannot be found using addr2line.
if (!it->map.name.empty()) {
try_addr2line = true;
}
} else if (current_method != nullptr &&
Locks::mutator_lock_->IsSharedHeld(Thread::Current()) &&
PcIsWithinQuickCode(current_method, it->pc)) {
const void* start_of_code = current_method->GetEntryPointFromQuickCompiledCode();
os << current_method->JniLongName() << "+" //jni 函数
<< (it->pc - reinterpret_cast<uint64_t>(start_of_code));
} else {
os << "???";
}
os << ")";
}
os << std::endl;
if (try_addr2line && use_addr2line) { // 使用 addr2line 解析
Addr2line(it->map.name, it->rel_pc, os, prefix, &addr2line_state);
}
}
if (addr2line_state != nullptr) {
Drain(0, prefix, &addr2line_state, os); // 读取addr2line 解析结果输出到os
}
}
####### Thread::DumpJavaStack
void Thread::DumpJavaStack(std::ostream& os, bool check_suspended, bool dump_locks) const {
// If flip_function is not null, it means we have run a checkpoint
// before the thread wakes up to execute the flip function and the
// thread roots haven't been forwarded. So the following access to
// the roots (locks or methods in the frames) would be bad. Run it
// here. TODO: clean up.
{
Thread* this_thread = const_cast<Thread*>(this);
Closure* flip_func = this_thread->GetFlipFunction();
if (flip_func != nullptr) {
flip_func->Run(this_thread);
}
}
// Dumping the Java stack involves the verifier for locks. The verifier operates under the
// assumption that there is no exception pending on entry. Thus, stash any pending exception.
// Thread::Current() instead of this in case a thread is dumping the stack of another suspended
// thread.
ScopedExceptionStorage ses(Thread::Current());
std::unique_ptr<Context> context(Context::Create());
// 通过StackDumpVisitor输出,继承体系 StackDumpVisitor -> MonitorObjectsStackVisitor -> StackVisitor
StackDumpVisitor dumper(os, const_cast<Thread*>(this), context.get(),
!tls32_.throwing_OutOfMemoryError, check_suspended, dump_locks);
dumper.WalkStack(); // 调用 StackVisitor的WalkStack
}
####### StackVisitor::WalkStack
这块涉及art相关知识,较为复杂后续再深入研究。从代码逻辑看,是遍历管理的stack,然后调用VisitFrame()来输出每一帧的数据。
/// @art/runtime/stack.cc
void StackVisitor::WalkStack(bool include_transitions) {
if (check_suspended_) {
DCHECK(thread_ == Thread::Current() || thread_->IsSuspended());
}
CHECK_EQ(cur_depth_, 0U);
size_t inlined_frames_count = 0;
for (const ManagedStack* current_fragment = thread_->GetManagedStack();
current_fragment != nullptr; current_fragment = current_fragment->GetLink()) {// 遍历管理的stack
cur_shadow_frame_ = current_fragment->GetTopShadowFrame();
cur_quick_frame_ = current_fragment->GetTopQuickFrame();
cur_quick_frame_pc_ = 0;
DCHECK(cur_oat_quick_method_header_ == nullptr);
if (cur_quick_frame_ != nullptr) { // Handle quick stack frames.
// Can't be both a shadow and a quick fragment.
DCHECK(current_fragment->GetTopShadowFrame() == nullptr);
ArtMethod* method = *cur_quick_frame_;
DCHECK(method != nullptr);
bool header_retrieved = false;
if (method->IsNative()) {
// We do not have a PC for the first frame, so we cannot simply use
// ArtMethod::GetOatQuickMethodHeader() as we're unable to distinguish there
// between GenericJNI frame and JIT-compiled JNI stub; the entrypoint may have
// changed since the frame was entered. The top quick frame tag indicates
// GenericJNI here, otherwise it's either AOT-compiled or JNI-compiled JNI stub.
if (UNLIKELY(current_fragment->GetTopQuickFrameTag())) {
// The generic JNI does not have any method header.
cur_oat_quick_method_header_ = nullptr;
} else {
const void* existing_entry_point = method->GetEntryPointFromQuickCompiledCode();
CHECK(existing_entry_point != nullptr);
Runtime* runtime = Runtime::Current();
ClassLinker* class_linker = runtime->GetClassLinker();
// Check whether we can quickly get the header from the current entrypoint.
if (!class_linker->IsQuickGenericJniStub(existing_entry_point) &&
!class_linker->IsQuickResolutionStub(existing_entry_point) &&
existing_entry_point != GetQuickInstrumentationEntryPoint()) {
cur_oat_quick_method_header_ =
OatQuickMethodHeader::FromEntryPoint(existing_entry_point);
} else {
const void* code = method->GetOatMethodQuickCode(class_linker->GetImagePointerSize());
if (code != nullptr) {
cur_oat_quick_method_header_ = OatQuickMethodHeader::FromEntryPoint(code);
} else {
// This must be a JITted JNI stub frame.
CHECK(runtime->GetJit() != nullptr);
code = runtime->GetJit()->GetCodeCache()->GetJniStubCode(method);
CHECK(code != nullptr) << method->PrettyMethod();
cur_oat_quick_method_header_ = OatQuickMethodHeader::FromCodePointer(code);
}
}
}
header_retrieved = true;
}
while (method != nullptr) {
if (!header_retrieved) {
cur_oat_quick_method_header_ = method->GetOatQuickMethodHeader(cur_quick_frame_pc_);
}
header_retrieved = false; // Force header retrieval in next iteration.
ValidateFrame();
if ((walk_kind_ == StackWalkKind::kIncludeInlinedFrames)
&& (cur_oat_quick_method_header_ != nullptr)
&& cur_oat_quick_method_header_->IsOptimized()
&& !method->IsNative() // JNI methods cannot have any inlined frames.
&& CodeInfo::HasInlineInfo(cur_oat_quick_method_header_->GetOptimizedCodeInfoPtr())) {
DCHECK_NE(cur_quick_frame_pc_, 0u);
CodeInfo* code_info = GetCurrentInlineInfo();
StackMap* stack_map = GetCurrentStackMap();
if (stack_map->IsValid() && stack_map->HasInlineInfo()) {
DCHECK_EQ(current_inline_frames_.size(), 0u);
for (current_inline_frames_ = code_info->GetInlineInfosOf(*stack_map);
!current_inline_frames_.empty();
current_inline_frames_.pop_back()) {
bool should_continue = VisitFrame();
if (UNLIKELY(!should_continue)) {
return;
}
cur_depth_++;
inlined_frames_count++;
}
}
}
bool should_continue = VisitFrame();
if (UNLIKELY(!should_continue)) {
return;
}
QuickMethodFrameInfo frame_info = GetCurrentQuickFrameInfo();
if (context_ != nullptr) {
context_->FillCalleeSaves(reinterpret_cast<uint8_t*>(cur_quick_frame_), frame_info);
}
// Compute PC for next stack frame from return PC.
size_t frame_size = frame_info.FrameSizeInBytes();
uintptr_t return_pc_addr = GetReturnPcAddr();
uintptr_t return_pc = *reinterpret_cast<uintptr_t*>(return_pc_addr);
if (UNLIKELY(reinterpret_cast<uintptr_t>(GetQuickInstrumentationExitPc()) == return_pc)) {
// While profiling, the return pc is restored from the side stack, except when walking
// the stack for an exception where the side stack will be unwound in VisitFrame.
const std::map<uintptr_t, instrumentation::InstrumentationStackFrame>&
instrumentation_stack = *thread_->GetInstrumentationStack();
auto it = instrumentation_stack.find(return_pc_addr);
CHECK(it != instrumentation_stack.end());
const instrumentation::InstrumentationStackFrame& instrumentation_frame = it->second;
if (GetMethod() ==
Runtime::Current()->GetCalleeSaveMethod(CalleeSaveType::kSaveAllCalleeSaves)) {
// Skip runtime save all callee frames which are used to deliver exceptions.
} else if (instrumentation_frame.interpreter_entry_) {
ArtMethod* callee =
Runtime::Current()->GetCalleeSaveMethod(CalleeSaveType::kSaveRefsAndArgs);
CHECK_EQ(GetMethod(), callee) << "Expected: " << ArtMethod::PrettyMethod(callee)
<< " Found: " << ArtMethod::PrettyMethod(GetMethod());
} else if (!instrumentation_frame.method_->IsRuntimeMethod()) {
// Trampolines get replaced with their actual method in the stack,
// so don't do the check below for runtime methods.
// Instrumentation generally doesn't distinguish between a method's obsolete and
// non-obsolete version.
CHECK_EQ(instrumentation_frame.method_->GetNonObsoleteMethod(),
GetMethod()->GetNonObsoleteMethod())
<< "Expected: "
<< ArtMethod::PrettyMethod(instrumentation_frame.method_->GetNonObsoleteMethod())
<< " Found: " << ArtMethod::PrettyMethod(GetMethod()->GetNonObsoleteMethod());
}
return_pc = instrumentation_frame.return_pc_;
}
cur_quick_frame_pc_ = return_pc;
uint8_t* next_frame = reinterpret_cast<uint8_t*>(cur_quick_frame_) + frame_size;
cur_quick_frame_ = reinterpret_cast<ArtMethod**>(next_frame);
if (kDebugStackWalk) {
LOG(INFO) << ArtMethod::PrettyMethod(method) << "@" << method << " size=" << frame_size
<< std::boolalpha
<< " optimized=" << (cur_oat_quick_method_header_ != nullptr &&
cur_oat_quick_method_header_->IsOptimized())
<< " native=" << method->IsNative()
<< std::noboolalpha
<< " entrypoints=" << method->GetEntryPointFromQuickCompiledCode()
<< "," << (method->IsNative() ? method->GetEntryPointFromJni() : nullptr)
<< " next=" << *cur_quick_frame_;
}
if (kCount == CountTransitions::kYes || !method->IsRuntimeMethod()) {
cur_depth_++;
}
method = *cur_quick_frame_;
}
// We reached a transition frame, it doesn't have a method header.
cur_oat_quick_method_header_ = nullptr;
} else if (cur_shadow_frame_ != nullptr) {
do {
ValidateFrame();
bool should_continue = VisitFrame();
if (UNLIKELY(!should_continue)) {
return;
}
cur_depth_++;
cur_shadow_frame_ = cur_shadow_frame_->GetLink();
} while (cur_shadow_frame_ != nullptr);
}
if (include_transitions) {
bool should_continue = VisitFrame();
if (!should_continue) {
return;
}
}
if (kCount == CountTransitions::kYes) {
cur_depth_++;
}
}
if (num_frames_ != 0) {
CHECK_EQ(cur_depth_, num_frames_);
}
}
从之前的分析,继承体系 StackDumpVisitor -> MonitorObjectsStackVisitor -> StackVisitor
StackVisitor的VisitFrame函数是纯虚函数,我们看下MonitorObjectsStackVisitor::VisitFrame
####### MonitorObjectsStackVisitor::VisitFrame
执行逻辑 StartMethod -> 打印ThreadState、locks -> EndMethod
/// @art/runtime/monitor_objects_stack_visitor.cc
bool MonitorObjectsStackVisitor::VisitFrame() {
ArtMethod* m = GetMethod();// 获取当前frame的方法
if (m->IsRuntimeMethod()) {
return true;
}
VisitMethodResult vmrEntry = StartMethod(m, frame_count); // 开始访问方法
switch (vmrEntry) {
case VisitMethodResult::kContinueMethod:
break;
case VisitMethodResult::kSkipMethod:
return true;
case VisitMethodResult::kEndStackWalk:
return false;
}
if (frame_count == 0) {
// Top frame, check for blocked state.
ObjPtr<mirror::Object> monitor_object;
uint32_t lock_owner_tid;
ThreadState state = Monitor::FetchState(GetThread(),
&monitor_object,
&lock_owner_tid);
switch (state) {
case kWaiting:
case kTimedWaiting: // 输出wait状态 - waiting on
VisitWaitingObject(monitor_object, state);
break;
case kSleeping: // 输出sleep状态, - sleeping on
VisitSleepingObject(monitor_object);
break;
case kBlocked:
case kWaitingForLockInflation: // 输出block状态,如 - waiting to lock
VisitBlockedOnObject(monitor_object, state, lock_owner_tid);
break;
default:
break;
}
}
if (dump_locks) { // 输出lock
// Visit locks, but do not abort on errors. This could trigger a nested abort.
// Skip visiting locks if dump_locks is false as it would cause a bad_mutexes_held in
// RegTypeCache::RegTypeCache due to thread_list_lock.
Monitor::VisitLocks(this, VisitLockedObject, this, false);
}
++frame_count;
VisitMethodResult vmrExit = EndMethod(m); // 结束访问方法
switch (vmrExit) {
case VisitMethodResult::kContinueMethod:
case VisitMethodResult::kSkipMethod:
return true;
case VisitMethodResult::kEndStackWalk:
return false;
}
LOG(FATAL) << "Unreachable";
UNREACHABLE();
}
MonitorObjectsStackVisitor类的很多行为都是抽象的,它的实现是 StackDumpVisitor,下面直接把它的实现贴出
/// @art/runtime/thread.cc
struct StackDumpVisitor : public MonitorObjectsStackVisitor {
StackDumpVisitor(std::ostream& os_in, Thread* thread_in, Context* context, bool can_allocate,
bool check_suspended = true, bool dump_locks = true)
REQUIRES_SHARED(Locks::mutator_lock_) : MonitorObjectsStackVisitor(thread_in,
context, check_suspended, can_allocate && dump_locks),
os(os_in), last_method(nullptr), last_line_number(0), repetition_count(0) {}
virtual ~StackDumpVisitor() {
if (frame_count == 0) { // 没有frame时打印
os << " (no managed stack frames)\n";
}
}
static constexpr size_t kMaxRepetition = 3u;
// 访问方法,打印出在哪个方法行号 如 at com.android.server.SystemServer.run(SystemServer.java:966)
VisitMethodResult StartMethod(ArtMethod* m, size_t frame_nr ATTRIBUTE_UNUSED)
override REQUIRES_SHARED(Locks::mutator_lock_) {
m = m->GetInterfaceMethodIfProxy(kRuntimePointerSize);
ObjPtr<mirror::DexCache> dex_cache = m->GetDexCache();
int line_number = -1;
if (dex_cache != nullptr) { // be tolerant of bad input
const DexFile* dex_file = dex_cache->GetDexFile();
line_number = annotations::GetLineNumFromPC(dex_file, m, GetDexPc(false)); /// 获取行号,
}
if (line_number == last_line_number && last_method == m) { // 处理重复行
++repetition_count;
} else {
if (repetition_count >= kMaxRepetition) {
os << " ... repeated " << (repetition_count - kMaxRepetition) << " times\n";
}
repetition_count = 0;
last_line_number = line_number;
last_method = m;
}
if (repetition_count >= kMaxRepetition) {
// Skip visiting=printing anything.
return VisitMethodResult::kSkipMethod;
}
os << " at " << m->PrettyMethod(false); // 输出在哪个方法
if (m->IsNative()) {
os << "(Native method)"; // native 方法后面附加
} else {
const char* source_file(m->GetDeclaringClassSourceFile());
os << "(" << (source_file != nullptr ? source_file : "unavailable")
<< ":" << line_number << ")"; // 非native还会打印文件名 行号
}
os << "\n";
// Go and visit locks.
return VisitMethodResult::kContinueMethod;
}
VisitMethodResult EndMethod(ArtMethod* m ATTRIBUTE_UNUSED) override {
return VisitMethodResult::kContinueMethod; // 返回继续进行
}
// 打印waiting状态 ,如 - waiting on <0x038f04bd> (a java.lang.Class<java.lang.ref.ReferenceQueue>)
void VisitWaitingObject(ObjPtr<mirror::Object> obj, ThreadState state ATTRIBUTE_UNUSED)
override REQUIRES_SHARED(Locks::mutator_lock_) {
PrintObject(obj, " - waiting on ", ThreadList::kInvalidThreadId);
}
// 打印sleep状态,如 - sleeping on <0x08ac2fb9> (a java.lang.Object)
void VisitSleepingObject(ObjPtr<mirror::Object> obj)
override REQUIRES_SHARED(Locks::mutator_lock_) {
PrintObject(obj, " - sleeping on ", ThreadList::kInvalidThreadId);
}
// 打印 blocked 状态
void VisitBlockedOnObject(ObjPtr<mirror::Object> obj, ThreadState state, uint32_t owner_tid)
override REQUIRES_SHARED(Locks::mutator_lock_) {
const char* msg;
switch (state) {
case kBlocked: // 等锁状态,
msg = " - waiting to lock ";
break;
case kWaitingForLockInflation:
msg = " - waiting for lock inflation of ";
break;
default:
LOG(FATAL) << "Unreachable";
UNREACHABLE();
}
PrintObject(obj, msg, owner_tid);
}
// 打印持锁信息 如- locked <0x02e64d80> (a com.android.server.am.AppProfiler$ProcessCpuThread)
void VisitLockedObject(ObjPtr<mirror::Object> obj)
override REQUIRES_SHARED(Locks::mutator_lock_) {
PrintObject(obj, " - locked ", ThreadList::kInvalidThreadId);
}
void PrintObject(ObjPtr<mirror::Object> obj, // 输出object状态
const char* msg,
uint32_t owner_tid) REQUIRES_SHARED(Locks::mutator_lock_) {
if (obj == nullptr) {
os << msg << "an unknown object";
} else {
if ((obj->GetLockWord(true).GetState() == LockWord::kThinLocked) &&
Locks::mutator_lock_->IsExclusiveHeld(Thread::Current())) {
// Getting the identity hashcode here would result in lock inflation and suspension of the
// current thread, which isn't safe if this is the only runnable thread.
os << msg << StringPrintf("<@addr=0x%" PRIxPTR "> (a %s)",
reinterpret_cast<intptr_t>(obj.Ptr()),
obj->PrettyTypeOf().c_str());
} else {
// - waiting on <0x6008c468> (a java.lang.Class<java.lang.ref.ReferenceQueue>)
// Call PrettyTypeOf before IdentityHashCode since IdentityHashCode can cause thread
// suspension and move pretty_object.
const std::string pretty_type(obj->PrettyTypeOf());
os << msg << StringPrintf("<0x%08x> (a %s)", obj->IdentityHashCode(), pretty_type.c_str());
}
}
if (owner_tid != ThreadList::kInvalidThreadId) { // 打印持锁线程信息
os << " held by thread " << owner_tid;
}
os << "\n";
}
std::ostream& os;
ArtMethod* last_method;
int last_line_number;
size_t repetition_count;
};
BaseMutex::DumpAll
/// @art/runtime/base/mutex.cc
void BaseMutex::DumpAll(std::ostream& os) { // 输出 Mutex 信息
if (kLogLockContentions) {
os << "Mutex logging:\n";
ScopedAllMutexesLock mu(reinterpret_cast<const BaseMutex*>(-1));
std::set<BaseMutex*>* all_mutexes = gAllMutexData->all_mutexes;
if (all_mutexes == nullptr) {
// No mutexes have been created yet during at startup.
return;
}
os << "(Contended)\n";
for (const BaseMutex* mutex : *all_mutexes) {
if (mutex->HasEverContended()) {
mutex->Dump(os);
os << "\n";
}
}
os << "(Never contented)\n";
for (const BaseMutex* mutex : *all_mutexes) {
if (!mutex->HasEverContended()) {
mutex->Dump(os);
os << "\n";
}
}
}
}
在dump完后,将输出的内容输出
SignalCatcher::Output
/// @art/runtime/signal_catcher.cc
void SignalCatcher::Output(const std::string& s) {
ScopedThreadStateChange tsc(Thread::Current(), kWaitingForSignalCatcherOutput);
palette_status_t status = PaletteWriteCrashThreadStacks(s.data(), s.size());// 写Stacks数据
if (status == PALETTE_STATUS_OK) {
LOG(INFO) << "Wrote stack traces to tombstoned";
} else {
CHECK(status == PALETTE_STATUS_FAILED_CHECK_LOG);
LOG(ERROR) << "Failed to write stack traces to tombstoned";
}
}
/// @art/libartpalette/apex/palette.cc
palette_status_t PaletteWriteCrashThreadStacks(/*in*/const char* stack, size_t stack_len) {
PaletteWriteCrashThreadStacksMethod m =
PaletteLoader::Instance().GetPaletteWriteCrashThreadStacksMethod();
return m(stack, stack_len);
}
PaletteWriteCrashThreadStacksMethod 对应的实现,实际上是 PaletteWriteCrashThreadStacks,这中间涉及一些比较复杂的宏转换。
PaletteWriteCrashThreadStacks
该函数的功能就是将stacks输出
- 连接tombstoned,获取一个可输出的fd
- 将 stacks 内容写入fd,然后执行同步操作并关闭fd
- 通知tombstoned完成dump
/// @system/libartpalette/palette_android.cc
palette_status_t PaletteWriteCrashThreadStacks(/*in*/ const char* stacks, size_t stacks_len) {
android::base::unique_fd tombstone_fd;
android::base::unique_fd output_fd;
// 连接 tombstoned 获取一个输出stacks的 output_fd, 此处type 是 kDebuggerdJavaBacktrace
if (!tombstoned_connect(getpid(), &tombstone_fd, &output_fd, kDebuggerdJavaBacktrace)) {
// Failure here could be due to file descriptor resource exhaustion
// so write the stack trace message to the log in case it helps
// debug that.
LOG(INFO) << std::string_view(stacks, stacks_len);
// tombstoned_connect() logs failure reason.
return PALETTE_STATUS_FAILED_CHECK_LOG;
}
palette_status_t status = PALETTE_STATUS_OK;
if (!android::base::WriteFully(output_fd, stacks, stacks_len)) {// 将stacks写入output_fd
PLOG(ERROR) << "Failed to write tombstoned output";
TEMP_FAILURE_RETRY(ftruncate(output_fd, 0));
status = PALETTE_STATUS_FAILED_CHECK_LOG;
}
if (TEMP_FAILURE_RETRY(fdatasync(output_fd)) == -1 && errno != EINVAL) {// 执行同步操作
// Ignore EINVAL so we don't report failure if we just tried to flush a pipe
// or socket.
if (status == PALETTE_STATUS_OK) {
PLOG(ERROR) << "Failed to fsync tombstoned output";
status = PALETTE_STATUS_FAILED_CHECK_LOG;
}
TEMP_FAILURE_RETRY(ftruncate(output_fd, 0));
TEMP_FAILURE_RETRY(fdatasync(output_fd));
}
if (close(output_fd.release()) == -1 && errno != EINTR) {// 关闭output_fd
if (status == PALETTE_STATUS_OK) {
PLOG(ERROR) << "Failed to close tombstoned output";
status = PALETTE_STATUS_FAILED_CHECK_LOG;
}
}
if (!tombstoned_notify_completion(tombstone_fd)) { // 通知tombstoned完成dump
// tombstoned_notify_completion() logs failure.
status = PALETTE_STATUS_FAILED_CHECK_LOG;
}
return status;
}
连接tombstoned
tombstoned_connect 通过socket连接 tombstoned
/// @system/core/debuggerd/tombstoned/tombstoned_client.cpp
bool tombstoned_connect(pid_t pid, unique_fd* tombstoned_socket, unique_fd* text_output_fd,
DebuggerdDumpType dump_type) {
return tombstoned_connect(pid, tombstoned_socket, text_output_fd, nullptr, dump_type);
}
bool tombstoned_connect(pid_t pid, unique_fd* tombstoned_socket, unique_fd* text_output_fd,
unique_fd* proto_output_fd, DebuggerdDumpType dump_type) {
unique_fd sockfd(
socket_local_client((dump_type != kDebuggerdJavaBacktrace ? kTombstonedCrashSocketName
: kTombstonedJavaTraceSocketName),
ANDROID_SOCKET_NAMESPACE_RESERVED, SOCK_SEQPACKET));
if (sockfd == -1) {
async_safe_format_log(ANDROID_LOG_ERROR, "libc", "failed to connect to tombstoned: %s",
strerror(errno));
return false;
}
TombstonedCrashPacket packet = {};
packet.packet_type = CrashPacketType::kDumpRequest;
packet.packet.dump_request.pid = pid;
packet.packet.dump_request.dump_type = dump_type; // 发送dump请求
if (TEMP_FAILURE_RETRY(write(sockfd, &packet, sizeof(packet))) != sizeof(packet)) {
async_safe_format_log(ANDROID_LOG_ERROR, "libc", "failed to write DumpRequest packet: %s",
strerror(errno));
return false;
}
unique_fd tmp_output_fd, tmp_proto_fd;
ssize_t rc = -1;
// 连接后获取一个输出的 fd
if (dump_type == kDebuggerdTombstoneProto) {
rc = ReceiveFileDescriptors(sockfd, &packet, sizeof(packet), &tmp_output_fd, &tmp_proto_fd);
} else {
rc = ReceiveFileDescriptors(sockfd, &packet, sizeof(packet), &tmp_output_fd);
}
if (rc == -1) {
async_safe_format_log(ANDROID_LOG_ERROR, "libc",
"failed to read response to DumpRequest packet: %s", strerror(errno));
return false;
} else if (rc != sizeof(packet)) {
async_safe_format_log(
ANDROID_LOG_ERROR, "libc",
"received DumpRequest response packet of incorrect length (expected %zu, got %zd)",
sizeof(packet), rc);
return false;
}
// Make the fd O_APPEND so that our output is guaranteed to be at the end of a file.
// (This also makes selinux rules consistent, because selinux distinguishes between writing to
// a regular fd, and writing to an fd with O_APPEND).
int flags = fcntl(tmp_output_fd.get(), F_GETFL);
if (fcntl(tmp_output_fd.get(), F_SETFL, flags | O_APPEND) != 0) {
async_safe_format_log(ANDROID_LOG_WARN, "libc", "failed to set output fd flags: %s",
strerror(errno));
}
*tombstoned_socket = std::move(sockfd);
*text_output_fd = std::move(tmp_output_fd);
if (proto_output_fd) {
*proto_output_fd = std::move(tmp_proto_fd);
}
return true;
}
tombstoned处理请求 - perform_request
根据 Android 12 进程native crash流程分析 中,可以知道当tombstoned收到dump请求后,会执行如下操作
- 可以发现,当pid已经注册了Intercept列表,如Watchdog(4) Trace生成过程分析过一种注册过程,则会返回注册时添加的fd,会在指定的路径或文件生成trace。
- 否则,返回一个临时fd,最终会在 /data/anr/下生成一个 trace_xx 文件
/// @system/core/debuggerd/tombstoned/tombstoned.cpp
static void perform_request(std::unique_ptr<Crash> crash) {
unique_fd output_fd;
// 这种情况通常是注册了一个dump请求,在Watchdog(4) Trace生成过程分析过一种注册过程
// 就是调用 AMS#dumpStackTraces 流程
bool intercepted =
intercept_manager->GetIntercept(crash->crash_pid, crash->crash_type, &output_fd);
if (intercepted) {
if (crash->crash_type == kDebuggerdTombstoneProto) {
crash->output.proto = CrashArtifact::devnull();
}
} else {
// 这种情况 获取输出文件fd, kill -3 会在 /data/anr/下生成一个 trace_xx 文件
if (auto o = CrashQueue::for_crash(crash.get())->get_output(crash->crash_type); o) {
crash->output = std::move(*o);
output_fd.reset(dup(crash->output.text.fd));
} else {
LOG(ERROR) << "failed to get crash output for type " << crash->crash_type;
return;
}
}
// 写响应给client,即crash_dump
TombstonedCrashPacket response = {.packet_type = CrashPacketType::kPerformDump};
ssize_t rc = -1;
if (crash->output.proto) {
rc = SendFileDescriptors(crash->crash_socket_fd, &response, sizeof(response), output_fd.get(),
crash->output.proto->fd.get());
} else { // 写回输出的tombstone fd
rc = SendFileDescriptors(crash->crash_socket_fd, &response, sizeof(response), output_fd.get());
}
output_fd.reset();
if (rc == -1) {
PLOG(WARNING) << "failed to send response to CrashRequest";
return;
} else if (rc != sizeof(response)) {
PLOG(WARNING) << "crash socket write returned short";
return;
}
// TODO: Make this configurable by the interceptor?
struct timeval timeout = {10 * android::base::HwTimeoutMultiplier(), 0};
event_base* base = event_get_base(crash->crash_event);
// 监听crash dump 完成。 收到请求回调 crash_completed_cb
event_assign(crash->crash_event, base, crash->crash_socket_fd, EV_TIMEOUT | EV_READ,
crash_completed_cb, crash.get());
event_add(crash->crash_event, &timeout);
CrashQueue::for_crash(crash)->on_crash_started();
// The crash is now owned by the event loop.
crash.release();
}
参考
https://blog.csdn.net/hl09083253cy/article/details/78418742
http://gityuan.com/2016/11/26/art-trace/