概述
Android 系统以 linux 内核为基础,其进程的管理也依赖于 linux 的进程管理机制,对于 android 系统来说进程管理主要分为以下几部分:
- 进程的创建;
- 进程的优先级管理;
- 进程的内存管理;
- 进程的调度;
- 进程回收。
一、进程创建
1.1 进程与线程
在开始之前可以先看下 Google 官方对于进程及线程的定义:
1.2 进程创建
Linux 进程创建:通过 fork() 系统调用创建进程,本文仅分析此处;
Linux 用户级线程创建:通过 pthread 库中的 pthread_creat() 创建线程;
Linux 内核线程创建:通过 Kthread_creat() 创建内核线程。
1.2.1 fork 流程
- 用户空间调用 fork() 方法;
- 经过 syscall 陷入内核空间, 内核根据系统调用号找到相应的 sys_fork 系统调用;
- sys_fork() 过程会在调用 do_fork(),该方法参数有一个 flags,代表的是父子进程之间需要共享的资源,对于进程创建 flags=SIGCHLD,即当子进程退出时向父进程发送 SIGCHLD 信号;
- do_fork(),会进行一些 check 过程,之后便是进入核心方法 copy_process。
kernel/fork.c
SYSCALL_DEFINE0(fork)
{
#ifdef CONFIG_MMU
return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);
#else
/* can not support in nommu mode */
return -EINVAL;
#endif
}
SYSCALL_DEFINE0(vfork)
{
return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
0, NULL, NULL, 0);
}
……
#ifdef __ARCH_WANT_SYS_CLONE
#ifdef CONFIG_CLONE_BACKWARDS
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
int __user *, parent_tidptr,
unsigned long, tls,
int __user *, child_tidptr)
#elif defined(CONFIG_CLONE_BACKWARDS2)
SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
int __user *, parent_tidptr,
int __user *, child_tidptr,
unsigned long, tls)
#elif defined(CONFIG_CLONE_BACKWARDS3)
SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
int, stack_size,
int __user *, parent_tidptr,
int __user *, child_tidptr,
unsigned long, tls)
#else
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
int __user *, parent_tidptr,
int __user *, child_tidptr,
unsigned long, tls)
#endif
……
long do_fork(unsigned long clone_flags,// clone方法传递过程的flags,标记子进程从父进程中需要继承的资源清单
unsigned long stack_start,// 子进程用户态的堆栈地址,fork()过程该值为0, clone()过程赋予有效值
unsigned long stack_size,// 不必要的参数,默认设置为0
int __user *parent_tidptr,// 用户态下父进程的tid地址
int __user *child_tidptr)// 用户态下子进程的tid地址
{
return _do_fork(clone_flags, stack_start, stack_size,
parent_tidptr, child_tidptr, 0);
}
……
long _do_fork(unsigned long clone_flags,
unsigned long stack_start,
unsigned long stack_size,
int __user *parent_tidptr,
int __user *child_tidptr,
unsigned long tls)
{
struct task_struct *p;
int trace = 0;
long nr;
/*
* Determine whether and which event to report to ptracer. When
* called from kernel_thread or CLONE_UNTRACED is explicitly
* requested, no event is reported; otherwise, report if the event
* for the type of forking is enabled.
*/
if (!(clone_flags & CLONE_UNTRACED)) {
if (clone_flags & CLONE_VFORK)
trace = PTRACE_EVENT_VFORK;
else if ((clone_flags & CSIGNAL) != SIGCHLD)
trace = PTRACE_EVENT_CLONE;
else
trace = PTRACE_EVENT_FORK;
if (likely(!ptrace_event_enabled(current, trace)))
trace = 0;
}
p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr,
child_tidptr, NULL, trace, tls, NUMA_NO_NODE);// 复制进程描述符,pid分配也是这个过程完成
add_latent_entropy();
/*
* Do this prior waking up the new thread - the thread pointer
* might get invalid after that point, if the thread exits quickly.
*/
if (!IS_ERR(p)) {
struct completion vfork;
struct pid *pid;
cpufreq_task_times_alloc(p);
trace_sched_process_fork(current, p);
pid = get_task_pid(p, PIDTYPE_PID);
nr = pid_vnr(pid);
if (clone_flags & CLONE_PARENT_SETTID)
put_user(nr, parent_tidptr);
if (clone_flags & CLONE_VFORK) {// 执行相应的初始化过程
p->vfork_done = &vfork;
init_completion(&vfork);
get_task_struct(p);
}
wake_up_new_task(p);// 唤醒子进程,分配CPU时间片
/* forking complete and child started to run, tell ptracer */
if (unlikely(trace))
ptrace_event_pid(trace, pid);
if (clone_flags & CLONE_VFORK) {// 父进程等待子进程执行exec函数来替换地址空间
if (!wait_for_vfork_done(p, &vfork))
ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
}
put_pid(pid);
} else {
nr = PTR_ERR(p);
}
return nr;
}
……
static __latent_entropy struct task_struct *copy_process(
unsigned long clone_flags,
unsigned long stack_start,
unsigned long stack_size,
int __user *parent_tidptr,
int __user *child_tidptr,
struct pid *pid,
int trace,
unsigned long tls,
int node)
{
int pidfd = -1, retval;
struct task_struct *p;
retval = -ENOMEM;
p = dup_task_struct(current, node);// 拷贝当前进程task_struct
/* Perform scheduler related setup. Assign this task to a CPU. */
retval = sched_fork(clone_flags, p);// 设置调度器相关信息
if (pid != &init_struct_pid) {
pid = alloc_pid(p->nsproxy->pid_ns_for_children);
if (IS_ERR(pid)) {
retval = PTR_ERR(pid);
goto bad_fork_cleanup_thread;
}
}
……
}
static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
struct task_struct *tsk;
unsigned long *stack;
struct vm_struct *stack_vm_area;
int err;
if (node == NUMA_NO_NODE)
node = tsk_fork_get_node(orig);
tsk = alloc_task_struct_node(node);//创建task_struc结构体
if (!tsk)
return NULL;
stack = alloc_thread_stack_node(tsk, node);//创建thread_stack结构体
if (!stack)
goto free_tsk;
stack_vm_area = task_stack_vm_area(tsk);
err = arch_dup_task_struct(tsk, orig);
/*
* arch_dup_task_struct() clobbers the stack-related fields. Make
* sure they're properly initialized before using any stack-related
* functions again.
*/
tsk->stack = stack;
#ifdef CONFIG_VMAP_STACK
tsk->stack_vm_area = stack_vm_area;
#endif
#ifdef CONFIG_THREAD_INFO_IN_TASK
atomic_set(&tsk->stack_refcount, 1);
#endif
if (err)
goto free_stack;
err = scs_prepare(tsk, node);
if (err)
goto free_stack;
#ifdef CONFIG_SECCOMP
/*
* We must handle setting up seccomp filters once we're under
* the sighand lock in case orig has changed between now and
* then. Until then, filter must be NULL to avoid messing up
* the usage counts on the error path calling free_task.
*/
tsk->seccomp.filter = NULL;
#endif
setup_thread_stack(tsk, orig);
clear_user_return_notifier(tsk);
clear_tsk_need_resched(tsk);
set_task_stack_end_magic(tsk);
#ifdef CONFIG_CC_STACKPROTECTOR
tsk->stack_canary = get_random_canary();
#endif
/*
* One for us, one for whoever does the "release_task()" (usually
* parent)
*/
atomic_set(&tsk->usage, 2);
#ifdef CONFIG_BLK_DEV_IO_TRACE
tsk->btrace_seq = 0;
#endif
tsk->splice_pipe = NULL;
tsk->task_frag.page = NULL;
tsk->wake_q.next = NULL;
account_kernel_stack(tsk, 1);
kcov_task_init(tsk);
#ifdef CONFIG_FAULT_INJECTION
tsk->fail_nth = 0;
#endif
return tsk;
free_stack:
free_thread_stack(tsk);
free_tsk:
free_task_struct(tsk);
return NULL;
}
kernel/sched/core.c
int sched_fork(unsigned long clone_flags, struct task_struct *p)
{
unsigned long flags;
int cpu = get_cpu();
__sched_fork(clone_flags, p);
/*
* We mark the process as NEW here. This guarantees that
* nobody will actually run it, and a signal or other external
* event cannot wake it up and insert it on the runqueue either.
*/
p->state = TASK_NEW;
/*
* Make sure we do not leak PI boosting priority to the child.
*/
p->prio = current->normal_prio;//提升的进程优先级必要传递到子进程
if (task_cpu(p) != cpu) {
wake_flags |= WF_MIGRATED;
psi_ttwu_dequeue(p);
set_task_cpu(p, cpu);//为进程分配cpu
}
kernel/pid.c
struct pid *alloc_pid(struct pid_namespace *ns)
{
struct pid *pid;
enum pid_type type;
int i, nr;
struct pid_namespace *tmp;
struct upid *upid;
int retval = -ENOMEM;
pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
if (!pid)
return ERR_PTR(retval);
tmp = ns;
pid->level = ns->level;
for (i = ns->level; i >= 0; i--) {
nr = alloc_pidmap(tmp);//通过alloc_pidmap()方法来完成pid的分配工作
if (nr < 0) {
retval = nr;
goto out_free;
}
pid->numbers[i].nr = nr;
pid->numbers[i].ns = tmp;
tmp = tmp->parent;
}
if (unlikely(is_child_reaper(pid))) {
if (pid_ns_prepare_proc(ns)) {
disable_pid_allocation(ns);
goto out_free;
}
}
get_pid_ns(ns);
atomic_set(&pid->count, 1);
for (type = 0; type < PIDTYPE_MAX; ++type)
INIT_HLIST_HEAD(&pid->tasks[type]);
init_waitqueue_head(&pid->wait_pidfd);
upid = pid->numbers + ns->level;
spin_lock_irq(&pidmap_lock);
if (!(ns->nr_hashed & PIDNS_HASH_ADDING))
goto out_unlock;
for ( ; upid >= pid->numbers; --upid) {
hlist_add_head_rcu(&upid->pid_chain,
&pid_hash[pid_hashfn(upid->nr, upid->ns)]);
upid->ns->nr_hashed++;
}
spin_unlock_irq(&pidmap_lock);
return pid;
out_unlock:
spin_unlock_irq(&pidmap_lock);
put_pid_ns(ns);
out_free:
while (++i <= ns->level)
free_pidmap(pid->numbers + i);
kmem_cache_free(ns->pid_cachep, pid);
return ERR_PTR(retval);
}
1.3 系统进程
本节简述 init ,zygote,system_server 三个系统进程。
1.3.1 app_process
framework/base/cmds/app_process
app_process 是一个可执行程序,该程序的主要作用是启动 zygote 和 systemserver 进程。
1.3.2 init 进程
init 进程是唯一固定进程号的进程,进程号为 1,是 Android 系统起来的第一个进程,init 进程的初始化是通过 init.rc 的配置文件管理。
1.3.3 zygote 进程
zygote 进程由 init 进程启动,所有应用进程都由 zygote 进程创建,以 init.zygote64.rc 文件为例解析 zygote 进程创建。
system/core/rootdir/init.zygote64.rc
service zygote /system/bin/app_process64 -Xzygote /system/bin --zygote --start-system-server
class main
priority -20
user root
group root readproc reserved_disk
socket zygote stream 660 root system
socket usap_pool_primary stream 660 root system
onrestart exec_background - system system -- /system/bin/vdc volume abort_fuse
onrestart write /sys/power/state on
onrestart restart audioserver
onrestart restart cameraserver
onrestart restart media
onrestart restart netd
onrestart restart wificond
writepid /dev/cpuset/foreground/tasks
zygote 进程是所有应用的父进程,主要逻辑在:
ramework/base/core/java/com/android/internal/os/ZygoteInit.java
1.3.4 AMS 进程
zygote 进程起来后会启动一个 socket 等待连接,等待的就是 AMS 进程,所有的应用程序进程都是由 AMS 通过 socket 发送请求给 zygote 进程,然后由 zygote 进程 fork 创建。同时 AMS 掌握进程的优先级管理。
framework/base/services/core/java/com/android/server/am/
1.3.5 system_server 进程
zygote 进程起来后会根据需要启动 system_server 进程。
system_server进程中包含了大量的系统服务。例如:
负责网络管理的NetworkManagementService
负责窗口管理的WindowManagerService
负责震动管理的VibratorService
负责输入管理的InputManagerService等等
1.4 四大组件与进程的创建
1.4.1 Activity 与进程的创建
在 AMS 中,对每一个运行中的 Activity 都有一个 ActivityRecord 对象,这个对象记录 Activity 的详细状态。
AMS 中的 startActivity 方法接受 Context.startActivity 的请求,方法如下:
framework/base/services/core/java/com/android/server/am/ActivityManagerService.java
@Override
public int startActivity(IApplicationThread caller, String callingPackage,
Intent intent, String resolvedType, IBinder resultTo, String resultWho, int requestCode,
int startFlags, ProfilerInfo profilerInfo, Bundle bOptions) {
return mActivityTaskManager.startActivity(caller, callingPackage, null, intent,
resolvedType, resultTo, resultWho, requestCode, startFlags, profilerInfo, bOptions);
}
AMS 通过 Stack 和 Task 来管理 Activity,每个 Activity 都属于一个 Task,一个 Task 可能包含多个 Activity,一个 Stack 包含多个 Task。ActivityStackSupervisor 类负责管理所有的 Stack。
Activity 的启动过程会涉及 Intent 的解析、Stack 和 Task 的查询及创建、Activity 进程的创建、Activity 窗口的创建、Activity生命周期的调度等。在 Activity 启动的最后,会将前一个 Activity pause,将新的 Activity resume。
1.4.2 Service 与进程的创建
在 AMS 中,对每一个运行中的 Service 都有一个 ServiceRecord 对象,这个对象记录 Service 的详细状态,在 AMS 中 startservice 方法处理 Context.startService API 的请求。
framework/base/services/core/java/com/android/server/am/ActivityManagerService.java
@Override
public ComponentName startService(IApplicationThread caller, Intent service,
String resolvedType, boolean requireForeground, String callingPackage,
String callingFeatureId, int userId)
throws TransactionTooLargeException {
enforceNotIsolatedCaller("startService");
// Refuse possible leaked file descriptors
if (service != null && service.hasFileDescriptors() == true) {
throw new IllegalArgumentException("File descriptors passed in Intent");
}
if (callingPackage == null) {
throw new IllegalArgumentException("callingPackage cannot be null");
}
//*/ freeme.chenming, 20180712. Atlas.
if (mFreemeSystemAtlas != null && mFreemeSystemAtlas.onStartService(caller, service,
resolvedType, callingPackage, userId)) {
return null;
}
//*/
if (DEBUG_SERVICE) Slog.v(TAG_SERVICE,
"*** startService: " + service + " type=" + resolvedType + " fg=" + requireForeground);
synchronized(this) {
final int callingPid = Binder.getCallingPid();
final int callingUid = Binder.getCallingUid();
final long origId = Binder.clearCallingIdentity();
ComponentName res;
try {
res = mServices.startServiceLocked(caller, service,
resolvedType, callingPid, callingUid,
requireForeground, callingPackage, callingFeatureId, userId);
} finally {
Binder.restoreCallingIdentity(origId);
}
return res;
}
}
mService 对象是 ActiveServices 类型,该类负责管理 Service 相关的活动。
启动 Service 的流程:ActivityManagerService.startService -> ActiveService.startServicelocked-> ActiveServices.startServiceInnerLocked -> ActiveServices.bringUpServiceLocked-> ActivityManagerService.startProcessLocked
在 bringUpServiceLocked 方法会判断 Service 所在进程是否启动,未启动则通过 startProcessLocked 启动,如下所示:
framework/base/services/core/java/com/android/server/am/ActiveServices.java
// Not running -- get it started, and enqueue this service record
// to be executed when the app comes up.
if (app == null && !permissionsReviewRequired) {
// TODO (chriswailes): Change the Zygote policy flags based on if the launch-for-service
// was initiated from a notification tap or not.
if ((app=mAm.startProcessLocked(procName, r.appInfo, true, intentFlags,
hostingRecord, ZYGOTE_POLICY_FLAG_EMPTY, false, isolated, false)) == null) {
String msg = "Unable to launch app "
+ r.appInfo.packageName + "/"
+ r.appInfo.uid + " for service "
+ r.intent.getIntent() + ": process is bad";
Slog.w(TAG, msg);
bringDownServiceLocked(r);
return msg;
}
if (isolated) {
r.isolatedProc = app;
}
}
1.4.3 Provider 与进程的创建
在 AMS 中对于每个运行中的 ContentProvider 都有一个 ContentProviderRecord 对象记录其详细状态。
framework/base/services/core/java/com/android/server/am/ActivityManagerService.java
@Override
public final ContentProviderHolder getContentProvider(
IApplicationThread caller, String callingPackage, String name, int userId,
boolean stable) {
enforceNotIsolatedCaller("getContentProvider");
if (caller == null) {
String msg = "null IApplicationThread when getting content provider "
+ name;
Slog.w(TAG, msg);
throw new SecurityException(msg);
}
//*/ freeme.chenming, 20180712. Atlas.
if (mFreemeSystemAtlas != null && mFreemeSystemAtlas.onGetContentProvider(caller, name, userId, stable)) {
return null;
}
//*/
// The incoming user check is now handled in checkContentProviderPermissionLocked() to deal
// with cross-user grant.
final int callingUid = Binder.getCallingUid();
if (callingPackage != null && mAppOpsService.checkPackage(callingUid, callingPackage)
!= AppOpsManager.MODE_ALLOWED) {
throw new SecurityException("Given calling package " + callingPackage
+ " does not match caller's uid " + callingUid);
}
return getContentProviderImpl(caller, name, null, callingUid, callingPackage,
null, stable, userId);
}
1.4.4 Receiver 与进程的创建
开发者通过 Context.sendBroadcast 接口来发送广播,AMS 里的 broadcastIntent() 方法对应广播发送的处理。
在 AMS 里通过队列来管理广播,BroadcastQueue 描述一个广播队列,BroadcastRecord 描述一个广播事件。
在 AMS 中如果收到一个广播请求,会创建一个 BroadcastRecord 放入 BroadcastQueue 中,然后通知队列去处理这个广播,然后 AMS 可以去继续处理其他请求。
在 BroadcastQueue.processNextBroadcast(boolean fromMsg) 方法实现广播事件的逻辑,在这个方法,如果发现 BroadcastReceiver 还没启动,便会通过 AMS 的 startProcessLocked 方法启动 BroadcastReceiver。
framework/base/services/core/java/android/server/am/BroadcastQueue.java
final void processNextBroadcast(boolean fromMsg) {
synchronized (mService) {
processNextBroadcastLocked(fromMsg, false);
}
}
final void processNextBroadcastLocked(boolean fromMsg, boolean skipOomAdj) {
BroadcastRecord r;
……
ResolveInfo info =
(ResolveInfo)nextReceiver;
ComponentName component = new ComponentName(
info.activityInfo.applicationInfo.packageName,
info.activityInfo.name);
……
// Not running -- get it started, to be executed when the app comes up.
if (DEBUG_BROADCAST) Slog.v(TAG_BROADCAST,
"Need to start app ["
+ mQueueName + "] " + targetProcess + " for broadcast " + r);
if ((r.curApp=mService.startProcessLocked(targetProcess,
info.activityInfo.applicationInfo, true,
r.intent.getFlags() | Intent.FLAG_FROM_BACKGROUND,
new HostingRecord("broadcast", r.curComponent),
isActivityCapable ? ZYGOTE_POLICY_FLAG_LATENCY_SENSITIVE : ZYGOTE_POLICY_FLAG_EMPTY,
(r.intent.getFlags()&Intent.FLAG_RECEIVER_BOOT_UPGRADE) != 0, false, false))
== null) {
// Ah, this recipient is unavailable. Finish it if necessary,
// and mark the broadcast record as ready for the next.
Slog.w(TAG, "Unable to launch app "
+ info.activityInfo.applicationInfo.packageName + "/"
+ receiverUid + " for broadcast "
+ r.intent + ": process is bad");
logBroadcastReceiverDiscardLocked(r);
finishReceiverLocked(r, r.resultCode, r.resultData,
r.resultExtras, r.resultAbort, false);
scheduleBroadcastsLocked();
r.state = BroadcastRecord.IDLE;
return;
}
maybeAddAllowBackgroundActivityStartsToken(r.curApp, r);
mPendingBroadcast = r;
mPendingBroadcastRecvIndex = recIdx;
}
二、进程优先级
系统对于进程的优先级有五个分类:前台进程,可见进程,服务进程,后台进程,空进程。
2.1 ProcessRecord
每个 Android 的应用进程中都有可能包涵四大组件中的一个或多个,应用进程由 ActivityManagerService 发送请求给 zygote 创建,并且在 ActivityManagerService 中每一个运行的进程都有一个 ProcessRecord 对象与之对应。
framework/base/services/core/java/android/server/am/ProcessRecord.java
// all ServiceRecord running in this process
private final ArraySet<ServiceRecord> mServices = new ArraySet<>();
// services that are currently executing code (need to remain foreground).
final ArraySet<ServiceRecord> executingServices = new ArraySet<>();
// All ConnectionRecord this process holds
final ArraySet<ConnectionRecord> connections = new ArraySet<>();
// all IIntentReceivers that are registered from this process.
final ArraySet<ReceiverList> receivers = new ArraySet<>();
// class (String) -> ContentProviderRecord
final ArrayMap<String, ContentProviderRecord> pubProviders = new ArrayMap<>();
// All ContentProviderRecord process is using
final ArrayList<ContentProviderConnection> conProviders = new ArrayList<>();
mServices 和 executingServices 记录进程中运行的 Service;
receivers 记录进程中运行的 BroadcastReceiver;
pubProviders 记录进程中运行的 ContentProvider;
connections 记录 Service 对于客户端使用状态的记录;
conProviders 记录 ContentProvider 对于客户端使用状态的记录。
connections 和 conProviders 连接的客户端的进程优先级会影响被使用的 Services 和 ContentProvider 所在进程的优先级,组件的状态是其所在进程优先级的决定因素,四大组价状态如下:
Activity 是否在前台,是否被用户可见;
Services 正在被哪些客户端使用;
ContentProvider 正在被哪些客户端使用;
BroadcastReceiver 是否正在接收广播。
2.2 oom_score_adj
对于每一个运行中的进程,内核可以通过 proc 文件系统允许其他程序修个某个进程的优先级,其文件节点为:/proc/[pid]/oom_score_adj,
在 ProcessRecord.java 中对于 oom_score_adj 有如下限定:
int maxAdj; // Maximum OOM adjustment for this process
private int mCurRawAdj; // Current OOM unlimited adjustment for this process
int setRawAdj; // Last set OOM unlimited adjustment for this process
int curAdj; // Current OOM adjustment for this process
int setAdj; // Last set OOM adjustment for this process
maxAdj 指定了该进程允许的 oom_score_adj 最大值,这个属性主要给系统应用和常驻内存的进程使用,通过设定 maxAdj 保证这些进程一直拥有较高的优先级;
curAdj 记录这一次优先级计算的结果,在计算完成后会将 curAdj 复制给对应的 setAdj 进程备份。
在 ProcessList 类中预定义了 oom_score_adj 的可能值,范围是 -1000 ~ 1000,值越小表示进程越重要。
framework/base/services/core/java/android/server/am/ProcessList.java
// Adjustment used in certain places where we don't know it yet.
// (Generally this is something that is going to be cached, but we
// don't know the exact value in the cached range to assign yet.)
static final int UNKNOWN_ADJ = 1001;
// This is a process only hosting activities that are not visible,
// so it can be killed without any disruption.
static final int CACHED_APP_MAX_ADJ = 999;
static final int CACHED_APP_MIN_ADJ = 900;
// This is the oom_adj level that we allow to die first. This cannot be equal to
// CACHED_APP_MAX_ADJ unless processes are actively being assigned an oom_score_adj of
// CACHED_APP_MAX_ADJ.
static final int CACHED_APP_LMK_FIRST_ADJ = 950;
// The B list of SERVICE_ADJ -- these are the old and decrepit
// services that aren't as shiny and interesting as the ones in the A list.
static final int SERVICE_B_ADJ = 800;
// This is the process of the previous application that the user was in.
// This process is kept above other things, because it is very common to
// switch back to the previous app. This is important both for recent
// task switch (toggling between the two top recent apps) as well as normal
// UI flow such as clicking on a URI in the e-mail app to view in the browser,
// and then pressing back to return to e-mail.
static final int PREVIOUS_APP_ADJ = 700;
// This is a process holding the home application -- we want to try
// avoiding killing it, even if it would normally be in the background,
// because the user interacts with it so much.
static final int HOME_APP_ADJ = 600;
// This is a process holding an application service -- killing it will not
// have much of an impact as far as the user is concerned.
static final int SERVICE_ADJ = 500;
// This is a process with a heavy-weight application. It is in the
// background, but we want to try to avoid killing it. Value set in
// system/rootdir/init.rc on startup.
static final int HEAVY_WEIGHT_APP_ADJ = 400;
// This is a process currently hosting a backup operation. Killing it
// is not entirely fatal but is generally a bad idea.
static final int BACKUP_APP_ADJ = 300;
// This is a process bound by the system (or other app) that's more important than services but
// not so perceptible that it affects the user immediately if killed.
static final int PERCEPTIBLE_LOW_APP_ADJ = 250;
// This is a process only hosting components that are perceptible to the
// user, and we really want to avoid killing them, but they are not
// immediately visible. An example is background music playback.
static final int PERCEPTIBLE_APP_ADJ = 200;
// This is a process only hosting activities that are visible to the
// user, so we'd prefer they don't disappear.
static final int VISIBLE_APP_ADJ = 100;
static final int VISIBLE_APP_LAYER_MAX = PERCEPTIBLE_APP_ADJ - VISIBLE_APP_ADJ - 1;
// This is a process that was recently TOP and moved to FGS. Continue to treat it almost
// like a foreground app for a while.
// @see TOP_TO_FGS_GRACE_PERIOD
static final int PERCEPTIBLE_RECENT_FOREGROUND_APP_ADJ = 50;
// This is the process running the current foreground app. We'd really
// rather not kill it!
static final int FOREGROUND_APP_ADJ = 0;
// This is a process that the system or a persistent process has bound to,
// and indicated it is important.
static final int PERSISTENT_SERVICE_ADJ = -700;
// This is a system persistent process, such as telephony. Definitely
// don't want to kill it, but doing so is not completely fatal.
static final int PERSISTENT_PROC_ADJ = -800;
// The system process runs at the default adjustment.
static final int SYSTEM_ADJ = -900;
// Special code for native processes that are not being managed by the system (so
// don't have an oom adj assigned by the system).
static final int NATIVE_ADJ = -1000;
FOREGROUND_APP_ADJ 是前台应用进程的优先级,也是普通应用程序能获得的最高优先级,其他优先级可阅读注释了解。
2.3 进程优先级
系统会对不同进程设置不同的优先级。
在 include/linux/sched/prio.h 文件中,定义了以下的宏来描述进程优先级:
#define MAX_NICE 19
#define MIN_NICE -20
#define NICE_WIDTH (MAX_NICE - MIN_NICE + 1)
/*
* Priority of a process goes from 0..MAX_PRIO-1, valid RT
* priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
* tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
* values are inverted: lower p->prio value means higher priority.
*
* The MAX_USER_RT_PRIO value allows the actual maximum
* RT priority to be separate from the value exported to
* user-space. This allows kernel threads to set their
* priority to a value higher than any user task. Note:
* MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
*/
#define MAX_USER_RT_PRIO 100
#define MAX_RT_PRIO MAX_USER_RT_PRIO
#define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH)
#define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2)
/*
* Convert user-nice values [ -20 ... 0 ... 19 ]
* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
* and back.
*/
#define NICE_TO_PRIO(nice) ((nice) + DEFAULT_PRIO)
#define PRIO_TO_NICE(prio) ((prio) - DEFAULT_PRIO)
/*
* 'User priority' is the nice value converted to something we
* can work with better when scaling various scheduler parameters,
* it's a [ 0 ... 39 ] range.
*/
#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
三、进程调度
进程调度是操作系统最核心的功能之一,在目前的 linux 内核中,一共有 6 种调度策略,可以分为以下三类:
普通调度策略:SCHED_NORMAL,SCHED_BATCH,SCHED_IDLE;
实时调度策略:SCHED_FIFO,SCHED_RR;
Deadline 调度策略:SCHED_DEADLINE。
/*
* Scheduling policies
*/
#define SCHED_NORMAL 0
#define SCHED_FIFO 1
#define SCHED_RR 2
#define SCHED_BATCH 3
/* SCHED_ISO: reserved but not implemented yet */
#define SCHED_IDLE 5
#define SCHED_DEADLINE 6
SCHED_NORMAL 也就是常说的 SCHED_OTHER,这是进程的默认调度策略,也就是时间共享策略。绝大部分进程都使用这个调度策略。
SCHED_BATCH 与 SCHED_NORMAL 类似,不同的是,内核会认为该进程是CPU密集型,因此在调度会有小的惩罚。这种策略适用于那些非交互的后台进程。
SCHED_IDLE 是最低优先级的调度策略,nice 值不会被考虑。
SCHED_FIFO 全称是 First in-first out。这种策略不会使用时间片算法,在同优先级的情况下,会按照先进先出的方法按顺序执行。作为一种实时调度策略,属于该调度策略的进程会一直执行直到被IO阻塞或者被更高优先级的进程抢占。
SCHED_RR 全称是Round-robin。这是对于SCHED_FIFO增强的实时策略,它使用了时间片共享的方式来调度进程。
SCHED_DEADLINE 指定了预计完成时间的调度策略,它拥有超过所有其他策略的最高优先级。
内核负责了进程的 CPU 调度,所有运行中的进程并非能平等的能获取相等的时间片。在 ProcessRecord 中,通过 Schedule Group 来记录进程的调度组,取值定义在 ProcessList 类中。
// Activity manager's version of Process.THREAD_GROUP_BACKGROUND
static final int SCHED_GROUP_BACKGROUND = 0;
// Activity manager's version of Process.THREAD_GROUP_RESTRICTED
static final int SCHED_GROUP_RESTRICTED = 1;
// Activity manager's version of Process.THREAD_GROUP_DEFAULT
static final int SCHED_GROUP_DEFAULT = 2;
// Activity manager's version of Process.THREAD_GROUP_TOP_APP
public static final int SCHED_GROUP_TOP_APP = 3;
// Activity manager's version of Process.THREAD_GROUP_TOP_APP
// Disambiguate between actual top app and processes bound to the top app
static final int SCHED_GROUP_TOP_APP_BOUND = 4;
四、内存回收
内存回收主要分为进程内的内存回收和进程级的内存回收。
4.1 进程内的内存回收
4.1.1 虚拟机自身的垃圾回收机制
垃圾回收是指:虚拟机会监测应用程序的对象创建和使用,并在一些特定的时候销毁无用的对象以回收内存。
垃圾回收的基本想法是要找出虚拟机中哪些对象已经不会再被使用然后将其释放。其最常用的算法有下面两种:
1.应用计数算法
引用计数算法是为每个对象维护一个被引用的次数:对象刚创建时的初始引用计数为 0,每次被一个对象引用时,引用计数加 1,反之减 1。当一个对象的引用计数重新回到0时便可以认为是不会被使用的,这些对象便可以被垃圾回收。
2.对象跟踪算法
对象追踪算法是通过 GC root 类型的对象为起点,追踪所有被这些对象所引用的对象,并顺着这些被引用的对象继续往下追踪,在追踪的过程中,对所有被追踪到的对象打上标记,而剩下的那些没有被打过标记的对象便可以认为是没有被使用的,因此这些对象可以将其释放。
4.1.2 开发者进行内存回收
当确定某些对象不会再被使用时,要主动释放对其引用,这样虚拟机才能将其回收。对于不再被用到对象,仍然保持对其引用导致其无法释放,将导致内存泄漏的发生。
在某些场景下系统会通知应用进行内存释放,ComponentCallback2 接口中的 onTrimMemory() 回调用来接收这个事件。
4.2 进程级的内存回收
进程级的内存回收主要分为:
4.2.1 LowMemoryKiller
lowmemorykiller.c 位于 drivers/staging/android/,属于 Android 专有。
register_shrinker 用于初始化,LMK 驱动通过注册 shrinker 来实现的,shrinker 是 linux kernel 标准的回收内存 page 的机制,由内核线程 kswapd 负责监控。
当内存不足时 kswapd 线程会遍历一张 shrinker 链表,并回调已注册的 shrinker 函数来回收内存 page,kswapd 还会周期性唤醒来执行内存操作。每个 zone 维护 active_list 和 inactive_list 链表,内核根据页面活动状态将 page 在这两个链表之间移动,最终通过 shrink_slab 和 shrink_zone 来回收内存页,此处不进行过多关注。
在 lowmem_count 里 ANON 代表匿名映射,没有后备存储器;FILE 代表文件映射; 内存计算公式 = 活动匿名内存 + 活动文件内存 + 不活动匿名内存 + 不活动文件内存。
当 lmk 触发时,先杀 oom_adj 大的进程,若 oom_adj 相等,则杀 oom_score_adj 最大的进程,并且 rss 内存最大的进程是最先杀死。lowmem_scan 通过 send_sig 发送 SIGKILL 杀死选中的目标。
drivers/staging/android/lowmemorykiller.c
static struct shrinker lowmem_shrinker = {
.scan_objects = lowmem_scan,
.count_objects = lowmem_count,
.seeks = DEFAULT_SEEKS * 16,
.flags = SHRINKER_LMK
};
static int __init lowmem_init(void) {
#ifdef CONFIG_LOWMEM_NOTIFY_KOBJ
int rc;
lowmem_notify_kobj = kzalloc(sizeof(*lowmem_notify_kobj), GFP_KERNEL);
if (!lowmem_notify_kobj)
return -ENOMEM;
rc = kobject_init_and_add(lowmem_notify_kobj, &lowmem_notify_kobj_type,
mm_kobj, "lowmemkiller");
if (rc) {
kfree(lowmem_notify_kobj);
return rc;
}
#endif
register_shrinker(&lowmem_shrinker);
#ifdef CONFIG_OOM_NOTIFIER
register_oom_notifier(&android_oom_notifier);
#endif
#ifdef CONFIG_E_SHOW_MEM
register_e_show_mem_notifier(&tasks_e_show_mem_notifier);
#endif
vmpressure_notifier_register(&lmk_vmpr_nb);
lowmem_print(1, "entering:%s\n", __func__);
nl_sk = netlink_kernel_create(&init_net, LMK_NETLINK_PROTO, &cfg);
if (!nl_sk)
lowmem_print(1, "error createing nl socket.\n");
return 0;
}
static unsigned long lowmem_count(struct shrinker *s,
struct shrink_control *sc) {
#ifdef CONFIG_FREEZER
/* Do not allow LMK to work when system is freezing */
if (pm_freezing)
return 0;
#endif
return global_page_state(NR_ACTIVE_ANON) +
global_page_state(NR_ACTIVE_FILE) +
global_page_state(NR_INACTIVE_ANON) +
global_page_state(NR_INACTIVE_FILE);
}
static int test_task_state(struct task_struct *p, int state) {
struct task_struct *t;
for_each_thread(p, t) {
task_lock(t);
if (t->state & state) {
task_unlock(t);
return 1;
}
task_unlock(t);
}
return 0;
}
static DEFINE_MUTEX(scan_mutex);
static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc) {
struct task_struct *tsk;
struct task_struct *selected = NULL;
unsigned long rem = 0;
int selected_process_uid = 0;
int selected_process_pid = 0;
int selected_process_adj = 0;
int tasksize;
int i;
int ret = 0;
int pressure = 0;
short min_score_adj = OOM_SCORE_ADJ_MAX + 1;
int minfree = 0;
int selected_tasksize = 0;
short selected_oom_score_adj;
int array_size = ARRAY_SIZE(lowmem_adj);
int other_free;
int other_file;
struct sysinfo si;
int other_file_orig;
/* work around for antutu */
struct task_struct *selected_antutu = NULL;
int selected_antutu_tasksize = 0;
short selected_antutu_adj = -1000;
bool has_antutu_3D = false;
#ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER_MEMINFO
static DEFINE_RATELIMIT_STATE(lmk_rs, DEFAULT_RATELIMIT_INTERVAL, 1);
#endif
#ifdef CONFIG_E_SHOW_MEM
/* 600s */
static DEFINE_RATELIMIT_STATE(lmk_mem_rs,
DEFAULT_RATELIMIT_INTERVAL * 12 * 10, 1);
static DEFINE_RATELIMIT_STATE(lmk_meminfo_rs,
DEFAULT_RATELIMIT_INTERVAL * 12, 1);
#endif
if (!mutex_trylock(&scan_mutex))
return 0;
#ifdef CONFIG_LOWMEM_NOTIFY_KOBJ
lowmem_notif_sc.gfp_mask = sc->gfp_mask;
if (get_free_ram(&other_free, &other_file_orig, &other_file, sc)) {
if (mutex_is_locked(&kernfs_mutex))
msleep(1);
if (!mutex_is_locked(&kernfs_mutex))
lowmem_notify_killzone_approach();
else
lowmem_print(1, "skip as kernfs_mutex is locked.");
}
#else
get_current_ram(&other_free, &other_file_orig, &other_file, sc);
#endif
si_swapinfo(&si);
if (lowmem_adj_size < array_size)
array_size = lowmem_adj_size;
if (lowmem_minfree_size < array_size)
array_size = lowmem_minfree_size;
for (i = 0; i < array_size; i++) {
minfree = lowmem_minfree[i];
if (other_free < minfree && other_file < minfree) {
min_score_adj = lowmem_adj[i];
if (min_score_adj != 0 && si.freeswap < si.totalswap * 20 / 100) {
minfree = lowmem_minfree[i - 1];
min_score_adj = lowmem_adj[i - 1];
}
break;
}
}
ret = adjust_minadj(&min_score_adj, &pressure);
lowmem_print(3, "lowmem_scan %lu, %x, ofree %d %d, ma %hd\n",
sc->nr_to_scan, sc->gfp_mask, other_free,
other_file, min_score_adj);
if (min_score_adj == OOM_SCORE_ADJ_MAX + 1) {
trace_almk_shrink(0, ret, other_free, other_file, 0);
lowmem_print(5, "lowmem_scan %lu, %x, return 0\n",
sc->nr_to_scan, sc->gfp_mask);
mutex_unlock(&scan_mutex);
return 0;
}
selected_oom_score_adj = min_score_adj;
rcu_read_lock();
for_each_process(tsk) {
struct task_struct *p;
short oom_score_adj;
if (tsk->flags & PF_KTHREAD)
continue;
if (time_before_eq(jiffies, lowmem_deathpending_timeout)) {
if (test_task_flag(tsk, TIF_MEMDIE)) {
rcu_read_unlock();
mutex_unlock(&scan_mutex);
return 0;
}
}
/* workaround for cts case:CtsMediaTestCases
* vmpressure is disable in GMS version when run cts.
* so this is for gsi version.
*/
if (pressure > 0 && strstr(tsk->comm, "decTestProcess"))
continue;
p = find_lock_task_mm(tsk);
if (!p)
continue;
if (p->signal->flags & SIGNAL_GROUP_EXIT) {
lowmem_print(2, "'%s' (%d:%d) group exit, skip.\n",
p->comm, p->pid, p->tgid);
task_unlock(p);
continue;
}
/* workaround for antutu */
if (strstr("com.antutu.benchmark.full", p->comm))
has_antutu_3D = true;
oom_score_adj = p->signal->oom_score_adj;
if (oom_score_adj < min_score_adj) {
task_unlock(p);
continue;
}
tasksize = get_mm_rss(p->mm);
task_unlock(p);
if (tasksize <= 0)
continue;
if (selected) {
if (oom_score_adj < selected_oom_score_adj)
continue;
if (oom_score_adj == selected_oom_score_adj &&
tasksize <= selected_tasksize)
continue;
}
/* workaround for antutu */
if (!selected_antutu &&
strstr("com.antutu.ABenchMark", p->comm)) {
selected_antutu = p;
selected_antutu_tasksize = tasksize;
selected_antutu_adj = oom_score_adj;
continue;
}
selected = p;
selected_tasksize = tasksize;
selected_oom_score_adj = oom_score_adj;
lowmem_print(2, "select '%s' (%d), adj %hd, size %d, to kill,
p->comm, p->pid, oom_score_adj, tasksize);
}
/* workaround for antutu:
* if 3D task is not exist, check if the antutu task is more suited
* to be killed
*/
if (selected && selected_antutu && !has_antutu_3D) {
if (selected_antutu_adj > selected_oom_score_adj || (selected_antutu_adj == selected_oom_score_adj && selected_antutu_tasksize > selected_tasksize)) {
selected = selected_antutu;
selected_tasksize = selected_antutu_tasksize;
selected_oom_score_adj = selected_antutu_adj;
}
}
if (selected) {
long cache_size = other_file * (long)(PAGE_SIZE / 1024);
long cache_size_orig = other_file_orig * (long)(PAGE_SIZE / 1024);
long cache_limit = minfree * (long)(PAGE_SIZE / 1024);
long free = other_free * (long)(PAGE_SIZE / 1024);
if (test_task_flag(selected, TIF_MEMDIE) && (test_task_state(selected, TASK_UNINTERRUPTIBLE))) {
lowmem_print(2, "'%s' (%d) is already killed\n",
selected->comm,
selected->pid);
rcu_read_unlock();
mutex_unlock(&scan_mutex);
return 0;
}
task_lock(selected);
/* add for lmfs */
selected_process_uid = from_kuid(&init_user_ns,
selected->cred->uid);
selected_process_pid = selected->pid;
selected_process_adj = selected_oom_score_adj;
send_sig(SIGKILL, selected, 0);
/*
* FIXME: lowmemorykiller shouldn't abuse global OOM killer
* infrastructure. There is no real reason why the selected
* task should have access to the memory reserves.
*/
if (selected->mm)
mark_oom_victim(selected);
task_unlock(selected);
trace_lowmemory_kill(selected, cache_size, cache_limit, free);
lowmem_print(1, "Killing '%s' (%d:%d), adj %hd,\n"
" to free %ldkB on behalf of '%s' (%d) because\n"
" cache is %ldkB , limit is %ldkB for oom_score_adj %hd\n"
" Free memory is %ldkB above reserved\n"
" swaptotal is %ldkB, swapfree is %ldkB, pressure is %d\n"
" cache_orig is %ldkB\n",
selected->comm, selected->pid, selected->tgid,
selected_oom_score_adj,
selected_tasksize * (long)(PAGE_SIZE / 1024),
current->comm, current->pid,
cache_size, cache_limit,
min_score_adj, free,
si.totalswap * (long)(PAGE_SIZE / 1024),
si.freeswap * (long)(PAGE_SIZE / 1024),
pressure, cache_size_orig);
lowmem_deathpending_timeout = jiffies + HZ;
rem += selected_tasksize;
trace_almk_shrink(selected_tasksize, ret,
other_free, other_file, selected_oom_score_adj);
} else {
trace_almk_shrink(1, ret, other_free, other_file, 0);
}
rcu_read_unlock();
mutex_unlock(&scan_mutex);
if (selected) {
send_killing_app_info_to_user(selected_process_uid,
selected_process_pid,
selected_process_adj);
#ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER_MEMINFO
if (__ratelimit(&lmk_rs))
dump_tasks_info();
#endif
#ifdef CONFIG_E_SHOW_MEM
if ((0 == min_score_adj)
&& (__ratelimit(&lmk_meminfo_rs))) {
enhanced_show_mem(E_SHOW_MEM_ALL);
} else if (__ratelimit(&lmk_mem_rs)) {
if ((!si.freeswap)
|| ((si.totalswap / (si.freeswap + 1)) >= 10))
enhanced_show_mem(E_SHOW_MEM_CLASSIC);
else
enhanced_show_mem(E_SHOW_MEM_BASIC);
} else if (process_need_show_memory(selected->comm)) {
enhanced_show_mem(E_SHOW_MEM_ALL);
}
#endif
}
lowmem_print(4, "lowmem_scan %lu, %x, return %lu\n",
sc->nr_to_scan, sc->gfp_mask, rem);
return rem;
}
4.2.2 Linux OOM Killer
Linux OOM Killer 是 Linux 内核的一部分,源码位置 /mm/oom_kill.c。
在系统无法再进行内存分配时,内核会遍历所有进程,对每个进程计算 badness,得分最高的会被杀。
流程如下:_alloc_pages -> out_of_memory() -> select_bad_process() -> oom_badness()
_alloc_pages 是内核在分配内存时调用
unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
const nodemask_t *nodemask, unsigned long totalpages)
{
long points;
long adj;
if (oom_unkillable_task(p, memcg, nodemask))
return 0;
p = find_lock_task_mm(p);
if (!p)
return 0;
/*
* Do not even consider tasks which are explicitly marked oom
* unkillable or have been already oom reaped or the are in
* the middle of vfork
*/
adj = (long)p->signal->oom_score_adj;
if (adj == OOM_SCORE_ADJ_MIN ||
test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
in_vfork(p)) {
task_unlock(p);
return 0;
}
/*
* The baseline for the badness score is the proportion of RAM that each
* task's rss, pagetable and swap space use.
*/
points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm);
task_unlock(p);
/*
* Root processes get 3% bonus, just like the __vm_enough_memory()
* implementation used by LSMs.
*/
if (has_capability_noaudit(p, CAP_SYS_ADMIN))
points -= (points * 3) / 100;
/* Normalize to oom_score_adj units */
adj *= totalpages / 1000;
points += adj;
/*
* Never return 0 for an eligible task regardless of the root bonus and
* oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
*/
return points > 0 ? points : 1;
}