本来按顺序这一篇应该也还是 logd,但我刚开始写就碰到了 cgroup,一顿搜索又扯上了 lmk,没办法,只能先解决这拦路的石头,然后再继续 logd。
Android 早先的版本的 lmk 是以驱动的形式在内核中实现的,这种方式并不为主线内核所接受。后来有人给内核添加了 memory pressure event,这就为应用层实现 lmk 提供了可能性。通过监听 memory pressure 事件,应用可以在内存 low、medium 和 critical 的时候得到通知,从而回收一些优先级比较低的应用。
下面我们就一起来看看他的实现。
应用初始化
跟大多数守护进程一样,lmkd 也是由 init 进程启动的:
// system/core/lmkd/lmkd.rc
service lmkd /system/bin/lmkd
class core
group root readproc
critical
socket lmkd seqpacket 0660 system system
writepid /dev/cpuset/system-background/tasks
这里创建的 socket lmkd 的 user/group 都是 system,而它的权限是 0660,所以只有 system 应用才能读写(一般是 activity manager)。
接下来的 writepid 跟 Linux 的 cgroups 相关,目前我也不太了解(流下了没技术的泪水),后面补上相关的知识后再来单独撸一篇(文章)。
应用启动后,开始执行 main
函数,main
函数主要做三件事:
- 读取配置参数
- 锁住内存页并设置进程调度器
- 初始化 epoll 事件监听
- 循环处理事件
为了让读者有整体感,这里我们先把一整个 main
函数放上了,然后再单独看各个部分的实现。
// system/core/lmkd/lmkd.c
int main(int argc __unused, char **argv __unused) {
struct sched_param param = {
.sched_priority = 1,
};
/* By default disable low level vmpressure events */
level_oomadj[VMPRESS_LEVEL_LOW] =
property_get_int32("ro.lmk.low", OOM_SCORE_ADJ_MAX + 1);
level_oomadj[VMPRESS_LEVEL_MEDIUM] =
property_get_int32("ro.lmk.medium", 800);
level_oomadj[VMPRESS_LEVEL_CRITICAL] =
property_get_int32("ro.lmk.critical", 0);
debug_process_killing = property_get_bool("ro.lmk.debug", false);
/* By default disable upgrade/downgrade logic */
enable_pressure_upgrade =
property_get_bool("ro.lmk.critical_upgrade", false);
upgrade_pressure =
(int64_t)property_get_int32("ro.lmk.upgrade_pressure", 100);
downgrade_pressure =
(int64_t)property_get_int32("ro.lmk.downgrade_pressure", 100);
kill_heaviest_task =
property_get_bool("ro.lmk.kill_heaviest_task", false);
low_ram_device = property_get_bool("ro.config.low_ram", false);
kill_timeout_ms =
(unsigned long)property_get_int32("ro.lmk.kill_timeout_ms", 0);
use_minfree_levels =
property_get_bool("ro.lmk.use_minfree_levels", false);
#ifdef LMKD_LOG_STATS
statslog_init(&log_ctx, &enable_stats_log);
#endif
// MCL_ONFAULT pins pages as they fault instead of loading
// everything immediately all at once. (Which would be bad,
// because as of this writing, we have a lot of mapped pages we
// never use.) Old kernels will see MCL_ONFAULT and fail with
// EINVAL; we ignore this failure.
//
// N.B. read the man page for mlockall. MCL_CURRENT | MCL_ONFAULT
// pins ⊆ MCL_CURRENT, converging to just MCL_CURRENT as we fault
// in pages.
if (mlockall(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT) && errno != EINVAL)
ALOGW("mlockall failed: errno=%d", errno);
sched_setscheduler(0, SCHED_FIFO, ¶m);
if (!init())
mainloop();
#ifdef LMKD_LOG_STATS
statslog_destroy(&log_ctx);
#endif
ALOGI("exiting");
return 0;
}
读取配置参数
// system/core/lmkd/lmkd.c
/* memory pressure levels */
enum vmpressure_level {
VMPRESS_LEVEL_LOW = 0,
VMPRESS_LEVEL_MEDIUM,
VMPRESS_LEVEL_CRITICAL,
VMPRESS_LEVEL_COUNT
};
static int level_oomadj[VMPRESS_LEVEL_COUNT];
static int mpevfd[VMPRESS_LEVEL_COUNT] = { -1, -1, -1 };
static bool debug_process_killing;
static bool enable_pressure_upgrade;
static int64_t upgrade_pressure;
static int64_t downgrade_pressure;
static bool low_ram_device;
static bool kill_heaviest_task;
static unsigned long kill_timeout_ms;
static bool use_minfree_levels;
int main(int argc __unused, char **argv __unused) {
/* By default disable low level vmpressure events */
level_oomadj[VMPRESS_LEVEL_LOW] =
property_get_int32("ro.lmk.low", OOM_SCORE_ADJ_MAX + 1);
level_oomadj[VMPRESS_LEVEL_MEDIUM] =
property_get_int32("ro.lmk.medium", 800);
level_oomadj[VMPRESS_LEVEL_CRITICAL] =
property_get_int32("ro.lmk.critical", 0);
debug_process_killing = property_get_bool("ro.lmk.debug", false);
/* By default disable upgrade/downgrade logic */
enable_pressure_upgrade =
property_get_bool("ro.lmk.critical_upgrade", false);
upgrade_pressure =
(int64_t)property_get_int32("ro.lmk.upgrade_pressure", 100);
downgrade_pressure =
(int64_t)property_get_int32("ro.lmk.downgrade_pressure", 100);
kill_heaviest_task =
property_get_bool("ro.lmk.kill_heaviest_task", false);
low_ram_device = property_get_bool("ro.config.low_ram", false);
kill_timeout_ms =
(unsigned long)property_get_int32("ro.lmk.kill_timeout_ms", 0);
use_minfree_levels =
property_get_bool("ro.lmk.use_minfree_levels", false);
...
...
...
}
这段代码很简单,就是直接从系统属性里面读配置,然后放到静态变量里。关于这些属性的含义,读者可以参考 system/core/lmkd/README.md
。
enum vmpressure_level
代表了内存压力等级,分别是我们前面提到的 low、medium 和 critical。
锁住内存页并设置进程调度器
int main(int argc __unused, char **argv __unused) {
// 读取配置参数
// MCL_ONFAULT pins pages as they fault instead of loading
// everything immediately all at once. (Which would be bad,
// because as of this writing, we have a lot of mapped pages we
// never use.) Old kernels will see MCL_ONFAULT and fail with
// EINVAL; we ignore this failure.
//
// N.B. read the man page for mlockall. MCL_CURRENT | MCL_ONFAULT
// pins ⊆ MCL_CURRENT, converging to just MCL_CURRENT as we fault
// in pages.
if (mlockall(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT) && errno != EINVAL)
ALOGW("mlockall failed: errno=%d", errno);
struct sched_param param = {
.sched_priority = 1,
};
sched_setscheduler(0, SCHED_FIFO, ¶m);
// 初始化 epoll 事件监听
// 循环处理事件
}
MCL_CURRENT
把应用里当前已经在内存中的页锁着;MCL_FUTURE
把以后分配的内存区锁着;MCL_ONFAULT
表示不把那些当前不在内存里的页都加载到内存,而是当 page fault 的时候,再将加载进来的页面锁着。关于 mlockall
的更多信息,读者可以参考 man page。
接下来的 sched_setscheduler
将自己设置为实时进程,使用的调度器是 fifo。实时进程的优先级高于所有普通进程。对 fifo 调度器来说,在进程可运行(runnable)的时候,内核不会抢占它。
初始化 epoll 事件监听
int main(int argc __unused, char **argv __unused) {
// 读取配置参数
// 锁住内存页并设置进程调度器
if (!init())
mainloop();
return 0;
}
epoll 的初始化由 init
函数完成,mainloop
在一个主循环里处理事件,后者我们在下一小节讨论。
static int init(void) {
struct epoll_event epev;
int i;
int ret;
page_k = sysconf(_SC_PAGESIZE);
if (page_k == -1)
page_k = PAGE_SIZE;
page_k /= 1024;
epollfd = epoll_create(MAX_EPOLL_EVENTS);
if (epollfd == -1) {
ALOGE("epoll_create failed (errno=%d)", errno);
return -1;
}
// mark data connections as not connected
for (int i = 0; i < MAX_DATA_CONN; i++) {
data_sock[i].sock = -1;
}
ctrl_sock.sock = android_get_control_socket("lmkd");
if (ctrl_sock.sock < 0) {
ALOGE("get lmkd control socket failed");
return -1;
}
ret = listen(ctrl_sock.sock, MAX_DATA_CONN);
if (ret < 0) {
ALOGE("lmkd control socket listen failed (errno=%d)", errno);
return -1;
}
epev.events = EPOLLIN;
ctrl_sock.handler_info.handler = ctrl_connect_handler;
epev.data.ptr = (void *)&(ctrl_sock.handler_info);
if (epoll_ctl(epollfd, EPOLL_CTL_ADD, ctrl_sock.sock, &epev) == -1) {
ALOGE("epoll_ctl for lmkd control socket failed (errno=%d)", errno);
return -1;
}
maxevents++;
has_inkernel_module = !access(INKERNEL_MINFREE_PATH, W_OK);
use_inkernel_interface = has_inkernel_module;
if (use_inkernel_interface) {
ALOGI("Using in-kernel low memory killer interface");
} else {
if (!init_mp_common(VMPRESS_LEVEL_LOW) ||
!init_mp_common(VMPRESS_LEVEL_MEDIUM) ||
!init_mp_common(VMPRESS_LEVEL_CRITICAL)) {
ALOGE("Kernel does not support memory pressure events or in-kernel low memory killer");
return -1;
}
}
for (i = 0; i <= ADJTOSLOT(OOM_SCORE_ADJ_MAX); i++) {
procadjslot_list[i].next = &procadjslot_list[i];
procadjslot_list[i].prev = &procadjslot_list[i];
}
return 0;
}
这里初始化静态变量 page_k
,page_k
表示一个内存页有多少 KB,默认的 PAGE_SIZE
为 4096。
static long page_k;
/* data required to handle events */
struct event_handler_info {
int data;
void (*handler)(int data, uint32_t events);
};
/* data required to handle socket events */
struct sock_event_handler_info {
int sock;
struct event_handler_info handler_info;
};
/* max supported number of data connections */
#define MAX_DATA_CONN 2
/* socket event handler data */
static struct sock_event_handler_info ctrl_sock;
static struct sock_event_handler_info data_sock[MAX_DATA_CONN];
ctrl_sock
是由 init 进程帮我们创建的那个 socket lmkd,通过这个 socket,lmkd 进程监听这个 socket 并等待接受客户的连接。lmkd 最多支持两个客户(MAX_DATA_CONN
),这两个客户的 socket fd 就放在 data_sock
。
struct event_handler_info
定义了一个接口,epoll
返回时,主循环里就这个事件对应的 handler
。调用时,第一个参数 data
是用户在创建 struct event_handler_info
对象时初始化的字段 data
,第二个参数 events
是 epoll
返回的事件。
从上面的初始化代码可以看出,当 socket lmkd 有客户连接时,对应的回调是 ctrl_connect_handler
。这里我们没有初始化 ctrl_sock.handler_info.data
,是因为 ctrl_connect_handler
不使用这个额外的参数。
INKERN