Linux kernel suspend resume学习：2.6.35与3.0.35比较

本文链接：https://blog.csdn.net/njuitjf/article/details/18317149

Linux kernel suspend resume学习：2.6.35与3.0.35比较

最近在学习linux kernel中suspend和resume的处理。
只是一味的看代码，有点枯燥，刚好有两个项目使用了不同的内核版本，就以比较这两个间的处理差别为线索进行学习。
由于列举了很多代码，为了保持连续性，整篇文章还是放到了一个blog中。

首先看看公开出去的接口，都是文件/sys/power/state。
读该文件可以获取可能取值。
写该文件可以实现状态改变。

/sys/power/state定义的地方：
power_attr(state);

两个内核版本中，power_attr的定义相同。
power_attr的定义：
#define power_attr(_name) \
static struct kobj_attribute _name##_attr = { \
.attr = {    \
  .name = __stringify(_name), \
  .mode = 0644,   \
},     \
.show = _name##_show,   \
.store = _name##_store,  \
}

power_attr就是往sysfs中添加一个文件，操作函数：
.show = _name##_show,
.store = _name##_store,

对应到state：
.show = state_show,
.store = state_store,

内核中这两个函数的注释：
/**
* state - control system power state.
*
* show() returns what states are supported, which is hard-coded to
* 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and
* 'disk' (Suspend-to-Disk).
*
* store() accepts one of those strings, translates it into the
* proper enumerated value, and initiates a suspend transition.
*/

state_show其实就是将数组pm_states的内容show出来。
数组pm_states的定义，在两个项目中是不同的。

kernel 2.6.35项目中：
const char *const pm_states[PM_SUSPEND_MAX] = {
#ifdef CONFIG_EARLYSUSPEND
[PM_SUSPEND_ON] = "on",
#endif
[PM_SUSPEND_STANDBY] = "standby",
[PM_SUSPEND_MEM] = "mem",
};

kernel 3.0.35项目中：
const char *const pm_states[PM_SUSPEND_MAX] = {
[PM_SUSPEND_STANDBY] = "standby",
[PM_SUSPEND_MEM] = "mem",
};

关于这些 state 的介绍，请参考 kernel 中稳定：
Documentation\power\states.txt

函数state_store定义，两个项目中的差别也就是kernel 2.6.35项目中多了对CONFIG_EARLYSUSPEND的处理。
将kernel 2.6.35项目中state_store的定义列出来：
static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
#ifdef CONFIG_SUSPEND
/*
看一下PM_SUSPEND_ON及几个相关的定义：
typedef int __bitwise suspend_state_t;

#define PM_SUSPEND_ON  ((__force suspend_state_t) 0)
#define PM_SUSPEND_STANDBY ((__force suspend_state_t) 1)
#define PM_SUSPEND_MEM  ((__force suspend_state_t) 3)
#define PM_SUSPEND_MAX  ((__force suspend_state_t) 4)
不难理解，如果有on的话，肯定要从on开始遍历
*/
#ifdef CONFIG_EARLYSUSPEND
suspend_state_t state = PM_SUSPEND_ON;
#else
suspend_state_t state = PM_SUSPEND_STANDBY;
#endif
const char * const *s;
#endif
char *p;
int len;
int error = -EINVAL;

p = memchr(buf, '\n', n);
len = p ? p - buf : n;

/* First, check if we are requested to hibernate */
if (len == 4 && !strncmp(buf, "disk", len)) {
error = hibernate();
goto Exit;
}

#ifdef CONFIG_SUSPEND
for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) {
  if (*s && len == strlen(*s) && !strncmp(buf, *s, len))
   break;
}
if (state < PM_SUSPEND_MAX && *s)
#ifdef CONFIG_EARLYSUSPEND
  if (state == PM_SUSPEND_ON || valid_state(state)) {
   error = 0;
   request_suspend_state(state);
  }
#else
  error = enter_state(state);
#endif
#endif

Exit:
return error ? error : n;
}

函数valid_state是判断是否对指定的state进行了支持：
bool valid_state(suspend_state_t state)
{
/*
* All states need lowlevel support and need to be valid to the lowlevel
* implementation, no valid callback implies that none are valid.
*/
return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
}

suspend_ops在函数suspend_set_ops中被赋值：
/**
* suspend_set_ops - Set the global suspend method table.
* @ops: Pointer to ops structure.
*/
void suspend_set_ops(struct platform_suspend_ops *ops)
{
mutex_lock(&pm_mutex);
suspend_ops = ops;
mutex_unlock(&pm_mutex);
}
一般会在 architecture 相关的pm模块的 init 或者 probe 函数中会调用 suspend_set_ops 。
valid 一般是判断该 architecture 中是否支持该 state 。

接下来看看差异中的 request_suspend_state 函数：
void request_suspend_state(suspend_state_t new_state)
{
unsigned long irqflags;
int old_sleep;

spin_lock_irqsave(&state_lock, irqflags);
old_sleep = state & SUSPEND_REQUESTED;
...

if (new_state == PM_SUSPEND_STANDBY) {
  mode = EARLY_SUSPEND_MODE_EINK;
  new_state = PM_SUSPEND_MEM;
} else
  mode = EARLY_SUSPEND_MODE_NORMAL;

if (!old_sleep && new_state != PM_SUSPEND_ON) {
  if ((state & SUSPENDED) && (last_mode != mode)) {
   /* flush the workqueue */
   spin_unlock_irqrestore(&state_lock, irqflags);
   flush_workqueue(suspend_work_queue);
   spin_lock_irqsave(&state_lock, irqflags);
  }
  state |= SUSPEND_REQUESTED;
  queue_work(suspend_work_queue, &early_suspend_work); // 将 early_suspend_work 添加到 suspend_work_queue 中
} else if (old_sleep && new_state == PM_SUSPEND_ON) {    // 这一次走的是这个分支
  state &= ~SUSPEND_REQUESTED;
  wake_lock(&main_wake_lock);
  queue_work(suspend_work_queue, &late_resume_work);    // 将 late_resume_work 添加到 suspend_work_queue
}

if (new_state != PM_SUSPEND_ON)
last_mode = mode;

requested_suspend_state = new_state;
spin_unlock_irqrestore(&state_lock, irqflags);
}

suspend_work_queue 在函数 wakelocks_init 中被创建：
suspend_work_queue = create_singlethread_workqueue("suspend");
函数 wakelocks_init 为 core_initcall ：
core_initcall(wakelocks_init);

early_suspend_work 和 late_resume_work 的定义：
static DECLARE_WORK(early_suspend_work, early_suspend);
static DECLARE_WORK(late_resume_work, late_resume);

#define DECLARE_WORK(n, f)     \
struct work_struct n = __WORK_INITIALIZER(n, f)

#define __WORK_INITIALIZER(n, f) {    \
.data = WORK_DATA_STATIC_INIT(),   \
.entry = { &(n).entry, &(n).entry },   \
.func = (f),      \
__WORK_INIT_LOCKDEP_MAP(#n, &(n))   \
}

early_suspend 和 late_resume 是处理函数。
两个函数中都对 early_suspend_handlers 进行了处理。
early_suspend 函数中依次调用了 early_suspend_handlers 中的 suspend 函数：
list_for_each_entry(pos, &early_suspend_handlers, link) {
  if (pos->suspend != NULL) {
   pos->pm_mode = pwr_mode;
   pos->suspend(pos);
  }
}
late_resume 函数中一次调用了 early_suspend_handlers 中的 resume 函数：
list_for_each_entry_reverse(pos, &early_suspend_handlers, link)
  if (pos->resume != NULL)
   pos->resume(pos);

early_suspend_handlers 的定义：
static LIST_HEAD(early_suspend_handlers);

函数 register_early_suspend 将 handler 注册到 early_suspend_handlers ：
void register_early_suspend(struct early_suspend *handler)
{
struct list_head *pos;

mutex_lock(&early_suspend_lock);
list_for_each(pos, &early_suspend_handlers) {
  struct early_suspend *e;
  e = list_entry(pos, struct early_suspend, link);
  if (e->level > handler->level)
   break;
}
list_add_tail(&handler->link, pos);
if ((state & SUSPENDED) && handler->suspend)
  handler->suspend(handler);
mutex_unlock(&early_suspend_lock);
}
需要进行early suspend处理的模块调用函数 register_early_suspend 注册 handler .
static struct early_suspend mxc_epdc_earlysuspend = {
.level = EARLY_SUSPEND_LEVEL_DISABLE_FB,
.suspend = mxc_epdc_early_suspend,
.resume = mxc_epdc_late_resume,
};

register_early_suspend(&mxc_epdc_earlysuspend);

看看 queue_work 的实现：
/**
* queue_work - queue work on a workqueue
* @wq: workqueue to use
* @work: work to queue
*
* Returns 0 if @work was already on a queue, non-zero otherwise.
*
* We queue the work to the CPU on which it was submitted, but if the CPU dies
* it can be processed by another CPU.
*/
int queue_work(struct workqueue_struct *wq, struct work_struct *work)
{
int ret;

ret = queue_work_on(get_cpu(), wq, work);
put_cpu();

return ret;
}
/**
* queue_work_on - queue work on specific cpu
* @cpu: CPU number to execute work on
* @wq: workqueue to use
* @work: work to queue
*
* Returns 0 if @work was already on a queue, non-zero otherwise.
*
* We queue the work to a specific CPU, the caller must ensure it
* can't go away.
*/
int
queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
{
int ret = 0;

if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
  BUG_ON(!list_empty(&work->entry));
  // __queue_work 最终将 work 添加到指定 cpu_workqueue_struct 中的 worklist 中
  __queue_work(wq_per_cpu(wq, cpu), work);
  ret = 1;
}
return ret;
}

flush_workqueue 的实现：
/**
* flush_workqueue - ensure that any scheduled work has run to completion.
* @wq: workqueue to flush
*
* Forces execution of the workqueue and blocks until its completion.
* This is typically used in driver shutdown handlers.
*
* We sleep until all works which were queued on entry have been handled,
* but we are not livelocked by new incoming ones.
*
* This function used to run the workqueues itself. Now we just wait for the
* helper threads to do it.
*/
void flush_workqueue(struct workqueue_struct *wq)
{
const struct cpumask *cpu_map = wq_cpu_map(wq);
int cpu;

might_sleep();
lock_map_acquire(&wq->lockdep_map);
lock_map_release(&wq->lockdep_map);
for_each_cpu(cpu, cpu_map)
flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
}

回到 state_store 函数。
我们发现函数中最初有个判断，如果设置的状态为 "disk" ，则调用函数 hibernate 。
根据state.txt中的说明，disk即进入深度睡眠，更省电。
因为其将 snapshot 写入到了 disk 。之后可以 power down 。

来看看 hibernate 的实现。
两个内核版本中稍有差别，kernel 3.0.35中多了一些处理。下面把 kernel 3.0.35的实现列了出来：
/**
* hibernate - Carry out system hibernation, including saving the image.
*/
int hibernate(void)
{
int error;

mutex_lock(&pm_mutex);
/* The snapshot device should not be opened while we're running */
if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
error = -EBUSY;
goto Unlock;
}

pm_prepare_console();
error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
if (error)
goto Exit;

error = usermodehelper_disable();
if (error)
goto Exit;

/* Allocate memory management structures */
error = create_basic_memory_bitmaps();
if (error)
goto Exit;

printk(KERN_INFO "PM: Syncing filesystems ... ");
sys_sync();
printk("done.\n");

error = prepare_processes();
if (error)
goto Finish;

if (hibernation_test(TEST_FREEZER))
goto Thaw;

if (hibernation_testmode(HIBERNATION_TESTPROC))
goto Thaw;

error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
if (error)
goto Thaw;

if (in_suspend) {
unsigned int flags = 0;

  if (hibernation_mode == HIBERNATION_PLATFORM)
   flags |= SF_PLATFORM_MODE;
  if (nocompress)                      // kernel 2.6.35中无
   flags |= SF_NOCOMPRESS_MODE;     // kernel 2.6.35中无
  pr_debug("PM: writing image.\n");
  error = swsusp_write(flags);
  swsusp_free();
  if (!error)
   power_down();
  in_suspend = 0;                      // kernel 2.6.35中无
  pm_restore_gfp_mask();               // kernel 2.6.35中无
} else {
  pr_debug("PM: Image restored successfully.\n");
}

Thaw:
thaw_processes();
Finish:
free_basic_memory_bitmaps();
usermodehelper_enable();
Exit:
pm_notifier_call_chain(PM_POST_HIBERNATION);
pm_restore_console();
atomic_inc(&snapshot_device_available);
Unlock:
mutex_unlock(&pm_mutex);
return error;
}

先看第一处差别， NOCOMPRESS 相关。
kernel 2.6.35中没有定义 SF_NOCOMPRESS_MODE 。
搜索代码发现，kernel 3.0.35中有3个地方使用了 SF_NOCOMPRESS_MODE 。
分别是判断 swap 是否有足够空间、写入 snapshot 、读取 snapshot 时。

第一处：
在函数 enough_swap 中。功能在注释中已经体现。该函数的实现：
/**
* enough_swap - Make sure we have enough swap to save the image.
*
* Returns TRUE or FALSE after checking the total amount of swap
* space avaiable from the resume partition.
*/

static int enough_swap(unsigned int nr_pages, unsigned int flags)
{
unsigned int free_swap = count_swap_pages(root_swap, 1);
unsigned int required;

pr_debug("PM: Free swap pages: %u\n", free_swap);
// 如果为非压缩模式，请求多少页即需要多少页；压缩模式下需乘以最坏情况下的压缩比
required = PAGES_FOR_IO + ((flags & SF_NOCOMPRESS_MODE) ?
nr_pages : (nr_pages * LZO_CMP_PAGES) / LZO_UNC_PAGES + 1);
return free_swap > required;
}
看了下 kernel 2.6.35 中的实现，关键代码如下：
return free_swap > nr_pages + PAGES_FOR_IO;
比较两个版本的 kernel 可知，2.6.35中只支持非压缩模式；3.0.35中增加了压缩模式的支持。

第二处：
在函数 swsusp_write 中。关键代码：
/**
* swsusp_write - Write entire image and metadata.
* @flags: flags to pass to the "boot" kernel in the image header
*
* It is important _NOT_ to umount filesystems at this point. We want
* them synced (in case something goes wrong) but we DO not want to mark
* filesystem clean: it is not. (And it does not matter, if we resume
* correctly, we'll mark system clean, anyway.)
*/

int swsusp_write(unsigned int flags)
{
...
pages = snapshot_get_image_size();
error = get_swap_writer(&handle);
...
// 判断 swap 中是否有足够空间
if (!enough_swap(pages, flags)) {
...
}
memset(&snapshot, 0, sizeof(struct snapshot_handle));
error = snapshot_read_next(&snapshot);
...
header = (struct swsusp_info *)data_of(snapshot);
// 写入元数据
error = swap_write_page(&handle, header, NULL);
if (!error) {
  // 写入 snapshot 数据。根据是否设置了 SF_NOCOMPRESS_MODE 分别调用 save_image 和 save_image_lzo
  error = (flags & SF_NOCOMPRESS_MODE) ?
   save_image(&handle, &snapshot, pages - 1) :
   save_image_lzo(&handle, &snapshot, pages - 1);
}
out_finish:
error = swap_writer_finish(&handle, flags, error);
return error;
}
函数 save_image 的注释：
/**
* save_image - save the suspend image data
*/
函数 save_image_lzo 的注释：
/**
* save_image_lzo - Save the suspend image data compressed with LZO.
* @handle: Swap mam handle to use for saving the image.
* @snapshot: Image to read data from.
* @nr_to_write: Number of pages to save.
*/
lzo 压缩算法就不介绍了。具体写入的实现这儿也不深入了。
2.6.35 中函数 swsusp_write 的实现类似，只是在调用 enough_swap 时不会传入 flags 参数；
另外，没有函数 save_image_lzo ，只会调用 save_image 。
函数 swsusp_write 中调用的另外一个重要函数 snapshot_read_next 。其注释：
/**
* snapshot_read_next - used for reading the system memory snapshot.
*
* On the first call to it @handle should point to a zeroed
* snapshot_handle structure. The structure gets updated and a pointer
* to it should be passed to this function every next time.
*
* On success the function returns a positive number. Then, the caller
* is allowed to read up to the returned number of bytes from the memory
* location computed by the data_of() macro.
*
* The function returns 0 to indicate the end of data stream condition,
* and a negative number is returned on error. In such cases the
* structure pointed to by @handle is not updated and should not be used
* any more.
*/
swsusp_write 中第一次调用了函数 snapshot_read_next ，函数 save_image/save_image_lzo 中循环调用 snapshot_read_next 函数，直到读取完 snapshot 。

第三处：
在函数 swsusp_read 中，关键代码：
/**
* swsusp_read - read the hibernation image.
* @flags_p: flags passed by the "frozen" kernel in the image header should
* be written into this memory location
*/

int swsusp_read(unsigned int *flags_p)
{
...
memset(&snapshot, 0, sizeof(struct snapshot_handle));
error = snapshot_write_next(&snapshot);
...
header = (struct swsusp_info *)data_of(snapshot);
error = get_swap_reader(&handle, flags_p);
...
if (!error)
  error = swap_read_page(&handle, header, NULL);
if (!error) {
  error = (*flags_p & SF_NOCOMPRESS_MODE) ?
   load_image(&handle, &snapshot, header->pages - 1) :
   load_image_lzo(&handle, &snapshot, header->pages - 1);
}
swap_reader_finish(&handle);
end:
if (!error)
  pr_debug("PM: Image successfully loaded\n");
else
  pr_debug("PM: Error %d resuming\n", error);
return error;
}
load_image 的注释：
/**
* load_image - load the image using the swap map handle
* @handle and the snapshot handle @snapshot
* (assume there are @nr_pages pages to load)
*/

load_image_lzo 的注释：
/**
* load_image_lzo - Load compressed image data and decompress them with LZO.
* @handle: Swap map handle to use for loading data.
* @snapshot: Image to copy uncompressed data into.
* @nr_to_read: Number of pages to load.
*/

2.6.35 中函数 swsusp_read 的实现类似，只是少了对 SF_NOCOMPRESS_MODE 的处理，没有实现函数 load_image_lzo 。
swsusp_read 函数中调用了另外一个重要函数 snapshot_write_next ，其注释：
/**
* snapshot_write_next - used for writing the system memory snapshot.
*
* On the first call to it @handle should point to a zeroed
* snapshot_handle structure. The structure gets updated and a pointer
* to it should be passed to this function every next time.
*
* On success the function returns a positive number. Then, the caller
* is allowed to write up to the returned number of bytes to the memory
* location computed by the data_of() macro.
*
* The function returns 0 to indicate the "end of file" condition,
* and a negative number is returned on error. In such cases the
* structure pointed to by @handle is not updated and should not be used
* any more.
*/
swsusp_read 函数中第一次调用了 snapshot_write_next ， load_image/load_image_lzo 中循环调用 snapshot_write_next ，直到处理完所有的 snapshot 。

第二处差别是在3.0.35中多了：
in_suspend = 0;
搜索代码，发现函数 create_image 中将 in_suspend 设置为了1.
函数 hibernate 调用了函数 hibernation_snapshot 。
函数 hibernation_snapshot 的注释：
/**
* hibernation_snapshot - Quiesce devices and create a hibernation image.
* @platform_mode: If set, use platform driver to prepare for the transition.
*
* This routine must be called with pm_mutex held.
*/
函数 hibernation_snapshot 调用了函数 create_image 。
函数 create_image 的注释：
/**
* create_image - Create a hibernation image.
* @platform_mode: Whether or not to use the platform driver.
*
* Execute device drivers' .freeze_noirq() callbacks, create a hibernation image
* and execute the drivers' .thaw_noirq() callbacks.
*
* Control reappears in this routine after the subsequent restore.
*/
3.0.35中多了这么一句，难道只是为了防止 hibernate 函数中重复进入 if(hibernate)...

第三处差别是在3.0.35中多了：
pm_restore_gfp_mask();
pm_restore_gfp_mask 的实现及相关定义：
#ifdef CONFIG_PM_SLEEP
/*
* The following functions are used by the suspend/hibernate code to temporarily
* change gfp_allowed_mask in order to avoid using I/O during memory allocations
* while devices are suspended. To avoid races with the suspend/hibernate code,
* they should always be called with pm_mutex held (gfp_allowed_mask also should
* only be modified with pm_mutex held, unless the suspend/hibernate code is
* guaranteed not to run in parallel with that modification).
*/

static gfp_t saved_gfp_mask;

void pm_restore_gfp_mask(void)
{
WARN_ON(!mutex_is_locked(&pm_mutex));
if (saved_gfp_mask) {
gfp_allowed_mask = saved_gfp_mask;
saved_gfp_mask = 0;
}
}

void pm_restrict_gfp_mask(void)
{
WARN_ON(!mutex_is_locked(&pm_mutex));
WARN_ON(saved_gfp_mask);
saved_gfp_mask = gfp_allowed_mask;
gfp_allowed_mask &= ~GFP_IOFS;
}
#endif /* CONFIG_PM_SLEEP */

回头看看 hibernate 函数。

/* The snapshot device should not be opened while we're running */
if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
  error = -EBUSY;
  goto Unlock;
}
注释的意思是，我们在执行 hibernate 的时候，不允许别人再打开 snapshot 设备。
static struct miscdevice snapshot_device = {
.minor = SNAPSHOT_MINOR,
.name = "snapshot",
.fops = &snapshot_fops,
};
snapshot_fops 的定义：
static const struct file_operations snapshot_fops = {
.open = snapshot_open,
.release = snapshot_release,
.read = snapshot_read,
.write = snapshot_write,
.llseek = no_llseek,
.unlocked_ioctl = snapshot_ioctl,
};
打开 snapshot 设备用的就是函数 snapshot_open 了。
实现在我们 running 的时候不让 snapshot 设备被打开的方法是通过变量 snapshot_device_available ，其定义：
atomic_t snapshot_device_available = ATOMIC_INIT(1);
snapshot_open 函数中有以下语句：
if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
  error = -EBUSY;
  goto Unlock;
}
add 个 -1 ，也就相当于减1操作。

继续 hibernate 函数。
// console 相关处理
pm_prepare_console();

error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
pm_notifier_call_chain 函数经过多层调用，调用到了函数 __blocking_notifier_call_chain ，其注释：
/**
* __blocking_notifier_call_chain - Call functions in a blocking notifier chain
* @nh: Pointer to head of the blocking notifier chain
* @val: Value passed unmodified to notifier function
* @v: Pointer passed unmodified to notifier function
* @nr_to_call: See comment for notifier_call_chain.
* @nr_calls: See comment for notifier_call_chain.
*
* Calls each function in a notifier chain in turn. The functions
* run in a process context, so they are allowed to block.
*
* If the return value of the notifier can be and'ed
* with %NOTIFY_STOP_MASK then blocking_notifier_call_chain()
* will return immediately, with the return value of
* the notifier function which halted execution.
* Otherwise the return value is the return value
* of the last notifier function called.
*/

继续 hibernate 函数。
/**
* usermodehelper_disable - prevent new helpers from being started
*/
error = usermodehelper_disable();

/* Allocate memory management structures */
error = create_basic_memory_bitmaps();
函数 create_basic_memory_bitmaps 的注释：
/**
* create_basic_memory_bitmaps - create bitmaps needed for marking page
* frames that should not be saved and free page frames. The pointers
* forbidden_pages_map and free_pages_map are only modified if everything
* goes well, because we don't want the bits to be used before both bitmaps
* are set up.
*/

继续 hibernate 函数。
error = prepare_processes();
prepare_processes 函数的实现：
static int prepare_processes(void)
{
int error = 0;

if (freeze_processes()) {
error = -EBUSY;
thaw_processes();
}
return error;
}
可见函数 prepare_processes 的功能为：
尝试冷冻进程，如果失败，则解冻进程，并返回 -EBUSY 。
如何冷冻进程的先不看了。

继续 hibernate 函数。
// 如果只是 debug ，那就只简单 delay 一会
if (hibernation_test(TEST_FREEZER))
  goto Thaw;
// 与上面类似
if (hibernation_testmode(HIBERNATION_TESTPROC))
  goto Thaw;
// 此函数前面见到过，功能是让 devices 都静止，并创建 hibernation image 。它还将 in_suspend 设置为了1
error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);

// hibernation_snapshot 被成功调用， in_suspend 肯定非0
if (in_suspend) {
  unsigned int flags = 0;

  if (hibernation_mode == HIBERNATION_PLATFORM)
   flags |= SF_PLATFORM_MODE;
  if (nocompress) // 是否是非压缩的
   flags |= SF_NOCOMPRESS_MODE;
  pr_debug("PM: writing image.\n");
  // 函数 swsusp_write 前面见过，将 snapshot 写入到 swap
  error = swsusp_write(flags);
  /**
   * swsusp_free - free pages allocated for the suspend.
   *
   * Suspend pages are alocated before the atomic copy is made, so we
   * need to release them after the resume.
   */
  swsusp_free();
  if (!error)
   power_down();
  in_suspend = 0;
  pm_restore_gfp_mask();
} else {
  pr_debug("PM: Image restored successfully.\n");
}

函数 power_down 的实现：
/**
* power_down - Shut the machine down for hibernation.
*
* Use the platform driver, if configured, to put the system into the sleep
* state corresponding to hibernation, or try to power it off or reboot,
* depending on the value of hibernation_mode.
*/
static void power_down(void)
{
switch (hibernation_mode) {
case HIBERNATION_TEST:
case HIBERNATION_TESTPROC:
  break;
case HIBERNATION_REBOOT:
  /**
   * kernel_restart - reboot the system
   * @cmd: pointer to buffer containing command to execute for restart
   *  or %NULL
   *
   * Shutdown everything and perform a clean reboot.
   * This is not safe to call in interrupt context.
   */
  kernel_restart(NULL);
  break;
case HIBERNATION_PLATFORM:
  /**
   * hibernation_platform_enter - Power off the system using the platform driver.
   */
  hibernation_platform_enter();
case HIBERNATION_SHUTDOWN:
  /**
   * kernel_power_off - power_off the system
   *
   * Shutdown everything and perform a clean system power_off.
   */
  kernel_power_off();
  break;
}
/**
* kernel_halt - halt the system
*
* Shutdown everything and perform a clean system halt.
*/
kernel_halt();
/*
* Valid image is on the disk, if we continue we risk serious data
* corruption after resume.
*/
printk(KERN_CRIT "PM: Please power down manually\n");
while(1);
}

到 power_down 函数，已经 power down 了，后面的代码怎么跑？
自然是重新 power up 之后继续跑了。
首先是紧跟在 power_down 之后的下两句代码：
  in_suspend = 0;
  pm_restore_gfp_mask();

然后是：
Thaw:
// 春回大地，冰雪消融
thaw_processes();
Finish:
/**
* free_basic_memory_bitmaps - free memory bitmaps allocated by
* create_basic_memory_bitmaps(). The auxiliary pointers are necessary
* so that the bitmaps themselves are not referred to while they are being
* freed.
*/
free_basic_memory_bitmaps();
// 前面 disable 了，现在要 enable 回来
usermodehelper_enable();
Exit:
pm_notifier_call_chain(PM_POST_HIBERNATION);
// 对应于前面 pm_prepare_console 的处理
pm_restore_console();
// 我们已经不 running 了，别人可以再使用 snapshot device 了
atomic_inc(&snapshot_device_available);
Unlock:
mutex_unlock(&pm_mutex);
return error;
}

看完了 hibernate 函数，即 Suspend-to-disk 的处理。
回到 state_store 函数继续。
根据写入的字符串，找到对应的 state ，并以该 state 为参数调用函数 enter_state 。
看看 enter_state 函数的实现：
/**
* enter_state - Do common work of entering low-power state.
* @state: pm_state structure for state we're entering.
*
* Make sure we're the only ones trying to enter a sleep state. Fail
* if someone has beat us to it, since we don't want anything weird to
* happen when we wake up.
* Then, do the setup for suspend, enter the state, and cleaup (after
* we've woken up).
*/
int enter_state(suspend_state_t state)
{
int error;

// 这个函数前面看到过，判断当前 architecture 是否支持该 state
if (!valid_state(state))
return -ENODEV;

if (!mutex_trylock(&pm_mutex))
return -EBUSY;

printk(KERN_INFO "PM: Syncing filesystems ... ");
sys_sync();
printk("done.\n");

pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
// 实现见后文
error = suspend_prepare();
if (error)
goto Unlock;

if (suspend_test(TEST_FREEZER))
goto Finish;

pr_debug("PM: Entering %s sleep\n", pm_states[state]);
pm_restrict_gfp_mask();
error = suspend_devices_and_enter(state);
pm_restore_gfp_mask();

Finish:
pr_debug("PM: Finishing wakeup.\n");
suspend_finish();
Unlock:
mutex_unlock(&pm_mutex);
return error;
}

函数 suspend_prepare 的实现：
/**
* suspend_prepare - Do prep work before entering low-power state.
*
* This is common code that is called for each state that we're entering.
* Run suspend notifiers, allocate a console and stop all processes.
*/
static int suspend_prepare(void)
{
int error;

if (!suspend_ops || !suspend_ops->enter)
return -EPERM;

// 前面见过，console相关的处理
pm_prepare_console();

// 前面也见过，只是参数略有不同
error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
if (error)
goto Finish;

// 前面也见过
error = usermodehelper_disable();
if (error)
goto Finish;

// 如果支持 suspend freezer ,该函数直接调用函数 freeze_processes 。
// 与前面看过的 hibernate 中的 prepare_processes 函数类似
error = suspend_freeze_processes();
// 一切 OK ，返回 0
if (!error)
return 0;

suspend_thaw_processes();
usermodehelper_enable();
Finish:
pm_notifier_call_chain(PM_POST_SUSPEND);
pm_restore_console();
return error;
}

回到函数 enter_state ，
// 只有定义了 CONFIG_PM_DEBUG ，并且是在测试的时候，suspend_test 才返回1，否则返回0，继续 suspend
if (suspend_test(TEST_FREEZER))
  goto Finish;

// 让 gfp mask 变得更严格，这是 suspend 前最后调用的一个函数
// 后面的 pm_restore_gfp_mask 函数是 resume 回来调用的第一个函数，功能是恢复 gfp mask
// 这个在前文中也有看到，是 3.0.35 kernel 的 hibernate 函数中新加的处理
pm_restrict_gfp_mask();
// 函数的实现在后面
error = suspend_devices_and_enter(state);

suspend_devices_and_enter 函数的实现：
/**
* suspend_devices_and_enter - suspend devices and enter the desired system
*        sleep state.
* @state:    state to enter
*/
int suspend_devices_and_enter(suspend_state_t state)
{
int error;

if (!suspend_ops)
return -ENOSYS;

trace_machine_suspend(state);
// suspend_ops 在前文出现过，在 architecture 的 pm 模块的 init 或者 probe 函数中会设置 suspend_ops
if (suspend_ops->begin) {
  error = suspend_ops->begin(state);
  if (error)
   goto Close;
}
// 函数 suspend_console 的实现见后文
suspend_console();
// 函数 suspend_test_start 和函数 suspend_test_finish 用于计时，并输出时间信息
suspend_test_start();
// 函数 dpm_suspend_start 的实现见后文
error = dpm_suspend_start(PMSG_SUSPEND);
if (error) {
  printk(KERN_ERR "PM: Some devices failed to suspend\n");
  goto Recover_platform;
}
suspend_test_finish("suspend devices");
// 此函数前面说过
if (suspend_test(TEST_DEVICES))
  goto Recover_platform;

// 函数 suspend_enter 的实现见后文
error = suspend_enter(state);

Resume_devices:
suspend_test_start();
/**
* dpm_resume_end - Execute "resume" callbacks and complete system transition.
* @state: PM transition of the system being carried out.
*
* Execute "resume" callbacks for all devices and complete the PM transition of
* the system.
*/
// dpm_resume_end 调用 dpm_resume 和 dpm_complete
/**
* dpm_resume - Execute "resume" callbacks for non-sysdev devices.
* @state: PM transition of the system being carried out.
*
* Execute the appropriate "resume" callback for all devices whose status
* indicates that they are suspended.
*/
/**
* dpm_complete - Complete a PM transition for all non-sysdev devices.
* @state: PM transition of the system being carried out.
*
* Execute the ->complete() callbacks for all devices whose PM status is not
* DPM_ON (this allows new devices to be registered).
*/
dpm_resume_end(PMSG_RESUME);
suspend_test_finish("resume devices");
// console 处理
resume_console();
Close:
if (suspend_ops->end)
suspend_ops->end();
trace_machine_suspend(PWR_EVENT_EXIT);
return error;

Recover_platform:
if (suspend_ops->recover)
suspend_ops->recover();
goto Resume_devices;
}

函数 suspend_console 的实现：
/**
* suspend_console - suspend the console subsystem
*
* This disables printk() while we go into suspend states
*/
void suspend_console(void)
{
if (!console_suspend_enabled)
return;
printk("Suspending console(s) (use no_console_suspend to debug)\n");
console_lock();
console_suspended = 1;
up(&console_sem);
}

函数 dpm_suspend_start 的实现：
/**
* dpm_suspend_start - Prepare devices for PM transition and suspend them.
* @state: PM transition of the system being carried out.
*
* Prepare all non-sysdev devices for system PM transition and execute "suspend"
* callbacks for them.
*/
int dpm_suspend_start(pm_message_t state)
{
int error;

/**
* dpm_prepare - Prepare all non-sysdev devices for a system PM transition.
* @state: PM transition of the system being carried out.
*
* Execute the ->prepare() callback(s) for all devices.
*/
error = dpm_prepare(state);
if (!error)
  /**
   * dpm_suspend - Execute "suspend" callbacks for all non-sysdev devices.
   * @state: PM transition of the system being carried out.
   */
  error = dpm_suspend(state);
return error;
}
函数 dpm_prepare 和函数 dpm_suspend 中的处理涉及到那些 devices 的 callback 被调用。
在函数 dpm_prepare 中，会遍历队列 dpm_list ，依次调用其中设备的 ->prepare() callback(s) ，
如果成功，则将其添加到 dpm_prepared_list 队列。
函数 dpm_suspend 遍历 dpm_prepared_list 队列，依次调用其中设备的 ->suspend() callback(s) 。
如果成功，则将其添加到 dpm_suspended_list 队列，后面调用到的 dpm_resume 函数会使用该队列。
dpm_resume 处理之后又将成员 move 到 dpm_prepared_list 队列。
函数 dpm_complete 中会处理 dpm_prepared_list 队列。
接下来的问题是， dpm_list 里的成员是谁添加到？
函数 device_pm_add 中会往 dpm_list 中添加成员：
/**
* device_pm_add - Add a device to the PM core's list of active devices.
* @dev: Device to add to the list.
*/
void device_pm_add(struct device *dev)
{
...
list_add_tail(&dev->power.entry, &dpm_list);
...
}
函数 device_add 中调用了函数 device_pm_add ：
/**
* device_add - add device to device hierarchy.
* @dev: device.
*
* This is part 2 of device_register(), though may be called
* separately _iff_ device_initialize() has been called separately.
*
* This adds @dev to the kobject hierarchy via kobject_add(), adds it
* to the global and sibling lists for the device, then
* adds it to the other relevant subsystems of the driver model.
*
* NOTE: _Never_ directly free @dev after calling this function, even
* if it returned an error! Always use put_device() to give up your
* reference instead.
*/
int device_add(struct device *dev)
{
...
device_pm_add(dev);
...
}
举一个 audio device driver 的例子。 driver 的 init 函数中调用了函数 platform_device_add 。
函数 platform_device_add 中调用了函数 device_add ：
/**
* platform_device_add - add a platform device to device hierarchy
* @pdev: platform device we're adding
*
* This is part 2 of platform_device_register(), though may be called
* separately _iff_ pdev was allocated by platform_device_alloc().
*/
int platform_device_add(struct platform_device *pdev)
{
...

ret = device_add(&pdev->dev);
...
}

函数 suspend_enter 的实现：
/**
* suspend_enter - enter the desired system sleep state.
* @state: state to enter
*
* This function should be called after devices have been suspended.
*/
static int suspend_enter(suspend_state_t state)
{
int error;

// suspend_ops 已经见过多次
if (suspend_ops->prepare) {
  error = suspend_ops->prepare();
  if (error)
   goto Platform_finish;
}

/**
* dpm_suspend_noirq - Execute "late suspend" callbacks for non-sysdev devices.
* @state: PM transition of the system being carried out.
*
* Prevent device drivers from receiving interrupts and call the "noirq" suspend
* handlers for all non-sysdev devices.
*/
error = dpm_suspend_noirq(PMSG_SUSPEND);
if (error) {
printk(KERN_ERR "PM: Some devices failed to power down\n");
goto Platform_finish;
}

if (suspend_ops->prepare_late) {
  error = suspend_ops->prepare_late();
  if (error)
   goto Platform_wake;
}

if (suspend_test(TEST_PLATFORM))
goto Platform_wake;

// 把 first_cpu 之外的 cpu 都通过函数 _cpu_down down 掉
error = disable_nonboot_cpus();
if (error || suspend_test(TEST_CPUS))
goto Enable_cpus;

arch_suspend_disable_irqs();
BUG_ON(!irqs_disabled());

/**
* syscore_suspend - Execute all the registered system core suspend callbacks.
*
* This function is executed with one CPU on-line and disabled interrupts.
*/
// 遍历 syscore_ops_list 队列，依次调用成员的 suspend 函数
// 函数 register_syscore_ops 会往队列 syscore_ops_list 上添加成员
/**
* register_syscore_ops - Register a set of system core operations.
* @ops: System core operations to register.
*/
error = syscore_suspend();
// 成功则进入 if 语句
if (!error) {
  if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) {
   // enter 函数的实现一般在pm.c中。fsl的imx6的pm.c中，enter suspend的时候调用了调用了iRam中的suspend处理函数，最后等待中断的到来。
   // iRam中的函数是pm模块初始化的时候，copy过去的。
   // 在举例中，imx6上用的是kernel 3.0.35。
   // kernel 2.6.35 对应的 imx5上的处理类似，只是细节上稍有不同。
   error = suspend_ops->enter(state);
   events_check_enabled = false;
  }
  /**
   * syscore_resume - Execute all the registered system core resume callbacks.
   *
   * This function is executed with one CPU on-line and disabled interrupts.
   */
  // 重新上电了，处理的队列也是 syscore_ops_list
  syscore_resume();
}