/*
* FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node,
* so all functions starting at paging_init should be marked __init
* in those cases. SPARSEMEM, however, allows for memory hotplug,
* and alloc_bootmem_node is not used.
*/
#ifdef CONFIG_SPARSEMEM
#define __paginginit __meminit
#else
#define __paginginit __init
#endif
/* Memory initialisation debug and verification */
enum mminit_level {
MMINIT_WARNING,
MMINIT_VERIFY,
MMINIT_TRACE
};
#ifdef CONFIG_DEBUG_MEMORY_INIT
extern int mminit_loglevel;
#define mminit_dprintk(level, prefix, fmt, arg...) \
do { \
if (level < mminit_loglevel) { \
if (level <= MMINIT_WARNING) \
printk(KERN_WARNING "mminit::" prefix " " fmt, ##arg); \
else \
printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \
} \
} while (0)
extern void mminit_verify_pageflags_layout(void);
extern void mminit_verify_zonelist(void);
#else
static inline void mminit_dprintk(enum mminit_level level,
const char *prefix, const char *fmt, ...)
{
}
static inline void mminit_verify_pageflags_layout(void)
{
}
static inline void mminit_verify_zonelist(void)
{
}
#endif /* CONFIG_DEBUG_MEMORY_INIT */
/* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */
#if defined(CONFIG_SPARSEMEM)
extern void mminit_validate_memmodel_limits(unsigned long *start_pfn,
unsigned long *end_pfn);
#else
static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
unsigned long *end_pfn)
{
}
#endif /* CONFIG_SPARSEMEM */
#define ZONE_RECLAIM_NOSCAN -2
#define ZONE_RECLAIM_FULL -1
#define ZONE_RECLAIM_SOME 0
#define ZONE_RECLAIM_SUCCESS 1
extern int hwpoison_filter(struct page *p);
extern u32 hwpoison_filter_dev_major;
extern u32 hwpoison_filter_dev_minor;
extern u64 hwpoison_filter_flags_mask;
extern u64 hwpoison_filter_flags_value;
extern u64 hwpoison_filter_memcg;
extern u32 hwpoison_filter_enable;
extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
unsigned long, unsigned long,
unsigned long, unsigned long);
extern void set_pageblock_order(void);
unsigned long reclaim_clean_pages_from_list(struct zone *zone,
struct list_head *page_list);
/* The ALLOC_WMARK bits are used as an index to zone->watermark */
#define ALLOC_WMARK_MIN WMARK_MIN
#define ALLOC_WMARK_LOW WMARK_LOW
#define ALLOC_WMARK_HIGH WMARK_HIGH
#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */
/* Mask to get the watermark bits */
#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
#define ALLOC_HARDER 0x10 /* try to alloc harder */
#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
#define ALLOC_FAIR 0x100 /* fair zone allocation */
#endif /* __MM_INTERNAL_H */
C:\Users\Admin\Desktop\linux-4.2.y-new\linux-4.2.y\mm/interval_tree.c
/*
* mm/interval_tree.c - interval tree for mapping->i_mmap
*
* Copyright (C) 2012, Michel Lespinasse <walken@google.com>
*
* This file is released under the GPL v2.
*/
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/rmap.h>
#include <linux/interval_tree_generic.h>
static inline unsigned long vma_start_pgoff(struct vm_area_struct *v)
{
return v->vm_pgoff;
}
static inline unsigned long vma_last_pgoff(struct vm_area_struct *v)
{
return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1;
}
INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb,
unsigned long, shared.rb_subtree_last,
vma_start_pgoff, vma_last_pgoff,, vma_interval_tree)
/* Insert node immediately after prev in the interval tree */
void vma_interval_tree_insert_after(struct vm_area_struct *node,
struct vm_area_struct *prev,
struct rb_root *root)
{
struct rb_node **link;
struct vm_area_struct *parent;
unsigned long last = vma_last_pgoff(node);
VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node);
if (!prev->shared.rb.rb_right) {
parent = prev;
link = &prev->shared.rb.rb_right;
} else {
parent = rb_entry(prev->shared.rb.rb_right,
struct vm_area_struct, shared.rb);
if (parent->shared.rb_subtree_last < last)
parent->shared.rb_subtree_last = last;
while (parent->shared.rb.rb_left) {
parent = rb_entry(parent->shared.rb.rb_left,
struct vm_area_struct, shared.rb);
if (parent->shared.rb_subtree_last < last)
parent->shared.rb_subtree_last = last;
}
link = &parent->shared.rb.rb_left;
}
node->shared.rb_subtree_last = last;
rb_link_node(&node->shared.rb, &parent->shared.rb, link);
rb_insert_augmented(&node->shared.rb, root,
&vma_interval_tree_augment);
}
static inline unsigned long avc_start_pgoff(struct anon_vma_chain *avc)
{
return vma_start_pgoff(avc->vma);
}
static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc)
{
return vma_last_pgoff(avc->vma);
}
INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last,
avc_start_pgoff, avc_last_pgoff,
static inline, __anon_vma_interval_tree)
void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
struct rb_root *root)
{
#ifdef CONFIG_DEBUG_VM_RB
node->cached_vma_start = avc_start_pgoff(node);
node->cached_vma_last = avc_last_pgoff(node);
#endif
__anon_vma_interval_tree_insert(node, root);
}
void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
struct rb_root *root)
{
__anon_vma_interval_tree_remove(node, root);
}
struct anon_vma_chain *
anon_vma_interval_tree_iter_first(struct rb_root *root,
unsigned long first, unsigned long last)
{
return __anon_vma_interval_tree_iter_first(root, first, last);
}
struct anon_vma_chain *
anon_vma_interval_tree_iter_next(struct anon_vma_chain *node,
unsigned long first, unsigned long last)
{
return __anon_vma_interval_tree_iter_next(node, first, last);
}
#ifdef CONFIG_DEBUG_VM_RB
void anon_vma_interval_tree_verify(struct anon_vma_chain *node)
{
WARN_ON_ONCE(node->cached_vma_start != avc_start_pgoff(node));
WARN_ON_ONCE(node->cached_vma_last != avc_last_pgoff(node));
}
#endif
C:\Users\Admin\Desktop\linux-4.2.y-new\linux-4.2.y\mm/kasan/kasan.c
/*
* This file contains shadow memory manipulation code.
*
* Copyright (c) 2014 Samsung Electronics Co., Ltd.
* Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
*
* Some of code borrowed from https://github.com/xairy/linux by
* Andrey Konovalov <adech.fo@gmail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#define DISABLE_BRANCH_PROFILING
#include <linux/export.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/memblock.h>
#include <linux/memory.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/printk.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/stacktrace.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/vmalloc.h>
#include <linux/kasan.h>
#include "kasan.h"
#include "../slab.h"
/*
* Poisons the shadow memory for 'size' bytes starting from 'addr'.
* Memory addresses should be aligned to KASAN_SHADOW_SCALE_SIZE.
*/
static void kasan_poison_shadow(const void *address, size_t size, u8 value)
{
void *shadow_start, *shadow_end;
shadow_start = kasan_mem_to_shadow(address);
shadow_end = kasan_mem_to_shadow(address + size);
memset(shadow_start, value, shadow_end - shadow_start);
}
void kasan_unpoison_shadow(const void *address, size_t size)
{
kasan_poison_shadow(address, size, 0);
if (size & KASAN_SHADOW_MASK) {
u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size);
*shadow = size & KASAN_SHADOW_MASK;
}
}
/*
* All functions below always inlined so compiler could
* perform better optimizations in each of __asan_loadX/__assn_storeX
* depending on memory access size X.
*/
static __always_inline bool memory_is_poisoned_1(unsigned long addr)
{
s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr);
if (unlikely(shadow_value)) {
s8 last_accessible_byte = addr & KASAN_SHADOW_MASK;
return unlikely(last_accessible_byte >= shadow_value);
}
return false;
}
static __always_inline bool memory_is_poisoned_2(unsigned long addr)
{
u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
if (unlikely(*shadow_addr)) {
if (memory_is_poisoned_1(addr + 1))
return true;
if (likely(((addr + 1) & KASAN_SHADOW_MASK) != 0))
return false;
return unlikely(*(u8 *)shadow_addr);
}
return false;
}
static __always_inline bool memory_is_poisoned_4(unsigned long addr)
{
u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
if (unlikely(*shadow_addr)) {
if (memory_is_poisoned_1(addr + 3))
return true;
if (likely(((addr + 3) & KASAN_SHADOW_MASK) >= 3))
return false;
return unlikely(*(u8 *)shadow_addr);
}
return false;
}
static __always_inline bool memory_is_poisoned_8(unsigned long addr)
{
u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
if (unlikely(*shadow_addr)) {
if (memory_is_poisoned_1(addr + 7))
return true;
if (likely(((addr + 7) & KASAN_SHADOW_MASK) >= 7))
return false;
return unlikely(*(u8 *)shadow_addr);
}
return false;
}
static __always_inline bool memory_is_poisoned_16(unsigned long addr)
{
u32 *shadow_addr = (u32 *)kasan_mem_to_shadow((void *)addr);
if (unlikely(*shadow_addr)) {
u16 shadow_first_bytes = *(u16 *)shadow_addr;
s8 last_byte = (addr + 15) & KASAN_SHADOW_MASK;
if (unlikely(shadow_first_bytes))
return true;
if (likely(!last_byte))
return false;
return memory_is_poisoned_1(addr + 15);
}
return false;
}
static __always_inline unsigned long bytes_is_zero(const u8 *start,
size_t size)
{
while (size) {
if (unlikely(*start))
return (unsigned long)start;
start++;
size--;
}
return 0;
}
static __always_inline unsigned long memory_is_zero(const void *start,
const void *end)
{
unsigned int words;
unsigned long ret;
unsigned int prefix = (unsigned long)start % 8;
if (end - start <= 16)
return bytes_is_zero(start, end - start);
if (prefix) {
prefix = 8 - prefix;
ret = bytes_is_zero(start, prefix);
if (unlikely(ret))
return ret;
start += prefix;
}
words = (end - start) / 8;
while (words) {
if (unlikely(*(u64 *)start))
return bytes_is_zero(start, 8);
start += 8;
words--;
}
return bytes_is_zero(start, (end - start) % 8);
}
static __always_inline bool memory_is_poisoned_n(unsigned long addr,
size_t size)
{
unsigned long ret;
ret = memory_is_zero(kasan_mem_to_shadow((void *)addr),
kasan_mem_to_shadow((void *)addr + size - 1) + 1);
if (unlikely(ret)) {
unsigned long last_byte = addr + size - 1;
s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte);
if (unlikely(ret != (unsigned long)last_shadow ||
((last_byte & KASAN_SHADOW_MASK) >= *last_shadow)))
return true;
}
return false;
}
static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
{
if (__builtin_constant_p(size)) {
switch (size) {
case 1:
return memory_is_poisoned_1(addr);
case 2:
return memory_is_poisoned_2(addr);
case 4:
return memory_is_poisoned_4(addr);
case 8:
return memory_is_poisoned_8(addr);
case 16:
return memory_is_poisoned_16(addr);
default:
BUILD_BUG();
}
}
return memory_is_poisoned_n(addr, size);
}
static __always_inline void check_memory_region(unsigned long addr,
size_t size, bool write)
{
struct kasan_access_info info;
if (unlikely(size == 0))
return;
if (unlikely((void *)addr <
kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
info.access_addr = (void *)addr;
info.access_size = size;
info.is_write = write;
info.ip = _RET_IP_;
kasan_report_user_access(&info);
return;
}
if (likely(!memory_is_poisoned(addr, size)))
return;
kasan_report(addr, size, write, _RET_IP_);
}
void __asan_loadN(unsigned long addr, size_t size);
void __asan_storeN(unsigned long addr, size_t size);
#undef memset
void *memset(void *addr, int c, size_t len)
{
__asan_storeN((unsigned long)addr, len);
return __memset(addr, c, len);
}
#undef memmove
void *memmove(void *dest, const void *src, size_t len)
{
__asan_loadN((unsigned long)src, len);
__asan_storeN((unsigned long)dest, len);
return __memmove(dest, src, len);
}
#undef memcpy
void *memcpy(void *dest, const void *src, size_t len)
{
__asan_loadN((unsigned long)src, len);
__asan_storeN((unsigned long)dest, len);
return __memcpy(dest, src, len);
}
void kasan_alloc_pages(struct page *page, unsigned int order)
{
if (likely(!PageHighMem(page)))
kasan_unpoison_shadow(page_address(page), PAGE_SIZE << order);
}
void kasan_free_pages(struct page *page, unsigned int order)
{
if (likely(!PageHighMem(page)))
kasan_poison_shadow(page_address(page),
PAGE_SIZE << order,
KASAN_FREE_PAGE);
}
void kasan_poison_slab(struct page *page)
{
kasan_poison_shadow(page_address(page),
PAGE_SIZE << compound_order(page),
KASAN_KMALLOC_REDZONE);
}
void kasan_unpoison_object_data(struct kmem_cache *cache, void *object)
{
kasan_unpoison_shadow(object, cache->object_size);
}
void kasan_poison_object_data(struct kmem_cache *cache, void *object)
{
kasan_poison_shadow(object,
round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE),
KASAN_KMALLOC_REDZONE);
}
void kasan_slab_alloc(struct kmem_cache *cache, void *object)
{
kasan_kmalloc(cache, object, cache->object_size);
}
void kasan_slab_free(struct kmem_cache *cache, void *object)
{
unsigned long size = cache->object_size;
unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
/* RCU slabs could be legally used after free within the RCU period */
if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
return;
kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
}
void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size)
{
unsigned long redzone_start;
unsigned long redzone_end;
if (unlikely(object == NULL))
return;
redzone_start = round_up((unsigned long)(object + size),
KASAN_SHADOW_SCALE_SIZE);
redzone_end = round_up((unsigned long)object + cache->object_size,
KASAN_SHADOW_SCALE_SIZE);
kasan_unpoison_shadow(object, size);
kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
KASAN_KMALLOC_REDZONE);
}
EXPORT_SYMBOL(kasan_kmalloc);
void kasan_kmalloc_large(const void *ptr, size_t size)
{
struct page *page;
unsigned long redzone_start;
unsigned long redzone_end;
if (unlikely(ptr == NULL))
return;
page = virt_to_page(ptr);
redzone_start = round_up((unsigned long)(ptr + size),
KASAN_SHADOW_SCALE_SIZE);
redzone_end = (unsigned long)ptr + (PAGE_SIZE << compound_order(page));
kasan_unpoison_shadow(ptr, size);
kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
KASAN_PAGE_REDZONE);
}
void kasan_krealloc(const void *object, size_t size)
{
struct page *page;
if (unlikely(object == ZERO_SIZE_PTR))
return;
page = virt_to_head_page(object);
if (unlikely(!PageSlab(page)))
kasan_kmalloc_large(object, size);
else
kasan_kmalloc(page->slab_cache, object, size);
}
void kasan_kfree(void *ptr)
{
struct page *page;
page = virt_to_head_page(ptr);
if (unlikely(!PageSlab(page)))
kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page),
KASAN_FREE_PAGE);
else
kasan_slab_free(page->slab_cache, ptr);
}
void kasan_kfree_large(const void *ptr)
{
struct page *page = virt_to_page(ptr);
kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page),
KASAN_FREE_PAGE);
}
int kasan_module_alloc(void *addr, size_t size)
{
void *ret;
size_t shadow_size;
unsigned long shadow_start;
shadow_start = (unsigned long)kasan_mem_to_shadow(addr);
shadow_size = round_up(size >> KASAN_SHADOW_SCALE_SHIFT,
PAGE_SIZE);
if (WARN_ON(!PAGE_ALIGNED(shadow_start)))
return -EINVAL;
ret = __vmalloc_node_range(shadow_size, 1, shadow_start,
shadow_start + shadow_size,
GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE,
__builtin_return_address(0));
if (ret) {
find_vm_area(addr)->flags |= VM_KASAN;
return 0;
}
return -ENOMEM;
}
void kasan_free_shadow(const struct vm_struct *vm)
{
if (vm->flags & VM_KASAN)
vfree(kasan_mem_to_shadow(vm->addr));
}
static void register_global(struct kasan_global *global)
{
size_t aligned_size = round_up(global->size, KASAN_SHADOW_SCALE_SIZE);
kasan_unpoison_shadow(global->beg, global->size);
kasan_poison_shadow(global->beg + aligned_size,
global->size_with_redzone - aligned_size,
KASAN_GLOBAL_REDZONE);
}
void __asan_register_globals(struct kasan_global *globals, size_t size)
{
int i;
for (i = 0; i < size; i++)
register_global(&globals[i]);
}
EXPORT_SYMBOL(__asan_register_globals);
void __asan_unregister_globals(struct kasan_global *globals, size_t size)
{
}
EXPORT_SYMBOL(__asan_unregister_globals);
#define DEFINE_ASAN_LOAD_STORE(size) \
void __asan_load##size(unsigned long addr) \
{ \
check_memory_region(addr, size, false); \
} \
EXPORT_SYMBOL(__asan_load##size); \
__alias(__asan_load##size) \
void __asan_load##size##_noabort(unsigned long); \
EXPORT_SYMBOL(__asan_load##size##_noabort); \
void __asan_store##size(unsigned long addr) \
{ \
check_memory_region(addr, size, true); \
} \
EXPORT_SYMBOL(__asan_store##size); \
__alias(__asan_store##size) \
void __asan_store##size##_noabort(unsigned long); \
EXPORT_SYMBOL(__asan_store##size##_noabort)
DEFINE_ASAN_LOAD_STORE(1);
DEFINE_ASAN_LOAD_STORE(2);
DEFINE_ASAN_LOAD_STORE(4);
DEFINE_ASAN_LOAD_STORE(8);
DEFINE_ASAN_LOAD_STORE(16);
void __asan_loadN(unsigned long addr, size_t size)
{
check_memory_region(addr, size, false);
}
EXPORT_SYMBOL(__asan_loadN);
__alias(__asan_loadN)
void __asan_loadN_noabort(unsigned long, size_t);
EXPORT_SYMBOL(__asan_loadN_noabort);
void __asan_storeN(unsigned long addr, size_t size)
{
check_memory_region(addr, size, true);
}
EXPORT_SYMBOL(__asan_storeN);
__alias(__asan_storeN)
void __asan_storeN_noabort(unsigned long, size_t);
EXPORT_SYMBOL(__asan_storeN_noabort);
/* to shut up compiler complaints */
void __asan_handle_no_return(void) {}
EXPORT_SYMBOL(__asan_handle_no_return);
#ifdef CONFIG_MEMORY_HOTPLUG
static int kasan_mem_notifier(struct notifier_block *nb,
unsigned long action, void *data)
{
return (action == MEM_GOING_ONLINE) ? NOTIFY_BAD : NOTIFY_OK;
}
static int __init kasan_memhotplug_init(void)
{
pr_err("WARNING: KASan doesn't support memory hot-add\n");
pr_err("Memory hot-add will be disabled\n");
hotplug_memory_notifier(kasan_mem_notifier, 0);
return 0;
}
module_init(kasan_memhotplug_init);
#endif
C:\Users\Admin\Desktop\linux-4.2.y-new\linux-4.2.y\mm/kasan/kasan.h
#ifndef __MM_KASAN_KASAN_H
#define __MM_KASAN_KASAN_H
#include <linux/kasan.h>
#define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT)
#define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1)
#define KASAN_FREE_PAGE 0xFF /* page was freed */
#define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */
#define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */
#define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */
#define KASAN_GLOBAL_REDZONE 0xFA /* redzone for global variable */
/*
* Stack redzone shadow values
* (Those are compiler's ABI, don't change them)
*/
#define KASAN_STACK_LEFT 0xF1
#define KASAN_STACK_MID 0xF2
#define KASAN_STACK_RIGHT 0xF3
#define KASAN_STACK_PARTIAL 0xF4
/* Don't break randconfig/all*config builds */
#ifndef KASAN_ABI_VERSION
#define KASAN_ABI_VERSION 1
#endif
struct kasan_access_info {
const void *access_addr;
const void *first_bad_addr;
size_t access_size;
bool is_write;
unsigned long ip;
};
/* The layout of struct dictated by compiler */
struct kasan_source_location {
const char *filename;
int line_no;
int column_no;
};
/* The layout of struct dictated by compiler */
struct kasan_global {
const void *beg; /* Address of the beginning of the global variable. */
size_t size; /* Size of the global variable. */
size_t size_with_redzone; /* Size of the variable + size of the red zone. 32 bytes aligned */
const void *name;
const void *module_name; /* Name of the module where the global variable is declared. */
unsigned long has_dynamic_init; /* This needed for C++ */
#if KASAN_ABI_VERSION >= 4
struct kasan_source_location *location;
#endif
};
void kasan_report_error(struct kasan_access_info *info);
void kasan_report_user_access(struct kasan_access_info *info);
static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
{
return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET)
<< KASAN_SHADOW_SCALE_SHIFT);
}
static inline bool kasan_enabled(void)
{
return !current->kasan_depth;
}
void kasan_report(unsigned long addr, size_t size,
bool is_write, unsigned long ip);
#endif
C:\Users\Admin\Desktop\linux-4.2.y-new\linux-4.2.y\mm/kasan/Makefile
KASAN_SANITIZE := n
CFLAGS_REMOVE_kasan.o = -pg
# Function splitter causes unnecessary splits in __asan_load1/__asan_store1
# see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533
CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
obj-y := kasan.o report.o
C:\Users\Admin\Desktop\linux-4.2.y-new\linux-4.2.y\mm/kasan/report.c
/*
* This file contains error reporting code.
*
* Copyright (c) 2014 Samsung Electronics Co., Ltd.
* Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
*
* Some of code borrowed from https://github.com/xairy/linux by
* Andrey Konovalov <adech.fo@gmail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
*/
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/printk.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/stacktrace.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/kasan.h>
#include <asm/sections.h>
#include "kasan.h"
#include "../slab.h"
/* Shadow layout customization. */
#define SHADOW_BYTES_PER_BLOCK 1
#define SHADOW_BLOCKS_PER_ROW 16
#define SHADOW_BYTES_PER_ROW (SHADOW_BLOCKS_PER_ROW * SHADOW_BYTES_PER_BLOCK)
#define SHADOW_ROWS_AROUND_ADDR 2
static const void *find_first_bad_addr(const void *addr, size_t size)
{
u8 shadow_val = *(u8 *)kasan_mem_to_shadow(addr);
const void *first_bad_addr = addr;
while (!shadow_val && first_bad_addr < addr + size) {
first_bad_addr += KASAN_SHADOW_SCALE_SIZE;
shadow_val = *(u8 *)kasan_mem_to_shadow(first_bad_addr);
}
return first_bad_addr;
}
static void print_error_description(struct kasan_access_info *info)
{
const char *bug_type = "unknown crash";
u8 shadow_val;
info->first_bad_addr = find_first_bad_addr(info->access_addr,
info->access_size);
shadow_val = *(u8 *)kasan_mem_to_shadow(info->first_bad_addr);
switch (shadow_val) {
case KASAN_FREE_PAGE:
case KASAN_KMALLOC_FREE:
bug_type = "use after free";
break;
case KASAN_PAGE_REDZONE:
case KASAN_KMALLOC_REDZONE:
case KASAN_GLOBAL_REDZONE:
case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
bug_type = "out of bounds access";
break;
case KASAN_STACK_LEFT:
case KASAN_STACK_MID:
case KASAN_STACK_RIGHT:
case KASAN_STACK_PARTIAL:
bug_type = "out of bounds on stack";
break;
}
pr_err("BUG: KASan: %s in %pS at addr %p\n",
bug_type, (void *)info->ip,
info->access_addr);
pr_err("%s of size %zu by task %s/%d\n",
info->is_write ? "Write" : "Read",
info->access_size, current->comm, task_pid_nr(current));
}
static inline bool kernel_or_module_addr(const void *addr)
{
return (addr >= (void *)_stext && addr < (void *)_end)
|| (addr >= (void *)MODULES_VADDR
&& addr < (void *)MODULES_END);
}
static inline bool init_task_stack_addr(const void *addr)
{
return addr >= (void *)&init_thread_union.stack &&
(addr <= (void *)&init_thread_union.stack +
sizeof(init_thread_union.stack));
}
static void print_address_description(struct kasan_access_info *info)
{
const void *addr = info->access_addr;
if ((addr >= (void *)PAGE_OFFSET) &&
(addr < high_memory)) {
struct page *page = virt_to_head_page(addr);
if (PageSlab(page)) {
void *object;
struct kmem_cache *cache = page->slab_cache;
void *last_object;
object = virt_to_obj(cache, page_address(page), addr);
last_object = page_address(page) +
page->objects * cache->size;
if (unlikely(object > last_object))
object = last_object; /* we hit into padding */
object_err(cache, page, object,
"kasan: bad access detected");
return;
}
dump_page(page, "kasan: bad access detected");
}
if (kernel_or_module_addr(addr)) {
if (!init_task_stack_addr(addr))
pr_err("Address belongs to variable %pS\n", addr);
}
dump_stack();
}
static bool row_is_guilty(const void *row, const void *guilty)
{
return (row <= guilty) && (guilty < row + SHADOW_BYTES_PER_ROW);
}
static int shadow_pointer_offset(const void *row, const void *shadow)
{
/* The length of ">ff00ff00ff00ff00: " is
* 3 + (BITS_PER_LONG/8)*2 chars.
*/
return 3 + (BITS_PER_LONG/8)*2 + (shadow - row)*2 +
(shadow - row) / SHADOW_BYTES_PER_BLOCK + 1;
}
static void print_shadow_for_address(const void *addr)
{
int i;
const void *shadow = kasan_mem_to_shadow(addr);
const void *shadow_row;
shadow_row = (void *)round_down((unsigned long)shadow,
SHADOW_BYTES_PER_ROW)
- SHADOW_ROWS_AROUND_ADDR * SHADOW_BYTES_PER_ROW;
pr_err("Memory state around the buggy address:\n");
for (i = -SHADOW_ROWS_AROUND_ADDR; i <= SHADOW_ROWS_AROUND_ADDR; i++) {
const void *kaddr = kasan_shadow_to_mem(shadow_row);
char buffer[4 + (BITS_PER_LONG/8)*2];
snprintf(buffer, sizeof(buffer),
(i == 0) ? ">%p: " : " %p: ", kaddr);
kasan_disable_current();
print_hex_dump(KERN_ERR, buffer,
DUMP_PREFIX_NONE, SHADOW_BYTES_PER_ROW, 1,
shadow_row, SHADOW_BYTES_PER_ROW, 0);
kasan_enable_current();
if (row_is_guilty(shadow_row, shadow))
pr_err("%*c\n",
shadow_pointer_offset(shadow_row, shadow),
'^');
shadow_row += SHADOW_BYTES_PER_ROW;
}
}
static DEFINE_SPINLOCK(report_lock);
void kasan_report_error(struct kasan_access_info *info)
{
unsigned long flags;
spin_lock_irqsave(&report_lock, flags);
pr_err("================================="
"=================================\n");
print_error_description(info);
print_address_description(info);
print_shadow_for_address(info->first_bad_addr);
pr_err("================================="
"=================================\n");
spin_unlock_irqrestore(&report_lock, flags);
}
void kasan_report_user_access(struct kasan_access_info *info)
{
unsigned long flags;
spin_lock_irqsave(&report_lock, flags);
pr_err("================================="
"=================================\n");
pr_err("BUG: KASan: user-memory-access on address %p\n",
info->access_addr);
pr_err("%s of size %zu by task %s/%d\n",
info->is_write ? "Write" : "Read",
info->access_size, current->comm, task_pid_nr(current));
dump_stack();
pr_err("================================="
"=================================\n");
spin_unlock_irqrestore(&report_lock, flags);
}
void kasan_report(unsigned long addr, size_t size,
bool is_write, unsigned long ip)
{
struct kasan_access_info info;
if (likely(!kasan_enabled()))
return;
info.access_addr = (void *)addr;
info.access_size = size;
info.is_write = is_write;
info.ip = ip;
kasan_report_error(&info);
}
#define DEFINE_ASAN_REPORT_LOAD(size) \
void __asan_report_load##size##_noabort(unsigned long addr) \
{ \
kasan_report(addr, size, false, _RET_IP_); \
} \
EXPORT_SYMBOL(__asan_report_load##size##_noabort)
#define DEFINE_ASAN_REPORT_STORE(size) \
void __asan_report_store##size##_noabort(unsigned long addr) \
{ \
kasan_report(addr, size, true, _RET_IP_); \
} \
EXPORT_SYMBOL(__asan_report_store##size##_noabort)
DEFINE_ASAN_REPORT_LOAD(1);
DEFINE_ASAN_REPORT_LOAD(2);
DEFINE_ASAN_REPORT_LOAD(4);
DEFINE_ASAN_REPORT_LOAD(8);
DEFINE_ASAN_REPORT_LOAD(16);
DEFINE_ASAN_REPORT_STORE(1);
DEFINE_ASAN_REPORT_STORE(2);
DEFINE_ASAN_REPORT_STORE(4);
DEFINE_ASAN_REPORT_STORE(8);
DEFINE_ASAN_REPORT_STORE(16);
void __asan_report_load_n_noabort(unsigned long addr, size_t size)
{
kasan_report(addr, size, false, _RET_IP_);
}
EXPORT_SYMBOL(__asan_report_load_n_noabort);
void __asan_report_store_n_noabort(unsigned long addr, size_t size)
{
kasan_report(addr, size, true, _RET_IP_);
}
EXPORT_SYMBOL(__asan_report_store_n_noabort);
C:\Users\Admin\Desktop\linux-4.2.y-new\linux-4.2.y\mm/Kconfig
config SELECT_MEMORY_MODEL
def_bool y
depends on ARCH_SELECT_MEMORY_MODEL
choice
prompt "Memory model"
depends on SELECT_MEMORY_MODEL
default DISCONTIGMEM_MANUAL if ARCH_DISCONTIGMEM_DEFAULT
default SPARSEMEM_MANUAL if ARCH_SPARSEMEM_DEFAULT
default FLATMEM_MANUAL
config FLATMEM_MANUAL
bool "Flat Memory"
depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE
help
This option allows you to change some of the ways that
Linux manages its memory internally. Most users will
only have one option here: FLATMEM. This is normal
and a correct option.
Some users of more advanced features like NUMA and
memory hotplug may have different options here.
DISCONTIGMEM is a more mature, better tested system,
but is incompatible with memory hotplug and may suffer
decreased performance over SPARSEMEM. If unsure between
"Sparse Memory" and "Discontiguous Memory", choose
"Discontiguous Memory".
If unsure, choose this option (Flat Memory) over any other.
config DISCONTIGMEM_MANUAL
bool "Discontiguous Memory"
depends on ARCH_DISCONTIGMEM_ENABLE
help
This option provides enhanced support for discontiguous
memory systems, over FLATMEM. These systems have holes
in their physical address spaces, and this option provides
more efficient handling of these holes. However, the vast
majority of hardware has quite flat address spaces, and
can have degraded performance from the extra overhead that
this option imposes.
Many NUMA configurations will have this as the only option.
If unsure, choose "Flat Memory" over this option.
config SPARSEMEM_MANUAL
bool "Sparse Memory"
depends on ARCH_SPARSEMEM_ENABLE
help
This will be the only option for some systems, including
memory hotplug systems. This is normal.
For many other systems, this will be an alternative to
"Discontiguous Memory". This option provides some potential
performance benefits, along with decreased code complexity,
but it is newer, and more experimental.
If unsure, choose "Discontiguous Memory" or "Flat Memory"
over this option.
endchoice
config DISCONTIGMEM
def_bool y
depends on (!SELECT_MEMORY_MODEL && ARCH_DISCONTIGMEM_ENABLE) || DISCONTIGMEM_MANUAL
config SPARSEMEM
def_bool y
depends on (!SELECT_MEMORY_MODEL && ARCH_SPARSEMEM_ENABLE) || SPARSEMEM_MANUAL
config FLATMEM
def_bool y
depends on (!DISCONTIGMEM && !SPARSEMEM) || FLATMEM_MANUAL
config FLAT_NODE_MEM_MAP
def_bool y
depends on !SPARSEMEM
#
# Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's
# to represent different areas of memory. This variable allows
# those dependencies to exist individually.
#
config NEED_MULTIPLE_NODES
def_bool y
depends on DISCONTIGMEM || NUMA
config HAVE_MEMORY_PRESENT
def_bool y
depends on ARCH_HAVE_MEMORY_PRESENT || SPARSEMEM
#
# SPARSEMEM_EXTREME (which is the default) does some bootmem
# allocations when memory_present() is called. If this cannot
# be done on your architecture, select this option. However,
# statically allocating the mem_section[] array can potentially
# consume vast quantities of .bss, so be careful.
#
# This option will also potentially produce smaller runtime code
# with gcc 3.4 and later.
#
config SPARSEMEM_STATIC
bool
#
# Architecture platforms which require a two level mem_section in SPARSEMEM
# must select this option. This is usually for architecture platforms with
# an extremely sparse physical address space.
#
config SPARSEMEM_EXTREME
def_bool y
depends on SPARSEMEM && !SPARSEMEM_STATIC
config SPARSEMEM_VMEMMAP_ENABLE
bool
config SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
def_bool y
depends on SPARSEMEM && X86_64
config SPARSEMEM_VMEMMAP
bool "Sparse Memory virtual memmap"
depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE
default y
help
SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise
pfn_to_page and page_to_pfn operations. This is the most
efficient option when sufficient kernel resources are available.
config HAVE_MEMBLOCK
bool
config HAVE_MEMBLOCK_NODE_MAP
bool
config HAVE_MEMBLOCK_PHYS_MAP
bool
config HAVE_GENERIC_RCU_GUP
bool
config ARCH_DISCARD_MEMBLOCK
bool
config NO_BOOTMEM
bool
config MEMORY_ISOLATION
bool
config MOVABLE_NODE
bool "Enable to assign a node which has only movable memory"
depends on HAVE_MEMBLOCK
depends on NO_BOOTMEM
depends on X86_64
depends on NUMA
default n
help
Allow a node to have only movable memory. Pages used by the kernel,
such as direct mapping pages cannot be migrated. So the corresponding
memory device cannot be hotplugged. This option allows the following
two things:
- When the system is booting, node full of hotpluggable memory can
be arranged to have only movable memory so that the whole node can
be hot-removed. (need movable_node boot option specified).
- After the system is up, the option allows users to online all the
memory of a node as movable memory so that the whole node can be
hot-removed.
Users who don't use the memory hotplug feature are fine with this
option on since they don't specify movable_node boot option or they
don't online memory as movable.
Say Y here if you want to hotplug a whole node.
Say N here if you want kernel to use memory on all nodes evenly.
#
# Only be set on architectures that have completely implemented memory hotplug
# feature. If you are not sure, don't touch it.
#
config HAVE_BOOTMEM_INFO_NODE
def_bool n
# eventually, we can have this option just 'select SPARSEMEM'
config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
depends on SPARSEMEM || X86_64_ACPI_NUMA
depends on ARCH_ENABLE_MEMORY_HOTPLUG
depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
config MEMORY_HOTPLUG_SPARSE
def_bool y
depends on SPARSEMEM && MEMORY_HOTPLUG
config MEMORY_HOTREMOVE
bool "Allow for memory hot remove"
select MEMORY_ISOLATION
select HAVE_BOOTMEM_INFO_NODE if (X86_64 || PPC64)
depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
depends on MIGRATION
#
# If we have space for more page flags then we can enable additional
# optimizations and functionality.
#
# Regular Sparsemem takes page flag bits for the sectionid if it does not
# use a virtual memmap. Disable extended page flags for 32 bit platforms
# that require the use of a sectionid in the page flags.
#
config PAGEFLAGS_EXTENDED
def_bool y
depends on 64BIT || SPARSEMEM_VMEMMAP || !SPARSEMEM
# Heavily threaded applications may benefit from splitting the mm-wide
# page_table_lock, so that faults on different parts of the user address
# space can be handled with less contention: split it at this NR_CPUS.
# Default to 4 for wider testing, though 8 might be more appropriate.
# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
# PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes.
# DEBUG_SPINLOCK and DEBUG_LOCK_ALLOC spinlock_t also enlarge struct page.
#
config SPLIT_PTLOCK_CPUS
int
default "999999" if !MMU
default "999999" if ARM && !CPU_CACHE_VIPT
default "999999" if PARISC && !PA20
default "4"
config ARCH_ENABLE_SPLIT_PMD_PTLOCK
bool
#
# support for memory balloon
config MEMORY_BALLOON
bool
#
# support for memory balloon compaction
config BALLOON_COMPACTION
bool "Allow for balloon memory compaction/migration"
def_bool y
depends on COMPACTION && MEMORY_BALLOON
help
Memory fragmentation introduced by ballooning might reduce
significantly the number of 2MB contiguous memory blocks that can be
used within a guest, thus imposing performance penalties associated
with the reduced number of transparent huge pages that could be used
by the guest workload. Allowing the compaction & migration for memory
pages enlisted as being part of memory balloon devices avoids the
scenario aforementioned and helps improving memory defragmentation.
#
# support for memory compaction
config COMPACTION
bool "Allow for memory compaction"
def_bool y
select MIGRATION
depends on MMU
help
Allows the compaction of memory for the allocation of huge pages.
#
# support for page migration
#
config MIGRATION
bool "Page migration"
def_bool y
depends on (NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA) && MMU
help
Allows the migration of the physical location of pages of processes
while the virtual addresses are not changed. This is useful in
two situations. The first is on NUMA systems to put pages nearer
to the processors accessing. The second is when allocating huge
pages as migration can relocate pages to satisfy a huge page
allocation instead of reclaiming.
config ARCH_ENABLE_HUGEPAGE_MIGRATION
bool
config PHYS_ADDR_T_64BIT
def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
config ZONE_DMA_FLAG
int
default "0" if !ZONE_DMA
default "1"
config BOUNCE
bool "Enable bounce buffers"
default y
depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM)
help
Enable bounce buffers for devices that cannot access
the full range of memory available to the CPU. Enabled
by default when ZONE_DMA or HIGHMEM is selected, but you
may say n to override this.
# On the 'tile' arch, USB OHCI needs the bounce pool since tilegx will often
# have more than 4GB of memory, but we don't currently use the IOTLB to present
# a 32-bit address to OHCI. So we need to use a bounce pool instead.
#
# We also use the bounce pool to provide stable page writes for jbd. jbd
# initiates buffer writeback without locking the page or setting PG_writeback,
# and fixing that behavior (a second time; jbd2 doesn't have this problem) is
# a major rework effort. Instead, use the bounce buffer to snapshot pages
# (until jbd goes away). The only jbd user is ext3.
config NEED_BOUNCE_POOL
bool
default y if (TILE && USB_OHCI_HCD) || (BLK_DEV_INTEGRITY && JBD)
config NR_QUICK
int
depends on QUICKLIST
default "2" if AVR32
default "1"
config VIRT_TO_BUS
bool
help
An architecture should select this if it implements the
deprecated interface virt_to_bus(). All new architectures
should probably not select this.
config MMU_NOTIFIER
bool
select SRCU
config KSM
bool "Enable KSM for page merging"
depends on MMU
help
Enable Kernel Samepage Merging: KSM periodically scans those areas
of an application's address space that an app has advised may be
mergeable. When it finds pages of identical content, it replaces
the many instances by a single page with that content, so
saving memory until one or another app needs to modify the content.
Recommended for use with KVM, or with other duplicative applications.
See Documentation/vm/ksm.txt for more information: KSM is inactive
until a program has madvised that an area is MADV_MERGEABLE, and
root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
config DEFAULT_MMAP_MIN_ADDR
int "Low address space to protect from user allocation"
depends on MMU
default 4096
help
This is the portion of low virtual memory which should be protected
from userspace allocation. Keeping a user from writing to low pages
can help reduce the impact of kernel NULL pointer bugs.
For most ia64, ppc64 and x86 users with lots of address space
a value of 65536 is reasonable and should cause no problems.
On arm and other archs it should not be higher than 32768.
Programs which use vm86 functionality or have some need to map
this low address space will need CAP_SYS_RAWIO or disable this
protection by setting the value to 0.
This value can be changed after boot using the
/proc/sys/vm/mmap_min_addr tunable.
config ARCH_SUPPORTS_MEMORY_FAILURE
bool
config MEMORY_FAILURE
depends on MMU
depends on ARCH_SUPPORTS_MEMORY_FAILURE
bool "Enable recovery from hardware memory errors"
select MEMORY_ISOLATION
select RAS
help
Enables code to recover from some memory failures on systems
with MCA recovery. This allows a system to continue running
even when some of its memory has uncorrected errors. This requires
special hardware support and typically ECC memory.
config HWPOISON_INJECT
tristate "HWPoison pages injector"
depends on MEMORY_FAILURE && DEBUG_KERNEL && PROC_FS
select PROC_PAGE_MONITOR
config NOMMU_INITIAL_TRIM_EXCESS
int "Turn on mmap() excess space trimming before booting"
depends on !MMU
default 1
help
The NOMMU mmap() frequently needs to allocate large contiguous chunks
of memory on which to store mappings, but it can only ask the system
allocator for chunks in 2^N*PAGE_SIZE amounts - which is frequently
more than it requires. To deal with this, mmap() is able to trim off
the excess and return it to the allocator.
If trimming is enabled, the excess is trimmed off and returned to the
system allocator, which can cause extra fragmentation, particularly
if there are a lot of transient processes.
If trimming is disabled, the excess is kept, but not used, which for
long-term mappings means that the space is wasted.
Trimming can be dynamically controlled through a sysctl option
(/proc/sys/vm/nr_trim_pages) which specifies the minimum number of
excess pages there must be before trimming should occur, or zero if
no trimming is to occur.
This option specifies the initial value of this option. The default
of 1 says that all excess pages should be trimmed.
See Documentation/nommu-mmap.txt for more information.
config TRANSPARENT_HUGEPAGE
bool "Transparent Hugepage Support"
depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
select COMPACTION
help
Transparent Hugepages allows the kernel to use huge pages and
huge tlb transparently to the applications whenever possible.
This feature can improve computing performance to certain
applications by speeding up page faults during memory
allocation, by reducing the number of tlb misses and by speeding
up the pagetable walking.
If memory constrained on embedded, you may want to say N.
choice
prompt "Transparent Hugepage Support sysfs defaults"
depends on TRANSPARENT_HUGEPAGE
default TRANSPARENT_HUGEPAGE_ALWAYS
help
Selects the sysfs defaults for Transparent Hugepage Support.
config TRANSPARENT_HUGEPAGE_ALWAYS
bool "always"
help
Enabling Transparent Hugepage always, can increase the
memory footprint of applications without a guaranteed
benefit but it will work automatically for all applications.
config TRANSPARENT_HUGEPAGE_MADVISE
bool "madvise"
help
Enabling Transparent Hugepage madvise, will only provide a
performance improvement benefit to the applications using
madvise(MADV_HUGEPAGE) but it won't risk to increase the
memory footprint of applications without a guaranteed
benefit.
endchoice
#
# UP and nommu archs use km based percpu allocator
#
config NEED_PER_CPU_KM
depends on !SMP
bool
default y
config CLEANCACHE
bool "Enable cleancache driver to cache clean pages if tmem is present"
default n
help
Cleancache can be thought of as a page-granularity victim cache
for clean pages that the kernel's pageframe replacement algorithm
(PFRA) would like to keep around, but can't since there isn't enough
memory. So when the PFRA "evicts" a page, it first attempts to use
cleancache code to put the data contained in that page into
"transcendent memory", memory that is not directly accessible or
addressable by the kernel and is of unknown and possibly
time-varying size. And when a cleancache-enabled
filesystem wishes to access a page in a file on disk, it first
checks cleancache to see if it already contains it; if it does,
the page is copied into the kernel and a disk access is avoided.
When a transcendent memory driver is available (such as zcache or
Xen transcendent memory), a significant I/O reduction
may be achieved. When none is available, all cleancache calls
are reduced to a single pointer-compare-against-NULL resulting
in a negligible performance hit.
If unsure, say Y to enable cleancache
config FRONTSWAP
bool "Enable frontswap to cache swap pages if tmem is present"
depends on SWAP
default n
help
Frontswap is so named because it can be thought of as the opposite
of a "backing" store for a swap device. The data is stored into
"transcendent memory", memory that is not directly accessible or
addressable by the kernel and is of unknown and possibly
time-varying size. When space in transcendent memory is available,
a significant swap I/O reduction may be achieved. When none is
available, all frontswap calls are reduced to a single pointer-
compare-against-NULL resulting in a negligible performance hit
and swap data is stored as normal on the matching swap device.
If unsure, say Y to enable frontswap.
config CMA
bool "Contiguous Memory Allocator"
depends on HAVE_MEMBLOCK && MMU
select MIGRATION
select MEMORY_ISOLATION
help
This enables the Contiguous Memory Allocator which allows other
subsystems to allocate big physically-contiguous blocks of memory.
CMA reserves a region of memory and allows only movable pages to
be allocated from it. This way, the kernel can use the memory for
pagecache and when a subsystem requests for contiguous area, the
allocated pages are migrated away to serve the contiguous request.
If unsure, say "n".
config CMA_DEBUG
bool "CMA debug messages (DEVELOPMENT)"
depends on DEBUG_KERNEL && CMA
help
Turns on debug messages in CMA. This produces KERN_DEBUG
messages for every CMA call as well as various messages while
processing calls such as dma_alloc_from_contiguous().
This option does not affect warning and error messages.
config CMA_DEBUGFS
bool "CMA debugfs interface"
depends on CMA && DEBUG_FS
help
Turns on the DebugFS interface for CMA.
config CMA_AREAS
int "Maximum count of the CMA areas"
depends on CMA
default 7
help
CMA allows to create CMA areas for particular purpose, mainly,
used as device private area. This parameter sets the maximum
number of CMA area in the system.
If unsure, leave the default value "7".
config MEM_SOFT_DIRTY
bool "Track memory changes"
depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS
select PROC_PAGE_MONITOR
help
This option enables memory changes tracking by introducing a
soft-dirty bit on pte-s. This bit it set when someone writes
into a page just as regular dirty bit, but unlike the latter
it can be cleared by hands.
See Documentation/vm/soft-dirty.txt for more details.
config ZSWAP
bool "Compressed cache for swap pages (EXPERIMENTAL)"
depends on FRONTSWAP && CRYPTO=y
select CRYPTO_LZO
select ZPOOL
default n
help
A lightweight compressed cache for swap pages. It takes
pages that are in the process of being swapped out and attempts to
compress them into a dynamically allocated RAM-based memory pool.
This can result in a significant I/O reduction on swap device and,
in the case where decompressing from RAM is faster that swap device
reads, can also improve workload performance.
This is marked experimental because it is a new feature (as of
v3.11) that interacts heavily with memory reclaim. While these
interactions don't cause any known issues on simple memory setups,
they have not be fully explored on the large set of potential
configurations and workloads that exist.
config ZPOOL
tristate "Common API for compressed memory storage"
default n
help
Compressed memory storage API. This allows using either zbud or
zsmalloc.
config ZBUD
tristate "Low density storage for compressed pages"
default n
help
A special purpose allocator for storing compressed pages.
It is designed to store up to two compressed pages per physical
page. While this design limits storage density, it has simple and
deterministic reclaim properties that make it preferable to a higher
density approach when reclaim will be used.
config ZSMALLOC
tristate "Memory allocator for compressed pages"
depends on MMU
default n
help
zsmalloc is a slab-based memory allocator designed to store
compressed RAM pages. zsmalloc uses virtual memory mapping
in order to reduce fragmentation. However, this results in a
non-standard allocator interface where a handle, not a pointer, is
returned by an alloc(). This handle must be mapped in order to
access the allocated space.
config PGTABLE_MAPPING
bool "Use page table mapping to access object in zsmalloc"
depends on ZSMALLOC
help
By default, zsmalloc uses a copy-based object mapping method to
access allocations that span two pages. However, if a particular
architecture (ex, ARM) performs VM mapping faster than copying,
then you should select this. This causes zsmalloc to use page table
mapping rather than copying for object mapping.
You can check speed with zsmalloc benchmark:
https://github.com/spartacus06/zsmapbench
config ZSMALLOC_STAT
bool "Export zsmalloc statistics"
depends on ZSMALLOC
select DEBUG_FS
help
This option enables code in the zsmalloc to collect various
statistics about whats happening in zsmalloc and exports that
information to userspace via debugfs.
If unsure, say N.
config GENERIC_EARLY_IOREMAP
bool
config MAX_STACK_SIZE_MB
int "Maximum user stack size for 32-bit processes (MB)"
default 80
range 8 256 if METAG
range 8 2048
depends on STACK_GROWSUP && (!64BIT || COMPAT)
help
This is the maximum stack size in Megabytes in the VM layout of 32-bit
user processes when the stack grows upwards (currently only on parisc
and metag arch). The stack will be located at the highest memory
address minus the given value, unless the RLIMIT_STACK hard limit is
changed to a smaller value in which case that is used.
A sane initial value is 80 MB.
# For architectures that support deferred memory initialisation
config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
bool
config DEFERRED_STRUCT_PAGE_INIT
bool "Defer initialisation of struct pages to kswapd"
default n
depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
depends on MEMORY_HOTPLUG
help
Ordinarily all struct pages are initialised during early boot in a
single thread. On very large machines this can take a considerable
amount of time. If this option is set, large machines will bring up
a subset of memmap at boot and then initialise the rest in parallel
when kswapd starts. This has a potential performance impact on
processes running early in the lifetime of the systemm until kswapd
finishes the initialisation.
C:\Users\Admin\Desktop\linux-4.2.y-new\linux-4.2.y\mm/Kconfig.debug
config PAGE_EXTENSION
bool "Extend memmap on extra space for more information on page"
---help---
Extend memmap on extra space for more information on page. This
could be used for debugging features that need to insert extra
field for every page. This extension enables us to save memory
by not allocating this extra memory according to boottime
configuration.
config DEBUG_PAGEALLOC
bool "Debug page memory allocations"
depends on DEBUG_KERNEL
depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC
depends on !KMEMCHECK
select PAGE_EXTENSION
select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
---help---
Unmap pages from the kernel linear mapping after free_pages().
This results in a large slowdown, but helps to find certain types
of memory corruption.
For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC,
fill the pages with poison patterns after free_pages() and verify
the patterns before alloc_pages(). Additionally,
this option cannot be enabled in combination with hibernation as
that would result in incorrect warnings of memory corruption after
a resume because free pages are not saved to the suspend image.
config PAGE_POISONING
bool
C:\Users\Admin\Desktop\linux-4.2.y-new\linux-4.2.y\mm/kmemcheck.c
#include <linux/gfp.h>
#include <linux/mm_types.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include "slab.h"
#include <linux/kmemcheck.h>
void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
{
struct page *shadow;
int pages;
int i;
pages = 1 << order;
/*
* With kmemcheck enabled, we need to allocate a memory area for the
* shadow bits as well.
*/
shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order);
if (!shadow) {
if (printk_ratelimit())
printk(KERN_ERR "kmemcheck: failed to allocate "
"shadow bitmap\n");
return;
}
for(i = 0; i < pages; ++i)
page[i].shadow = page_address(&shadow[i]);
/*
* Mark it as non-present for the MMU so that our accesses to
* this memory will trigger a page fault and let us analyze
* the memory accesses.
*/
kmemcheck_hide_pages(page, pages);
}
void kmemcheck_free_shadow(struct page *page, int order)
{
struct page *shadow;
int pages;
int i;
if (!kmemcheck_page_is_tracked(page))
return;
pages = 1 << order;
kmemcheck_show_pages(page, pages);
shadow = virt_to_page(page[0].shadow);
for(i = 0; i < pages; ++i)
page[i].shadow = NULL;
__free_pages(shadow, order);
}
void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
size_t size)
{
/*
* Has already been memset(), which initializes the shadow for us
* as well.
*/
if (gfpflags & __GFP_ZERO)
return;
/* No need to initialize the shadow of a non-tracked slab. */
if (s->flags & SLAB_NOTRACK)
return;
if (!kmemcheck_enabled || gfpflags & __GFP_NOTRACK) {
/*
* Allow notracked objects to be allocated from
* tracked caches. Note however that these objects
* will still get page faults on access, they just
* won't ever be flagged as uninitialized. If page
* faults are not acceptable, the slab cache itself
* should be marked NOTRACK.
*/
kmemcheck_mark_initialized(object, size);
} else if (!s->ctor) {
/*
* New objects should be marked uninitialized before
* they're returned to the called.
*/
kmemcheck_mark_uninitialized(object, size);
}
}
void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size)
{
/* TODO: RCU freeing is unsupported for now; hide false positives. */
if (!s->ctor && !(s->flags & SLAB_DESTROY_BY_RCU))
kmemcheck_mark_freed(object, size);
}
void kmemcheck_pagealloc_alloc(struct page *page, unsigned int order,
gfp_t gfpflags)
{
int pages;
if (gfpflags & (__GFP_HIGHMEM | __GFP_NOTRACK))
return;
pages = 1 << order;
/*
* NOTE: We choose to track GFP_ZERO pages too; in fact, they
* can become uninitialized by copying uninitialized memory
* into them.
*/
/* XXX: Can use zone->node for node? */
kmemcheck_alloc_shadow(page, order, gfpflags, -1);
if (gfpflags & __GFP_ZERO)
kmemcheck_mark_initialized_pages(page, pages);
else
kmemcheck_mark_uninitialized_pages(page, pages);
}
C:\Users\Admin\Desktop\linux-4.2.y-new\linux-4.2.y\mm/kmemleak-test.c
/*
* mm/kmemleak-test.c
*
* Copyright (C) 2008 ARM Limited
* Written by Catalin Marinas <catalin.marinas@arm.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#define pr_fmt(fmt) "kmemleak: " fmt
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/list.h>
#include <linux/percpu.h>
#include <linux/fdtable.h>
#include <linux/kmemleak.h>
struct test_node {
long header[25];
struct list_head list;
long footer[25];
};
static LIST_HEAD(test_list);
static DEFINE_PER_CPU(void *, kmemleak_test_pointer);
/*
* Some very simple testing. This function needs to be extended for
* proper testing.
*/
static int __init kmemleak_test_init(void)
{
struct test_node *elem;
int i;
printk(KERN_INFO "Kmemleak testing\n");
/* make some orphan objects */
pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL));
pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL));
pr_info("kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL));
pr_info("kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL));
pr_info("kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL));
pr_info("kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL));
pr_info("kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL));
pr_info("kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL));
#ifndef CONFIG_MODULES
pr_info("kmem_cache_alloc(files_cachep) = %p\n",
kmem_cache_alloc(files_cachep, GFP_KERNEL));
pr_info("kmem_cache_alloc(files_cachep) = %p\n",
kmem_cache_alloc(files_cachep, GFP_KERNEL));
#endif
pr_info("vmalloc(64) = %p\n", vmalloc(64));
pr_info("vmalloc(64) = %p\n", vmalloc(64));
pr_info("vmalloc(64) = %p\n", vmalloc(64));
pr_info("vmalloc(64) = %p\n", vmalloc(64));
pr_info("vmalloc(64) = %p\n", vmalloc(64));
/*
* Add elements to a list. They should only appear as orphan
* after the module is removed.
*/
for (i = 0; i < 10; i++) {
elem = kzalloc(sizeof(*elem), GFP_KERNEL);
pr_info("kzalloc(sizeof(*elem)) = %p\n", elem);
if (!elem)
return -ENOMEM;
INIT_LIST_HEAD(&elem->list);
list_add_tail(&elem->list, &test_list);
}
for_each_possible_cpu(i) {
per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL);
pr_info("kmalloc(129) = %p\n",
per_cpu(kmemleak_test_pointer, i));
}
return 0;
}
module_init(kmemleak_test_init);
static void __exit kmemleak_test_exit(void)
{
struct test_node *elem, *tmp;
/*
* Remove the list elements without actually freeing the
* memory.
*/
list_for_each_entry_safe(elem, tmp, &test_list, list)
list_del(&elem->list);
}
module_exit(kmemleak_test_exit);
MODULE_LICENSE("GPL");
C:\Users\Admin\Desktop\linux-4.2.y-new\linux-4.2.y\mm/kmemleak.c
/*
* mm/kmemleak.c
*
* Copyright (C) 2008 ARM Limited
* Written by Catalin Marinas <catalin.marinas@arm.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
*
* For more information on the algorithm and kmemleak usage, please see
* Documentation/kmemleak.txt.
*
* Notes on locking
* ----------------
*
* The following locks and mutexes are used by kmemleak:
*
* - kmemleak_lock (rwlock): protects the object_list modifications and
* accesses to the object_tree_root. The object_list is the main list
* holding the metadata (struct kmemleak_object) for the allocated memory
* blocks. The object_tree_root is a red black tree used to look-up
* metadata based on a pointer to the corresponding memory block. The
* kmemleak_object structures are added to the object_list and
* object_tree_root in the create_object() function called from the
* kmemleak_alloc() callback and removed in delete_object() called from the
* kmemleak_free() callback
* - kmemleak_object.lock (spinlock): protects a kmemleak_object. Accesses to
* the metadata (e.g. count) are protected by this lock. Note that some
* members of this structure may be protected by other means (atomic or
* kmemleak_lock). This lock is also held when scanning the corresponding
* memory block to avoid the kernel freeing it via the kmemleak_free()
* callback. This is less heavyweight than holding a global lock like
* kmemleak_lock during scanning
* - scan_mutex (mutex): ensures that only one thread may scan the memory for
* unreferenced objects at a time. The gray_list contains the objects which
* are already referenced or marked as false positives and need to be
* scanned. This list is only modified during a scanning episode when the
* scan_mutex is held. At the end of a scan, the gray_list is always empty.
* Note that the kmemleak_object.use_count is incremented when an object is
* added to the gray_list and therefore cannot be freed. This mutex also
* prevents multiple users of the "kmemleak" debugfs file together with
* modifications to the memory scanning parameters including the scan_thread
* pointer
*
* Locks and mutexes are acquired/nested in the following order:
*
* scan_mutex [-> object->lock] -> kmemleak_lock -> other_object->lock (SINGLE_DEPTH_NESTING)
*
* No kmemleak_lock and object->lock nesting is allowed outside scan_mutex
* regions.
*
* The kmemleak_object structures have a use_count incremented or decremented
* using the get_object()/put_object() functions. When the use_count becomes
* 0, this count can no longer be incremented and put_object() schedules the
* kmemleak_object freeing via an RCU callback. All calls to the get_object()
* function must be protected by rcu_read_lock() to avoid accessing a freed
* structure.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/sched.h>
#include <linux/jiffies.h>
#include <linux/delay.h>
#include <linux/export.h>
#include <linux/kthread.h>
#include <linux/rbtree.h>
#include <linux/fs.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
#include <linux/cpumask.h>
#include <linux/spinlock.h>
#include <linux/mutex.h>
#include <linux/rcupdate.h>
#include <linux/stacktrace.h>
#include <linux/cache.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/mmzone.h>
#include <linux/slab.h>
#include <linux/thread_info.h>
#include <linux/err.h>
#include <linux/uaccess.h>
#include <linux/string.h>
#include <linux/nodemask.h>
#include <linux/mm.h>
#include <linux/workqueue.h>
#include <linux/crc32.h>
#include <asm/sections.h>
#include <asm/processor.h>
#include <linux/atomic.h>
#include <linux/kasan.h>
#include <linux/kmemcheck.h>
#include <linux/kmemleak.h>
#include <linux/memory_hotplug.h>
/*
* Kmemleak configuration and common defines.
*/
#define MAX_TRACE 16 /* stack trace length */
#define MSECS_MIN_AGE 5000 /* minimum object age for reporting */
#define SECS_FIRST_SCAN 60 /* delay before the first scan */
#define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */
#define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */
#define BYTES_PER_POINTER sizeof(void *)
/* GFP bitmask for kmemleak internal allocations */
#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC | \
__GFP_NOACCOUNT)) | \
__GFP_NORETRY | __GFP_NOMEMALLOC | \
__GFP_NOWARN)
/* scanning area inside a memory block */
struct kmemleak_scan_area {
struct hlist_node node;
unsigned long start;
size_t size;
};
#define KMEMLEAK_GREY 0
#define KMEMLEAK_BLACK -1
/*
* Structure holding the metadata for each allocated memory block.
* Modifications to such objects should be made while holding the
* object->lock. Insertions or deletions from object_list, gray_list or
* rb_node are already protected by the corresponding locks or mutex (see
* the notes on locking above). These objects are reference-counted
* (use_count) and freed using the RCU mechanism.
*/
struct kmemleak_object {
spinlock_t lock;
unsigned long flags; /* object status flags */
struct list_head object_list;
struct list_head gray_list;
struct rb_node rb_node;
struct rcu_head rcu; /* object_list lockless traversal */
/* object usage count; object freed when use_count == 0 */
atomic_t use_count;
unsigned long pointer;
size_t size;
/* minimum number of a pointers found before it is considered leak */
int min_count;
/* the total number of pointers found pointing to this object */
int count;
/* checksum for detecting modified objects */
u32 checksum;
/* memory ranges to be scanned inside an object (empty for all) */
struct hlist_head area_list;
unsigned long trace[MAX_TRACE];
unsigned int trace_len;
unsigned long jiffies; /* creation timestamp */
pid_t pid; /* pid of the current task */
char comm[TASK_COMM_LEN]; /* executable name */
};
/* flag representing the memory block allocation status */
#define OBJECT_ALLOCATED (1 << 0)
/* flag set after the first reporting of an unreference object */
#define OBJECT_REPORTED (1 << 1)
/* flag set to not scan the object */
#define OBJECT_NO_SCAN (1 << 2)
/* number of bytes to print per line; must be 16 or 32 */
#define HEX_ROW_SIZE 16
/* number of bytes to print at a time (1, 2, 4, 8) */
#define HEX_GROUP_SIZE 1
/* include ASCII after the hex output */
#define HEX_ASCII 1
/* max number of lines to be printed */
#define HEX_MAX_LINES 2
/* the list of all allocated objects */
static LIST_HEAD(object_list);
/* the list of gray-colored objects (see color_gray comment below) */
static LIST_HEAD(gray_list);
/* search tree for object boundaries */
static struct rb_root object_tree_root = RB_ROOT;
/* rw_lock protecting the access to object_list and object_tree_root */
static DEFINE_RWLOCK(kmemleak_lock);
/* allocation caches for kmemleak internal data */
static struct kmem_cache *object_cache;
static struct kmem_cache *scan_area_cache;
/* set if tracing memory operations is enabled */
static int kmemleak_enabled;
/* same as above but only for the kmemleak_free() callback */
static int kmemleak_free_enabled;
/* set in the late_initcall if there were no errors */
static int kmemleak_initialized;
/* enables or disables early logging of the memory operations */
static int kmemleak_early_log = 1;
/* set if a kmemleak warning was issued */
static int kmemleak_warning;
/* set if a fatal kmemleak error has occurred */
static int kmemleak_error;
/* minimum and maximum address that may be valid pointers */
static unsigned long min_addr = ULONG_MAX;
static unsigned long max_addr;
static struct task_struct *scan_thread;
/* used to avoid reporting of recently allocated objects */
static unsigned long jiffies_min_age;
static unsigned long jiffies_last_scan;
/* delay between automatic memory scannings */
static signed long jiffies_scan_wait;
/* enables or disables the task stacks scanning */
static int kmemleak_stack_scan = 1;
/* protects the memory scanning, parameters and debug/kmemleak file access */
static DEFINE_MUTEX(scan_mutex);
/* setting kmemleak=on, will set this var, skipping the disable */
static int kmemleak_skip_disable;
/* If there are leaks that can be reported */
static bool kmemleak_found_leaks;
/*
* Early object allocation/freeing logging. Kmemleak is initialized after the
* kernel allocator. However, both the kernel allocator and kmemleak may
* allocate memory blocks which need to be tracked. Kmemleak defines an
* arbitrary buffer to hold the allocation/freeing information before it is
* fully initialized.
*/
/* kmemleak operation type for early logging */
enum {
KMEMLEAK_ALLOC,
KMEMLEAK_ALLOC_PERCPU,
KMEMLEAK_FREE,
KMEMLEAK_FREE_PART,
KMEMLEAK_FREE_PERCPU,
KMEMLEAK_NOT_LEAK,
KMEMLEAK_IGNORE,
KMEMLEAK_SCAN_AREA,
KMEMLEAK_NO_SCAN
};
/*
* Structure holding the information passed to kmemleak callbacks during the
* early logging.
*/
struct early_log {
int op_type; /* kmemleak operation type */
const void *ptr; /* allocated/freed memory block */
size_t size; /* memory block size */
int min_count; /* minimum reference count */
unsigned long trace[MAX_TRACE]; /* stack trace */
unsigned int trace_len; /* stack trace length */
};
/* early logging buffer and current position */
static struct early_log
early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE] __initdata;
static int crt_early_log __initdata;
static void kmemleak_disable(void);
/*
* Print a warning and dump the stack trace.
*/
#define kmemleak_warn(x...) do { \
pr_warning(x); \
dump_stack(); \
kmemleak_warning = 1; \
} while (0)
/*
* Macro invoked when a serious kmemleak condition occurred and cannot be
* recovered from. Kmemleak will be disabled and further allocation/freeing
* tracing no longer available.
*/
#define kmemleak_stop(x...) do { \
kmemleak_warn(x); \
kmemleak_disable(); \
} while (0)
/*
* Printing of the objects hex dump to the seq file. The number of lines to be
* printed is limited to HEX_MAX_LINES to prevent seq file spamming. The
* actual number of printed bytes depends on HEX_ROW_SIZE. It must be called
* with the object->lock held.
*/
static void hex_dump_object(struct seq_file *seq,
struct kmemleak_object *object)
{
const u8 *ptr = (const u8 *)object->pointer;
int i, len, remaining;
unsigned char linebuf[HEX_ROW_SIZE * 5];
/* limit the number of lines to HEX_MAX_LINES */
remaining = len =
min(object->size, (size_t)(HEX_MAX_LINES * HEX_ROW_SIZE));
seq_printf(seq, " hex dump (first %d bytes):\n", len);
for (i = 0; i < len; i += HEX_ROW_SIZE) {
int linelen = min(remaining, HEX_ROW_SIZE);
remaining -= HEX_ROW_SIZE;
hex_dump_to_buffer(ptr + i, linelen, HEX_ROW_SIZE,
HEX_GROUP_SIZE, linebuf, sizeof(linebuf),
HEX_ASCII);
seq_printf(seq, " %s\n", linebuf);
}
}
/*
* Object colors, encoded with count and min_count:
* - white - orphan object, not enough references to it (count < min_count)
* - gray - not orphan, not marked as false positive (min_count == 0) or
* sufficient references to it (count >= min_count)
* - black - ignore, it doesn't contain references (e.g. text section)
* (min_count == -1). No function defined for this color.
* Newly created objects don't have any color assigned (object->count == -1)
* before the next memory scan when they become white.
*/
static bool color_white(const struct kmemleak_object *object)
{
return object->count != KMEMLEAK_BLACK &&
object->count < object->min_count;
}
static bool color_gray(const struct kmemleak_object *object)
{
return object->min_count != KMEMLEAK_BLACK &&
object->count >= object->min_count;
}
/*
* Objects are considered unreferenced only if their color is white, they have
* not be deleted and have a minimum age to avoid false positives caused by
* pointers temporarily stored in CPU registers.
*/
static bool unreferenced_object(struct kmemleak_object *object)
{
return (color_white(object) && object->flags & OBJECT_ALLOCATED) &&
time_before_eq(object->jiffies + jiffies_min_age,
jiffies_last_scan);
}
/*
* Printing of the unreferenced objects information to the seq file. The
* print_unreferenced function must be called with the object->lock held.
*/
static void print_unreferenced(struct seq_file *seq,
struct kmemleak_object *object)
{
int i;
unsigned int msecs_age = jiffies_to_msecs(jiffies - object->jiffies);
seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n",
object->pointer, object->size);
seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n",
object->comm, object->pid, object->jiffies,
msecs_age / 1000, msecs_age % 1000);
hex_dump_object(seq, object);
seq_printf(seq, " backtrace:\n");
for (i = 0; i < object->trace_len; i++) {
void *ptr = (void *)object->trace[i];
seq_printf(seq, " [<%p>] %pS\n", ptr, ptr);
}
}
/*
* Print the kmemleak_object information. This function is used mainly for
* debugging special cases when kmemleak operations. It must be called with
* the object->lock held.
*/
static void dump_object_info(struct kmemleak_object *object)
{
struct stack_trace trace;
trace.nr_entries = object->trace_len;
trace.entries = object->trace;
pr_notice("Object 0x%08lx (size %zu):\n",
object->pointer, object->size);
pr_notice(" comm \"%s\", pid %d, jiffies %lu\n",
object->comm, object->pid, object->jiffies);
pr_notice(" min_count = %d\n", object->min_count);
pr_notice(" count = %d\n", object->count);
pr_notice(" flags = 0x%lx\n", object->flags);
pr_notice(" checksum = %u\n", object->checksum);
pr_notice(" backtrace:\n");
print_stack_trace(&trace, 4);
}
/*
* Look-up a memory block metadata (kmemleak_object) in the object search
* tree based on a pointer value. If alias is 0, only values pointing to the
* beginning of the memory block are allowed. The kmemleak_lock must be held
* when calling this function.
*/
static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
{
struct rb_node *rb = object_tree_root.rb_node;
while (rb) {
struct kmemleak_object *object =
rb_entry(rb, struct kmemleak_object, rb_node);
if (ptr < object->pointer)
rb = object->rb_node.rb_left;
else if (object->pointer + object->size <= ptr)
rb = object->rb_node.rb_right;
else if (object->pointer == ptr || alias)
return object;
else {
kmemleak_warn("Found object by alias at 0x%08lx\n",
ptr);
dump_object_info(object);
break;
}
}
return NULL;
}
/*
* Increment the object use_count. Return 1 if successful or 0 otherwise. Note
* that once an object's use_count reached 0, the RCU freeing was already
* registered and the object should no longer be used. This function must be
* called under the protection of rcu_read_lock().
*/
static int get_object(struct kmemleak_object *object)
{
return atomic_inc_not_zero(&object->use_count);
}
/*
* RCU callback to free a kmemleak_object.
*/
static void free_object_rcu(struct rcu_head *rcu)
{
struct hlist_node *tmp;
struct kmemleak_scan_area *area;
struct kmemleak_object *object =
container_of(rcu, struct kmemleak_object, rcu);
/*
* Once use_count is 0 (guaranteed by put_object), there is no other
* code accessing this object, hence no need for locking.
*/
hlist_for_each_entry_safe(area, tmp, &object->area_list, node) {
hlist_del(&area->node);
kmem_cache_free(scan_area_cache, area);
}
kmem_cache_free(object_cache, object);
}
/*
* Decrement the object use_count. Once the count is 0, free the object using
* an RCU callback. Since put_object() may be called via the kmemleak_free() ->
* delete_object() path, the delayed RCU freeing ensures that there is no
* recursive call to the kernel allocator. Lock-less RCU object_list traversal
* is also possible.
*/
static void put_object(struct kmemleak_object *object)
{
if (!atomic_dec_and_test(&object->use_count))
return;
/* should only get here after delete_object was called */
WARN_ON(object->flags & OBJECT_ALLOCATED);
call_rcu(&object->rcu, free_object_rcu);
}
/*
* Look up an object in the object search tree and increase its use_count.
*/
static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
{
unsigned long flags;
struct kmemleak_object *object = NULL;
rcu_read_lock();
read_lock_irqsave(&kmemleak_lock, flags);
object = lookup_object(ptr, alias);
read_unlock_irqrestore(&kmemleak_lock, flags);
/* check whether the object is still available */
if (object && !get_object(object))
object = NULL;
rcu_read_unlock();
return object;
}
/*
* Look up an object in the object search tree and remove it from both
* object_tree_root and object_list. The returned object's use_count should be
* at least 1, as initially set by create_object().
*/
static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int alias)
{
unsigned long flags;
struct kmemleak_object *object;
write_lock_irqsave(&kmemleak_lock, flags);
object = lookup_object(ptr, alias);
if (object) {
rb_erase(&object->rb_node, &object_tree_root);
list_del_rcu(&object->object_list);
}
write_unlock_irqrestore(&kmemleak_lock, flags);
return object;
}
/*
* Save stack trace to the given array of MAX_TRACE size.
*/
static int __save_stack_trace(unsigned long *trace)
{
struct stack_trace stack_trace;
stack_trace.max_entries = MAX_TRACE;
stack_trace.nr_entries = 0;
stack_trace.entries = trace;
stack_trace.skip = 2;
save_stack_trace(&stack_trace);
return stack_trace.nr_entries;
}
/*
* Create the metadata (struct kmemleak_object) corresponding to an allocated
* memory block and add it to the object_list and object_tree_root.
*/
static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
int min_count, gfp_t gfp)
{
unsigned long flags;
struct kmemleak_object *object, *parent;
struct rb_node **link, *rb_parent;
object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
if (!object) {
pr_warning("Cannot allocate a kmemleak_object structure\n");
kmemleak_disable();
return NULL;
}
INIT_LIST_HEAD(&object->object_list);
INIT_LIST_HEAD(&object->gray_list);
INIT_HLIST_HEAD(&object->area_list);
spin_lock_init(&object->lock);
atomic_set(&object->use_count, 1);
object->flags = OBJECT_ALLOCATED;
object->pointer = ptr;
object->size = size;
object->min_count = min_count;
object->count = 0; /* white color initially */
object->jiffies = jiffies;
object->checksum = 0;
/* task information */
if (in_irq()) {
object->pid = 0;
strncpy(object->comm, "hardirq", sizeof(object->comm));
} else if (in_softirq()) {
object->pid = 0;
strncpy(object->comm, "softirq", sizeof(object->comm));
} else {
object->pid = current->pid;
/*
* There is a small chance of a race with set_task_comm(),
* however using get_task_comm() here may cause locking
* dependency issues with current->alloc_lock. In the worst
* case, the command line is not correct.
*/
strncpy(object->comm, current->comm, sizeof(object->comm));
}
/* kernel backtrace */
object->trace_len = __save_stack_trace(object->trace);
write_lock_irqsave(&kmemleak_lock, flags);
min_addr = min(min_addr, ptr);
max_addr = max(max_addr, ptr + size);
link = &object_tree_root.rb_node;
rb_parent = NULL;
while (*link) {
rb_parent = *link;
parent = rb_entry(rb_parent, struct kmemleak_object, rb_node);
if (ptr + size <= parent->pointer)
link = &parent->rb_node.rb_left;
else if (parent->pointer + parent->size <= ptr)
link = &parent->rb_node.rb_right;
else {
kmemleak_stop("Cannot insert 0x%lx into the object "
"search tree (overlaps existing)\n",
ptr);
/*
* No need for parent->lock here since "parent" cannot
* be freed while the kmemleak_lock is held.
*/
dump_object_info(parent);
kmem_cache_free(object_cache, object);
object = NULL;
goto out;
}
}
rb_link_node(&object->rb_node, rb_parent, link);
rb_insert_color(&object->rb_node, &object_tree_root);
list_add_tail_rcu(&object->object_list, &object_list);
out:
write_unlock_irqrestore(&kmemleak_lock, flags);
return object;
}
/*
* Mark the object as not allocated and schedule RCU freeing via put_object().
*/
static void __delete_object(struct kmemleak_object *object)
{
unsigned long flags;
WARN_ON(!(object->flags & OBJECT_ALLOCATED));
WARN_ON(atomic_read(&object->use_count) < 1);
/*
* Locking here also ensures that the corresponding memory block
* cannot be freed when it is being scanned.
*/
spin_lock_irqsave(&object->lock, flags);
object->flags &= ~OBJECT_ALLOCATED;
spin_unlock_irqrestore(&object->lock, flags);
put_object(object);
}
/*
* Look up the metadata (struct kmemleak_object) corresponding to ptr and
* delete it.
*/
static void delete_object_full(unsigned long ptr)
{
struct kmemleak_object *object;
object = find_and_remove_object(ptr, 0);
if (!object) {
#ifdef DEBUG
kmemleak_warn("Freeing unknown object at 0x%08lx\n",
ptr);
#endif
return;
}
__delete_object(object);
}
/*
* Look up the metadata (struct kmemleak_object) corresponding to ptr and
* delete it. If the memory block is partially freed, the function may create
* additional metadata for the remaining parts of the block.
*/
static void delete_object_part(unsigned long ptr, size_t size)
{
struct kmemleak_object *object;
unsigned long start, end;
object = find_and_remove_object(ptr, 1);
if (!object) {
#ifdef DEBUG
kmemleak_warn("Partially freeing unknown object at 0x%08lx "
"(size %zu)\n", ptr, size);
#endif
return;
}
/*
* Create one or two objects that may result from the memory block
* split. Note that partial freeing is only done by free_bootmem() and
* this happens before kmemleak_init() is called. The path below is
* only executed during early log recording in kmemleak_init(), so
* GFP_KERNEL is enough.
*/
start = object->pointer;
end = object->pointer + object->size;
if (ptr > start)
create_object(start, ptr - start, object->min_count,
GFP_KERNEL);
if (ptr + size < end)
create_object(ptr + size, end - ptr - size, object->min_count,
GFP_KERNEL);
__delete_object(object);
}
static void __paint_it(struct kmemleak_object *object, int color)
{
object->min_count = color;
if (color == KMEMLEAK_BLACK)
object->flags |= OBJECT_NO_SCAN;
}
static void paint_it(struct kmemleak_object *object, int color)
{
unsigned long flags;
spin_lock_irqsave(&object->lock, flags);
__paint_it(object, color);
spin_unlock_irqrestore(&object->lock, flags);
}
static void paint_ptr(unsigned long ptr, int color)
{
struct kmemleak_object *object;
object = find_and_get_object(ptr, 0);
if (!object) {
kmemleak_warn("Trying to color unknown object "
"at 0x%08lx as %s\n", ptr,
(color == KMEMLEAK_GREY) ? "Grey" :
(color == KMEMLEAK_BLACK) ? "Black" : "Unknown");
return;
}
paint_it(object, color);
put_object(object);
}
/*
* Mark an object permanently as gray-colored so that it can no longer be
* reported as a leak. This is used in general to mark a false positive.
*/
static void make_gray_object(unsigned long ptr)
{
paint_ptr(ptr, KMEMLEAK_GREY);
}
/*
* Mark the object as black-colored so that it is ignored from scans and
* reporting.
*/
static void make_black_object(unsigned long ptr)
{
paint_ptr(ptr, KMEMLEAK_BLACK);
}
/*
* Add a scanning area to the object. If at least one such area is added,
* kmemleak will only scan these ranges rather than the whole memory block.
*/
static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
{
unsigned long flags;
struct kmemleak_object *object;
struct kmemleak_scan_area *area;
object = find_and_get_object(ptr, 1);
if (!object) {
kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n",
ptr);
return;
}
area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
if (!area) {
pr_warning("Cannot allocate a scan area\n");
goto out;
}
spin_lock_irqsave(&object->lock, flags);
if (size == SIZE_MAX) {
size = object->pointer + object->size - ptr;
} else if (ptr + size > object->pointer + object->size) {
kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr);
dump_object_info(object);
kmem_cache_free(scan_area_cache, area);
goto out_unlock;
}
INIT_HLIST_NODE(&area->node);
area->start = ptr;
area->size = size;
hlist_add_head(&area->node, &object->area_list);
out_unlock:
spin_unlock_irqrestore(&object->lock, flags);
out:
put_object(object);
}
/*
* Set the OBJECT_NO_SCAN flag for the object corresponding to the give
* pointer. Such object will not be scanned by kmemleak but references to it
* are searched.
*/
static void object_no_scan(unsigned long ptr)
{
unsigned long flags;
struct kmemleak_object *object;
object = find_and_get_object(ptr, 0);
if (!object) {
kmemleak_warn("Not scanning unknown object at 0x%08lx\n", ptr);
return;
}
spin_lock_irqsave(&object->lock, flags);
object->flags |= OBJECT_NO_SCAN;
spin_unlock_irqrestore(&object->lock, flags);
put_object(object);
}
/*
* Log an early kmemleak_* call to the early_log buffer. These calls will be
* processed later once kmemleak is fully initialized.
*/
static void __init log_early(int op_type, const void *ptr, size_t size,
int min_count)
{
unsigned long flags;
struct early_log *log;
if (kmemleak_error) {
/* kmemleak stopped recording, just count the requests */
crt_early_log++;
return;
}
if (crt_early_log >= ARRAY_SIZE(early_log)) {
kmemleak_disable();
return;
}
/*
* There is no need for locking since the kernel is still in UP mode
* at this stage. Disabling the IRQs is enough.
*/
local_irq_save(flags);
log = &early_log[crt_early_log];
log->op_type = op_type;
log->ptr = ptr;
log->size = size;
log->min_count = min_count;
log->trace_len = __save_stack_trace(log->trace);
crt_early_log++;
local_irq_restore(flags);
}
/*
* Log an early allocated block and populate the stack trace.
*/
static void early_alloc(struct early_log *log)
{
struct kmemleak_object *object;
unsigned long flags;
int i;
if (!kmemleak_enabled || !log->ptr || IS_ERR(log->ptr))
return;
/*
* RCU locking needed to ensure object is not freed via put_object().
*/
rcu_read_lock();
object = create_object((unsigned long)log->ptr, log->size,
log->min_count, GFP_ATOMIC);
if (!object)
goto out;
spin_lock_irqsave(&object->lock, flags);
for (i = 0; i < log->trace_len; i++)
object->trace[i] = log->trace[i];
object->trace_len = log->trace_len;
spin_unlock_irqrestore(&object->lock, flags);
out:
rcu_read_unlock();
}
/*
* Log an early allocated block and populate the stack trace.
*/
static void early_alloc_percpu(struct early_log *log)
{
unsigned int cpu;
const void __percpu *ptr = log->ptr;
for_each_possible_cpu(cpu) {
log->ptr = per_cpu_ptr(ptr, cpu);
early_alloc(log);
}
}
/**
* kmemleak_alloc - register a newly allocated object
* @ptr: pointer to beginning of the object
* @size: size of the object
* @min_count: minimum number of references to this object. If during memory
* scanning a number of references less than @min_count is found,
* the object is reported as a memory leak. If @min_count is 0,
* the object is never reported as a leak. If @min_count is -1,
* the object is ignored (not scanned and not reported as a leak)
* @gfp: kmalloc() flags used for kmemleak internal memory allocations
*
* This function is called from the kernel allocators when a new object
* (memory block) is allocated (kmem_cache_alloc, kmalloc, vmalloc etc.).
*/
void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
gfp_t gfp)
{
pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count);
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
create_object((unsigned long)ptr, size, min_count, gfp);
else if (kmemleak_early_log)
log_early(KMEMLEAK_ALLOC, ptr, size, min_count);
}
EXPORT_SYMBOL_GPL(kmemleak_alloc);
/**
* kmemleak_alloc_percpu - register a newly allocated __percpu object
* @ptr: __percpu pointer to beginning of the object
* @size: size of the object
* @gfp: flags used for kmemleak internal memory allocations
*
* This function is called from the kernel percpu allocator when a new object
* (memory block) is allocated (alloc_percpu).
*/
void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
gfp_t gfp)
{
unsigned int cpu;
pr_debug("%s(0x%p, %zu)\n", __func__, ptr, size);
/*
* Percpu allocations are only scanned and not reported as leaks
* (min_count is set to 0).
*/
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
for_each_possible_cpu(cpu)
create_object((unsigned long)per_cpu_ptr(ptr, cpu),
size, 0, gfp);
else if (kmemleak_early_log)
log_early(KMEMLEAK_ALLOC_PERCPU, ptr, size, 0);
}
EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu);
/**
* kmemleak_free - unregister a previously registered object
* @ptr: pointer to beginning of the object
*
* This function is called from the kernel allocators when an object (memory
* block) is freed (kmem_cache_free, kfree, vfree etc.).
*/
void __ref kmemleak_free(const void *ptr)
{
pr_debug("%s(0x%p)\n", __func__, ptr);
if (kmemleak_free_enabled && ptr && !IS_ERR(ptr))
delete_object_full((unsigned long)ptr);
else if (kmemleak_early_log)
log_early(KMEMLEAK_FREE, ptr, 0, 0);
}
EXPORT_SYMBOL_GPL(kmemleak_free);
/**
* kmemleak_free_part - partially unregister a previously registered object
* @ptr: pointer to the beginning or inside the object. This also
* represents the start of the range to be freed
* @size: size to be unregistered
*
* This function is called when only a part of a memory block is freed
* (usually from the bootmem allocator).
*/
void __ref kmemleak_free_part(const void *ptr, size_t size)
{
pr_debug("%s(0x%p)\n", __func__, ptr);
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
delete_object_part((unsigned long)ptr, size);
else if (kmemleak_early_log)
log_early(KMEMLEAK_FREE_PART, ptr, size, 0);
}
EXPORT_SYMBOL_GPL(kmemleak_free_part);
/**
* kmemleak_free_percpu - unregister a previously registered __percpu object
* @ptr: __percpu pointer to beginning of the object
*
* This function is called from the kernel percpu allocator when an object
* (memory block) is freed (free_percpu).
*/
void __ref kmemleak_free_percpu(const void __percpu *ptr)
{
unsigned int cpu;
pr_debug("%s(0x%p)\n", __func__, ptr);
if (kmemleak_free_enabled && ptr && !IS_ERR(ptr))
for_each_possible_cpu(cpu)
delete_object_full((unsigned long)per_cpu_ptr(ptr,
cpu));
else if (kmemleak_early_log)
log_early(KMEMLEAK_FREE_PERCPU, ptr, 0, 0);
}
EXPORT_SYMBOL_GPL(kmemleak_free_percpu);
/**
* kmemleak_update_trace - update object allocation stack trace
* @ptr: pointer to beginning of the object
*
* Override the object allocation stack trace for cases where the actual
* allocation place is not always useful.
*/
void __ref kmemleak_update_trace(const void *ptr)
{
struct kmemleak_object *object;
unsigned long flags;
pr_debug("%s(0x%p)\n", __func__, ptr);
if (!kmemleak_enabled || IS_ERR_OR_NULL(ptr))
return;
object = find_and_get_object((unsigned long)ptr, 1);
if (!object) {
#ifdef DEBUG
kmemleak_warn("Updating stack trace for unknown object at %p\n",
ptr);
#endif
return;
}
spin_lock_irqsave(&object->lock, flags);
object->trace_len = __save_stack_trace(object->trace);
spin_unlock_irqrestore(&object->lock, flags);
put_object(object);
}
EXPORT_SYMBOL(kmemleak_update_trace);
/**
* kmemleak_not_leak - mark an allocated object as false positive
* @ptr: pointer to beginning of the object
*
* Calling this function on an object will cause the memory block to no longer
* be reported as leak and always be scanned.
*/
void __ref kmemleak_not_leak(const void *ptr)
{
pr_debug("%s(0x%p)\n", __func__, ptr);
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
make_gray_object((unsigned long)ptr);
else if (kmemleak_early_log)
log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0);
}
EXPORT_SYMBOL(kmemleak_not_leak);
/**
* kmemleak_ignore - ignore an allocated object
* @ptr: pointer to beginning of the object
*
* Calling this function on an object will cause the memory block to be
* ignored (not scanned and not reported as a leak). This is usually done when
* it is known that the corresponding block is not a leak and does not contain
* any references to other allocated memory blocks.
*/
void __ref kmemleak_ignore(const void *ptr)
{
pr_debug("%s(0x%p)\n", __func__, ptr);
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
make_black_object((unsigned long)ptr);
else if (kmemleak_early_log)
log_early(KMEMLEAK_IGNORE, ptr, 0, 0);
}
EXPORT_SYMBOL(kmemleak_ignore);
/**
* kmemleak_scan_area - limit the range to be scanned in an allocated object
* @ptr: pointer to beginning or inside the object. This also
* represents the start of the scan area
* @size: size of the scan area
* @gfp: kmalloc() flags used for kmemleak internal memory allocations
*
* This function is used when it is known that only certain parts of an object
* contain references to other objects. Kmemleak will only scan these areas
* reducing the number false negatives.
*/
void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp)
{
pr_debug("%s(0x%p)\n", __func__, ptr);
if (kmemleak_enabled && ptr && size && !IS_ERR(ptr))
add_scan_area((unsigned long)ptr, size, gfp);
else if (kmemleak_early_log)
log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0);
}
EXPORT_SYMBOL(kmemleak_scan_area);
/**
* kmemleak_no_scan - do not scan an allocated object
* @ptr: pointer to beginning of the object
*
* This function notifies kmemleak not to scan the given memory block. Useful
* in situations where it is known that the given object does not contain any
* references to other objects. Kmemleak will not scan such objects reducing
* the number of false negatives.
*/
void __ref kmemleak_no_scan(const void *ptr)
{
pr_debug("%s(0x%p)\n", __func__, ptr);
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
object_no_scan((unsigned long)ptr);
else if (kmemleak_early_log)
log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0);
}
EXPORT_SYMBOL(kmemleak_no_scan);
/*
* Update an object's checksum and return true if it was modified.
*/
static bool update_checksum(struct kmemleak_object *object)
{
u32 old_csum = object->checksum;
if (!kmemcheck_is_obj_initialized(object->pointer, object->size))
return false;
kasan_disable_current();
object->checksum = crc32(0, (void *)object->pointer, object->size);
kasan_enable_current();
return object->checksum != old_csum;
}
/*
* Memory scanning is a long process and it needs to be interruptable. This
* function checks whether such interrupt condition occurred.
*/
static int scan_should_stop(void)
{
if (!kmemleak_enabled)
return 1;
/*
* This function may be called from either process or kthread context,
* hence the need to check for both stop conditions.
*/
if (current->mm)
return signal_pending(current);
else
return kthread_should_stop();
return 0;
}
/*
* Scan a memory block (exclusive range) for valid pointers and add those
* found to the gray list.
*/
static void scan_block(void *_start, void *_end,
struct kmemleak_object *scanned)
{
unsigned long *ptr;
unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER);
unsigned long *end = _end - (BYTES_PER_POINTER - 1);
unsigned long flags;
read_lock_irqsave(&kmemleak_lock, flags);
for (ptr = start; ptr < end; ptr++) {
struct kmemleak_object *object;
unsigned long pointer;
if (scan_should_stop())
break;
/* don't scan uninitialized memory */
if (!kmemcheck_is_obj_initialized((unsigned long)ptr,
BYTES_PER_POINTER))
continue;
kasan_disable_current();
pointer = *ptr;
kasan_enable_current();
if (pointer < min_addr || pointer >= max_addr)
continue;
/*
* No need for get_object() here since we hold kmemleak_lock.
* object->use_count cannot be dropped to 0 while the object
* is still present in object_tree_root and object_list
* (with updates protected by kmemleak_lock).
*/
object = lookup_object(pointer, 1);
if (!object)
continue;
if (object == scanned)
/* self referenced, ignore */
continue;
/*
* Avoid the lockdep recursive warning on object->lock being
* previously acquired in scan_object(). These locks are
* enclosed by scan_mutex.
*/
spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
if (!color_white(object)) {
/* non-orphan, ignored or new */
spin_unlock(&object->lock);
continue;
}
/*
* Increase the object's reference count (number of pointers
* to the memory block). If this count reaches the required
* minimum, the object's color will become gray and it will be
* added to the gray_list.
*/
object->count++;
if (color_gray(object)) {
/* put_object() called when removing from gray_list */
WARN_ON(!get_object(object));
list_add_tail(&object->gray_list, &gray_list);
}
spin_unlock(&object->lock);
}
read_unlock_irqrestore(&kmemleak_lock, flags);
}
/*
* Scan a large memory block in MAX_SCAN_SIZE chunks to reduce the latency.
*/
static void scan_large_block(void *start, void *end)
{
void *next;
while (start < end) {
next = min(start + MAX_SCAN_SIZE, end);
scan_block(start, next, NULL);
start = next;
cond_resched();
}
}
/*
* Scan a memory block corresponding to a kmemleak_object. A condition is
* that object->use_count >= 1.
*/
static void scan_object(struct kmemleak_object *object)
{
struct kmemleak_scan_area *area;
unsigned long flags;
/*
* Once the object->lock is acquired, the corresponding memory block
* cannot be freed (the same lock is acquired in delete_object).
*/
spin_lock_irqsave(&object->lock, flags);
if (object->flags & OBJECT_NO_SCAN)
goto out;
if (!(object->flags & OBJECT_ALLOCATED))
/* already freed object */
goto out;
if (hlist_empty(&object->area_list)) {
void *start = (void *)object->pointer;
void *end = (void *)(object->pointer + object->size);
void *next;
do {
next = min(start + MAX_SCAN_SIZE, end);
scan_block(start, next, object);
start = next;
if (start >= end)
break;
spin_unlock_irqrestore(&object->lock, flags);
cond_resched();
spin_lock_irqsave(&object->lock, flags);
} while (object->flags & OBJECT_ALLOCATED);
} else
hlist_for_each_entry(area, &object->area_list, node)
scan_block((void *)area->start,
(void *)(area->start + area->size),
object);
out:
spin_unlock_irqrestore(&object->lock, flags);
}
/*
* Scan the objects already referenced (gray objects). More objects will be
* referenced and, if there are no memory leaks, all the objects are scanned.
*/
static void scan_gray_list(void)
{
struct kmemleak_object *object, *tmp;
/*
* The list traversal is safe for both tail additions and removals
* from inside the loop. The kmemleak objects cannot be freed from
* outside the loop because their use_count was incremented.
*/
object = list_entry(gray_list.next, typeof(*object), gray_list);
while (&object->gray_list != &gray_list) {
cond_resched();
/* may add new objects to the list */
if (!scan_should_stop())
scan_object(object);
tmp = list_entry(object->gray_list.next, typeof(*object),
gray_list);
/* remove the object from the list and release it */
list_del(&object->gray_list);
put_object(object);
object = tmp;
}
WARN_ON(!list_empty(&gray_list));
}
/*
* Scan data sections and all the referenced memory blocks allocated via the
* kernel's standard allocators. This function must be called with the
* scan_mutex held.
*/
static void kmemleak_scan(void)
{
unsigned long flags;
struct kmemleak_object *object;
int i;
int new_leaks = 0;
jiffies_last_scan = jiffies;
/* prepare the kmemleak_object's */
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
spin_lock_irqsave(&object->lock, flags);
#ifdef DEBUG
/*
* With a few exceptions there should be a maximum of
* 1 reference to any object at this point.
*/
if (atomic_read(&object->use_count) > 1) {
pr_debug("object->use_count = %d\n",
atomic_read(&object->use_count));
dump_object_info(object);
}
#endif
/* reset the reference count (whiten the object) */
object->count = 0;
if (color_gray(object) && get_object(object))
list_add_tail(&object->gray_list, &gray_list);
spin_unlock_irqrestore(&object->lock, flags);
}
rcu_read_unlock();
/* data/bss scanning */
scan_large_block(_sdata, _edata);
scan_large_block(__bss_start, __bss_stop);
#ifdef CONFIG_SMP
/* per-cpu sections scanning */
for_each_possible_cpu(i)
scan_large_block(__per_cpu_start + per_cpu_offset(i),
__per_cpu_end + per_cpu_offset(i));
#endif
/*
* Struct page scanning for each node.
*/
get_online_mems();
for_each_online_node(i) {
unsigned long start_pfn = node_start_pfn(i);
unsigned long end_pfn = node_end_pfn(i);
unsigned long pfn;
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
struct page *page;
if (!pfn_valid(pfn))
continue;
page = pfn_to_page(pfn);
/* only scan if page is in use */
if (page_count(page) == 0)
continue;
scan_block(page, page + 1, NULL);
}
}
put_online_mems();
/*
* Scanning the task stacks (may introduce false negatives).
*/
if (kmemleak_stack_scan) {
struct task_struct *p, *g;
read_lock(&tasklist_lock);
do_each_thread(g, p) {
scan_block(task_stack_page(p), task_stack_page(p) +
THREAD_SIZE, NULL);
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
}
/*
* Scan the objects already referenced from the sections scanned
* above.
*/
scan_gray_list();
/*
* Check for new or unreferenced objects modified since the previous
* scan and color them gray until the next scan.
*/
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
spin_lock_irqsave(&object->lock, flags);
if (color_white(object) && (object->flags & OBJECT_ALLOCATED)
&& update_checksum(object) && get_object(object)) {
/* color it gray temporarily */
object->count = object->min_count;
list_add_tail(&object->gray_list, &gray_list);
}
spin_unlock_irqrestore(&object->lock, flags);
}
rcu_read_unlock();
/*
* Re-scan the gray list for modified unreferenced objects.
*/
scan_gray_list();
/*
* If scanning was stopped do not report any new unreferenced objects.
*/
if (scan_should_stop())
return;
/*
* Scanning result reporting.
*/
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
spin_lock_irqsave(&object->lock, flags);
if (unreferenced_object(object) &&
!(object->flags & OBJECT_REPORTED)) {
object->flags |= OBJECT_REPORTED;
new_leaks++;
}
spin_unlock_irqrestore(&object->lock, flags);
}
rcu_read_unlock();
if (new_leaks) {
kmemleak_found_leaks = true;
pr_info("%d new suspected memory leaks (see "
"/sys/kernel/debug/kmemleak)\n", new_leaks);
}
}
/*
* Thread function performing automatic memory scanning. Unreferenced objects
* at the end of a memory scan are reported but only the first time.
*/
static int kmemleak_scan_thread(void *arg)
{
static int first_run = 1;
pr_info("Automatic memory scanning thread started\n");
set_user_nice(current, 10);
/*
* Wait before the first scan to allow the system to fully initialize.
*/
if (first_run) {
first_run = 0;
ssleep(SECS_FIRST_SCAN);
}
while (!kthread_should_stop()) {
signed long timeout = jiffies_scan_wait;
mutex_lock(&scan_mutex);
kmemleak_scan();
mutex_unlock(&scan_mutex);
/* wait before the next scan */
while (timeout && !kthread_should_stop())
timeout = schedule_timeout_interruptible(timeout);
}
pr_info("Automatic memory scanning thread ended\n");
return 0;
}
/*
* Start the automatic memory scanning thread. This function must be called
* with the scan_mutex held.
*/
static void start_scan_thread(void)
{
if (scan_thread)
return;
scan_thread = kthread_run(kmemleak_scan_thread, NULL, "kmemleak");
if (IS_ERR(scan_thread)) {
pr_warning("Failed to create the scan thread\n");
scan_thread = NULL;
}
}
/*
* Stop the automatic memory scanning thread. This function must be called
* with the scan_mutex held.
*/
static void stop_scan_thread(void)
{
if (scan_thread) {
kthread_stop(scan_thread);
scan_thread = NULL;
}
}
/*
* Iterate over the object_list and return the first valid object at or after
* the required position with its use_count incremented. The function triggers
* a memory scanning when the pos argument points to the first position.
*/
static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos)
{
struct kmemleak_object *object;
loff_t n = *pos;
int err;
err = mutex_lock_interruptible(&scan_mutex);
if (err < 0)
return ERR_PTR(err);
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
if (n-- > 0)
continue;
if (get_object(object))
goto out;
}
object = NULL;
out:
return object;
}
/*
* Return the next object in the object_list. The function decrements the
* use_count of the previous object and increases that of the next one.
*/
static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct kmemleak_object *prev_obj = v;
struct kmemleak_object *next_obj = NULL;
struct kmemleak_object *obj = prev_obj;
++(*pos);
list_for_each_entry_continue_rcu(obj, &object_list, object_list) {
if (get_object(obj)) {
next_obj = obj;
break;
}
}
put_object(prev_obj);
return next_obj;
}
/*
* Decrement the use_count of the last object required, if any.
*/
static void kmemleak_seq_stop(struct seq_file *seq, void *v)
{
if (!IS_ERR(v)) {
/*
* kmemleak_seq_start may return ERR_PTR if the scan_mutex
* waiting was interrupted, so only release it if !IS_ERR.
*/
rcu_read_unlock();
mutex_unlock(&scan_mutex);
if (v)
put_object(v);
}
}
/*
* Print the information for an unreferenced object to the seq file.
*/
static int kmemleak_seq_show(struct seq_file *seq, void *v)
{
struct kmemleak_object *object = v;
unsigned long flags;
spin_lock_irqsave(&object->lock, flags);
if ((object->flags & OBJECT_REPORTED) && unreferenced_object(object))
print_unreferenced(seq, object);
spin_unlock_irqrestore(&object->lock, flags);
return 0;
}
static const struct seq_operations kmemleak_seq_ops = {
.start = kmemleak_seq_start,
.next = kmemleak_seq_next,
.stop = kmemleak_seq_stop,
.show = kmemleak_seq_show,
};
static int kmemleak_open(struct inode *inode, struct file *file)
{
return seq_open(file, &kmemleak_seq_ops);
}
static int dump_str_object_info(const char *str)
{
unsigned long flags;
struct kmemleak_object *object;
unsigned long addr;
if (kstrtoul(str, 0, &addr))
return -EINVAL;
object = find_and_get_object(addr, 0);
if (!object) {
pr_info("Unknown object at 0x%08lx\n", addr);
return -EINVAL;
}
spin_lock_irqsave(&object->lock, flags);
dump_object_info(object);
spin_unlock_irqrestore(&object->lock, flags);
put_object(object);
return 0;
}
/*
* We use grey instead of black to ensure we can do future scans on the same
* objects. If we did not do future scans these black objects could
* potentially contain references to newly allocated objects in the future and
* we'd end up with false positives.
*/
static void kmemleak_clear(void)
{
struct kmemleak_object *object;
unsigned long flags;
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
spin_lock_irqsave(&object->lock, flags);
if ((object->flags & OBJECT_REPORTED) &&
unreferenced_object(object))
__paint_it(object, KMEMLEAK_GREY);
spin_unlock_irqrestore(&object->lock, flags);
}
rcu_read_unlock();
kmemleak_found_leaks = false;
}
static void __kmemleak_do_cleanup(void);
/*
* File write operation to configure kmemleak at run-time. The following
* commands can be written to the /sys/kernel/debug/kmemleak file:
* off - disable kmemleak (irreversible)
* stack=on - enable the task stacks scanning
* stack=off - disable the tasks stacks scanning
* scan=on - start the automatic memory scanning thread
* scan=off - stop the automatic memory scanning thread
* scan=... - set the automatic memory scanning period in seconds (0 to
* disable it)
* scan - trigger a memory scan
* clear - mark all current reported unreferenced kmemleak objects as
* grey to ignore printing them, or free all kmemleak objects
* if kmemleak has been disabled.
* dump=... - dump information about the object found at the given address
*/
static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
size_t size, loff_t *ppos)
{
char buf[64];
int buf_size;
int ret;
buf_size = min(size, (sizeof(buf) - 1));
if (strncpy_from_user(buf, user_buf, buf_size) < 0)
return -EFAULT;
buf[buf_size] = 0;
ret = mutex_lock_interruptible(&scan_mutex);
if (ret < 0)
return ret;
if (strncmp(buf, "clear", 5) == 0) {
if (kmemleak_enabled)
kmemleak_clear();
else
__kmemleak_do_cleanup();
goto out;
}
if (!kmemleak_enabled) {
ret = -EBUSY;
goto out;
}
if (strncmp(buf, "off", 3) == 0)
kmemleak_disable();
else if (strncmp(buf, "stack=on", 8) == 0)
kmemleak_stack_scan = 1;
else if (strncmp(buf, "stack=off", 9) == 0)
kmemleak_stack_scan = 0;
else if (strncmp(buf, "scan=on", 7) == 0)
start_scan_thread();
else if (strncmp(buf, "scan=off", 8) == 0)
stop_scan_thread();
else if (strncmp(buf, "scan=", 5) == 0) {
unsigned long secs;
ret = kstrtoul(buf + 5, 0, &secs);
if (ret < 0)
goto out;
stop_scan_thread();
if (secs) {
jiffies_scan_wait = msecs_to_jiffies(secs * 1000);
start_scan_thread();
}
} else if (strncmp(buf, "scan", 4) == 0)
kmemleak_scan();
else if (strncmp(buf, "dump=", 5) == 0)
ret = dump_str_object_info(buf + 5);
else
ret = -EINVAL;
out:
mutex_unlock(&scan_mutex);
if (ret < 0)
return ret;
/* ignore the rest of the buffer, only one command at a time */
*ppos += size;
return size;
}
static const struct file_operations kmemleak_fops = {
.owner = THIS_MODULE,
.open = kmemleak_open,
.read = seq_read,
.write = kmemleak_write,
.llseek = seq_lseek,
.release = seq_release,
};
static void __kmemleak_do_cleanup(void)
{
struct kmemleak_object *object;
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list)
delete_object_full(object->pointer);
rcu_read_unlock();
}
/*
* Stop the memory scanning thread and free the kmemleak internal objects if
* no previous scan thread (otherwise, kmemleak may still have some useful
* information on memory leaks).
*/
static void kmemleak_do_cleanup(struct work_struct *work)
{
stop_scan_thread();
/*
* Once the scan thread has stopped, it is safe to no longer track
* object freeing. Ordering of the scan thread stopping and the memory
* accesses below is guaranteed by the kthread_stop() function.
*/
kmemleak_free_enabled = 0;
if (!kmemleak_found_leaks)
__kmemleak_do_cleanup();
else
pr_info("Kmemleak disabled without freeing internal data. "
"Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\"\n");
}
static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup);
/*
* Disable kmemleak. No memory allocation/freeing will be traced once this
* function is called. Disabling kmemleak is an irreversible operation.
*/
static void kmemleak_disable(void)
{
/* atomically check whether it was already invoked */
if (cmpxchg(&kmemleak_error, 0, 1))
return;
/* stop any memory operation tracing */
kmemleak_enabled = 0;
/* check whether it is too early for a kernel thread */
if (kmemleak_initialized)
schedule_work(&cleanup_work);
else
kmemleak_free_enabled = 0;
pr_info("Kernel memory leak detector disabled\n");
}
/*
* Allow boot-time kmemleak disabling (enabled by default).
*/
static int kmemleak_boot_config(char *str)
{
if (!str)
return -EINVAL;
if (strcmp(str, "off") == 0)
kmemleak_disable();
else if (strcmp(str, "on") == 0)
kmemleak_skip_disable = 1;
else
return -EINVAL;
return 0;
}
early_param("kmemleak", kmemleak_boot_config);
static void __init print_log_trace(struct early_log *log)
{
struct stack_trace trace;
trace.nr_entries = log->trace_len;
trace.entries = log->trace;
pr_notice("Early log backtrace:\n");
print_stack_trace(&trace, 2);
}
/*
* Kmemleak initialization.
*/
void __init kmemleak_init(void)
{
int i;
unsigned long flags;
#ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF
if (!kmemleak_skip_disable) {
kmemleak_early_log = 0;
kmemleak_disable();
return;
}
#endif
jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE);
jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000);
object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
if (crt_early_log >= ARRAY_SIZE(early_log))
pr_warning("Early log buffer exceeded (%d), please increase "
"DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n", crt_early_log);
/* the kernel is still in UP mode, so disabling the IRQs is enough */
local_irq_save(flags);
kmemleak_early_log = 0;
if (kmemleak_error) {
local_irq_restore(flags);
return;
} else {
kmemleak_enabled = 1;
kmemleak_free_enabled = 1;
}
local_irq_restore(flags);
/*
* This is the point where tracking allocations is safe. Automatic
* scanning is started during the late initcall. Add the early logged
* callbacks to the kmemleak infrastructure.
*/
for (i = 0; i < crt_early_log; i++) {
struct early_log *log = &early_log[i];
switch (log->op_type) {
case KMEMLEAK_ALLOC:
early_alloc(log);
break;
case KMEMLEAK_ALLOC_PERCPU:
early_alloc_percpu(log);
break;
case KMEMLEAK_FREE:
kmemleak_free(log->ptr);
break;
case KMEMLEAK_FREE_PART:
kmemleak_free_part(log->ptr, log->size);
break;
case KMEMLEAK_FREE_PERCPU:
kmemleak_free_percpu(log->ptr);
break;
case KMEMLEAK_NOT_LEAK:
kmemleak_not_leak(log->ptr);
break;
case KMEMLEAK_IGNORE:
kmemleak_ignore(log->ptr);
break;
case KMEMLEAK_SCAN_AREA:
kmemleak_scan_area(log->ptr, log->size, GFP_KERNEL);
break;
case KMEMLEAK_NO_SCAN:
kmemleak_no_scan(log->ptr);
break;
default:
kmemleak_warn("Unknown early log operation: %d\n",
log->op_type);
}
if (kmemleak_warning) {
print_log_trace(log);
kmemleak_warning = 0;
}
}
}
/*
* Late initialization function.
*/
static int __init kmemleak_late_init(void)
{
struct dentry *dentry;
kmemleak_initialized = 1;
if (kmemleak_error) {
/*
* Some error occurred and kmemleak was disabled. There is a
* small chance that kmemleak_disable() was called immediately
* after setting kmemleak_initialized and we may end up with
* two clean-up threads but serialized by scan_mutex.
*/
schedule_work(&cleanup_work);
return -ENOMEM;
}
dentry = debugfs_create_file("kmemleak", S_IRUGO, NULL, NULL,
&kmemleak_fops);
if (!dentry)
pr_warning("Failed to create the debugfs kmemleak file\n");
mutex_lock(&scan_mutex);
start_scan_thread();
mutex_unlock(&scan_mutex);
pr_info("Kernel memory leak detector initialized\n");
return 0;
}
late_initcall(kmemleak_late_init);
C:\Users\Admin\Desktop\linux-4.2.y-new\linux-4.2.y\mm/ksm.c
/*
* Memory merging support.
*
* This code enables dynamic sharing of identical pages found in different
* memory areas, even if they are not shared by fork()
*
* Copyright (C) 2008-2009 Red Hat, Inc.
* Authors:
* Izik Eidus
* Andrea Arcangeli
* Chris Wright
* Hugh Dickins
*
* This work is licensed under the terms of the GNU GPL, version 2.
*/
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/mman.h>
#include <linux/sched.h>
#include <linux/rwsem.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/spinlock.h>
#include <linux/jhash.h>
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/wait.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
#include <linux/memory.h>
#include <linux/mmu_notifier.h>
#include <linux/swap.h>
#include <linux/ksm.h>
#include <linux/hashtable.h>
#include <linux/freezer.h>
#include <linux/oom.h>
#include <linux/numa.h>
#include <asm/tlbflush.h>
#include "internal.h"
#ifdef CONFIG_NUMA
#define NUMA(x) (x)
#define DO_NUMA(x) do { (x); } while (0)
#else
#define NUMA(x) (0)
#define DO_NUMA(x) do { } while (0)
#endif
/*
* A few notes about the KSM scanning process,
* to make it easier to understand the data structures below:
*
* In order to reduce excessive scanning, KSM sorts the memory pages by their
* contents into a data structure that holds pointers to the pages' locations.
*
* Since the contents of the pages may change at any moment, KSM cannot just
* insert the pages into a normal sorted tree and expect it to find anything.
* Therefore KSM uses two data structures - the stable and the unstable tree.
*
* The stable tree holds pointers to all the merged pages (ksm pages), sorted
* by their contents. Because each such page is write-protected, searching on
* this tree is fully assured to be working (except when pages are unmapped),
* and therefore this tree is called the stable tree.
*
* In addition to the stable tree, KSM uses a second data structure called the
* unstable tree: this tree holds pointers to pages which have been found to
* be "unchanged for a period of time". The unstable tree sorts these pages
* by their contents, but since they are not write-protected, KSM cannot rely
* upon the unstable tree to work correctly - the unstable tree is liable to
* be corrupted as its contents are modified, and so it is called unstable.
*
* KSM solves this problem by several techniques:
*
* 1) The unstable tree is flushed every time KSM completes scanning all
* memory areas, and then the tree is rebuilt again from the beginning.
* 2) KSM will only insert into the unstable tree, pages whose hash value
* has not changed since the previous scan of all memory areas.
* 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
* colors of the nodes and not on their contents, assuring that even when
* the tree gets "corrupted" it won't get out of balance, so scanning time
* remains the same (also, searching and inserting nodes in an rbtree uses
* the same algorithm, so we have no overhead when we flush and rebuild).
* 4) KSM never flushes the stable tree, which means that even if it were to
* take 10 attempts to find a page in the unstable tree, once it is found,
* it is secured in the stable tree. (When we scan a new page, we first
* compare it against the stable tree, and then against the unstable tree.)
*
* If the merge_across_nodes tunable is unset, then KSM maintains multiple
* stable trees and multiple unstable trees: one of each for each NUMA node.
*/
/**
* struct mm_slot - ksm information per mm that is being scanned
* @link: link to the mm_slots hash list
* @mm_list: link into the mm_slots list, rooted in ksm_mm_head
* @rmap_list: head for this mm_slot's singly-linked list of rmap_items
* @mm: the mm that this information is valid for
*/
struct mm_slot {
struct hlist_node link;
struct list_head mm_list;
struct rmap_item *rmap_list;
struct mm_struct *mm;
};
/**
* struct ksm_scan - cursor for scanning
* @mm_slot: the current mm_slot we are scanning
* @address: the next address inside that to be scanned
* @rmap_list: link to the next rmap to be scanned in the rmap_list
* @seqnr: count of completed full scans (needed when removing unstable node)
*
* There is only the one ksm_scan instance of this cursor structure.
*/
struct ksm_scan {
struct mm_slot *mm_slot;
unsigned long address;
struct rmap_item **rmap_list;
unsigned long seqnr;
};
/**
* struct stable_node - node of the stable rbtree
* @node: rb node of this ksm page in the stable tree
* @head: (overlaying parent) &migrate_nodes indicates temporarily on that list
* @list: linked into migrate_nodes, pending placement in the proper node tree
* @hlist: hlist head of rmap_items using this ksm page
* @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid)
* @nid: NUMA node id of stable tree in which linked (may not match kpfn)
*/
struct stable_node {
union {
struct rb_node node; /* when node of stable tree */
struct { /* when listed for migration */
struct list_head *head;
struct list_head list;
};
};
struct hlist_head hlist;
unsigned long kpfn;
#ifdef CONFIG_NUMA
int nid;
#endif
};
/**
* struct rmap_item - reverse mapping item for virtual addresses
* @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
* @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
* @nid: NUMA node id of unstable tree in which linked (may not match page)
* @mm: the memory structure this rmap_item is pointing into
* @address: the virtual address this rmap_item tracks (+ flags in low bits)
* @oldchecksum: previous checksum of the page at that virtual address
* @node: rb node of this rmap_item in the unstable tree
* @head: pointer to stable_node heading this list in the stable tree
* @hlist: link into hlist of rmap_items hanging off that stable_node
*/
struct rmap_item {
struct rmap_item *rmap_list;
union {
struct anon_vma *anon_vma; /* when stable */
#ifdef CONFIG_NUMA
int nid; /* when node of unstable tree */
#endif
};
struct mm_struct *mm;
unsigned long address; /* + low bits used for flags below */
unsigned int oldchecksum; /* when unstable */
union {
struct rb_node node; /* when node of unstable tree */
struct { /* when listed from stable tree */
struct stable_node *head;
struct hlist_node hlist;
};
};
};
#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */
#define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */
#define STABLE_FLAG 0x200 /* is listed from the stable tree */
/* The stable and unstable tree heads */
static struct rb_root one_stable_tree[1] = { RB_ROOT };
static struct rb_root one_unstable_tree[1] = { RB_ROOT };
static struct rb_root *root_stable_tree = one_stable_tree;
static struct rb_root *root_unstable_tree = one_unstable_tree;
/* Recently migrated nodes of stable tree, pending proper placement */
static LIST_HEAD(migrate_nodes);
#define MM_SLOTS_HASH_BITS 10
static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
static struct mm_slot ksm_mm_head = {
.mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
};
static struct ksm_scan ksm_scan = {
.mm_slot = &ksm_mm_head,
};
static struct kmem_cache *rmap_item_cache;
static struct kmem_cache *stable_node_cache;
static struct kmem_cache *mm_slot_cache;
/* The number of nodes in the stable tree */
static unsigned long ksm_pages_shared;
/* The number of page slots additionally sharing those nodes */
static unsigned long ksm_pages_sharing;
/* The number of nodes in the unstable tree */
static unsigned long ksm_pages_unshared;
/* The number of rmap_items in use: to calculate pages_volatile */
static unsigned long ksm_rmap_items;
/* Number of pages ksmd should scan in one batch */
static unsigned int ksm_thread_pages_to_scan = 100;
/* Milliseconds ksmd should sleep between batches */
static unsigned int ksm_thread_sleep_millisecs = 20;
#ifdef CONFIG_NUMA
/* Zeroed when merging across nodes is not allowed */
static unsigned int ksm_merge_across_nodes = 1;
static int ksm_nr_node_ids = 1;
#else
#define ksm_merge_across_nodes 1U
#define ksm_nr_node_ids 1
#endif
#define KSM_RUN_STOP 0
#define KSM_RUN_MERGE 1
#define KSM_RUN_UNMERGE 2
#define KSM_RUN_OFFLINE 4
static unsigned long ksm_run = KSM_RUN_STOP;
static void wait_while_offlining(void);
static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
static DEFINE_MUTEX(ksm_thread_mutex);
static DEFINE_SPINLOCK(ksm_mmlist_lock);
#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
sizeof(struct __struct), __alignof__(struct __struct),\
(__flags), NULL)
static int __init ksm_slab_init(void)
{
rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
if (!rmap_item_cache)
goto out;
stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
if (!stable_node_cache)
goto out_free1;
mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
if (!mm_slot_cache)
goto out_free2;
return 0;
out_free2:
kmem_cache_destroy(stable_node_cache);
out_free1:
kmem_cache_destroy(rmap_item_cache);
out:
return -ENOMEM;
}
static void __init ksm_slab_free(void)
{
kmem_cache_destroy(mm_slot_cache);
kmem_cache_destroy(stable_node_cache);
kmem_cache_destroy(rmap_item_cache);
mm_slot_cache = NULL;
}
static inline struct rmap_item *alloc_rmap_item(void)
{
struct rmap_item *rmap_item;
rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL);
if (rmap_item)
ksm_rmap_items++;
return rmap_item;
}
static inline void free_rmap_item(struct rmap_item *rmap_item)
{
ksm_rmap_items--;
rmap_item->mm = NULL; /* debug safety */
kmem_cache_free(rmap_item_cache, rmap_item);
}
static inline struct stable_node *alloc_stable_node(void)
{
return kmem_cache_alloc(stable_node_cache, GFP_KERNEL);
}
static inline void free_stable_node(struct stable_node *stable_node)
{
kmem_cache_free(stable_node_cache, stable_node);
}
static inline struct mm_slot *alloc_mm_slot(void)
{
if (!mm_slot_cache) /* initialization failed */
return NULL;
return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
}
static inline void free_mm_slot(struct mm_slot *mm_slot)
{
kmem_cache_free(mm_slot_cache, mm_slot);
}
static struct mm_slot *get_mm_slot(struct mm_struct *mm)
{
struct mm_slot *slot;
hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm)
if (slot->mm == mm)
return slot;
return NULL;
}
static void insert_to_mm_slots_hash(struct mm_struct *mm,
struct mm_slot *mm_slot)
{
mm_slot->mm = mm;
hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
}
/*
* ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
* page tables after it has passed through ksm_exit() - which, if necessary,
* takes mmap_sem briefly to serialize against them. ksm_exit() does not set
* a special flag: they can just back out as soon as mm_users goes to zero.
* ksm_test_exit() is used throughout to make this test for exit: in some
* places for correctness, in some places just to avoid unnecessary work.
*/
static inline bool ksm_test_exit(struct mm_struct *mm)
{
return atomic_read(&mm->mm_users) == 0;
}
/*
* We use break_ksm to break COW on a ksm page: it's a stripped down
*
* if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1)
* put_page(page);
*
* but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
* in case the application has unmapped and remapped mm,addr meanwhile.
* Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
* mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
*/
static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
{
struct page *page;
int ret = 0;
do {
cond_resched();
page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION);
if (IS_ERR_OR_NULL(page))
break;
if (PageKsm(page))
ret = handle_mm_fault(vma->vm_mm, vma, addr,
FAULT_FLAG_WRITE);
else
ret = VM_FAULT_WRITE;
put_page(page);
} while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
/*
* We must loop because handle_mm_fault() may back out if there's
* any difficulty e.g. if pte accessed bit gets updated concurrently.
*
* VM_FAULT_WRITE is what we have been hoping for: it indicates that
* COW has been broken, even if the vma does not permit VM_WRITE;
* but note that a concurrent fault might break PageKsm for us.
*
* VM_FAULT_SIGBUS could occur if we race with truncation of the
* backing file, which also invalidates anonymous pages: that's
* okay, that truncation will have unmapped the PageKsm for us.
*
* VM_FAULT_OOM: at the time of writing (late July 2009), setting
* aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
* current task has TIF_MEMDIE set, and will be OOM killed on return
* to user; and ksmd, having no mm, would never be chosen for that.
*
* But if the mm is in a limited mem_cgroup, then the fault may fail
* with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
* even ksmd can fail in this way - though it's usually breaking ksm
* just to undo a merge it made a moment before, so unlikely to oom.
*
* That's a pity: we might therefore have more kernel pages allocated
* than we're counting as nodes in the stable tree; but ksm_do_scan
* will retry to break_cow on each pass, so should recover the page
* in due course. The important thing is to not let VM_MERGEABLE
* be cleared while any such pages might remain in the area.
*/
return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
}
static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
unsigned long addr)
{
struct vm_area_struct *vma;
if (ksm_test_exit(mm))
return NULL;
vma = find_vma(mm, addr);
if (!vma || vma->vm_start > addr)
return NULL;
if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
return NULL;
return vma;
}
static void break_cow(struct rmap_item *rmap_item)
{
struct mm_struct *mm = rmap_item->mm;
unsigned long addr = rmap_item->address;
struct vm_area_struct *vma;
/*
* It is not an accident that whenever we want to break COW
* to undo, we also need to drop a reference to the anon_vma.
*/
put_anon_vma(rmap_item->anon_vma);
down_read(&mm->mmap_sem);
vma = find_mergeable_vma(mm, addr);
if (vma)
break_ksm(vma, addr);
up_read(&mm->mmap_sem);
}
static struct page *page_trans_compound_anon(struct page *page)
{
if (PageTransCompound(page)) {
struct page *head = compound_head(page);
/*
* head may actually be splitted and freed from under
* us but it's ok here.
*/
if (PageAnon(head))
return head;
}
return NULL;
}
static struct page *get_mergeable_page(struct rmap_item *rmap_item)
{
struct mm_struct *mm = rmap_item->mm;
unsigned long addr = rmap_item->address;
struct vm_area_struct *vma;
struct page *page;
down_read(&mm->mmap_sem);
vma = find_mergeable_vma(mm, addr);
if (!vma)
goto out;
page = follow_page(vma, addr, FOLL_GET);
if (IS_ERR_OR_NULL(page))
goto out;
if (PageAnon(page) || page_trans_compound_anon(page)) {
flush_anon_page(vma, page, addr);
flush_dcache_page(page);
} else {
put_page(page);
out: page = NULL;
}
up_read(&mm->mmap_sem);
return page;
}
/*
* This helper is used for getting right index into array of tree roots.
* When merge_across_nodes knob is set to 1, there are only two rb-trees for
* stable and unstable pages from all nodes with roots in index 0. Otherwise,
* every node has its own stable and unstable tree.
*/
static inline int get_kpfn_nid(unsigned long kpfn)
{
return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
}
static void remove_node_from_stable_tree(struct stable_node *stable_node)
{
struct rmap_item *rmap_item;
hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
if (rmap_item->hlist.next)
ksm_pages_sharing--;
else
ksm_pages_shared--;
put_anon_vma(rmap_item->anon_vma);
rmap_item->address &= PAGE_MASK;
cond_resched();
}
if (stable_node->head == &migrate_nodes)
list_del(&stable_node->list);
else
rb_erase(&stable_node->node,
root_stable_tree + NUMA(stable_node->nid));
free_stable_node(stable_node);
}
/*
* get_ksm_page: checks if the page indicated by the stable node
* is still its ksm page, despite having held no reference to it.
* In which case we can trust the content of the page, and it
* returns the gotten page; but if the page has now been zapped,
* remove the stale node from the stable tree and return NULL.
* But beware, the stable node's page might be being migrated.
*
* You would expect the stable_node to hold a reference to the ksm page.
* But if it increments the page's count, swapping out has to wait for
* ksmd to come around again before it can free the page, which may take
* seconds or even minutes: much too unresponsive. So instead we use a
* "keyhole reference": access to the ksm page from the stable node peeps
* out through its keyhole to see if that page still holds the right key,
* pointing back to this stable node. This relies on freeing a PageAnon
* page to reset its page->mapping to NULL, and relies on no other use of
* a page to put something that might look like our key in page->mapping.
* is on its way to being freed; but it is an anomaly to bear in mind.
*/
static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it)
{
struct page *page;
void *expected_mapping;
unsigned long kpfn;
expected_mapping = (void *)stable_node +
(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
again:
kpfn = READ_ONCE(stable_node->kpfn);
page = pfn_to_page(kpfn);
/*
* page is computed from kpfn, so on most architectures reading
* page->mapping is naturally ordered after reading node->kpfn,
* but on Alpha we need to be more careful.
*/
smp_read_barrier_depends();
if (READ_ONCE(page->mapping) != expected_mapping)
goto stale;
/*
* We cannot do anything with the page while its refcount is 0.
* Usually 0 means free, or tail of a higher-order page: in which
* case this node is no longer referenced, and should be freed;
* however, it might mean that the page is under page_freeze_refs().
* The __remove_mapping() case is easy, again the node is now stale;
* but if page is swapcache in migrate_page_move_mapping(), it might
* still be our page, in which case it's essential to keep the node.
*/
while (!get_page_unless_zero(page)) {
/*
* Another check for page->mapping != expected_mapping would
* work here too. We have chosen the !PageSwapCache test to
* optimize the common case, when the page is or is about to
* be freed: PageSwapCache is cleared (under spin_lock_irq)
* in the freeze_refs section of __remove_mapping(); but Anon
* page->mapping reset to NULL later, in free_pages_prepare().
*/
if (!PageSwapCache(page))
goto stale;
cpu_relax();
}
if (READ_ONCE(page->mapping) != expected_mapping) {
put_page(page);
goto stale;
}
if (lock_it) {
lock_page(page);
if (READ_ONCE(page->mapping) != expected_mapping) {
unlock_page(page);
put_page(page);
goto stale;
}
}
return page;
stale:
/*
* We come here from above when page->mapping or !PageSwapCache
* suggests that the node is stale; but it might be under migration.
* We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(),
* before checking whether node->kpfn has been changed.
*/
smp_rmb();
if (READ_ONCE(stable_node->kpfn) != kpfn)
goto again;
remove_node_from_stable_tree(stable_node);
return NULL;
}
/*
* Removing rmap_item from stable or unstable tree.
* This function will clean the information from the stable/unstable tree.
*/
static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
{
if (rmap_item->address & STABLE_FLAG) {
struct stable_node *stable_node;
struct page *page;
stable_node = rmap_item->head;
page = get_ksm_page(stable_node, true);
if (!page)
goto out;
hlist_del(&rmap_item->hlist);
unlock_page(page);
put_page(page);
if (stable_node->hlist.first)
ksm_pages_sharing--;
else
ksm_pages_shared--;
put_anon_vma(rmap_item->anon_vma);
rmap_item->address &= PAGE_MASK;
} else if (rmap_item->address & UNSTABLE_FLAG) {
unsigned char age;
/*
* Usually ksmd can and must skip the rb_erase, because
* root_unstable_tree was already reset to RB_ROOT.
* But be careful when an mm is exiting: do the rb_erase
* if this rmap_item was inserted by this scan, rather
* than left over from before.
*/
age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
BUG_ON(age > 1);
if (!age)
rb_erase(&rmap_item->node,
root_unstable_tree + NUMA(rmap_item->nid));
ksm_pages_unshared--;
rmap_item->address &= PAGE_MASK;
}
out:
cond_resched(); /* we're called from many long loops */
}
static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
struct rmap_item **rmap_list)
{
while (*rmap_list) {
struct rmap_item *rmap_item = *rmap_list;
*rmap_list = rmap_item->rmap_list;
remove_rmap_item_from_tree(rmap_item);
free_rmap_item(rmap_item);
}
}
/*
* Though it's very tempting to unmerge rmap_items from stable tree rather
* than check every pte of a given vma, the locking doesn't quite work for
* that - an rmap_item is assigned to the stable tree after inserting ksm
* page and upping mmap_sem. Nor does it fit with the way we skip dup'ing
* rmap_items from parent to child at fork time (so as not to waste time
* if exit comes before the next scan reaches it).
*
* Similarly, although we'd like to remove rmap_items (so updating counts
* and freeing memory) when unmerging an area, it's easier to leave that
* to the next pass of ksmd - consider, for example, how ksmd might be
* in cmp_and_merge_page on one of the rmap_items we would be removing.
*/
static int unmerge_ksm_pages(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
unsigned long addr;
int err = 0;
for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
if (ksm_test_exit(vma->vm_mm))
break;
if (signal_pending(current))
err = -ERESTARTSYS;
else
err = break_ksm(vma, addr);
}
return err;
}
#ifdef CONFIG_SYSFS
/*
* Only called through the sysfs control interface:
*/
static int remove_stable_node(struct stable_node *stable_node)
{
struct page *page;
int err;
page = get_ksm_page(stable_node, true);
if (!page) {
/*
* get_ksm_page did remove_node_from_stable_tree itself.
*/
return 0;
}
if (WARN_ON_ONCE(page_mapped(page))) {
/*
* This should not happen: but if it does, just refuse to let
* merge_across_nodes be switched - there is no need to panic.
*/
err = -EBUSY;
} else {
/*
* The stable node did not yet appear stale to get_ksm_page(),
* since that allows for an unmapped ksm page to be recognized
* right up until it is freed; but the node is safe to remove.
* This page might be in a pagevec waiting to be freed,
* or it might be PageSwapCache (perhaps under writeback),
* or it might have been removed from swapcache a moment ago.
*/
set_page_stable_node(page, NULL);
remove_node_from_stable_tree(stable_node);
err = 0;
}
unlock_page(page);
put_page(page);
return err;
}
static int remove_all_stable_nodes(void)
{
struct stable_node *stable_node;
struct list_head *this, *next;
int nid;
int err = 0;
for (nid = 0; nid < ksm_nr_node_ids; nid++) {
while (root_stable_tree[nid].rb_node) {
stable_node = rb_entry(root_stable_tree[nid].rb_node,
struct stable_node, node);
if (remove_stable_node(stable_node)) {
err = -EBUSY;
break; /* proceed to next nid */
}
cond_resched();
}
}
list_for_each_safe(this, next, &migrate_nodes) {
stable_node = list_entry(this, struct stable_node, list);
if (remove_stable_node(stable_node))
err = -EBUSY;
cond_resched();
}
return err;
}
static int unmerge_and_remove_all_rmap_items(void)
{
struct mm_slot *mm_slot;
struct mm_struct *mm;
struct vm_area_struct *vma;
int err = 0;
spin_lock(&ksm_mmlist_lock);
ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
struct mm_slot, mm_list);
spin_unlock(&ksm_mmlist_lock);
for (mm_slot = ksm_scan.mm_slot;
mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
mm = mm_slot->mm;
down_read(&mm->mmap_sem);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (ksm_test_exit(mm))
break;
if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
continue;
err = unmerge_ksm_pages(vma,
vma->vm_start, vma->vm_end);
if (err)
goto error;
}
remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
spin_lock(&ksm_mmlist_lock);
ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
struct mm_slot, mm_list);
if (ksm_test_exit(mm)) {
hash_del(&mm_slot->link);
list_del(&mm_slot->mm_list);
spin_unlock(&ksm_mmlist_lock);
free_mm_slot(mm_slot);
clear_bit(MMF_VM_MERGEABLE, &mm->flags);
up_read(&mm->mmap_sem);
mmdrop(mm);
} else {
spin_unlock(&ksm_mmlist_lock);
up_read(&mm->mmap_sem);
}
}
/* Clean up stable nodes, but don't worry if some are still busy */
remove_all_stable_nodes();
ksm_scan.seqnr = 0;
return 0;
error:
up_read(&mm->mmap_sem);
spin_lock(&ksm_mmlist_lock);
ksm_scan.mm_slot = &ksm_mm_head;
spin_unlock(&ksm_mmlist_lock);
return err;
}
#endif /* CONFIG_SYSFS */
static u32 calc_checksum(struct page *page)
{
u32 checksum;
void *addr = kmap_atomic(page);
checksum = jhash2(addr, PAGE_SIZE / 4, 17);
kunmap_atomic(addr);
return checksum;
}
static int memcmp_pages(struct page *page1, struct page *page2)
{
char *addr1, *addr2;
int ret;
addr1 = kmap_atomic(page1);
addr2 = kmap_atomic(page2);
ret = memcmp(addr1, addr2, PAGE_SIZE);
kunmap_atomic(addr2);
kunmap_atomic(addr1);
return ret;
}
static inline int pages_identical(struct page *page1, struct page *page2)
{
return !memcmp_pages(page1, page2);
}
static int write_protect_page(struct vm_area_struct *vma, struct page *page,
pte_t *orig_pte)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long addr;
pte_t *ptep;
spinlock_t *ptl;
int swapped;
int err = -EFAULT;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
addr = page_address_in_vma(page, vma);
if (addr == -EFAULT)
goto out;
BUG_ON(PageTransCompound(page));
mmun_start = addr;
mmun_end = addr + PAGE_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
ptep = page_check_address(page, mm, addr, &ptl, 0);
if (!ptep)
goto out_mn;
if (pte_write(*ptep) || pte_dirty(*ptep)) {
pte_t entry;
swapped = PageSwapCache(page);
flush_cache_page(vma, addr, page_to_pfn(page));
/*
* Ok this is tricky, when get_user_pages_fast() run it doesn't
* take any lock, therefore the check that we are going to make
* with the pagecount against the mapcount is racey and
* O_DIRECT can happen right after the check.
* So we clear the pte and flush the tlb before the check
* this assure us that no O_DIRECT can happen after the check
* or in the middle of the check.
*/
entry = ptep_clear_flush_notify(vma, addr, ptep);
/*
* Check that no O_DIRECT or similar I/O is in progress on the
* page
*/
if (page_mapcount(page) + 1 + swapped != page_count(page)) {
set_pte_at(mm, addr, ptep, entry);
goto out_unlock;
}
if (pte_dirty(entry))
set_page_dirty(page);
entry = pte_mkclean(pte_wrprotect(entry));
set_pte_at_notify(mm, addr, ptep, entry);
}
*orig_pte = *ptep;
err = 0;
out_unlock:
pte_unmap_unlock(ptep, ptl);
out_mn:
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
out:
return err;
}
/**
* replace_page - replace page in vma by new ksm page
* @vma: vma that holds the pte pointing to page
* @page: the page we are replacing by kpage
* @kpage: the ksm page we replace page by
* @orig_pte: the original value of the pte
*
* Returns 0 on success, -EFAULT on failure.
*/
static int replace_page(struct vm_area_struct *vma, struct page *page,
struct page *kpage, pte_t orig_pte)
{
struct mm_struct *mm = vma->vm_mm;
pmd_t *pmd;
pte_t *ptep;
spinlock_t *ptl;
unsigned long addr;
int err = -EFAULT;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
addr = page_address_in_vma(page, vma);
if (addr == -EFAULT)
goto out;
pmd = mm_find_pmd(mm, addr);
if (!pmd)
goto out;
mmun_start = addr;
mmun_end = addr + PAGE_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
if (!pte_same(*ptep, orig_pte)) {
pte_unmap_unlock(ptep, ptl);
goto out_mn;
}
get_page(kpage);
page_add_anon_rmap(kpage, vma, addr);
flush_cache_page(vma, addr, pte_pfn(*ptep));
ptep_clear_flush_notify(vma, addr, ptep);
set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
page_remove_rmap(page);
if (!page_mapped(page))
try_to_free_swap(page);
put_page(page);
pte_unmap_unlock(ptep, ptl);
err = 0;
out_mn:
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
out:
return err;
}
static int page_trans_compound_anon_split(struct page *page)
{
int ret = 0;
struct page *transhuge_head = page_trans_compound_anon(page);
if (transhuge_head) {
/* Get the reference on the head to split it. */
if (get_page_unless_zero(transhuge_head)) {
/*
* Recheck we got the reference while the head
* was still anonymous.
*/
if (PageAnon(transhuge_head))
ret = split_huge_page(transhuge_head);
else
/*
* Retry later if split_huge_page run
* from under us.
*/
ret = 1;
put_page(transhuge_head);
} else
/* Retry later if split_huge_page run from under us. */
ret = 1;
}
return ret;
}
/*
* try_to_merge_one_page - take two pages and merge them into one
* @vma: the vma that holds the pte pointing to page
* @page: the PageAnon page that we want to replace with kpage
* @kpage: the PageKsm page that we want to map instead of page,
* or NULL the first time when we want to use page as kpage.
*
* This function returns 0 if the pages were merged, -EFAULT otherwise.
*/
static int try_to_merge_one_page(struct vm_area_struct *vma,
struct page *page, struct page *kpage)
{
pte_t orig_pte = __pte(0);
int err = -EFAULT;
if (page == kpage) /* ksm page forked */
return 0;
if (!(vma->vm_flags & VM_MERGEABLE))
goto out;
if (PageTransCompound(page) && page_trans_compound_anon_split(page))
goto out;
BUG_ON(PageTransCompound(page));
if (!PageAnon(page))
goto out;
/*
* We need the page lock to read a stable PageSwapCache in
* write_protect_page(). We use trylock_page() instead of
* lock_page() because we don't want to wait here - we
* prefer to continue scanning and merging different pages,
* then come back to this page when it is unlocked.
*/
if (!trylock_page(page))
goto out;
/*
* If this anonymous page is mapped only here, its pte may need
* to be write-protected. If it's mapped elsewhere, all of its
* ptes are necessarily already write-protected. But in either
* case, we need to lock and check page_count is not raised.
*/
if (write_protect_page(vma, page, &orig_pte) == 0) {
if (!kpage) {
/*
* While we hold page lock, upgrade page from
* PageAnon+anon_vma to PageKsm+NULL stable_node:
* stable_tree_insert() will update stable_node.
*/
set_page_stable_node(page, NULL);
mark_page_accessed(page);
err = 0;
} else if (pages_identical(page, kpage))
err = replace_page(vma, page, kpage, orig_pte);
}
if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
munlock_vma_page(page);
if (!PageMlocked(kpage)) {
unlock_page(page);
lock_page(kpage);
mlock_vma_page(kpage);
page = kpage; /* for final unlock */
}
}
unlock_page(page);
out:
return err;
}
/*
* try_to_merge_with_ksm_page - like try_to_merge_two_pages,
* but no new kernel page is allocated: kpage must already be a ksm page.
*
* This function returns 0 if the pages were merged, -EFAULT otherwise.
*/
static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
struct page *page, struct page *kpage)
{
struct mm_struct *mm = rmap_item->mm;
struct vm_area_struct *vma;
int err = -EFAULT;
down_read(&mm->mmap_sem);
if (ksm_test_exit(mm))
goto out;
vma = find_vma(mm, rmap_item->address);
if (!vma || vma->vm_start > rmap_item->address)
goto out;
err = try_to_merge_one_page(vma, page, kpage);
if (err)
goto out;
/* Unstable nid is in union with stable anon_vma: remove first */
remove_rmap_item_from_tree(rmap_item);
/* Must get reference to anon_vma while still holding mmap_sem */
rmap_item->anon_vma = vma->anon_vma;
get_anon_vma(vma->anon_vma);
out:
up_read(&mm->mmap_sem);
return err;
}
/*
* try_to_merge_two_pages - take two identical pages and prepare them
* to be merged into one page.
*
* This function returns the kpage if we successfully merged two identical
* pages into one ksm page, NULL otherwise.
*
* Note that this function upgrades page to ksm page: if one of the pages
* is already a ksm page, try_to_merge_with_ksm_page should be used.
*/
static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
struct page *page,
struct rmap_item *tree_rmap_item,
struct page *tree_page)
{
int err;
err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
if (!err) {
err = try_to_merge_with_ksm_page(tree_rmap_item,
tree_page, page);
/*
* If that fails, we have a ksm page with only one pte
* pointing to it: so break it.
*/
if (err)
break_cow(rmap_item);
}
return err ? NULL : page;
}
/*
* stable_tree_search - search for page inside the stable tree
*
* This function checks if there is a page inside the stable tree
* with identical content to the page that we are scanning right now.
*
* This function returns the stable tree node of identical content if found,
* NULL otherwise.
*/
static struct page *stable_tree_search(struct page *page)
{
int nid;
struct rb_root *root;
struct rb_node **new;
struct rb_node *parent;
struct stable_node *stable_node;
struct stable_node *page_node;
page_node = page_stable_node(page);
if (page_node && page_node->head != &migrate_nodes) {
/* ksm page forked */
get_page(page);
return page;
}
nid = get_kpfn_nid(page_to_pfn(page));
root = root_stable_tree + nid;
again:
new = &root->rb_node;
parent = NULL;
while (*new) {
struct page *tree_page;
int ret;
cond_resched();
stable_node = rb_entry(*new, struct stable_node, node);
tree_page = get_ksm_page(stable_node, false);
if (!tree_page)
return NULL;
ret = memcmp_pages(page, tree_page);
put_page(tree_page);
parent = *new;
if (ret < 0)
new = &parent->rb_left;
else if (ret > 0)
new = &parent->rb_right;
else {
/*
* Lock and unlock the stable_node's page (which
* might already have been migrated) so that page
* migration is sure to notice its raised count.
* It would be more elegant to return stable_node
* than kpage, but that involves more changes.
*/
tree_page = get_ksm_page(stable_node, true);
if (tree_page) {
unlock_page(tree_page);
if (get_kpfn_nid(stable_node->kpfn) !=
NUMA(stable_node->nid)) {
put_page(tree_page);
goto replace;
}
return tree_page;
}
/*
* There is now a place for page_node, but the tree may
* have been rebalanced, so re-evaluate parent and new.
*/
if (page_node)
goto again;
return NULL;
}
}
if (!page_node)
return NULL;
list_del(&page_node->list);
DO_NUMA(page_node->nid = nid);
rb_link_node(&page_node->node, parent, new);
rb_insert_color(&page_node->node, root);
get_page(page);
return page;
replace:
if (page_node) {
list_del(&page_node->list);
DO_NUMA(page_node->nid = nid);
rb_replace_node(&stable_node->node, &page_node->node, root);
get_page(page);
} else {
rb_erase(&stable_node->node, root);
page = NULL;
}
stable_node->head = &migrate_nodes;
list_add(&stable_node->list, stable_node->head);
return page;
}
/*
* stable_tree_insert - insert stable tree node pointing to new ksm page
* into the stable tree.
*
* This function returns the stable tree node just allocated on success,
* NULL otherwise.
*/
static struct stable_node *stable_tree_insert(struct page *kpage)
{
int nid;
unsigned long kpfn;
struct rb_root *root;
struct rb_node **new;
struct rb_node *parent = NULL;
struct stable_node *stable_node;
kpfn = page_to_pfn(kpage);
nid = get_kpfn_nid(kpfn);
root = root_stable_tree + nid;
new = &root->rb_node;
while (*new) {
struct page *tree_page;
int ret;
cond_resched();
stable_node = rb_entry(*new, struct stable_node, node);
tree_page = get_ksm_page(stable_node, false);
if (!tree_page)
return NULL;
ret = memcmp_pages(kpage, tree_page);
put_page(tree_page);
parent = *new;
if (ret < 0)
new = &parent->rb_left;
else if (ret > 0)
new = &parent->rb_right;
else {
/*
* It is not a bug that stable_tree_search() didn't
* find this node: because at that time our page was
* not yet write-protected, so may have changed since.
*/
return NULL;
}
}
stable_node = alloc_stable_node();
if (!stable_node)
return NULL;
INIT_HLIST_HEAD(&stable_node->hlist);
stable_node->kpfn = kpfn;
set_page_stable_node(kpage, stable_node);
DO_NUMA(stable_node->nid = nid);
rb_link_node(&stable_node->node, parent, new);
rb_insert_color(&stable_node->node, root);
return stable_node;
}
/*
* unstable_tree_search_insert - search for identical page,
* else insert rmap_item into the unstable tree.
*
* This function searches for a page in the unstable tree identical to the
* page currently being scanned; and if no identical page is found in the
* tree, we insert rmap_item as a new object into the unstable tree.
*
* This function returns pointer to rmap_item found to be identical
* to the currently scanned page, NULL otherwise.
*
* This function does both searching and inserting, because they share
* the same walking algorithm in an rbtree.
*/
static
struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
struct page *page,
struct page **tree_pagep)
{
struct rb_node **new;
struct rb_root *root;
struct rb_node *parent = NULL;
int nid;
nid = get_kpfn_nid(page_to_pfn(page));
root = root_unstable_tree + nid;
new = &root->rb_node;
while (*new) {
struct rmap_item *tree_rmap_item;
struct page *tree_page;
int ret;
cond_resched();
tree_rmap_item = rb_entry(*new, struct rmap_item, node);
tree_page = get_mergeable_page(tree_rmap_item);
if (IS_ERR_OR_NULL(tree_page))
return NULL;
/*
* Don't substitute a ksm page for a forked page.
*/
if (page == tree_page) {
put_page(tree_page);
return NULL;
}
ret = memcmp_pages(page, tree_page);
parent = *new;
if (ret < 0) {
put_page(tree_page);
new = &parent->rb_left;
} else if (ret > 0) {
put_page(tree_page);
new = &parent->rb_right;
} else if (!ksm_merge_across_nodes &&
page_to_nid(tree_page) != nid) {
/*
* If tree_page has been migrated to another NUMA node,
* it will be flushed out and put in the right unstable
* tree next time: only merge with it when across_nodes.
*/
put_page(tree_page);
return NULL;
} else {
*tree_pagep = tree_page;
return tree_rmap_item;
}
}
rmap_item->address |= UNSTABLE_FLAG;
rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
DO_NUMA(rmap_item->nid = nid);
rb_link_node(&rmap_item->node, parent, new);
rb_insert_color(&rmap_item->node, root);
ksm_pages_unshared++;
return NULL;
}
/*
* stable_tree_append - add another rmap_item to the linked list of
* rmap_items hanging off a given node of the stable tree, all sharing
* the same ksm page.
*/
static void stable_tree_append(struct rmap_item *rmap_item,
struct stable_node *stable_node)
{
rmap_item->head = stable_node;
rmap_item->address |= STABLE_FLAG;
hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
if (rmap_item->hlist.next)
ksm_pages_sharing++;
else
ksm_pages_shared++;
}
/*
* cmp_and_merge_page - first see if page can be merged into the stable tree;
* if not, compare checksum to previous and if it's the same, see if page can
* be inserted into the unstable tree, or merged with a page already there and
* both transferred to the stable tree.
*
* @page: the page that we are searching identical page to.
* @rmap_item: the reverse mapping into the virtual address of this page
*/
static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
{
struct rmap_item *tree_rmap_item;
struct page *tree_page = NULL;
struct stable_node *stable_node;
struct page *kpage;
unsigned int checksum;
int err;
stable_node = page_stable_node(page);
if (stable_node) {
if (stable_node->head != &migrate_nodes &&
get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) {
rb_erase(&stable_node->node,
root_stable_tree + NUMA(stable_node->nid));
stable_node->head = &migrate_nodes;
list_add(&stable_node->list, stable_node->head);
}
if (stable_node->head != &migrate_nodes &&
rmap_item->head == stable_node)
return;
}
/* We first start with searching the page inside the stable tree */
kpage = stable_tree_search(page);
if (kpage == page && rmap_item->head == stable_node) {
put_page(kpage);
return;
}
remove_rmap_item_from_tree(rmap_item);
if (kpage) {
err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
if (!err) {
/*
* The page was successfully merged:
* add its rmap_item to the stable tree.
*/
lock_page(kpage);
stable_tree_append(rmap_item, page_stable_node(kpage));
unlock_page(kpage);
}
put_page(kpage);
return;
}
/*
* If the hash value of the page has changed from the last time
* we calculated it, this page is changing frequently: therefore we
* don't want to insert it in the unstable tree, and we don't want
* to waste our time searching for something identical to it there.
*/
checksum = calc_checksum(page);
if (rmap_item->oldchecksum != checksum) {
rmap_item->oldchecksum = checksum;
return;
}
tree_rmap_item =
unstable_tree_search_insert(rmap_item, page, &tree_page);
if (tree_rmap_item) {
kpage = try_to_merge_two_pages(rmap_item, page,
tree_rmap_item, tree_page);
put_page(tree_page);
if (kpage) {
/*
* The pages were successfully merged: insert new
* node in the stable tree and add both rmap_items.
*/
lock_page(kpage);
stable_node = stable_tree_insert(kpage);
if (stable_node) {
stable_tree_append(tree_rmap_item, stable_node);
stable_tree_append(rmap_item, stable_node);
}
unlock_page(kpage);
/*
* If we fail to insert the page into the stable tree,
* we will have 2 virtual addresses that are pointing
* to a ksm page left outside the stable tree,
* in which case we need to break_cow on both.
*/
if (!stable_node) {
break_cow(tree_rmap_item);
break_cow(rmap_item);
}
}
}
}
static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
struct rmap_item **rmap_list,
unsigned long addr)
{
struct rmap_item *rmap_item;
while (*rmap_list) {
rmap_item = *rmap_list;
if ((rmap_item->address & PAGE_MASK) == addr)
return rmap_item;
if (rmap_item->address > addr)
break;
*rmap_list = rmap_item->rmap_list;
remove_rmap_item_from_tree(rmap_item);
free_rmap_item(rmap_item);
}
rmap_item = alloc_rmap_item();
if (rmap_item) {
/* It has already been zeroed */
rmap_item->mm = mm_slot->mm;
rmap_item->address = addr;
rmap_item->rmap_list = *rmap_list;
*rmap_list = rmap_item;
}
return rmap_item;
}
static struct rmap_item *scan_get_next_rmap_item(struct page **page)
{
struct mm_struct *mm;
struct mm_slot *slot;
struct vm_area_struct *vma;
struct rmap_item *rmap_item;
int nid;
if (list_empty(&ksm_mm_head.mm_list))
return NULL;
slot = ksm_scan.mm_slot;
if (slot == &ksm_mm_head) {
/*
* A number of pages can hang around indefinitely on per-cpu
* pagevecs, raised page count preventing write_protect_page
* from merging them. Though it doesn't really matter much,
* it is puzzling to see some stuck in pages_volatile until
* other activity jostles them out, and they also prevented
* LTP's KSM test from succeeding deterministically; so drain
* them here (here rather than on entry to ksm_do_scan(),
* so we don't IPI too often when pages_to_scan is set low).
*/
lru_add_drain_all();
/*
* Whereas stale stable_nodes on the stable_tree itself
* get pruned in the regular course of stable_tree_search(),
* those moved out to the migrate_nodes list can accumulate:
* so prune them once before each full scan.
*/
if (!ksm_merge_across_nodes) {
struct stable_node *stable_node;
struct list_head *this, *next;
struct page *page;
list_for_each_safe(this, next, &migrate_nodes) {
stable_node = list_entry(this,
struct stable_node, list);
page = get_ksm_page(stable_node, false);
if (page)
put_page(page);
cond_resched();
}
}
for (nid = 0; nid < ksm_nr_node_ids; nid++)
root_unstable_tree[nid] = RB_ROOT;
spin_lock(&ksm_mmlist_lock);
slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
ksm_scan.mm_slot = slot;
spin_unlock(&ksm_mmlist_lock);
/*
* Although we tested list_empty() above, a racing __ksm_exit
* of the last mm on the list may have removed it since then.
*/
if (slot == &ksm_mm_head)
return NULL;
next_mm:
ksm_scan.address = 0;
ksm_scan.rmap_list = &slot->rmap_list;
}
mm = slot->mm;
down_read(&mm->mmap_sem);
if (ksm_test_exit(mm))
vma = NULL;
else
vma = find_vma(mm, ksm_scan.address);
for (; vma; vma = vma->vm_next) {
if (!(vma->vm_flags & VM_MERGEABLE))
continue;
if (ksm_scan.address < vma->vm_start)
ksm_scan.address = vma->vm_start;
if (!vma->anon_vma)
ksm_scan.address = vma->vm_end;
while (ksm_scan.address < vma->vm_end) {
if (ksm_test_exit(mm))
break;
*page = follow_page(vma, ksm_scan.address, FOLL_GET);
if (IS_ERR_OR_NULL(*page)) {
ksm_scan.address += PAGE_SIZE;
cond_resched();
continue;
}
if (PageAnon(*page) ||
page_trans_compound_anon(*page)) {
flush_anon_page(vma, *page, ksm_scan.address);
flush_dcache_page(*page);
rmap_item = get_next_rmap_item(slot,
ksm_scan.rmap_list, ksm_scan.address);
if (rmap_item) {
ksm_scan.rmap_list =
&rmap_item->rmap_list;
ksm_scan.address += PAGE_SIZE;
} else
put_page(*page);
up_read(&mm->mmap_sem);
return rmap_item;
}
put_page(*page);
ksm_scan.address += PAGE_SIZE;
cond_resched();
}
}
if (ksm_test_exit(mm)) {
ksm_scan.address = 0;
ksm_scan.rmap_list = &slot->rmap_list;
}
/*
* Nuke all the rmap_items that are above this current rmap:
* because there were no VM_MERGEABLE vmas with such addresses.
*/
remove_trailing_rmap_items(slot, ksm_scan.rmap_list);
spin_lock(&ksm_mmlist_lock);
ksm_scan.mm_slot = list_entry(slot->mm_list.next,
struct mm_slot, mm_list);
if (ksm_scan.address == 0) {
/*
* We've completed a full scan of all vmas, holding mmap_sem
* throughout, and found no VM_MERGEABLE: so do the same as
* __ksm_exit does to remove this mm from all our lists now.
* This applies either when cleaning up after __ksm_exit
* (but beware: we can reach here even before __ksm_exit),
* or when all VM_MERGEABLE areas have been unmapped (and
* mmap_sem then protects against race with MADV_MERGEABLE).
*/
hash_del(&slot->link);
list_del(&slot->mm_list);
spin_unlock(&ksm_mmlist_lock);
free_mm_slot(slot);
clear_bit(MMF_VM_MERGEABLE, &mm->flags);
up_read(&mm->mmap_sem);
mmdrop(mm);
} else {
spin_unlock(&ksm_mmlist_lock);
up_read(&mm->mmap_sem);
}
/* Repeat until we've completed scanning the whole list */
slot = ksm_scan.mm_slot;
if (slot != &ksm_mm_head)
goto next_mm;
ksm_scan.seqnr++;
return NULL;
}
/**
* ksm_do_scan - the ksm scanner main worker function.
* @scan_npages - number of pages we want to scan before we return.
*/
static void ksm_do_scan(unsigned int scan_npages)
{
struct rmap_item *rmap_item;
struct page *uninitialized_var(page);
while (scan_npages-- && likely(!freezing(current))) {
cond_resched();
rmap_item = scan_get_next_rmap_item(&page);
if (!rmap_item)
return;
cmp_and_merge_page(page, rmap_item);
put_page(page);
}
}
static int ksmd_should_run(void)
{
return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
}
static int ksm_scan_thread(void *nothing)
{
set_freezable();
set_user_nice(current, 5);
while (!kthread_should_stop()) {
mutex_lock(&ksm_thread_mutex);
wait_while_offlining();
if (ksmd_should_run())
ksm_do_scan(ksm_thread_pages_to_scan);
mutex_unlock(&ksm_thread_mutex);
try_to_freeze();
if (ksmd_should_run()) {
schedule_timeout_interruptible(
msecs_to_jiffies(ksm_thread_sleep_millisecs));
} else {
wait_event_freezable(ksm_thread_wait,
ksmd_should_run() || kthread_should_stop());
}
}
return 0;
}
int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
unsigned long end, int advice, unsigned long *vm_flags)
{
struct mm_struct *mm = vma->vm_mm;
int err;
switch (advice) {
case MADV_MERGEABLE:
/*
* Be somewhat over-protective for now!
*/
if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
VM_PFNMAP | VM_IO | VM_DONTEXPAND |
VM_HUGETLB | VM_MIXEDMAP))
return 0; /* just ignore the advice */
#ifdef VM_SAO
if (*vm_flags & VM_SAO)
return 0;
#endif
if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
err = __ksm_enter(mm);
if (err)
return err;
}
*vm_flags |= VM_MERGEABLE;
break;
case MADV_UNMERGEABLE:
if (!(*vm_flags & VM_MERGEABLE))
return 0; /* just ignore the advice */
if (vma->anon_vma) {
err = unmerge_ksm_pages(vma, start, end);
if (err)
return err;
}
*vm_flags &= ~VM_MERGEABLE;
break;
}
return 0;
}
int __ksm_enter(struct mm_struct *mm)
{
struct mm_slot *mm_slot;
int needs_wakeup;
mm_slot = alloc_mm_slot();
if (!mm_slot)
return -ENOMEM;
/* Check ksm_run too? Would need tighter locking */
needs_wakeup = list_empty(&ksm_mm_head.mm_list);
spin_lock(&ksm_mmlist_lock);
insert_to_mm_slots_hash(mm, mm_slot);
/*
* When KSM_RUN_MERGE (or KSM_RUN_STOP),
* insert just behind the scanning cursor, to let the area settle
* down a little; when fork is followed by immediate exec, we don't
* want ksmd to waste time setting up and tearing down an rmap_list.
*
* But when KSM_RUN_UNMERGE, it's important to insert ahead of its
* scanning cursor, otherwise KSM pages in newly forked mms will be
* missed: then we might as well insert at the end of the list.
*/
if (ksm_run & KSM_RUN_UNMERGE)
list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
else
list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
spin_unlock(&ksm_mmlist_lock);
set_bit(MMF_VM_MERGEABLE, &mm->flags);
atomic_inc(&mm->mm_count);
if (needs_wakeup)
wake_up_interruptible(&ksm_thread_wait);
return 0;
}
void __ksm_exit(struct mm_struct *mm)
{
struct mm_slot *mm_slot;
int easy_to_free = 0;
/*
* This process is exiting: if it's straightforward (as is the
* case when ksmd was never running), free mm_slot immediately.
* But if it's at the cursor or has rmap_items linked to it, use
* mmap_sem to synchronize with any break_cows before pagetables
* are freed, and leave the mm_slot on the list for ksmd to free.
* Beware: ksm may already have noticed it exiting and freed the slot.
*/
spin_lock(&ksm_mmlist_lock);
mm_slot = get_mm_slot(mm);
if (mm_slot && ksm_scan.mm_slot != mm_slot) {
if (!mm_slot->rmap_list) {
hash_del(&mm_slot->link);
list_del(&mm_slot->mm_list);
easy_to_free = 1;
} else {
list_move(&mm_slot->mm_list,
&ksm_scan.mm_slot->mm_list);
}
}
spin_unlock(&ksm_mmlist_lock);
if (easy_to_free) {
free_mm_slot(mm_slot);
clear_bit(MMF_VM_MERGEABLE, &mm->flags);
mmdrop(mm);
} else if (mm_slot) {
down_write(&mm->mmap_sem);
up_write(&mm->mmap_sem);
}
}
struct page *ksm_might_need_to_copy(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
struct anon_vma *anon_vma = page_anon_vma(page);
struct page *new_page;
if (PageKsm(page)) {
if (page_stable_node(page) &&
!(ksm_run & KSM_RUN_UNMERGE))
return page; /* no need to copy it */
} else if (!anon_vma) {
return page; /* no need to copy it */
} else if (anon_vma->root == vma->anon_vma->root &&
page->index == linear_page_index(vma, address)) {
return page; /* still no need to copy it */
}
if (!PageUptodate(page))
return page; /* let do_swap_page report the error */
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
if (new_page) {
copy_user_highpage(new_page, page, address, vma);
SetPageDirty(new_page);
__SetPageUptodate(new_page);
__set_page_locked(new_page);
}
return new_page;
}
int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
{
struct stable_node *stable_node;
struct rmap_item *rmap_item;
int ret = SWAP_AGAIN;
int search_new_forks = 0;
VM_BUG_ON_PAGE(!PageKsm(page), page);
/*
* Rely on the page lock to protect against concurrent modifications
* to that page's node of the stable tree.
*/
VM_BUG_ON_PAGE(!PageLocked(page), page);
stable_node = page_stable_node(page);
if (!stable_node)
return ret;
again:
hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
struct anon_vma *anon_vma = rmap_item->anon_vma;
struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
anon_vma_lock_read(anon_vma);
anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
0, ULONG_MAX) {
vma = vmac->vma;
if (rmap_item->address < vma->vm_start ||
rmap_item->address >= vma->vm_end)
continue;
/*
* Initially we examine only the vma which covers this
* rmap_item; but later, if there is still work to do,
* we examine covering vmas in other mms: in case they
* were forked from the original since ksmd passed.
*/
if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
continue;
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
continue;
ret = rwc->rmap_one(page, vma,
rmap_item->address, rwc->arg);
if (ret != SWAP_AGAIN) {
anon_vma_unlock_read(anon_vma);
goto out;
}
if (rwc->done && rwc->done(page)) {
anon_vma_unlock_read(anon_vma);
goto out;
}
}
anon_vma_unlock_read(anon_vma);
}
if (!search_new_forks++)
goto again;
out:
return ret;
}
#ifdef CONFIG_MIGRATION
void ksm_migrate_page(struct page *newpage, struct page *oldpage)
{
struct stable_node *stable_node;
VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage);
stable_node = page_stable_node(newpage);
if (stable_node) {
VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage);
stable_node->kpfn = page_to_pfn(newpage);
/*
* newpage->mapping was set in advance; now we need smp_wmb()
* to make sure that the new stable_node->kpfn is visible
* to get_ksm_page() before it can see that oldpage->mapping
* has gone stale (or that PageSwapCache has been cleared).
*/
smp_wmb();
set_page_stable_node(oldpage, NULL);
}
}
#endif /* CONFIG_MIGRATION */
#ifdef CONFIG_MEMORY_HOTREMOVE
static void wait_while_offlining(void)
{
while (ksm_run & KSM_RUN_OFFLINE) {
mutex_unlock(&ksm_thread_mutex);
wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
TASK_UNINTERRUPTIBLE);
mutex_lock(&ksm_thread_mutex);
}
}
static void ksm_check_stable_tree(unsigned long start_pfn,
unsigned long end_pfn)
{
struct stable_node *stable_node;
struct list_head *this, *next;
struct rb_node *node;
int nid;
for (nid = 0; nid < ksm_nr_node_ids; nid++) {
node = rb_first(root_stable_tree + nid);
while (node) {
stable_node = rb_entry(node, struct stable_node, node);
if (stable_node->kpfn >= start_pfn &&
stable_node->kpfn < end_pfn) {
/*
* Don't get_ksm_page, page has already gone:
* which is why we keep kpfn instead of page*
*/
remove_node_from_stable_tree(stable_node);
node = rb_first(root_stable_tree + nid);
} else
node = rb_next(node);
cond_resched();
}
}
list_for_each_safe(this, next, &migrate_nodes) {
stable_node = list_entry(this, struct stable_node, list);
if (stable_node->kpfn >= start_pfn &&
stable_node->kpfn < end_pfn)
remove_node_from_stable_tree(stable_node);
cond_resched();
}
}
static int ksm_memory_callback(struct notifier_block *self,
unsigned long action, void *arg)
{
struct memory_notify *mn = arg;
switch (action) {
case MEM_GOING_OFFLINE:
/*
* Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items()
* and remove_all_stable_nodes() while memory is going offline:
* it is unsafe for them to touch the stable tree at this time.
* But unmerge_ksm_pages(), rmap lookups and other entry points
* which do not need the ksm_thread_mutex are all safe.
*/
mutex_lock(&ksm_thread_mutex);
ksm_run |= KSM_RUN_OFFLINE;
mutex_unlock(&ksm_thread_mutex);
break;
case MEM_OFFLINE:
/*
* Most of the work is done by page migration; but there might
* be a few stable_nodes left over, still pointing to struct
* pages which have been offlined: prune those from the tree,
* otherwise get_ksm_page() might later try to access a
* non-existent struct page.
*/
ksm_check_stable_tree(mn->start_pfn,
mn->start_pfn + mn->nr_pages);
/* fallthrough */
case MEM_CANCEL_OFFLINE:
mutex_lock(&ksm_thread_mutex);
ksm_run &= ~KSM_RUN_OFFLINE;
mutex_unlock(&ksm_thread_mutex);
smp_mb(); /* wake_up_bit advises this */
wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));
break;
}
return NOTIFY_OK;
}
#else
static void wait_while_offlining(void)
{
}
#endif /* CONFIG_MEMORY_HOTREMOVE */
#ifdef CONFIG_SYSFS
/*
* This all compiles without CONFIG_SYSFS, but is a waste of space.
*/
#define KSM_ATTR_RO(_name) \
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
#define KSM_ATTR(_name) \
static struct kobj_attribute _name##_attr = \
__ATTR(_name, 0644, _name##_show, _name##_store)
static ssize_t sleep_millisecs_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs);
}
static ssize_t sleep_millisecs_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
unsigned long msecs;
int err;
err = kstrtoul(buf, 10, &msecs);
if (err || msecs > UINT_MAX)
return -EINVAL;
ksm_thread_sleep_millisecs = msecs;
return count;
}
KSM_ATTR(sleep_millisecs);
static ssize_t pages_to_scan_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(buf, "%u\n", ksm_thread_pages_to_scan);
}
static ssize_t pages_to_scan_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
int err;
unsigned long nr_pages;
err = kstrtoul(buf, 10, &nr_pages);
if (err || nr_pages > UINT_MAX)
return -EINVAL;
ksm_thread_pages_to_scan = nr_pages;
return count;
}
KSM_ATTR(pages_to_scan);
static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
return sprintf(buf, "%lu\n", ksm_run);
}
static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t count)
{
int err;
unsigned long flags;
err = kstrtoul(buf, 10, &flags);
if (err || flags > UINT_MAX)
return -EINVAL;
if (flags > KSM_RUN_UNMERGE)
return -EINVAL;
/*
* KSM_RUN_MERGE sets ksmd running, and 0 stops it running.
* KSM_RUN_UNMERGE stops it running and unmerges all rmap_items,
* breaking COW to free the pages_shared (but leaves mm_slots
* on the list for when ksmd may be set running again).
*/
mutex_lock(&ksm_thread_mutex);
wait_while_offlining();
if (ksm_run != flags) {
ksm_run = flags;
if (flags & KSM_RUN_UNMERGE) {
set_current_oom_origin();
err = unmerge_and_remove_all_rmap_items();
clear_current_oom_origin();
if (err) {
ksm_run = KSM_RUN_STOP;
count = err;
}
}
}
mutex_unlock(&ksm_thread_mutex);
if (flags & KSM_RUN_MERGE)
wake_up_interruptible(&ksm_thread_wait);
return count;
}
KSM_ATTR(run);
#ifdef CONFIG_NUMA
static ssize_t merge_across_nodes_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(buf, "%u\n", ksm_merge_across_nodes);
}
static ssize_t merge_across_nodes_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
int err;
unsigned long knob;
err = kstrtoul(buf, 10, &knob);
if (err)
return err;
if (knob > 1)
return -EINVAL;
mutex_lock(&ksm_thread_mutex);
wait_while_offlining();
if (ksm_merge_across_nodes != knob) {
if (ksm_pages_shared || remove_all_stable_nodes())
err = -EBUSY;
else if (root_stable_tree == one_stable_tree) {
struct rb_root *buf;
/*
* This is the first time that we switch away from the
* default of merging across nodes: must now allocate
* a buffer to hold as many roots as may be needed.
* Allocate stable and unstable together:
* MAXSMP NODES_SHIFT 10 will use 16kB.
*/
buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf),
GFP_KERNEL);
/* Let us assume that RB_ROOT is NULL is zero */
if (!buf)
err = -ENOMEM;
else {
root_stable_tree = buf;
root_unstable_tree = buf + nr_node_ids;
/* Stable tree is empty but not the unstable */
root_unstable_tree[0] = one_unstable_tree[0];
}
}
if (!err) {
ksm_merge_across_nodes = knob;
ksm_nr_node_ids = knob ? 1 : nr_node_ids;
}
}
mutex_unlock(&ksm_thread_mutex);
return err ? err : count;
}
KSM_ATTR(merge_across_nodes);
#endif
static ssize_t pages_shared_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(buf, "%lu\n", ksm_pages_shared);
}
KSM_ATTR_RO(pages_shared);
static ssize_t pages_sharing_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(buf, "%lu\n", ksm_pages_sharing);
}
KSM_ATTR_RO(pages_sharing);
static ssize_t pages_unshared_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(buf, "%lu\n", ksm_pages_unshared);
}
KSM_ATTR_RO(pages_unshared);
static ssize_t pages_volatile_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
long ksm_pages_volatile;
ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
- ksm_pages_sharing - ksm_pages_unshared;
/*
* It was not worth any locking to calculate that statistic,
* but it might therefore sometimes be negative: conceal that.
*/
if (ksm_pages_volatile < 0)
ksm_pages_volatile = 0;
return sprintf(buf, "%ld\n", ksm_pages_volatile);
}
KSM_ATTR_RO(pages_volatile);
static ssize_t full_scans_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(buf, "%lu\n", ksm_scan.seqnr);
}
KSM_ATTR_RO(full_scans);
static struct attribute *ksm_attrs[] = {
&sleep_millisecs_attr.attr,
&pages_to_scan_attr.attr,
&run_attr.attr,
&pages_shared_attr.attr,
&pages_sharing_attr.attr,
&pages_unshared_attr.attr,
&pages_volatile_attr.attr,
&full_scans_attr.attr,
#ifdef CONFIG_NUMA
&merge_across_nodes_attr.attr,
#endif
NULL,
};
static struct attribute_group ksm_attr_group = {
.attrs = ksm_attrs,
.name = "ksm",
};
#endif /* CONFIG_SYSFS */
static int __init ksm_init(void)
{
struct task_struct *ksm_thread;
int err;
err = ksm_slab_init();
if (err)
goto out;
ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
if (IS_ERR(ksm_thread)) {
pr_err("ksm: creating kthread failed\n");
err = PTR_ERR(ksm_thread);
goto out_free;
}
#ifdef CONFIG_SYSFS
err = sysfs_create_group(mm_kobj, &ksm_attr_group);
if (err) {
pr_err("ksm: register sysfs failed\n");
kthread_stop(ksm_thread);
goto out_free;
}
#else
ksm_run = KSM_RUN_MERGE; /* no way for user to start it */
#endif /* CONFIG_SYSFS */
#ifdef CONFIG_MEMORY_HOTREMOVE
/* There is no significance to this priority 100 */
hotplug_memory_notifier(ksm_memory_callback, 100);
#endif
return 0;
out_free:
ksm_slab_free();
out:
return err;
}
subsys_initcall(ksm_init);
C:\Users\Admin\Desktop\linux-4.2.y-new\linux-4.2.y\mm/list_lru.c
/*
* Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved.
* Authors: David Chinner and Glauber Costa
*
* Generic LRU infrastructure
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/list_lru.h>
#include <linux/slab.h>
#include <linux/mutex.h>
#include <linux/memcontrol.h>
#ifdef CONFIG_MEMCG_KMEM
static LIST_HEAD(list_lrus);
static DEFINE_MUTEX(list_lrus_mutex);
static void list_lru_register(struct list_lru *lru)
{
mutex_lock(&list_lrus_mutex);
list_add(&lru->list, &list_lrus);
mutex_unlock(&list_lrus_mutex);
}
static void list_lru_unregister(struct list_lru *lru)
{
mutex_lock(&list_lrus_mutex);
list_del(&lru->list);
mutex_unlock(&list_lrus_mutex);
}
#else
static void list_lru_register(struct list_lru *lru)
{
}
static void list_lru_unregister(struct list_lru *lru)
{
}
#endif /* CONFIG_MEMCG_KMEM */
#ifdef CONFIG_MEMCG_KMEM
static inline bool list_lru_memcg_aware(struct list_lru *lru)
{
return !!lru->node[0].memcg_lrus;
}
static inline struct list_lru_one *
list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
{
/*
* The lock protects the array of per cgroup lists from relocation
* (see memcg_update_list_lru_node).
*/
lockdep_assert_held(&nlru->lock);
if (nlru->memcg_lrus && idx >= 0)
return nlru->memcg_lrus->lru[idx];
return &nlru->lru;
}
static inline struct list_lru_one *
list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
{
struct mem_cgroup *memcg;
if (!nlru->memcg_lrus)
return &nlru->lru;
memcg = mem_cgroup_from_kmem(ptr);
if (!memcg)
return &nlru->lru;
return list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg));
}
#else
static inline bool list_lru_memcg_aware(struct list_lru *lru)
{
return false;
}
static inline struct list_lru_one *
list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
{
return &nlru->lru;
}
static inline struct list_lru_one *
list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
{
return &nlru->lru;
}
#endif /* CONFIG_MEMCG_KMEM */
bool list_lru_add(struct list_lru *lru, struct list_head *item)
{
int nid = page_to_nid(virt_to_page(item));
struct list_lru_node *nlru = &lru->node[nid];
struct list_lru_one *l;
spin_lock(&nlru->lock);
l = list_lru_from_kmem(nlru, item);
if (list_empty(item)) {
list_add_tail(item, &l->list);
l->nr_items++;
spin_unlock(&nlru->lock);
return true;
}
spin_unlock(&nlru->lock);
return false;
}
EXPORT_SYMBOL_GPL(list_lru_add);
bool list_lru_del(struct list_lru *lru, struct list_head *item)
{
int nid = page_to_nid(virt_to_page(item));
struct list_lru_node *nlru = &lru->node[nid];
struct list_lru_one *l;
spin_lock(&nlru->lock);
l = list_lru_from_kmem(nlru, item);
if (!list_empty(item)) {
list_del_init(item);
l->nr_items--;
spin_unlock(&nlru->lock);
return true;
}
spin_unlock(&nlru->lock);
return false;
}
EXPORT_SYMBOL_GPL(list_lru_del);
void list_lru_isolate(struct list_lru_one *list, struct list_head *item)
{
list_del_init(item);
list->nr_items--;
}
EXPORT_SYMBOL_GPL(list_lru_isolate);
void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
struct list_head *head)
{
list_move(item, head);
list->nr_items--;
}
EXPORT_SYMBOL_GPL(list_lru_isolate_move);
static unsigned long __list_lru_count_one(struct list_lru *lru,
int nid, int memcg_idx)
{
struct list_lru_node *nlru = &lru->node[nid];
struct list_lru_one *l;
unsigned long count;
spin_lock(&nlru->lock);
l = list_lru_from_memcg_idx(nlru, memcg_idx);
count = l->nr_items;
spin_unlock(&nlru->lock);
return count;
}
unsigned long list_lru_count_one(struct list_lru *lru,
int nid, struct mem_cgroup *memcg)
{
return __list_lru_count_one(lru, nid, memcg_cache_id(memcg));
}
EXPORT_SYMBOL_GPL(list_lru_count_one);
unsigned long list_lru_count_node(struct list_lru *lru, int nid)
{
long count = 0;
int memcg_idx;
count += __list_lru_count_one(lru, nid, -1);
if (list_lru_memcg_aware(lru)) {
for_each_memcg_cache_index(memcg_idx)
count += __list_lru_count_one(lru, nid, memcg_idx);
}
return count;
}
EXPORT_SYMBOL_GPL(list_lru_count_node);
static unsigned long
__list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx,
list_lru_walk_cb isolate, void *cb_arg,
unsigned long *nr_to_walk)
{
struct list_lru_node *nlru = &lru->node[nid];
struct list_lru_one *l;
struct list_head *item, *n;
unsigned long isolated = 0;
spin_lock(&nlru->lock);
l = list_lru_from_memcg_idx(nlru, memcg_idx);
restart:
list_for_each_safe(item, n, &l->list) {
enum lru_status ret;
/*
* decrement nr_to_walk first so that we don't livelock if we
* get stuck on large numbesr of LRU_RETRY items
*/
if (!*nr_to_walk)
break;
--*nr_to_walk;
ret = isolate(item, l, &nlru->lock, cb_arg);
switch (ret) {
case LRU_REMOVED_RETRY:
assert_spin_locked(&nlru->lock);
case LRU_REMOVED:
isolated++;
/*
* If the lru lock has been dropped, our list
* traversal is now invalid and so we have to
* restart from scratch.
*/
if (ret == LRU_REMOVED_RETRY)
goto restart;
break;
case LRU_ROTATE:
list_move_tail(item, &l->list);
break;
case LRU_SKIP:
break;
case LRU_RETRY:
/*
* The lru lock has been dropped, our list traversal is
* now invalid and so we have to restart from scratch.
*/
assert_spin_locked(&nlru->lock);
goto restart;
default:
BUG();
}
}
spin_unlock(&nlru->lock);
return isolated;
}
unsigned long
list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
list_lru_walk_cb isolate, void *cb_arg,
unsigned long *nr_to_walk)
{
return __list_lru_walk_one(lru, nid, memcg_cache_id(memcg),
isolate, cb_arg, nr_to_walk);
}
EXPORT_SYMBOL_GPL(list_lru_walk_one);
unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
list_lru_walk_cb isolate, void *cb_arg,
unsigned long *nr_to_walk)
{
long isolated = 0;
int memcg_idx;
isolated += __list_lru_walk_one(lru, nid, -1, isolate, cb_arg,
nr_to_walk);
if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
for_each_memcg_cache_index(memcg_idx) {
isolated += __list_lru_walk_one(lru, nid, memcg_idx,
isolate, cb_arg, nr_to_walk);
if (*nr_to_walk <= 0)
break;
}
}
return isolated;
}
EXPORT_SYMBOL_GPL(list_lru_walk_node);
static void init_one_lru(struct list_lru_one *l)
{
INIT_LIST_HEAD(&l->list);
l->nr_items = 0;
}
#ifdef CONFIG_MEMCG_KMEM
static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus,
int begin, int end)
{
int i;
for (i = begin; i < end; i++)
kfree(memcg_lrus->lru[i]);
}
static int __memcg_init_list_lru_node(struct list_lru_memcg *memcg_lrus,
int begin, int end)
{
int i;
for (i = begin; i < end; i++) {
struct list_lru_one *l;
l = kmalloc(sizeof(struct list_lru_one), GFP_KERNEL);
if (!l)
goto fail;
init_one_lru(l);
memcg_lrus->lru[i] = l;
}
return 0;
fail:
__memcg_destroy_list_lru_node(memcg_lrus, begin, i - 1);
return -ENOMEM;
}
static int memcg_init_list_lru_node(struct list_lru_node *nlru)
{
int size = memcg_nr_cache_ids;
nlru->memcg_lrus = kmalloc(size * sizeof(void *), GFP_KERNEL);
if (!nlru->memcg_lrus)
return -ENOMEM;
if (__memcg_init_list_lru_node(nlru->memcg_lrus, 0, size)) {
kfree(nlru->memcg_lrus);
return -ENOMEM;
}
return 0;
}
static void memcg_destroy_list_lru_node(struct list_lru_node *nlru)
{
__memcg_destroy_list_lru_node(nlru->memcg_lrus, 0, memcg_nr_cache_ids);
kfree(nlru->memcg_lrus);
}
static int memcg_update_list_lru_node(struct list_lru_node *nlru,
int old_size, int new_size)
{
struct list_lru_memcg *old, *new;
BUG_ON(old_size > new_size);
old = nlru->memcg_lrus;
new = kmalloc(new_size * sizeof(void *), GFP_KERNEL);
if (!new)
return -ENOMEM;
if (__memcg_init_list_lru_node(new, old_size, new_size)) {
kfree(new);
return -ENOMEM;
}
memcpy(new, old, old_size * sizeof(void *));
/*
* The lock guarantees that we won't race with a reader
* (see list_lru_from_memcg_idx).
*
* Since list_lru_{add,del} may be called under an IRQ-safe lock,
* we have to use IRQ-safe primitives here to avoid deadlock.
*/
spin_lock_irq(&nlru->lock);
nlru->memcg_lrus = new;
spin_unlock_irq(&nlru->lock);
kfree(old);
return 0;
}
static void memcg_cancel_update_list_lru_node(struct list_lru_node *nlru,
int old_size, int new_size)
{
/* do not bother shrinking the array back to the old size, because we
* cannot handle allocation failures here */
__memcg_destroy_list_lru_node(nlru->memcg_lrus, old_size, new_size);
}
static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
{
int i;
for (i = 0; i < nr_node_ids; i++) {
if (!memcg_aware)
lru->node[i].memcg_lrus = NULL;
else if (memcg_init_list_lru_node(&lru->node[i]))
goto fail;
}
return 0;
fail:
for (i = i - 1; i >= 0; i--)
memcg_destroy_list_lru_node(&lru->node[i]);
return -ENOMEM;
}
static void memcg_destroy_list_lru(struct list_lru *lru)
{
int i;
if (!list_lru_memcg_aware(lru))
return;
for (i = 0; i < nr_node_ids; i++)
memcg_destroy_list_lru_node(&lru->node[i]);
}
static int memcg_update_list_lru(struct list_lru *lru,
int old_size, int new_size)
{
int i;
if (!list_lru_memcg_aware(lru))
return 0;
for (i = 0; i < nr_node_ids; i++) {
if (memcg_update_list_lru_node(&lru->node[i],
old_size, new_size))
goto fail;
}
return 0;
fail:
for (i = i - 1; i >= 0; i--)
memcg_cancel_update_list_lru_node(&lru->node[i],
old_size, new_size);
return -ENOMEM;
}
static void memcg_cancel_update_list_lru(struct list_lru *lru,
int old_size, int new_size)
{
int i;
if (!list_lru_memcg_aware(lru))
return;
for (i = 0; i < nr_node_ids; i++)
memcg_cancel_update_list_lru_node(&lru->node[i],
old_size, new_size);
}
int memcg_update_all_list_lrus(int new_size)
{
int ret = 0;
struct list_lru *lru;
int old_size = memcg_nr_cache_ids;
mutex_lock(&list_lrus_mutex);
list_for_each_entry(lru, &list_lrus, list) {
ret = memcg_update_list_lru(lru, old_size, new_size);
if (ret)
goto fail;
}
out:
mutex_unlock(&list_lrus_mutex);
return ret;
fail:
list_for_each_entry_continue_reverse(lru, &list_lrus, list)
memcg_cancel_update_list_lru(lru, old_size, new_size);
goto out;
}
static void memcg_drain_list_lru_node(struct list_lru_node *nlru,
int src_idx, int dst_idx)
{
struct list_lru_one *src, *dst;
/*
* Since list_lru_{add,del} may be called under an IRQ-safe lock,
* we have to use IRQ-safe primitives here to avoid deadlock.
*/
spin_lock_irq(&nlru->lock);
src = list_lru_from_memcg_idx(nlru, src_idx);
dst = list_lru_from_memcg_idx(nlru, dst_idx);
list_splice_init(&src->list, &dst->list);
dst->nr_items += src->nr_items;
src->nr_items = 0;
spin_unlock_irq(&nlru->lock);
}
static void memcg_drain_list_lru(struct list_lru *lru,
int src_idx, int dst_idx)
{
int i;
if (!list_lru_memcg_aware(lru))
return;
for (i = 0; i < nr_node_ids; i++)
memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_idx);
}
void memcg_drain_all_list_lrus(int src_idx, int dst_idx)
{
struct list_lru *lru;
mutex_lock(&list_lrus_mutex);
list_for_each_entry(lru, &list_lrus, list)
memcg_drain_list_lru(lru, src_idx, dst_idx);
mutex_unlock(&list_lrus_mutex);
}
#else
static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
{
return 0;
}
static void memcg_destroy_list_lru(struct list_lru *lru)
{
}
#endif /* CONFIG_MEMCG_KMEM */
int __list_lru_init(struct list_lru *lru, bool memcg_aware,
struct lock_class_key *key)
{
int i;
size_t size = sizeof(*lru->node) * nr_node_ids;
int err = -ENOMEM;
memcg_get_cache_ids();
lru->node = kzalloc(size, GFP_KERNEL);
if (!lru->node)
goto out;
for (i = 0; i < nr_node_ids; i++) {
spin_lock_init(&lru->node[i].lock);
if (key)
lockdep_set_class(&lru->node[i].lock, key);
init_one_lru(&lru->node[i].lru);
}
err = memcg_init_list_lru(lru, memcg_aware);
if (err) {
kfree(lru->node);
goto out;
}
list_lru_register(lru);
out:
memcg_put_cache_ids();
return err;
}
EXPORT_SYMBOL_GPL(__list_lru_init);
void list_lru_destroy(struct list_lru *lru)
{
/* Already destroyed or not yet initialized? */
if (!lru->node)
return;
memcg_get_cache_ids();
list_lru_unregister(lru);
memcg_destroy_list_lru(lru);
kfree(lru->node);
lru->node = NULL;
memcg_put_cache_ids();
}
EXPORT_SYMBOL_GPL(list_lru_destroy);
C:\Users\Admin\Desktop\linux-4.2.y-new\linux-4.2.y\mm/maccess.c
/*
* Access kernel memory without faulting.
*/
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/uaccess.h>
/**
* probe_kernel_read(): safely attempt to read from a location
* @dst: pointer to the buffer that shall take the data
* @src: address to read from
* @size: size of the data chunk
*
* Safely read from address @src to the buffer at @dst. If a kernel fault
* happens, handle that and return -EFAULT.
*/
long __weak probe_kernel_read(void *dst, const void *src, size_t size)
__attribute__((alias("__probe_kernel_read")));
long __probe_kernel_read(void *dst, const void *src, size_t size)
{
long ret;
mm_segment_t old_fs = get_fs();
set_fs(KERNEL_DS);
pagefault_disable();
ret = __copy_from_user_inatomic(dst,
(__force const void __user *)src, size);
pagefault_enable();
set_fs(old_fs);
return ret ? -EFAULT : 0;
}
EXPORT_SYMBOL_GPL(probe_kernel_read);
/**
* probe_kernel_write(): safely attempt to write to a location
* @dst: address to write to
* @src: pointer to the data that shall be written
* @size: size of the data chunk
*
* Safely write to address @dst from the buffer at @src. If a kernel fault
* happens, handle that and return -EFAULT.
*/
long __weak probe_kernel_write(void *dst, const void *src, size_t size)
__attribute__((alias("__probe_kernel_write")));
long __probe_kernel_write(void *dst, const void *src, size_t size)
{
long ret;
mm_segment_t old_fs = get_fs();
set_fs(KERNEL_DS);
pagefault_disable();
ret = __copy_to_user_inatomic((__force void __user *)dst, src, size);
pagefault_enable();
set_fs(old_fs);
return ret ? -EFAULT : 0;
}
EXPORT_SYMBOL_GPL(probe_kernel_write);
C:\Users\Admin\Desktop\linux-4.2.y-new\linux-4.2.y\mm/madvise.c
/*
* linux/mm/madvise.c
*
* Copyright (C) 1999 Linus Torvalds
* Copyright (C) 2002 Christoph Hellwig
*/
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/syscalls.h>
#include <linux/mempolicy.h>
#include <linux/page-isolation.h>
#include <linux/hugetlb.h>
#include <linux/falloc.h>
#include <linux/sched.h>
#include <linux/ksm.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/swap.h>
#include <linux/swapops.h>
/*
* Any behaviour which results in changes to the vma->vm_flags needs to
* take mmap_sem for writing. Others, which simply traverse vmas, need
* to only take it for reading.
*/
static int madvise_need_mmap_write(int behavior)
{
switch (behavior) {
case MADV_REMOVE:
case MADV_WILLNEED:
case MADV_DONTNEED:
return 0;
default:
/* be safe, default to 1. list exceptions explicitly */
return 1;
}
}
/*
* We can potentially split a vm area into separate
* areas, each area with its own behavior.
*/
static long madvise_behavior(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end, int behavior)
{
struct mm_struct *mm = vma->vm_mm;
int error = 0;
pgoff_t pgoff;
unsigned long new_flags = vma->vm_flags;
switch (behavior) {
case MADV_NORMAL:
new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
break;
case MADV_SEQUENTIAL:
new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
break;
case MADV_RANDOM:
new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
break;
case MADV_DONTFORK:
new_flags |= VM_DONTCOPY;
break;
case MADV_DOFORK:
if (vma->vm_flags & VM_IO) {
error = -EINVAL;
goto out;
}
new_flags &= ~VM_DONTCOPY;
break;
case MADV_DONTDUMP:
new_flags |= VM_DONTDUMP;
break;
case MADV_DODUMP:
if (new_flags & VM_SPECIAL) {
error = -EINVAL;
goto out;
}
new_flags &= ~VM_DONTDUMP;
break;
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
error = ksm_madvise(vma, start, end, behavior, &new_flags);
if (error)
goto out;
break;
case MADV_HUGEPAGE:
case MADV_NOHUGEPAGE:
error = hugepage_madvise(vma, &new_flags, behavior);
if (error)
goto out;
break;
}
if (new_flags == vma->vm_flags) {
*prev = vma;
goto out;
}
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
vma->vm_file, pgoff, vma_policy(vma));
if (*prev) {
vma = *prev;
goto success;
}
*prev = vma;
if (start != vma->vm_start) {
error = split_vma(mm, vma, start, 1);
if (error)
goto out;
}
if (end != vma->vm_end) {
error = split_vma(mm, vma, end, 0);
if (error)
goto out;
}
success:
/*
* vm_flags is protected by the mmap_sem held in write mode.
*/
vma->vm_flags = new_flags;
out:
if (error == -ENOMEM)
error = -EAGAIN;
return error;
}
#ifdef CONFIG_SWAP
static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
unsigned long end, struct mm_walk *walk)
{
pte_t *orig_pte;
struct vm_area_struct *vma = walk->private;
unsigned long index;
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
return 0;
for (index = start; index != end; index += PAGE_SIZE) {
pte_t pte;
swp_entry_t entry;
struct page *page;
spinlock_t *ptl;
orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
pte = *(orig_pte + ((index - start) / PAGE_SIZE));
pte_unmap_unlock(orig_pte, ptl);
if (pte_present(pte) || pte_none(pte))
continue;
entry = pte_to_swp_entry(pte);
if (unlikely(non_swap_entry(entry)))
continue;
page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
vma, index);
if (page)
page_cache_release(page);
}
return 0;
}
static void force_swapin_readahead(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
struct mm_walk walk = {
.mm = vma->vm_mm,
.pmd_entry = swapin_walk_pmd_entry,
.private = vma,
};
walk_page_range(start, end, &walk);
lru_add_drain(); /* Push any new pages onto the LRU now */
}
static void force_shm_swapin_readahead(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
struct address_space *mapping)
{
pgoff_t index;
struct page *page;
swp_entry_t swap;
for (; start < end; start += PAGE_SIZE) {
index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
page = find_get_entry(mapping, index);
if (!radix_tree_exceptional_entry(page)) {
if (page)
page_cache_release(page);
continue;
}
swap = radix_to_swp_entry(page);
page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
NULL, 0);
if (page)
page_cache_release(page);
}
lru_add_drain(); /* Push any new pages onto the LRU now */
}
#endif /* CONFIG_SWAP */
/*
* Schedule all required I/O operations. Do not wait for completion.
*/
static long madvise_willneed(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end)
{
struct file *file = vma->vm_file;
#ifdef CONFIG_SWAP
if (!file) {
*prev = vma;
force_swapin_readahead(vma, start, end);
return 0;
}
if (shmem_mapping(file->f_mapping)) {
*prev = vma;
force_shm_swapin_readahead(vma, start, end,
file->f_mapping);
return 0;
}
#else
if (!file)
return -EBADF;
#endif
if (IS_DAX(file_inode(file))) {
/* no bad return value, but ignore advice */
return 0;
}
*prev = vma;
start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
if (end > vma->vm_end)
end = vma->vm_end;
end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
force_page_cache_readahead(file->f_mapping, file, start, end - start);
return 0;
}
/*
* Application no longer needs these pages. If the pages are dirty,
* it's OK to just throw them away. The app will be more careful about
* data it wants to keep. Be sure to free swap resources too. The
* zap_page_range call sets things up for shrink_active_list to actually free
* these pages later if no one else has touched them in the meantime,
* although we could add these pages to a global reuse list for
* shrink_active_list to pick up before reclaiming other pages.
*
* NB: This interface discards data rather than pushes it out to swap,
* as some implementations do. This has performance implications for
* applications like large transactional databases which want to discard
* pages in anonymous maps after committing to backing store the data
* that was kept in them. There is no reason to write this data out to
* the swap area if the application is discarding it.
*
* An interface that causes the system to free clean pages and flush
* dirty pages is already available as msync(MS_INVALIDATE).
*/
static long madvise_dontneed(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end)
{
*prev = vma;
if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
return -EINVAL;
zap_page_range(vma, start, end - start, NULL);
return 0;
}
/*
* Application wants to free up the pages and associated backing store.
* This is effectively punching a hole into the middle of a file.
*/
static long madvise_remove(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end)
{
loff_t offset;
int error;
struct file *f;
*prev = NULL; /* tell sys_madvise we drop mmap_sem */
if (vma->vm_flags & (VM_LOCKED | VM_HUGETLB))
return -EINVAL;
f = vma->vm_file;
if (!f || !f->f_mapping || !f->f_mapping->host) {
return -EINVAL;
}
if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
return -EACCES;
offset = (loff_t)(start - vma->vm_start)
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
/*
* Filesystem's fallocate may need to take i_mutex. We need to
* explicitly grab a reference because the vma (and hence the
* vma's reference to the file) can go away as soon as we drop
* mmap_sem.
*/
get_file(f);
up_read(¤t->mm->mmap_sem);
error = vfs_fallocate(f,
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
offset, end - start);
fput(f);
down_read(¤t->mm->mmap_sem);
return error;
}
#ifdef CONFIG_MEMORY_FAILURE
/*
* Error injection support for memory error handling.
*/
static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
{
struct page *p;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
for (; start < end; start += PAGE_SIZE <<
compound_order(compound_head(p))) {
int ret;
ret = get_user_pages_fast(start, 1, 0, &p);
if (ret != 1)
return ret;
if (PageHWPoison(p)) {
put_page(p);
continue;
}
if (bhv == MADV_SOFT_OFFLINE) {
pr_info("Soft offlining page %#lx at %#lx\n",
page_to_pfn(p), start);
ret = soft_offline_page(p, MF_COUNT_INCREASED);
if (ret)
return ret;
continue;
}
pr_info("Injecting memory failure for page %#lx at %#lx\n",
page_to_pfn(p), start);
/* Ignore return value for now */
memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
}
return 0;
}
#endif
static long
madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
unsigned long start, unsigned long end, int behavior)
{
switch (behavior) {
case MADV_REMOVE:
return madvise_remove(vma, prev, start, end);
case MADV_WILLNEED:
return madvise_willneed(vma, prev, start, end);
case MADV_DONTNEED:
return madvise_dontneed(vma, prev, start, end);
default:
return madvise_behavior(vma, prev, start, end, behavior);
}
}
static int
madvise_behavior_valid(int behavior)
{
switch (behavior) {
case MADV_DOFORK:
case MADV_DONTFORK:
case MADV_NORMAL:
case MADV_SEQUENTIAL:
case MADV_RANDOM:
case MADV_REMOVE:
case MADV_WILLNEED:
case MADV_DONTNEED:
#ifdef CONFIG_KSM
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
case MADV_HUGEPAGE:
case MADV_NOHUGEPAGE:
#endif
case MADV_DONTDUMP:
case MADV_DODUMP:
return 1;
default:
return 0;
}
}
/*
* The madvise(2) system call.
*
* Applications can use madvise() to advise the kernel how it should
* handle paging I/O in this VM area. The idea is to help the kernel
* use appropriate read-ahead and caching techniques. The information
* provided is advisory only, and can be safely disregarded by the
* kernel without affecting the correct operation of the application.
*
* behavior values:
* MADV_NORMAL - the default behavior is to read clusters. This
* results in some read-ahead and read-behind.
* MADV_RANDOM - the system should read the minimum amount of data
* on any access, since it is unlikely that the appli-
* cation will need more than what it asks for.
* MADV_SEQUENTIAL - pages in the given range will probably be accessed
* once, so they can be aggressively read ahead, and
* can be freed soon after they are accessed.
* MADV_WILLNEED - the application is notifying the system to read
* some pages ahead.
* MADV_DONTNEED - the application is finished with the given range,
* so the kernel can free resources associated with it.
* MADV_REMOVE - the application wants to free up the given range of
* pages and associated backing store.
* MADV_DONTFORK - omit this area from child's address space when forking:
* typically, to avoid COWing pages pinned by get_user_pages().
* MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
* MADV_MERGEABLE - the application recommends that KSM try to merge pages in
* this area with pages of identical content from other such areas.
* MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
*
* return values:
* zero - success
* -EINVAL - start + len < 0, start is not page-aligned,
* "behavior" is not a valid value, or application
* is attempting to release locked or shared pages.
* -ENOMEM - addresses in the specified range are not currently
* mapped, or are outside the AS of the process.
* -EIO - an I/O error occurred while paging in data.
* -EBADF - map exists, but area maps something that isn't a file.
* -EAGAIN - a kernel resource was temporarily unavailable.
*/
SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
{
unsigned long end, tmp;
struct vm_area_struct *vma, *prev;
int unmapped_error = 0;
int error = -EINVAL;
int write;
size_t len;
struct blk_plug plug;
#ifdef CONFIG_MEMORY_FAILURE
if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
return madvise_hwpoison(behavior, start, start+len_in);
#endif
if (!madvise_behavior_valid(behavior))
return error;
if (start & ~PAGE_MASK)
return error;
len = (len_in + ~PAGE_MASK) & PAGE_MASK;
/* Check to see whether len was rounded up from small -ve to zero */
if (len_in && !len)
return error;
end = start + len;
if (end < start)
return error;
error = 0;
if (end == start)
return error;
write = madvise_need_mmap_write(behavior);
if (write)
down_write(¤t->mm->mmap_sem);
else
down_read(¤t->mm->mmap_sem);
/*
* If the interval [start,end) covers some unmapped address
* ranges, just ignore them, but return -ENOMEM at the end.
* - different from the way of handling in mlock etc.
*/
vma = find_vma_prev(current->mm, start, &prev);
if (vma && start > vma->vm_start)
prev = vma;
blk_start_plug(&plug);
for (;;) {
/* Still start < end. */
error = -ENOMEM;
if (!vma)
goto out;
/* Here start < (end|vma->vm_end). */
if (start < vma->vm_start) {
unmapped_error = -ENOMEM;
start = vma->vm_start;
if (start >= end)
goto out;
}
/* Here vma->vm_start <= start < (end|vma->vm_end) */
tmp = vma->vm_end;
if (end < tmp)
tmp = end;
/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
error = madvise_vma(vma, &prev, start, tmp, behavior);
if (error)
goto out;
start = tmp;
if (prev && start < prev->vm_end)
start = prev->vm_end;
error = unmapped_error;
if (start >= end)
goto out;
if (prev)
vma = prev->vm_next;
else /* madvise_remove dropped mmap_sem */
vma = find_vma(current->mm, start);
}
out:
blk_finish_plug(&plug);
if (write)
up_write(¤t->mm->mmap_sem);
else
up_read(¤t->mm->mmap_sem);
return error;
}
C:\Users\Admin\Desktop\linux-4.2.y-new\linux-4.2.y\mm/Makefile
#
# Makefile for the linux memory manager.
#
KASAN_SANITIZE_slab_common.o := n
KASAN_SANITIZE_slub.o := n
mmu-y := nommu.o
mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
vmalloc.o pagewalk.o pgtable-generic.o
ifdef CONFIG_CROSS_MEMORY_ATTACH
mmu-$(CONFIG_MMU) += process_vm_access.o
endif
obj-y := filemap.o mempool.o oom_kill.o \
maccess.o page_alloc.o page-writeback.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o mmu_context.o percpu.o slab_common.o \
compaction.o vmacache.o \
interval_tree.o list_lru.o workingset.o \
debug.o $(mmu-y)
obj-y += init-mm.o
ifdef CONFIG_NO_BOOTMEM
obj-y += nobootmem.o
else
obj-y += bootmem.o
endif
obj-$(CONFIG_ADVISE_SYSCALLS) += fadvise.o
ifdef CONFIG_MMU
obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o
endif
obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
obj-$(CONFIG_FRONTSWAP) += frontswap.o
obj-$(CONFIG_ZSWAP) += zswap.o
obj-$(CONFIG_HAS_DMA) += dmapool.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
obj-$(CONFIG_NUMA) += mempolicy.o
obj-$(CONFIG_SPARSEMEM) += sparse.o
obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
obj-$(CONFIG_SLOB) += slob.o
obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
obj-$(CONFIG_KSM) += ksm.o
obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
obj-$(CONFIG_SLAB) += slab.o
obj-$(CONFIG_SLUB) += slub.o
obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
obj-$(CONFIG_KASAN) += kasan/
obj-$(CONFIG_FAILSLAB) += failslab.o
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
obj-$(CONFIG_MEMTEST) += memtest.o
obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o
obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
obj-$(CONFIG_PAGE_OWNER) += page_owner.o
obj-$(CONFIG_CLEANCACHE) += cleancache.o
obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
obj-$(CONFIG_ZPOOL) += zpool.o
obj-$(CONFIG_ZBUD) += zbud.o
obj-$(CONFIG_ZSMALLOC) += zsmalloc.o
obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
obj-$(CONFIG_CMA) += cma.o
obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
C:\Users\Admin\Desktop\linux-4.2.y-new\linux-4.2.y\mm/memblock.c
/*
* Procedures for maintaining information about logical memory blocks.
*
* Peter Bergner, IBM Corp. June 2001.
* Copyright (C) 2001 Peter Bergner.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/bitops.h>
#include <linux/poison.h>
#include <linux/pfn.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
#include <linux/memblock.h>
#include <asm-generic/sections.h>
#include <linux/io.h>
#include "internal.h"
static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS] __initdata_memblock;
#endif
struct memblock memblock __initdata_memblock = {
.memory.regions = memblock_memory_init_regions,
.memory.cnt = 1, /* empty dummy entry */
.memory.max = INIT_MEMBLOCK_REGIONS,
.reserved.regions = memblock_reserved_init_regions,
.reserved.cnt = 1, /* empty dummy entry */
.reserved.max = INIT_MEMBLOCK_REGIONS,
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
.physmem.regions = memblock_physmem_init_regions,
.physmem.cnt = 1, /* empty dummy entry */
.physmem.max = INIT_PHYSMEM_REGIONS,
#endif
.bottom_up = false,
.current_limit = MEMBLOCK_ALLOC_ANYWHERE,
};
int memblock_debug __initdata_memblock;
#ifdef CONFIG_MOVABLE_NODE
bool movable_node_enabled __initdata_memblock = false;
#endif
static bool system_has_some_mirror __initdata_memblock = false;
static int memblock_can_resize __initdata_memblock;
static int memblock_memory_in_slab __initdata_memblock = 0;
static int memblock_reserved_in_slab __initdata_memblock = 0;
ulong __init_memblock choose_memblock_flags(void)
{
return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE;
}
/* inline so we don't get a warning when pr_debug is compiled out */
static __init_memblock const char *
memblock_type_name(struct memblock_type *type)
{
if (type == &memblock.memory)
return "memory";
else if (type == &memblock.reserved)
return "reserved";
else
return "unknown";
}
/* adjust *@size so that (@base + *@size) doesn't overflow, return new size */
static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size)
{
return *size = min(*size, (phys_addr_t)ULLONG_MAX - base);
}
/*
* Address comparison utilities
*/
static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1,
phys_addr_t base2, phys_addr_t size2)
{
return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
}
static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
phys_addr_t base, phys_addr_t size)
{
unsigned long i;
for (i = 0; i < type->cnt; i++) {
phys_addr_t rgnbase = type->regions[i].base;
phys_addr_t rgnsize = type->regions[i].size;
if (memblock_addrs_overlap(base, size, rgnbase, rgnsize))
break;
}
return (i < type->cnt) ? i : -1;
}
/*
* __memblock_find_range_bottom_up - find free area utility in bottom-up
* @start: start of candidate range
* @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
* @size: size of free area to find
* @align: alignment of free area to find
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
* @flags: pick from blocks based on memory attributes
*
* Utility called from memblock_find_in_range_node(), find free area bottom-up.
*
* RETURNS:
* Found address on success, 0 on failure.
*/
static phys_addr_t __init_memblock
__memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
phys_addr_t size, phys_addr_t align, int nid,
ulong flags)
{
phys_addr_t this_start, this_end, cand;
u64 i;
for_each_free_mem_range(i, nid, flags, &this_start, &this_end, NULL) {
this_start = clamp(this_start, start, end);
this_end = clamp(this_end, start, end);
cand = round_up(this_start, align);
if (cand < this_end && this_end - cand >= size)
return cand;
}
return 0;
}
/**
* __memblock_find_range_top_down - find free area utility, in top-down
* @start: start of candidate range
* @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
* @size: size of free area to find
* @align: alignment of free area to find
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
* @flags: pick from blocks based on memory attributes
*
* Utility called from memblock_find_in_range_node(), find free area top-down.
*
* RETURNS:
* Found address on success, 0 on failure.
*/
static phys_addr_t __init_memblock
__memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
phys_addr_t size, phys_addr_t align, int nid,
ulong flags)
{
phys_addr_t this_start, this_end, cand;
u64 i;
for_each_free_mem_range_reverse(i, nid, flags, &this_start, &this_end,
NULL) {
this_start = clamp(this_start, start, end);
this_end = clamp(this_end, start, end);
if (this_end < size)
continue;
cand = round_down(this_end - size, align);
if (cand >= this_start)
return cand;
}
return 0;
}
/**
* memblock_find_in_range_node - find free area in given range and node
* @size: size of free area to find
* @align: alignment of free area to find
* @start: start of candidate range
* @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
* @flags: pick from blocks based on memory attributes
*
* Find @size free area aligned to @align in the specified range and node.
*
* When allocation direction is bottom-up, the @start should be greater
* than the end of the kernel image. Otherwise, it will be trimmed. The
* reason is that we want the bottom-up allocation just near the kernel
* image so it is highly likely that the allocated memory and the kernel
* will reside in the same node.
*
* If bottom-up allocation failed, will try to allocate memory top-down.
*
* RETURNS:
* Found address on success, 0 on failure.
*/
phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
phys_addr_t align, phys_addr_t start,
phys_addr_t end, int nid, ulong flags)
{
phys_addr_t kernel_end, ret;
/* pump up @end */
if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
end = memblock.current_limit;
/* avoid allocating the first page */
start = max_t(phys_addr_t, start, PAGE_SIZE);
end = max(start, end);
kernel_end = __pa_symbol(_end);
/*
* try bottom-up allocation only when bottom-up mode
* is set and @end is above the kernel image.
*/
if (memblock_bottom_up() && end > kernel_end) {
phys_addr_t bottom_up_start;
/* make sure we will allocate above the kernel */
bottom_up_start = max(start, kernel_end);
/* ok, try bottom-up allocation first */
ret = __memblock_find_range_bottom_up(bottom_up_start, end,
size, align, nid, flags);
if (ret)
return ret;
/*
* we always limit bottom-up allocation above the kernel,
* but top-down allocation doesn't have the limit, so
* retrying top-down allocation may succeed when bottom-up
* allocation failed.
*
* bottom-up allocation is expected to be fail very rarely,
* so we use WARN_ONCE() here to see the stack trace if
* fail happens.
*/
WARN_ONCE(1, "memblock: bottom-up allocation failed, "
"memory hotunplug may be affected\n");
}
return __memblock_find_range_top_down(start, end, size, align, nid,
flags);
}
/**
* memblock_find_in_range - find free area in given range
* @start: start of candidate range
* @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
* @size: size of free area to find
* @align: alignment of free area to find
*
* Find @size free area aligned to @align in the specified range.
*
* RETURNS:
* Found address on success, 0 on failure.
*/
phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
phys_addr_t end, phys_addr_t size,
phys_addr_t align)
{
phys_addr_t ret;
ulong flags = choose_memblock_flags();
again:
ret = memblock_find_in_range_node(size, align, start, end,
NUMA_NO_NODE, flags);
if (!ret && (flags & MEMBLOCK_MIRROR)) {
pr_warn("Could not allocate %pap bytes of mirrored memory\n",
&size);
flags &= ~MEMBLOCK_MIRROR;
goto again;
}
return ret;
}
static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
{
type->total_size -= type->regions[r].size;
memmove(&type->regions[r], &type->regions[r + 1],
(type->cnt - (r + 1)) * sizeof(type->regions[r]));
type->cnt--;
/* Special case for empty arrays */
if (type->cnt == 0) {
WARN_ON(type->total_size != 0);
type->cnt = 1;
type->regions[0].base = 0;
type->regions[0].size = 0;
type->regions[0].flags = 0;
memblock_set_region_node(&type->regions[0], MAX_NUMNODES);
}
}
#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
phys_addr_t *addr)
{
if (memblock.reserved.regions == memblock_reserved_init_regions)
return 0;
*addr = __pa(memblock.reserved.regions);
return PAGE_ALIGN(sizeof(struct memblock_region) *
memblock.reserved.max);
}
phys_addr_t __init_memblock get_allocated_memblock_memory_regions_info(
phys_addr_t *addr)
{
if (memblock.memory.regions == memblock_memory_init_regions)
return 0;
*addr = __pa(memblock.memory.regions);
return PAGE_ALIGN(sizeof(struct memblock_region) *
memblock.memory.max);
}
#endif
/**
* memblock_double_array - double the size of the memblock regions array
* @type: memblock type of the regions array being doubled
* @new_area_start: starting address of memory range to avoid overlap with
* @new_area_size: size of memory range to avoid overlap with
*
* Double the size of the @type regions array. If memblock is being used to
* allocate memory for a new reserved regions array and there is a previously
* allocated memory range [@new_area_start,@new_area_start+@new_area_size]
* waiting to be reserved, ensure the memory used by the new array does
* not overlap.
*
* RETURNS:
* 0 on success, -1 on failure.
*/
static int __init_memblock memblock_double_array(struct memblock_type *type,
phys_addr_t new_area_start,
phys_addr_t new_area_size)
{
struct memblock_region *new_array, *old_array;
phys_addr_t old_alloc_size, new_alloc_size;
phys_addr_t old_size, new_size, addr;
int use_slab = slab_is_available();
int *in_slab;
/* We don't allow resizing until we know about the reserved regions
* of memory that aren't suitable for allocation
*/
if (!memblock_can_resize)
return -1;
/* Calculate new doubled size */
old_size = type->max * sizeof(struct memblock_region);
new_size = old_size << 1;
/*
* We need to allocated new one align to PAGE_SIZE,
* so we can free them completely later.
*/
old_alloc_size = PAGE_ALIGN(old_size);
new_alloc_size = PAGE_ALIGN(new_size);
/* Retrieve the slab flag */
if (type == &memblock.memory)
in_slab = &memblock_memory_in_slab;
else
in_slab = &memblock_reserved_in_slab;
/* Try to find some space for it.
*
* WARNING: We assume that either slab_is_available() and we use it or
* we use MEMBLOCK for allocations. That means that this is unsafe to
* use when bootmem is currently active (unless bootmem itself is
* implemented on top of MEMBLOCK which isn't the case yet)
*
* This should however not be an issue for now, as we currently only
* call into MEMBLOCK while it's still active, or much later when slab
* is active for memory hotplug operations
*/
if (use_slab) {
new_array = kmalloc(new_size, GFP_KERNEL);
addr = new_array ? __pa(new_array) : 0;
} else {
/* only exclude range when trying to double reserved.regions */
if (type != &memblock.reserved)
new_area_start = new_area_size = 0;
addr = memblock_find_in_range(new_area_start + new_area_size,
memblock.current_limit,
new_alloc_size, PAGE_SIZE);
if (!addr && new_area_size)
addr = memblock_find_in_range(0,
min(new_area_start, memblock.current_limit),
new_alloc_size, PAGE_SIZE);
new_array = addr ? __va(addr) : NULL;
}
if (!addr) {
pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
memblock_type_name(type), type->max, type->max * 2);
return -1;
}
memblock_dbg("memblock: %s is doubled to %ld at [%#010llx-%#010llx]",
memblock_type_name(type), type->max * 2, (u64)addr,
(u64)addr + new_size - 1);
/*
* Found space, we now need to move the array over before we add the
* reserved region since it may be our reserved array itself that is
* full.
*/
memcpy(new_array, type->regions, old_size);
memset(new_array + type->max, 0, old_size);
old_array = type->regions;
type->regions = new_array;
type->max <<= 1;
/* Free old array. We needn't free it if the array is the static one */
if (*in_slab)
kfree(old_array);
else if (old_array != memblock_memory_init_regions &&
old_array != memblock_reserved_init_regions)
memblock_free(__pa(old_array), old_alloc_size);
/*
* Reserve the new array if that comes from the memblock. Otherwise, we
* needn't do it
*/
if (!use_slab)
BUG_ON(memblock_reserve(addr, new_alloc_size));
/* Update slab flag */
*in_slab = use_slab;
return 0;
}
mm4.txt
最新推荐文章于 2024-11-02 09:55:19 发布