/*
* Carry out RCU priority boosting on the task indicated by ->exp_tasks
* or ->boost_tasks, advancing the pointer to the next task in the
* ->blkd_tasks list.
*
* Note that irqs must be enabled: boosting the task can block.
* Returns 1 if there are more tasks needing to be boosted.
*/
static int rcu_boost(struct rcu_node *rnp)
{
unsigned long flags;
struct task_struct *t;
struct list_head *tb;
if (READ_ONCE(rnp->exp_tasks) == NULL &&
READ_ONCE(rnp->boost_tasks) == NULL)
return 0; /* Nothing left to boost. */
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
/*
* Recheck under the lock: all tasks in need of boosting
* might exit their RCU read-side critical sections on their own.
*/
if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
return 0;
}
/*
* Preferentially boost tasks blocking expedited grace periods.
* This cannot starve the normal grace periods because a second
* expedited grace period must boost all blocked tasks, including
* those blocking the pre-existing normal grace period.
*/
if (rnp->exp_tasks != NULL) {
tb = rnp->exp_tasks;
rnp->n_exp_boosts++;
} else {
tb = rnp->boost_tasks;
rnp->n_normal_boosts++;
}
rnp->n_tasks_boosted++;
/*
* We boost task t by manufacturing an rt_mutex that appears to
* be held by task t. We leave a pointer to that rt_mutex where
* task t can find it, and task t will release the mutex when it
* exits its outermost RCU read-side critical section. Then
* simply acquiring this artificial rt_mutex will boost task
* t's priority. (Thanks to tglx for suggesting this approach!)
*
* Note that task t must acquire rnp->lock to remove itself from
* the ->blkd_tasks list, which it will do from exit() if from
* nowhere else. We therefore are guaranteed that task t will
* stay around at least until we drop rnp->lock. Note that
* rnp->lock also resolves races between our priority boosting
* and task t's exiting its outermost RCU read-side critical
* section.
*/
t = container_of(tb, struct task_struct, rcu_node_entry);
rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
/* Lock only for side effect: boosts task t's priority. */
rt_mutex_lock(&rnp->boost_mtx);
rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */
return READ_ONCE(rnp->exp_tasks) != NULL ||
READ_ONCE(rnp->boost_tasks) != NULL;
}
/*
* Priority-boosting kthread. One per leaf rcu_node and one for the
* root rcu_node.
*/
static int rcu_boost_kthread(void *arg)
{
struct rcu_node *rnp = (struct rcu_node *)arg;
int spincnt = 0;
int more2boost;
trace_rcu_utilization(TPS("Start boost kthread@init"));
for (;;) {
rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
trace_rcu_utilization(TPS("End boost kthread@rcu_wait"));
rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
trace_rcu_utilization(TPS("Start boost kthread@rcu_wait"));
rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
more2boost = rcu_boost(rnp);
if (more2boost)
spincnt++;
else
spincnt = 0;
if (spincnt > 10) {
rnp->boost_kthread_status = RCU_KTHREAD_YIELDING;
trace_rcu_utilization(TPS("End boost kthread@rcu_yield"));
schedule_timeout_interruptible(2);
trace_rcu_utilization(TPS("Start boost kthread@rcu_yield"));
spincnt = 0;
}
}
/* NOTREACHED */
trace_rcu_utilization(TPS("End boost kthread@notreached"));
return 0;
}
/*
* Check to see if it is time to start boosting RCU readers that are
* blocking the current grace period, and, if so, tell the per-rcu_node
* kthread to start boosting them. If there is an expedited grace
* period in progress, it is always time to boost.
*
* The caller must hold rnp->lock, which this function releases.
* The ->boost_kthread_task is immortal, so we don't need to worry
* about it going away.
*/
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
__releases(rnp->lock)
{
struct task_struct *t;
if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
rnp->n_balk_exp_gp_tasks++;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
return;
}
if (rnp->exp_tasks != NULL ||
(rnp->gp_tasks != NULL &&
rnp->boost_tasks == NULL &&
rnp->qsmask == 0 &&
ULONG_CMP_GE(jiffies, rnp->boost_time))) {
if (rnp->exp_tasks == NULL)
rnp->boost_tasks = rnp->gp_tasks;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
t = rnp->boost_kthread_task;
if (t)
rcu_wake_cond(t, rnp->boost_kthread_status);
} else {
rcu_initiate_boost_trace(rnp);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
}
/*
* Wake up the per-CPU kthread to invoke RCU callbacks.
*/
static void invoke_rcu_callbacks_kthread(void)
{
unsigned long flags;
local_irq_save(flags);
__this_cpu_write(rcu_cpu_has_work, 1);
if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
current != __this_cpu_read(rcu_cpu_kthread_task)) {
rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
__this_cpu_read(rcu_cpu_kthread_status));
}
local_irq_restore(flags);
}
/*
* Is the current CPU running the RCU-callbacks kthread?
* Caller must have preemption disabled.
*/
static bool rcu_is_callbacks_kthread(void)
{
return __this_cpu_read(rcu_cpu_kthread_task) == current;
}
#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
/*
* Do priority-boost accounting for the start of a new grace period.
*/
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
{
rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
}
/*
* Create an RCU-boost kthread for the specified node if one does not
* already exist. We only create this kthread for preemptible RCU.
* Returns zero if all is well, a negated errno otherwise.
*/
static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
struct rcu_node *rnp)
{
int rnp_index = rnp - &rsp->node[0];
unsigned long flags;
struct sched_param sp;
struct task_struct *t;
if (rcu_state_p != rsp)
return 0;
if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0)
return 0;
rsp->boost = 1;
if (rnp->boost_kthread_task != NULL)
return 0;
t = kthread_create(rcu_boost_kthread, (void *)rnp,
"rcub/%d", rnp_index);
if (IS_ERR(t))
return PTR_ERR(t);
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
rnp->boost_kthread_task = t;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
return 0;
}
static void rcu_kthread_do_work(void)
{
rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
rcu_preempt_do_callbacks();
}
static void rcu_cpu_kthread_setup(unsigned int cpu)
{
struct sched_param sp;
sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
}
static void rcu_cpu_kthread_park(unsigned int cpu)
{
per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
}
static int rcu_cpu_kthread_should_run(unsigned int cpu)
{
return __this_cpu_read(rcu_cpu_has_work);
}
/*
* Per-CPU kernel thread that invokes RCU callbacks. This replaces the
* RCU softirq used in flavors and configurations of RCU that do not
* support RCU priority boosting.
*/
static void rcu_cpu_kthread(unsigned int cpu)
{
unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
int spincnt;
for (spincnt = 0; spincnt < 10; spincnt++) {
trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
local_bh_disable();
*statusp = RCU_KTHREAD_RUNNING;
this_cpu_inc(rcu_cpu_kthread_loops);
local_irq_disable();
work = *workp;
*workp = 0;
local_irq_enable();
if (work)
rcu_kthread_do_work();
local_bh_enable();
if (*workp == 0) {
trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
*statusp = RCU_KTHREAD_WAITING;
return;
}
}
*statusp = RCU_KTHREAD_YIELDING;
trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
schedule_timeout_interruptible(2);
trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
*statusp = RCU_KTHREAD_WAITING;
}
/*
* Set the per-rcu_node kthread's affinity to cover all CPUs that are
* served by the rcu_node in question. The CPU hotplug lock is still
* held, so the value of rnp->qsmaskinit will be stable.
*
* We don't include outgoingcpu in the affinity set, use -1 if there is
* no outgoing CPU. If there are no CPUs left in the affinity set,
* this function allows the kthread to execute on any CPU.
*/
static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
{
struct task_struct *t = rnp->boost_kthread_task;
unsigned long mask = rcu_rnp_online_cpus(rnp);
cpumask_var_t cm;
int cpu;
if (!t)
return;
if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
return;
for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
if ((mask & 0x1) && cpu != outgoingcpu)
cpumask_set_cpu(cpu, cm);
if (cpumask_weight(cm) == 0)
cpumask_setall(cm);
set_cpus_allowed_ptr(t, cm);
free_cpumask_var(cm);
}
static struct smp_hotplug_thread rcu_cpu_thread_spec = {
.store = &rcu_cpu_kthread_task,
.thread_should_run = rcu_cpu_kthread_should_run,
.thread_fn = rcu_cpu_kthread,
.thread_comm = "rcuc/%u",
.setup = rcu_cpu_kthread_setup,
.park = rcu_cpu_kthread_park,
};
/*
* Spawn boost kthreads -- called as soon as the scheduler is running.
*/
static void __init rcu_spawn_boost_kthreads(void)
{
struct rcu_node *rnp;
int cpu;
for_each_possible_cpu(cpu)
per_cpu(rcu_cpu_has_work, cpu) = 0;
BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
rcu_for_each_leaf_node(rcu_state_p, rnp)
(void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
}
static void rcu_prepare_kthreads(int cpu)
{
struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
struct rcu_node *rnp = rdp->mynode;
/* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
if (rcu_scheduler_fully_active)
(void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
}
#else /* #ifdef CONFIG_RCU_BOOST */
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
__releases(rnp->lock)
{
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
static void invoke_rcu_callbacks_kthread(void)
{
WARN_ON_ONCE(1);
}
static bool rcu_is_callbacks_kthread(void)
{
return false;
}
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
{
}
static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
{
}
static void __init rcu_spawn_boost_kthreads(void)
{
}
static void rcu_prepare_kthreads(int cpu)
{
}
#endif /* #else #ifdef CONFIG_RCU_BOOST */
#if !defined(CONFIG_RCU_FAST_NO_HZ)
/*
* Check to see if any future RCU-related work will need to be done
* by the current CPU, even if none need be done immediately, returning
* 1 if so. This function is part of the RCU implementation; it is -not-
* an exported member of the RCU API.
*
* Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
* any flavor of RCU.
*/
int rcu_needs_cpu(u64 basemono, u64 *nextevt)
{
*nextevt = KTIME_MAX;
return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)
? 0 : rcu_cpu_has_callbacks(NULL);
}
/*
* Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
* after it.
*/
static void rcu_cleanup_after_idle(void)
{
}
/*
* Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n,
* is nothing.
*/
static void rcu_prepare_for_idle(void)
{
}
/*
* Don't bother keeping a running count of the number of RCU callbacks
* posted because CONFIG_RCU_FAST_NO_HZ=n.
*/
static void rcu_idle_count_callbacks_posted(void)
{
}
#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
/*
* This code is invoked when a CPU goes idle, at which point we want
* to have the CPU do everything required for RCU so that it can enter
* the energy-efficient dyntick-idle mode. This is handled by a
* state machine implemented by rcu_prepare_for_idle() below.
*
* The following three proprocessor symbols control this state machine:
*
* RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
* to sleep in dyntick-idle mode with RCU callbacks pending. This
* is sized to be roughly one RCU grace period. Those energy-efficiency
* benchmarkers who might otherwise be tempted to set this to a large
* number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
* system. And if you are -that- concerned about energy efficiency,
* just power the system down and be done with it!
* RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is
* permitted to sleep in dyntick-idle mode with only lazy RCU
* callbacks pending. Setting this too high can OOM your system.
*
* The values below work well in practice. If future workloads require
* adjustment, they can be converted into kernel config parameters, though
* making the state machine smarter might be a better option.
*/
#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */
#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY;
module_param(rcu_idle_gp_delay, int, 0644);
static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
module_param(rcu_idle_lazy_gp_delay, int, 0644);
/*
* Try to advance callbacks for all flavors of RCU on the current CPU, but
* only if it has been awhile since the last time we did so. Afterwards,
* if there are any callbacks ready for immediate invocation, return true.
*/
static bool __maybe_unused rcu_try_advance_all_cbs(void)
{
bool cbs_ready = false;
struct rcu_data *rdp;
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
struct rcu_node *rnp;
struct rcu_state *rsp;
/* Exit early if we advanced recently. */
if (jiffies == rdtp->last_advance_all)
return false;
rdtp->last_advance_all = jiffies;
for_each_rcu_flavor(rsp) {
rdp = this_cpu_ptr(rsp->rda);
rnp = rdp->mynode;
/*
* Don't bother checking unless a grace period has
* completed since we last checked and there are
* callbacks not yet ready to invoke.
*/
if ((rdp->completed != rnp->completed ||
unlikely(READ_ONCE(rdp->gpwrap))) &&
rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
note_gp_changes(rsp, rdp);
if (cpu_has_callbacks_ready_to_invoke(rdp))
cbs_ready = true;
}
return cbs_ready;
}
/*
* Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
* to invoke. If the CPU has callbacks, try to advance them. Tell the
* caller to set the timeout based on whether or not there are non-lazy
* callbacks.
*
* The caller must have disabled interrupts.
*/
int rcu_needs_cpu(u64 basemono, u64 *nextevt)
{
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
unsigned long dj;
if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)) {
*nextevt = KTIME_MAX;
return 0;
}
/* Snapshot to detect later posting of non-lazy callback. */
rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
/* If no callbacks, RCU doesn't need the CPU. */
if (!rcu_cpu_has_callbacks(&rdtp->all_lazy)) {
*nextevt = KTIME_MAX;
return 0;
}
/* Attempt to advance callbacks. */
if (rcu_try_advance_all_cbs()) {
/* Some ready to invoke, so initiate later invocation. */
invoke_rcu_core();
return 1;
}
rdtp->last_accelerate = jiffies;
/* Request timer delay depending on laziness, and round. */
if (!rdtp->all_lazy) {
dj = round_up(rcu_idle_gp_delay + jiffies,
rcu_idle_gp_delay) - jiffies;
} else {
dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
}
*nextevt = basemono + dj * TICK_NSEC;
return 0;
}
/*
* Prepare a CPU for idle from an RCU perspective. The first major task
* is to sense whether nohz mode has been enabled or disabled via sysfs.
* The second major task is to check to see if a non-lazy callback has
* arrived at a CPU that previously had only lazy callbacks. The third
* major task is to accelerate (that is, assign grace-period numbers to)
* any recently arrived callbacks.
*
* The caller must have disabled interrupts.
*/
static void rcu_prepare_for_idle(void)
{
bool needwake;
struct rcu_data *rdp;
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
struct rcu_node *rnp;
struct rcu_state *rsp;
int tne;
if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL))
return;
/* Handle nohz enablement switches conservatively. */
tne = READ_ONCE(tick_nohz_active);
if (tne != rdtp->tick_nohz_enabled_snap) {
if (rcu_cpu_has_callbacks(NULL))
invoke_rcu_core(); /* force nohz to see update. */
rdtp->tick_nohz_enabled_snap = tne;
return;
}
if (!tne)
return;
/* If this is a no-CBs CPU, no callbacks, just return. */
if (rcu_is_nocb_cpu(smp_processor_id()))
return;
/*
* If a non-lazy callback arrived at a CPU having only lazy
* callbacks, invoke RCU core for the side-effect of recalculating
* idle duration on re-entry to idle.
*/
if (rdtp->all_lazy &&
rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) {
rdtp->all_lazy = false;
rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
invoke_rcu_core();
return;
}
/*
* If we have not yet accelerated this jiffy, accelerate all
* callbacks on this CPU.
*/
if (rdtp->last_accelerate == jiffies)
return;
rdtp->last_accelerate = jiffies;
for_each_rcu_flavor(rsp) {
rdp = this_cpu_ptr(rsp->rda);
if (!*rdp->nxttail[RCU_DONE_TAIL])
continue;
rnp = rdp->mynode;
raw_spin_lock(&rnp->lock); /* irqs already disabled. */
smp_mb__after_unlock_lock();
needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
if (needwake)
rcu_gp_kthread_wake(rsp);
}
}
/*
* Clean up for exit from idle. Attempt to advance callbacks based on
* any grace periods that elapsed while the CPU was idle, and if any
* callbacks are now ready to invoke, initiate invocation.
*/
static void rcu_cleanup_after_idle(void)
{
if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) ||
rcu_is_nocb_cpu(smp_processor_id()))
return;
if (rcu_try_advance_all_cbs())
invoke_rcu_core();
}
/*
* Keep a running count of the number of non-lazy callbacks posted
* on this CPU. This running counter (which is never decremented) allows
* rcu_prepare_for_idle() to detect when something out of the idle loop
* posts a callback, even if an equal number of callbacks are invoked.
* Of course, callbacks should only be posted from within a trace event
* designed to be called from idle or from within RCU_NONIDLE().
*/
static void rcu_idle_count_callbacks_posted(void)
{
__this_cpu_add(rcu_dynticks.nonlazy_posted, 1);
}
/*
* Data for flushing lazy RCU callbacks at OOM time.
*/
static atomic_t oom_callback_count;
static DECLARE_WAIT_QUEUE_HEAD(oom_callback_wq);
/*
* RCU OOM callback -- decrement the outstanding count and deliver the
* wake-up if we are the last one.
*/
static void rcu_oom_callback(struct rcu_head *rhp)
{
if (atomic_dec_and_test(&oom_callback_count))
wake_up(&oom_callback_wq);
}
/*
* Post an rcu_oom_notify callback on the current CPU if it has at
* least one lazy callback. This will unnecessarily post callbacks
* to CPUs that already have a non-lazy callback at the end of their
* callback list, but this is an infrequent operation, so accept some
* extra overhead to keep things simple.
*/
static void rcu_oom_notify_cpu(void *unused)
{
struct rcu_state *rsp;
struct rcu_data *rdp;
for_each_rcu_flavor(rsp) {
rdp = raw_cpu_ptr(rsp->rda);
if (rdp->qlen_lazy != 0) {
atomic_inc(&oom_callback_count);
rsp->call(&rdp->oom_head, rcu_oom_callback);
}
}
}
/*
* If low on memory, ensure that each CPU has a non-lazy callback.
* This will wake up CPUs that have only lazy callbacks, in turn
* ensuring that they free up the corresponding memory in a timely manner.
* Because an uncertain amount of memory will be freed in some uncertain
* timeframe, we do not claim to have freed anything.
*/
static int rcu_oom_notify(struct notifier_block *self,
unsigned long notused, void *nfreed)
{
int cpu;
/* Wait for callbacks from earlier instance to complete. */
wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0);
smp_mb(); /* Ensure callback reuse happens after callback invocation. */
/*
* Prevent premature wakeup: ensure that all increments happen
* before there is a chance of the counter reaching zero.
*/
atomic_set(&oom_callback_count, 1);
get_online_cpus();
for_each_online_cpu(cpu) {
smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1);
cond_resched_rcu_qs();
}
put_online_cpus();
/* Unconditionally decrement: no need to wake ourselves up. */
atomic_dec(&oom_callback_count);
return NOTIFY_OK;
}
static struct notifier_block rcu_oom_nb = {
.notifier_call = rcu_oom_notify
};
static int __init rcu_register_oom_notifier(void)
{
register_oom_notifier(&rcu_oom_nb);
return 0;
}
early_initcall(rcu_register_oom_notifier);
#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
#ifdef CONFIG_RCU_CPU_STALL_INFO
#ifdef CONFIG_RCU_FAST_NO_HZ
static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
{
struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
unsigned long nlpd = rdtp->nonlazy_posted - rdtp->nonlazy_posted_snap;
sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c",
rdtp->last_accelerate & 0xffff, jiffies & 0xffff,
ulong2long(nlpd),
rdtp->all_lazy ? 'L' : '.',
rdtp->tick_nohz_enabled_snap ? '.' : 'D');
}
#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
{
*cp = '\0';
}
#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
/* Initiate the stall-info list. */
static void print_cpu_stall_info_begin(void)
{
pr_cont("\n");
}
/*
* Print out diagnostic information for the specified stalled CPU.
*
* If the specified CPU is aware of the current RCU grace period
* (flavor specified by rsp), then print the number of scheduling
* clock interrupts the CPU has taken during the time that it has
* been aware. Otherwise, print the number of RCU grace periods
* that this CPU is ignorant of, for example, "1" if the CPU was
* aware of the previous grace period.
*
* Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info.
*/
static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
{
char fast_no_hz[72];
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_dynticks *rdtp = rdp->dynticks;
char *ticks_title;
unsigned long ticks_value;
if (rsp->gpnum == rdp->gpnum) {
ticks_title = "ticks this GP";
ticks_value = rdp->ticks_this_gp;
} else {
ticks_title = "GPs behind";
ticks_value = rsp->gpnum - rdp->gpnum;
}
print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n",
cpu, ticks_value, ticks_title,
atomic_read(&rdtp->dynticks) & 0xfff,
rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
READ_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
fast_no_hz);
}
/* Terminate the stall-info list. */
static void print_cpu_stall_info_end(void)
{
pr_err("\t");
}
/* Zero ->ticks_this_gp for all flavors of RCU. */
static void zero_cpu_stall_ticks(struct rcu_data *rdp)
{
rdp->ticks_this_gp = 0;
rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id());
}
/* Increment ->ticks_this_gp for all flavors of RCU. */
static void increment_cpu_stall_ticks(void)
{
struct rcu_state *rsp;
for_each_rcu_flavor(rsp)
raw_cpu_inc(rsp->rda->ticks_this_gp);
}
#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
static void print_cpu_stall_info_begin(void)
{
pr_cont(" {");
}
static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
{
pr_cont(" %d", cpu);
}
static void print_cpu_stall_info_end(void)
{
pr_cont("} ");
}
static void zero_cpu_stall_ticks(struct rcu_data *rdp)
{
}
static void increment_cpu_stall_ticks(void)
{
}
#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
#ifdef CONFIG_RCU_NOCB_CPU
/*
* Offload callback processing from the boot-time-specified set of CPUs
* specified by rcu_nocb_mask. For each CPU in the set, there is a
* kthread created that pulls the callbacks from the corresponding CPU,
* waits for a grace period to elapse, and invokes the callbacks.
* The no-CBs CPUs do a wake_up() on their kthread when they insert
* a callback into any empty list, unless the rcu_nocb_poll boot parameter
* has been specified, in which case each kthread actively polls its
* CPU. (Which isn't so great for energy efficiency, but which does
* reduce RCU's overhead on that CPU.)
*
* This is intended to be used in conjunction with Frederic Weisbecker's
* adaptive-idle work, which would seriously reduce OS jitter on CPUs
* running CPU-bound user-mode computations.
*
* Offloading of callback processing could also in theory be used as
* an energy-efficiency measure because CPUs with no RCU callbacks
* queued are more aggressive about entering dyntick-idle mode.
*/
/* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */
static int __init rcu_nocb_setup(char *str)
{
alloc_bootmem_cpumask_var(&rcu_nocb_mask);
have_rcu_nocb_mask = true;
cpulist_parse(str, rcu_nocb_mask);
return 1;
}
__setup("rcu_nocbs=", rcu_nocb_setup);
static int __init parse_rcu_nocb_poll(char *arg)
{
rcu_nocb_poll = 1;
return 0;
}
early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
/*
* Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
* grace period.
*/
static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
{
wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
}
/*
* Set the root rcu_node structure's ->need_future_gp field
* based on the sum of those of all rcu_node structures. This does
* double-count the root rcu_node structure's requests, but this
* is necessary to handle the possibility of a rcu_nocb_kthread()
* having awakened during the time that the rcu_node structures
* were being updated for the end of the previous grace period.
*/
static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
{
rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
}
static void rcu_init_one_nocb(struct rcu_node *rnp)
{
init_waitqueue_head(&rnp->nocb_gp_wq[0]);
init_waitqueue_head(&rnp->nocb_gp_wq[1]);
}
#ifndef CONFIG_RCU_NOCB_CPU_ALL
/* Is the specified CPU a no-CBs CPU? */
bool rcu_is_nocb_cpu(int cpu)
{
if (have_rcu_nocb_mask)
return cpumask_test_cpu(cpu, rcu_nocb_mask);
return false;
}
#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
/*
* Kick the leader kthread for this NOCB group.
*/
static void wake_nocb_leader(struct rcu_data *rdp, bool force)
{
struct rcu_data *rdp_leader = rdp->nocb_leader;
if (!READ_ONCE(rdp_leader->nocb_kthread))
return;
if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
/* Prior smp_mb__after_atomic() orders against prior enqueue. */
WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
wake_up(&rdp_leader->nocb_wq);
}
}
/*
* Does the specified CPU need an RCU callback for the specified flavor
* of rcu_barrier()?
*/
static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
{
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
unsigned long ret;
#ifdef CONFIG_PROVE_RCU
struct rcu_head *rhp;
#endif /* #ifdef CONFIG_PROVE_RCU */
/*
* Check count of all no-CBs callbacks awaiting invocation.
* There needs to be a barrier before this function is called,
* but associated with a prior determination that no more
* callbacks would be posted. In the worst case, the first
* barrier in _rcu_barrier() suffices (but the caller cannot
* necessarily rely on this, not a substitute for the caller
* getting the concurrency design right!). There must also be
* a barrier between the following load an posting of a callback
* (if a callback is in fact needed). This is associated with an
* atomic_inc() in the caller.
*/
ret = atomic_long_read(&rdp->nocb_q_count);
#ifdef CONFIG_PROVE_RCU
rhp = READ_ONCE(rdp->nocb_head);
if (!rhp)
rhp = READ_ONCE(rdp->nocb_gp_head);
if (!rhp)
rhp = READ_ONCE(rdp->nocb_follower_head);
/* Having no rcuo kthread but CBs after scheduler starts is bad! */
if (!READ_ONCE(rdp->nocb_kthread) && rhp &&
rcu_scheduler_fully_active) {
/* RCU callback enqueued before CPU first came online??? */
pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n",
cpu, rhp->func);
WARN_ON_ONCE(1);
}
#endif /* #ifdef CONFIG_PROVE_RCU */
return !!ret;
}
/*
* Enqueue the specified string of rcu_head structures onto the specified
* CPU's no-CBs lists. The CPU is specified by rdp, the head of the
* string by rhp, and the tail of the string by rhtp. The non-lazy/lazy
* counts are supplied by rhcount and rhcount_lazy.
*
* If warranted, also wake up the kthread servicing this CPUs queues.
*/
static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
struct rcu_head *rhp,
struct rcu_head **rhtp,
int rhcount, int rhcount_lazy,
unsigned long flags)
{
int len;
struct rcu_head **old_rhpp;
struct task_struct *t;
/* Enqueue the callback on the nocb list and update counts. */
atomic_long_add(rhcount, &rdp->nocb_q_count);
/* rcu_barrier() relies on ->nocb_q_count add before xchg. */
old_rhpp = xchg(&rdp->nocb_tail, rhtp);
WRITE_ONCE(*old_rhpp, rhp);
atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */
/* If we are not being polled and there is a kthread, awaken it ... */
t = READ_ONCE(rdp->nocb_kthread);
if (rcu_nocb_poll || !t) {
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeNotPoll"));
return;
}
len = atomic_long_read(&rdp->nocb_q_count);
if (old_rhpp == &rdp->nocb_head) {
if (!irqs_disabled_flags(flags)) {
/* ... if queue was empty ... */
wake_nocb_leader(rdp, false);
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeEmpty"));
} else {
rdp->nocb_defer_wakeup = RCU_NOGP_WAKE;
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeEmptyIsDeferred"));
}
rdp->qlen_last_fqs_check = 0;
} else if (len > rdp->qlen_last_fqs_check + qhimark) {
/* ... or if many callbacks queued. */
if (!irqs_disabled_flags(flags)) {
wake_nocb_leader(rdp, true);
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeOvf"));
} else {
rdp->nocb_defer_wakeup = RCU_NOGP_WAKE_FORCE;
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeOvfIsDeferred"));
}
rdp->qlen_last_fqs_check = LONG_MAX / 2;
} else {
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot"));
}
return;
}
/*
* This is a helper for __call_rcu(), which invokes this when the normal
* callback queue is inoperable. If this is not a no-CBs CPU, this
* function returns failure back to __call_rcu(), which can complain
* appropriately.
*
* Otherwise, this function queues the callback where the corresponding
* "rcuo" kthread can find it.
*/
static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
bool lazy, unsigned long flags)
{
if (!rcu_is_nocb_cpu(rdp->cpu))
return false;
__call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags);
if (__is_kfree_rcu_offset((unsigned long)rhp->func))
trace_rcu_kfree_callback(rdp->rsp->name, rhp,
(unsigned long)rhp->func,
-atomic_long_read(&rdp->nocb_q_count_lazy),
-atomic_long_read(&rdp->nocb_q_count));
else
trace_rcu_callback(rdp->rsp->name, rhp,
-atomic_long_read(&rdp->nocb_q_count_lazy),
-atomic_long_read(&rdp->nocb_q_count));
/*
* If called from an extended quiescent state with interrupts
* disabled, invoke the RCU core in order to allow the idle-entry
* deferred-wakeup check to function.
*/
if (irqs_disabled_flags(flags) &&
!rcu_is_watching() &&
cpu_online(smp_processor_id()))
invoke_rcu_core();
return true;
}
/*
* Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is
* not a no-CBs CPU.
*/
static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
struct rcu_data *rdp,
unsigned long flags)
{
long ql = rsp->qlen;
long qll = rsp->qlen_lazy;
/* If this is not a no-CBs CPU, tell the caller to do it the old way. */
if (!rcu_is_nocb_cpu(smp_processor_id()))
return false;
rsp->qlen = 0;
rsp->qlen_lazy = 0;
/* First, enqueue the donelist, if any. This preserves CB ordering. */
if (rsp->orphan_donelist != NULL) {
__call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist,
rsp->orphan_donetail, ql, qll, flags);
ql = qll = 0;
rsp->orphan_donelist = NULL;
rsp->orphan_donetail = &rsp->orphan_donelist;
}
if (rsp->orphan_nxtlist != NULL) {
__call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist,
rsp->orphan_nxttail, ql, qll, flags);
ql = qll = 0;
rsp->orphan_nxtlist = NULL;
rsp->orphan_nxttail = &rsp->orphan_nxtlist;
}
return true;
}
/*
* If necessary, kick off a new grace period, and either way wait
* for a subsequent grace period to complete.
*/
static void rcu_nocb_wait_gp(struct rcu_data *rdp)
{
unsigned long c;
bool d;
unsigned long flags;
bool needwake;
struct rcu_node *rnp = rdp->mynode;
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
needwake = rcu_start_future_gp(rnp, rdp, &c);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
if (needwake)
rcu_gp_kthread_wake(rdp->rsp);
/*
* Wait for the grace period. Do so interruptibly to avoid messing
* up the load average.
*/
trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
for (;;) {
wait_event_interruptible(
rnp->nocb_gp_wq[c & 0x1],
(d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c)));
if (likely(d))
break;
WARN_ON(signal_pending(current));
trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait"));
}
trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait"));
smp_mb(); /* Ensure that CB invocation happens after GP end. */
}
/*
* Leaders come here to wait for additional callbacks to show up.
* This function does not return until callbacks appear.
*/
static void nocb_leader_wait(struct rcu_data *my_rdp)
{
bool firsttime = true;
bool gotcbs;
struct rcu_data *rdp;
struct rcu_head **tail;
wait_again:
/* Wait for callbacks to appear. */
if (!rcu_nocb_poll) {
trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
wait_event_interruptible(my_rdp->nocb_wq,
!READ_ONCE(my_rdp->nocb_leader_sleep));
/* Memory barrier handled by smp_mb() calls below and repoll. */
} else if (firsttime) {
firsttime = false; /* Don't drown trace log with "Poll"! */
trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Poll");
}
/*
* Each pass through the following loop checks a follower for CBs.
* We are our own first follower. Any CBs found are moved to
* nocb_gp_head, where they await a grace period.
*/
gotcbs = false;
for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head);
if (!rdp->nocb_gp_head)
continue; /* No CBs here, try next follower. */
/* Move callbacks to wait-for-GP list, which is empty. */
WRITE_ONCE(rdp->nocb_head, NULL);
rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
gotcbs = true;
}
/*
* If there were no callbacks, sleep a bit, rescan after a
* memory barrier, and go retry.
*/
if (unlikely(!gotcbs)) {
if (!rcu_nocb_poll)
trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu,
"WokeEmpty");
WARN_ON(signal_pending(current));
schedule_timeout_interruptible(1);
/* Rescan in case we were a victim of memory ordering. */
my_rdp->nocb_leader_sleep = true;
smp_mb(); /* Ensure _sleep true before scan. */
for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower)
if (READ_ONCE(rdp->nocb_head)) {
/* Found CB, so short-circuit next wait. */
my_rdp->nocb_leader_sleep = false;
break;
}
goto wait_again;
}
/* Wait for one grace period. */
rcu_nocb_wait_gp(my_rdp);
/*
* We left ->nocb_leader_sleep unset to reduce cache thrashing.
* We set it now, but recheck for new callbacks while
* traversing our follower list.
*/
my_rdp->nocb_leader_sleep = true;
smp_mb(); /* Ensure _sleep true before scan of ->nocb_head. */
/* Each pass through the following loop wakes a follower, if needed. */
for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
if (READ_ONCE(rdp->nocb_head))
my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/
if (!rdp->nocb_gp_head)
continue; /* No CBs, so no need to wake follower. */
/* Append callbacks to follower's "done" list. */
tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail);
*tail = rdp->nocb_gp_head;
smp_mb__after_atomic(); /* Store *tail before wakeup. */
if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
/*
* List was empty, wake up the follower.
* Memory barriers supplied by atomic_long_add().
*/
wake_up(&rdp->nocb_wq);
}
}
/* If we (the leader) don't have CBs, go wait some more. */
if (!my_rdp->nocb_follower_head)
goto wait_again;
}
/*
* Followers come here to wait for additional callbacks to show up.
* This function does not return until callbacks appear.
*/
static void nocb_follower_wait(struct rcu_data *rdp)
{
bool firsttime = true;
for (;;) {
if (!rcu_nocb_poll) {
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
"FollowerSleep");
wait_event_interruptible(rdp->nocb_wq,
READ_ONCE(rdp->nocb_follower_head));
} else if (firsttime) {
/* Don't drown trace log with "Poll"! */
firsttime = false;
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "Poll");
}
if (smp_load_acquire(&rdp->nocb_follower_head)) {
/* ^^^ Ensure CB invocation follows _head test. */
return;
}
if (!rcu_nocb_poll)
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
"WokeEmpty");
WARN_ON(signal_pending(current));
schedule_timeout_interruptible(1);
}
}
/*
* Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes
* callbacks queued by the corresponding no-CBs CPU, however, there is
* an optional leader-follower relationship so that the grace-period
* kthreads don't have to do quite so many wakeups.
*/
static int rcu_nocb_kthread(void *arg)
{
int c, cl;
struct rcu_head *list;
struct rcu_head *next;
struct rcu_head **tail;
struct rcu_data *rdp = arg;
/* Each pass through this loop invokes one batch of callbacks */
for (;;) {
/* Wait for callbacks. */
if (rdp->nocb_leader == rdp)
nocb_leader_wait(rdp);
else
nocb_follower_wait(rdp);
/* Pull the ready-to-invoke callbacks onto local list. */
list = READ_ONCE(rdp->nocb_follower_head);
BUG_ON(!list);
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
WRITE_ONCE(rdp->nocb_follower_head, NULL);
tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
/* Each pass through the following loop invokes a callback. */
trace_rcu_batch_start(rdp->rsp->name,
atomic_long_read(&rdp->nocb_q_count_lazy),
atomic_long_read(&rdp->nocb_q_count), -1);
c = cl = 0;
while (list) {
next = list->next;
/* Wait for enqueuing to complete, if needed. */
while (next == NULL && &list->next != tail) {
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WaitQueue"));
schedule_timeout_interruptible(1);
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WokeQueue"));
next = list->next;
}
debug_rcu_head_unqueue(list);
local_bh_disable();
if (__rcu_reclaim(rdp->rsp->name, list))
cl++;
c++;
local_bh_enable();
list = next;
}
trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
smp_mb__before_atomic(); /* _add after CB invocation. */
atomic_long_add(-c, &rdp->nocb_q_count);
atomic_long_add(-cl, &rdp->nocb_q_count_lazy);
rdp->n_nocbs_invoked += c;
}
return 0;
}
/* Is a deferred wakeup of rcu_nocb_kthread() required? */
static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
{
return READ_ONCE(rdp->nocb_defer_wakeup);
}
/* Do a deferred wakeup of rcu_nocb_kthread(). */
static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
{
int ndw;
if (!rcu_nocb_need_deferred_wakeup(rdp))
return;
ndw = READ_ONCE(rdp->nocb_defer_wakeup);
WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_NOT);
wake_nocb_leader(rdp, ndw == RCU_NOGP_WAKE_FORCE);
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake"));
}
void __init rcu_init_nohz(void)
{
int cpu;
bool need_rcu_nocb_mask = true;
struct rcu_state *rsp;
#ifdef CONFIG_RCU_NOCB_CPU_NONE
need_rcu_nocb_mask = false;
#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
#if defined(CONFIG_NO_HZ_FULL)
if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask))
need_rcu_nocb_mask = true;
#endif /* #if defined(CONFIG_NO_HZ_FULL) */
if (!have_rcu_nocb_mask && need_rcu_nocb_mask) {
if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) {
pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n");
return;
}
have_rcu_nocb_mask = true;
}
if (!have_rcu_nocb_mask)
return;
#ifdef CONFIG_RCU_NOCB_CPU_ZERO
pr_info("\tOffload RCU callbacks from CPU 0\n");
cpumask_set_cpu(0, rcu_nocb_mask);
#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
#ifdef CONFIG_RCU_NOCB_CPU_ALL
pr_info("\tOffload RCU callbacks from all CPUs\n");
cpumask_copy(rcu_nocb_mask, cpu_possible_mask);
#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
#if defined(CONFIG_NO_HZ_FULL)
if (tick_nohz_full_running)
cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
#endif /* #if defined(CONFIG_NO_HZ_FULL) */
if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n");
cpumask_and(rcu_nocb_mask, cpu_possible_mask,
rcu_nocb_mask);
}
pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n",
cpumask_pr_args(rcu_nocb_mask));
if (rcu_nocb_poll)
pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
for_each_rcu_flavor(rsp) {
for_each_cpu(cpu, rcu_nocb_mask)
init_nocb_callback_list(per_cpu_ptr(rsp->rda, cpu));
rcu_organize_nocb_kthreads(rsp);
}
}
/* Initialize per-rcu_data variables for no-CBs CPUs. */
static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
{
rdp->nocb_tail = &rdp->nocb_head;
init_waitqueue_head(&rdp->nocb_wq);
rdp->nocb_follower_tail = &rdp->nocb_follower_head;
}
/*
* If the specified CPU is a no-CBs CPU that does not already have its
* rcuo kthread for the specified RCU flavor, spawn it. If the CPUs are
* brought online out of order, this can require re-organizing the
* leader-follower relationships.
*/
static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu)
{
struct rcu_data *rdp;
struct rcu_data *rdp_last;
struct rcu_data *rdp_old_leader;
struct rcu_data *rdp_spawn = per_cpu_ptr(rsp->rda, cpu);
struct task_struct *t;
/*
* If this isn't a no-CBs CPU or if it already has an rcuo kthread,
* then nothing to do.
*/
if (!rcu_is_nocb_cpu(cpu) || rdp_spawn->nocb_kthread)
return;
/* If we didn't spawn the leader first, reorganize! */
rdp_old_leader = rdp_spawn->nocb_leader;
if (rdp_old_leader != rdp_spawn && !rdp_old_leader->nocb_kthread) {
rdp_last = NULL;
rdp = rdp_old_leader;
do {
rdp->nocb_leader = rdp_spawn;
if (rdp_last && rdp != rdp_spawn)
rdp_last->nocb_next_follower = rdp;
if (rdp == rdp_spawn) {
rdp = rdp->nocb_next_follower;
} else {
rdp_last = rdp;
rdp = rdp->nocb_next_follower;
rdp_last->nocb_next_follower = NULL;
}
} while (rdp);
rdp_spawn->nocb_next_follower = rdp_old_leader;
}
/* Spawn the kthread for this CPU and RCU flavor. */
t = kthread_run(rcu_nocb_kthread, rdp_spawn,
"rcuo%c/%d", rsp->abbr, cpu);
BUG_ON(IS_ERR(t));
WRITE_ONCE(rdp_spawn->nocb_kthread, t);
}
/*
* If the specified CPU is a no-CBs CPU that does not already have its
* rcuo kthreads, spawn them.
*/
static void rcu_spawn_all_nocb_kthreads(int cpu)
{
struct rcu_state *rsp;
if (rcu_scheduler_fully_active)
for_each_rcu_flavor(rsp)
rcu_spawn_one_nocb_kthread(rsp, cpu);
}
/*
* Once the scheduler is running, spawn rcuo kthreads for all online
* no-CBs CPUs. This assumes that the early_initcall()s happen before
* non-boot CPUs come online -- if this changes, we will need to add
* some mutual exclusion.
*/
static void __init rcu_spawn_nocb_kthreads(void)
{
int cpu;
for_each_online_cpu(cpu)
rcu_spawn_all_nocb_kthreads(cpu);
}
/* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */
static int rcu_nocb_leader_stride = -1;
module_param(rcu_nocb_leader_stride, int, 0444);
/*
* Initialize leader-follower relationships for all no-CBs CPU.
*/
static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp)
{
int cpu;
int ls = rcu_nocb_leader_stride;
int nl = 0; /* Next leader. */
struct rcu_data *rdp;
struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */
struct rcu_data *rdp_prev = NULL;
if (!have_rcu_nocb_mask)
return;
if (ls == -1) {
ls = int_sqrt(nr_cpu_ids);
rcu_nocb_leader_stride = ls;
}
/*
* Each pass through this loop sets up one rcu_data structure and
* spawns one rcu_nocb_kthread().
*/
for_each_cpu(cpu, rcu_nocb_mask) {
rdp = per_cpu_ptr(rsp->rda, cpu);
if (rdp->cpu >= nl) {
/* New leader, set up for followers & next leader. */
nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls;
rdp->nocb_leader = rdp;
rdp_leader = rdp;
} else {
/* Another follower, link to previous leader. */
rdp->nocb_leader = rdp_leader;
rdp_prev->nocb_next_follower = rdp;
}
rdp_prev = rdp;
}
}
/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
static bool init_nocb_callback_list(struct rcu_data *rdp)
{
if (!rcu_is_nocb_cpu(rdp->cpu))
return false;
/* If there are early-boot callbacks, move them to nocb lists. */
if (rdp->nxtlist) {
rdp->nocb_head = rdp->nxtlist;
rdp->nocb_tail = rdp->nxttail[RCU_NEXT_TAIL];
atomic_long_set(&rdp->nocb_q_count, rdp->qlen);
atomic_long_set(&rdp->nocb_q_count_lazy, rdp->qlen_lazy);
rdp->nxtlist = NULL;
rdp->qlen = 0;
rdp->qlen_lazy = 0;
}
rdp->nxttail[RCU_NEXT_TAIL] = NULL;
return true;
}
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
{
WARN_ON_ONCE(1); /* Should be dead code. */
return false;
}
static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
{
}
static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
{
}
static void rcu_init_one_nocb(struct rcu_node *rnp)
{
}
static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
bool lazy, unsigned long flags)
{
return false;
}
static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
struct rcu_data *rdp,
unsigned long flags)
{
return false;
}
static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
{
}
static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
{
return false;
}
static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
{
}
static void rcu_spawn_all_nocb_kthreads(int cpu)
{
}
static void __init rcu_spawn_nocb_kthreads(void)
{
}
static bool init_nocb_callback_list(struct rcu_data *rdp)
{
return false;
}
#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
/*
* An adaptive-ticks CPU can potentially execute in kernel mode for an
* arbitrarily long period of time with the scheduling-clock tick turned
* off. RCU will be paying attention to this CPU because it is in the
* kernel, but the CPU cannot be guaranteed to be executing the RCU state
* machine because the scheduling-clock tick has been disabled. Therefore,
* if an adaptive-ticks CPU is failing to respond to the current grace
* period and has not be idle from an RCU perspective, kick it.
*/
static void __maybe_unused rcu_kick_nohz_cpu(int cpu)
{
#ifdef CONFIG_NO_HZ_FULL
if (tick_nohz_full_cpu(cpu))
smp_send_reschedule(cpu);
#endif /* #ifdef CONFIG_NO_HZ_FULL */
}
#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
static int full_sysidle_state; /* Current system-idle state. */
#define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */
#define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */
#define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */
#define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */
#define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */
/*
* Invoked to note exit from irq or task transition to idle. Note that
* usermode execution does -not- count as idle here! After all, we want
* to detect full-system idle states, not RCU quiescent states and grace
* periods. The caller must have disabled interrupts.
*/
static void rcu_sysidle_enter(int irq)
{
unsigned long j;
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
/* If there are no nohz_full= CPUs, no need to track this. */
if (!tick_nohz_full_enabled())
return;
/* Adjust nesting, check for fully idle. */
if (irq) {
rdtp->dynticks_idle_nesting--;
WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
if (rdtp->dynticks_idle_nesting != 0)
return; /* Still not fully idle. */
} else {
if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) ==
DYNTICK_TASK_NEST_VALUE) {
rdtp->dynticks_idle_nesting = 0;
} else {
rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE;
WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
return; /* Still not fully idle. */
}
}
/* Record start of fully idle period. */
j = jiffies;
WRITE_ONCE(rdtp->dynticks_idle_jiffies, j);
smp_mb__before_atomic();
atomic_inc(&rdtp->dynticks_idle);
smp_mb__after_atomic();
WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1);
}
/*
* Unconditionally force exit from full system-idle state. This is
* invoked when a normal CPU exits idle, but must be called separately
* for the timekeeping CPU (tick_do_timer_cpu). The reason for this
* is that the timekeeping CPU is permitted to take scheduling-clock
* interrupts while the system is in system-idle state, and of course
* rcu_sysidle_exit() has no way of distinguishing a scheduling-clock
* interrupt from any other type of interrupt.
*/
void rcu_sysidle_force_exit(void)
{
int oldstate = READ_ONCE(full_sysidle_state);
int newoldstate;
/*
* Each pass through the following loop attempts to exit full
* system-idle state. If contention proves to be a problem,
* a trylock-based contention tree could be used here.
*/
while (oldstate > RCU_SYSIDLE_SHORT) {
newoldstate = cmpxchg(&full_sysidle_state,
oldstate, RCU_SYSIDLE_NOT);
if (oldstate == newoldstate &&
oldstate == RCU_SYSIDLE_FULL_NOTED) {
rcu_kick_nohz_cpu(tick_do_timer_cpu);
return; /* We cleared it, done! */
}
oldstate = newoldstate;
}
smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */
}
/*
* Invoked to note entry to irq or task transition from idle. Note that
* usermode execution does -not- count as idle here! The caller must
* have disabled interrupts.
*/
static void rcu_sysidle_exit(int irq)
{
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
/* If there are no nohz_full= CPUs, no need to track this. */
if (!tick_nohz_full_enabled())
return;
/* Adjust nesting, check for already non-idle. */
if (irq) {
rdtp->dynticks_idle_nesting++;
WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
if (rdtp->dynticks_idle_nesting != 1)
return; /* Already non-idle. */
} else {
/*
* Allow for irq misnesting. Yes, it really is possible
* to enter an irq handler then never leave it, and maybe
* also vice versa. Handle both possibilities.
*/
if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) {
rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE;
WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
return; /* Already non-idle. */
} else {
rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE;
}
}
/* Record end of idle period. */
smp_mb__before_atomic();
atomic_inc(&rdtp->dynticks_idle);
smp_mb__after_atomic();
WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1));
/*
* If we are the timekeeping CPU, we are permitted to be non-idle
* during a system-idle state. This must be the case, because
* the timekeeping CPU has to take scheduling-clock interrupts
* during the time that the system is transitioning to full
* system-idle state. This means that the timekeeping CPU must
* invoke rcu_sysidle_force_exit() directly if it does anything
* more than take a scheduling-clock interrupt.
*/
if (smp_processor_id() == tick_do_timer_cpu)
return;
/* Update system-idle state: We are clearly no longer fully idle! */
rcu_sysidle_force_exit();
}
/*
* Check to see if the current CPU is idle. Note that usermode execution
* does not count as idle. The caller must have disabled interrupts,
* and must be running on tick_do_timer_cpu.
*/
static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
unsigned long *maxj)
{
int cur;
unsigned long j;
struct rcu_dynticks *rdtp = rdp->dynticks;
/* If there are no nohz_full= CPUs, don't check system-wide idleness. */
if (!tick_nohz_full_enabled())
return;
/*
* If some other CPU has already reported non-idle, if this is
* not the flavor of RCU that tracks sysidle state, or if this
* is an offline or the timekeeping CPU, nothing to do.
*/
if (!*isidle || rdp->rsp != rcu_state_p ||
cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu)
return;
/* Verify affinity of current kthread. */
WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu);
/* Pick up current idle and NMI-nesting counter and check. */
cur = atomic_read(&rdtp->dynticks_idle);
if (cur & 0x1) {
*isidle = false; /* We are not idle! */
return;
}
smp_mb(); /* Read counters before timestamps. */
/* Pick up timestamps. */
j = READ_ONCE(rdtp->dynticks_idle_jiffies);
/* If this CPU entered idle more recently, update maxj timestamp. */
if (ULONG_CMP_LT(*maxj, j))
*maxj = j;
}
/*
* Is this the flavor of RCU that is handling full-system idle?
*/
static bool is_sysidle_rcu_state(struct rcu_state *rsp)
{
return rsp == rcu_state_p;
}
/*
* Return a delay in jiffies based on the number of CPUs, rcu_node
* leaf fanout, and jiffies tick rate. The idea is to allow larger
* systems more time to transition to full-idle state in order to
* avoid the cache thrashing that otherwise occur on the state variable.
* Really small systems (less than a couple of tens of CPUs) should
* instead use a single global atomically incremented counter, and later
* versions of this will automatically reconfigure themselves accordingly.
*/
static unsigned long rcu_sysidle_delay(void)
{
if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
return 0;
return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000);
}
/*
* Advance the full-system-idle state. This is invoked when all of
* the non-timekeeping CPUs are idle.
*/
static void rcu_sysidle(unsigned long j)
{
/* Check the current state. */
switch (READ_ONCE(full_sysidle_state)) {
case RCU_SYSIDLE_NOT:
/* First time all are idle, so note a short idle period. */
WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_SHORT);
break;
case RCU_SYSIDLE_SHORT:
/*
* Idle for a bit, time to advance to next state?
* cmpxchg failure means race with non-idle, let them win.
*/
if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
(void)cmpxchg(&full_sysidle_state,
RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG);
break;
case RCU_SYSIDLE_LONG:
/*
* Do an additional check pass before advancing to full.
* cmpxchg failure means race with non-idle, let them win.
*/
if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
(void)cmpxchg(&full_sysidle_state,
RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL);
break;
default:
break;
}
}
/*
* Found a non-idle non-timekeeping CPU, so kick the system-idle state
* back to the beginning.
*/
static void rcu_sysidle_cancel(void)
{
smp_mb();
if (full_sysidle_state > RCU_SYSIDLE_SHORT)
WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_NOT);
}
/*
* Update the sysidle state based on the results of a force-quiescent-state
* scan of the CPUs' dyntick-idle state.
*/
static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
unsigned long maxj, bool gpkt)
{
if (rsp != rcu_state_p)
return; /* Wrong flavor, ignore. */
if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
return; /* Running state machine from timekeeping CPU. */
if (isidle)
rcu_sysidle(maxj); /* More idle! */
else
rcu_sysidle_cancel(); /* Idle is over. */
}
/*
* Wrapper for rcu_sysidle_report() when called from the grace-period
* kthread's context.
*/
static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
unsigned long maxj)
{
/* If there are no nohz_full= CPUs, no need to track this. */
if (!tick_nohz_full_enabled())
return;
rcu_sysidle_report(rsp, isidle, maxj, true);
}
/* Callback and function for forcing an RCU grace period. */
struct rcu_sysidle_head {
struct rcu_head rh;
int inuse;
};
static void rcu_sysidle_cb(struct rcu_head *rhp)
{
struct rcu_sysidle_head *rshp;
/*
* The following memory barrier is needed to replace the
* memory barriers that would normally be in the memory
* allocator.
*/
smp_mb(); /* grace period precedes setting inuse. */
rshp = container_of(rhp, struct rcu_sysidle_head, rh);
WRITE_ONCE(rshp->inuse, 0);
}
/*
* Check to see if the system is fully idle, other than the timekeeping CPU.
* The caller must have disabled interrupts. This is not intended to be
* called unless tick_nohz_full_enabled().
*/
bool rcu_sys_is_idle(void)
{
static struct rcu_sysidle_head rsh;
int rss = READ_ONCE(full_sysidle_state);
if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu))
return false;
/* Handle small-system case by doing a full scan of CPUs. */
if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) {
int oldrss = rss - 1;
/*
* One pass to advance to each state up to _FULL.
* Give up if any pass fails to advance the state.
*/
while (rss < RCU_SYSIDLE_FULL && oldrss < rss) {
int cpu;
bool isidle = true;
unsigned long maxj = jiffies - ULONG_MAX / 4;
struct rcu_data *rdp;
/* Scan all the CPUs looking for nonidle CPUs. */
for_each_possible_cpu(cpu) {
rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
rcu_sysidle_check_cpu(rdp, &isidle, &maxj);
if (!isidle)
break;
}
rcu_sysidle_report(rcu_state_p, isidle, maxj, false);
oldrss = rss;
rss = READ_ONCE(full_sysidle_state);
}
}
/* If this is the first observation of an idle period, record it. */
if (rss == RCU_SYSIDLE_FULL) {
rss = cmpxchg(&full_sysidle_state,
RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED);
return rss == RCU_SYSIDLE_FULL;
}
smp_mb(); /* ensure rss load happens before later caller actions. */
/* If already fully idle, tell the caller (in case of races). */
if (rss == RCU_SYSIDLE_FULL_NOTED)
return true;
/*
* If we aren't there yet, and a grace period is not in flight,
* initiate a grace period. Either way, tell the caller that
* we are not there yet. We use an xchg() rather than an assignment
* to make up for the memory barriers that would otherwise be
* provided by the memory allocator.
*/
if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL &&
!rcu_gp_in_progress(rcu_state_p) &&
!rsh.inuse && xchg(&rsh.inuse, 1) == 0)
call_rcu(&rsh.rh, rcu_sysidle_cb);
return false;
}
/*
* Initialize dynticks sysidle state for CPUs coming online.
*/
static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
{
rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE;
}
#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
static void rcu_sysidle_enter(int irq)
{
}
static void rcu_sysidle_exit(int irq)
{
}
static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
unsigned long *maxj)
{
}
static bool is_sysidle_rcu_state(struct rcu_state *rsp)
{
return false;
}
static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
unsigned long maxj)
{
}
static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
{
}
#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
/*
* Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the
* grace-period kthread will do force_quiescent_state() processing?
* The idea is to avoid waking up RCU core processing on such a
* CPU unless the grace period has extended for too long.
*
* This code relies on the fact that all NO_HZ_FULL CPUs are also
* CONFIG_RCU_NOCB_CPU CPUs.
*/
static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
{
#ifdef CONFIG_NO_HZ_FULL
if (tick_nohz_full_cpu(smp_processor_id()) &&
(!rcu_gp_in_progress(rsp) ||
ULONG_CMP_LT(jiffies, READ_ONCE(rsp->gp_start) + HZ)))
return true;
#endif /* #ifdef CONFIG_NO_HZ_FULL */
return false;
}
/*
* Bind the grace-period kthread for the sysidle flavor of RCU to the
* timekeeping CPU.
*/
static void rcu_bind_gp_kthread(void)
{
int __maybe_unused cpu;
if (!tick_nohz_full_enabled())
return;
#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
cpu = tick_do_timer_cpu;
if (cpu >= 0 && cpu < nr_cpu_ids)
set_cpus_allowed_ptr(current, cpumask_of(cpu));
#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
housekeeping_affine(current);
#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
}
/* Record the current task on dyntick-idle entry. */
static void rcu_dynticks_task_enter(void)
{
#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id());
#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
}
/* Record no current task on dyntick-idle exit. */
static void rcu_dynticks_task_exit(void)
{
#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
WRITE_ONCE(current->rcu_tasks_idle_cpu, -1);
#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
}
C:\Users\Admin\Desktop\linux-4.2.y-new\linux-4.2.y\kernel\/rcu/tree_trace.c
/*
* Read-Copy Update tracing for classic implementation
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, you can access it online at
* http://www.gnu.org/licenses/gpl-2.0.html.
*
* Copyright IBM Corporation, 2008
*
* Papers: http://www.rdrop.com/users/paulmck/RCU
*
* For detailed explanation of Read-Copy Update mechanism see -
* Documentation/RCU
*
*/
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/rcupdate.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/completion.h>
#include <linux/moduleparam.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/mutex.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
#define RCU_TREE_NONCORE
#include "tree.h"
DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
static int r_open(struct inode *inode, struct file *file,
const struct seq_operations *op)
{
int ret = seq_open(file, op);
if (!ret) {
struct seq_file *m = (struct seq_file *)file->private_data;
m->private = inode->i_private;
}
return ret;
}
static void *r_start(struct seq_file *m, loff_t *pos)
{
struct rcu_state *rsp = (struct rcu_state *)m->private;
*pos = cpumask_next(*pos - 1, cpu_possible_mask);
if ((*pos) < nr_cpu_ids)
return per_cpu_ptr(rsp->rda, *pos);
return NULL;
}
static void *r_next(struct seq_file *m, void *v, loff_t *pos)
{
(*pos)++;
return r_start(m, pos);
}
static void r_stop(struct seq_file *m, void *v)
{
}
static int show_rcubarrier(struct seq_file *m, void *v)
{
struct rcu_state *rsp = (struct rcu_state *)m->private;
seq_printf(m, "bcc: %d nbd: %lu\n",
atomic_read(&rsp->barrier_cpu_count),
rsp->n_barrier_done);
return 0;
}
static int rcubarrier_open(struct inode *inode, struct file *file)
{
return single_open(file, show_rcubarrier, inode->i_private);
}
static const struct file_operations rcubarrier_fops = {
.owner = THIS_MODULE,
.open = rcubarrier_open,
.read = seq_read,
.llseek = no_llseek,
.release = single_release,
};
#ifdef CONFIG_RCU_BOOST
static char convert_kthread_status(unsigned int kthread_status)
{
if (kthread_status > RCU_KTHREAD_MAX)
return '?';
return "SRWOY"[kthread_status];
}
#endif /* #ifdef CONFIG_RCU_BOOST */
static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
{
long ql, qll;
if (!rdp->beenonline)
return;
seq_printf(m, "%3d%cc=%ld g=%ld pq=%d/%d qp=%d",
rdp->cpu,
cpu_is_offline(rdp->cpu) ? '!' : ' ',
ulong2long(rdp->completed), ulong2long(rdp->gpnum),
rdp->passed_quiesce,
rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu),
rdp->qs_pending);
seq_printf(m, " dt=%d/%llx/%d df=%lu",
atomic_read(&rdp->dynticks->dynticks),
rdp->dynticks->dynticks_nesting,
rdp->dynticks->dynticks_nmi_nesting,
rdp->dynticks_fqs);
seq_printf(m, " of=%lu", rdp->offline_fqs);
rcu_nocb_q_lengths(rdp, &ql, &qll);
qll += rdp->qlen_lazy;
ql += rdp->qlen;
seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c",
qll, ql,
".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
rdp->nxttail[RCU_NEXT_TAIL]],
".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
rdp->nxttail[RCU_NEXT_READY_TAIL]],
".W"[rdp->nxttail[RCU_DONE_TAIL] !=
rdp->nxttail[RCU_WAIT_TAIL]],
".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
#ifdef CONFIG_RCU_BOOST
seq_printf(m, " kt=%d/%c ktl=%x",
per_cpu(rcu_cpu_has_work, rdp->cpu),
convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
rdp->cpu)),
per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff);
#endif /* #ifdef CONFIG_RCU_BOOST */
seq_printf(m, " b=%ld", rdp->blimit);
seq_printf(m, " ci=%lu nci=%lu co=%lu ca=%lu\n",
rdp->n_cbs_invoked, rdp->n_nocbs_invoked,
rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
}
static int show_rcudata(struct seq_file *m, void *v)
{
print_one_rcu_data(m, (struct rcu_data *)v);
return 0;
}
static const struct seq_operations rcudate_op = {
.start = r_start,
.next = r_next,
.stop = r_stop,
.show = show_rcudata,
};
static int rcudata_open(struct inode *inode, struct file *file)
{
return r_open(inode, file, &rcudate_op);
}
static const struct file_operations rcudata_fops = {
.owner = THIS_MODULE,
.open = rcudata_open,
.read = seq_read,
.llseek = no_llseek,
.release = seq_release,
};
static int show_rcuexp(struct seq_file *m, void *v)
{
struct rcu_state *rsp = (struct rcu_state *)m->private;
seq_printf(m, "s=%lu d=%lu w=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu dt=%lu dl=%lu dx=%lu\n",
atomic_long_read(&rsp->expedited_start),
atomic_long_read(&rsp->expedited_done),
atomic_long_read(&rsp->expedited_wrap),
atomic_long_read(&rsp->expedited_tryfail),
atomic_long_read(&rsp->expedited_workdone1),
atomic_long_read(&rsp->expedited_workdone2),
atomic_long_read(&rsp->expedited_normal),
atomic_long_read(&rsp->expedited_stoppedcpus),
atomic_long_read(&rsp->expedited_done_tries),
atomic_long_read(&rsp->expedited_done_lost),
atomic_long_read(&rsp->expedited_done_exit));
return 0;
}
static int rcuexp_open(struct inode *inode, struct file *file)
{
return single_open(file, show_rcuexp, inode->i_private);
}
static const struct file_operations rcuexp_fops = {
.owner = THIS_MODULE,
.open = rcuexp_open,
.read = seq_read,
.llseek = no_llseek,
.release = single_release,
};
#ifdef CONFIG_RCU_BOOST
static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
{
seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu ",
rnp->grplo, rnp->grphi,
"T."[list_empty(&rnp->blkd_tasks)],
"N."[!rnp->gp_tasks],
"E."[!rnp->exp_tasks],
"B."[!rnp->boost_tasks],
convert_kthread_status(rnp->boost_kthread_status),
rnp->n_tasks_boosted, rnp->n_exp_boosts,
rnp->n_normal_boosts);
seq_printf(m, "j=%04x bt=%04x\n",
(int)(jiffies & 0xffff),
(int)(rnp->boost_time & 0xffff));
seq_printf(m, " balk: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n",
rnp->n_balk_blkd_tasks,
rnp->n_balk_exp_gp_tasks,
rnp->n_balk_boost_tasks,
rnp->n_balk_notblocked,
rnp->n_balk_notyet,
rnp->n_balk_nos);
}
static int show_rcu_node_boost(struct seq_file *m, void *unused)
{
struct rcu_node *rnp;
rcu_for_each_leaf_node(&rcu_preempt_state, rnp)
print_one_rcu_node_boost(m, rnp);
return 0;
}
static int rcu_node_boost_open(struct inode *inode, struct file *file)
{
return single_open(file, show_rcu_node_boost, NULL);
}
static const struct file_operations rcu_node_boost_fops = {
.owner = THIS_MODULE,
.open = rcu_node_boost_open,
.read = seq_read,
.llseek = no_llseek,
.release = single_release,
};
#endif /* #ifdef CONFIG_RCU_BOOST */
static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
{
unsigned long gpnum;
int level = 0;
struct rcu_node *rnp;
gpnum = rsp->gpnum;
seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ",
ulong2long(rsp->completed), ulong2long(gpnum),
rsp->fqs_state,
(long)(rsp->jiffies_force_qs - jiffies),
(int)(jiffies & 0xffff));
seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
rsp->n_force_qs, rsp->n_force_qs_ngp,
rsp->n_force_qs - rsp->n_force_qs_ngp,
READ_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen);
for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) {
if (rnp->level != level) {
seq_puts(m, "\n");
level = rnp->level;
}
seq_printf(m, "%lx/%lx->%lx %c%c>%c %d:%d ^%d ",
rnp->qsmask, rnp->qsmaskinit, rnp->qsmaskinitnext,
".G"[rnp->gp_tasks != NULL],
".E"[rnp->exp_tasks != NULL],
".T"[!list_empty(&rnp->blkd_tasks)],
rnp->grplo, rnp->grphi, rnp->grpnum);
}
seq_puts(m, "\n");
}
static int show_rcuhier(struct seq_file *m, void *v)
{
struct rcu_state *rsp = (struct rcu_state *)m->private;
print_one_rcu_state(m, rsp);
return 0;
}
static int rcuhier_open(struct inode *inode, struct file *file)
{
return single_open(file, show_rcuhier, inode->i_private);
}
static const struct file_operations rcuhier_fops = {
.owner = THIS_MODULE,
.open = rcuhier_open,
.read = seq_read,
.llseek = no_llseek,
.release = single_release,
};
static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
{
unsigned long flags;
unsigned long completed;
unsigned long gpnum;
unsigned long gpage;
unsigned long gpmax;
struct rcu_node *rnp = &rsp->node[0];
raw_spin_lock_irqsave(&rnp->lock, flags);
completed = READ_ONCE(rsp->completed);
gpnum = READ_ONCE(rsp->gpnum);
if (completed == gpnum)
gpage = 0;
else
gpage = jiffies - rsp->gp_start;
gpmax = rsp->gp_max;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
seq_printf(m, "completed=%ld gpnum=%ld age=%ld max=%ld\n",
ulong2long(completed), ulong2long(gpnum), gpage, gpmax);
}
static int show_rcugp(struct seq_file *m, void *v)
{
struct rcu_state *rsp = (struct rcu_state *)m->private;
show_one_rcugp(m, rsp);
return 0;
}
static int rcugp_open(struct inode *inode, struct file *file)
{
return single_open(file, show_rcugp, inode->i_private);
}
static const struct file_operations rcugp_fops = {
.owner = THIS_MODULE,
.open = rcugp_open,
.read = seq_read,
.llseek = no_llseek,
.release = single_release,
};
static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
{
if (!rdp->beenonline)
return;
seq_printf(m, "%3d%cnp=%ld ",
rdp->cpu,
cpu_is_offline(rdp->cpu) ? '!' : ' ',
rdp->n_rcu_pending);
seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ",
rdp->n_rp_qs_pending,
rdp->n_rp_report_qs,
rdp->n_rp_cb_ready,
rdp->n_rp_cpu_needs_gp);
seq_printf(m, "gpc=%ld gps=%ld nn=%ld ndw%ld\n",
rdp->n_rp_gp_completed,
rdp->n_rp_gp_started,
rdp->n_rp_nocb_defer_wakeup,
rdp->n_rp_need_nothing);
}
static int show_rcu_pending(struct seq_file *m, void *v)
{
print_one_rcu_pending(m, (struct rcu_data *)v);
return 0;
}
static const struct seq_operations rcu_pending_op = {
.start = r_start,
.next = r_next,
.stop = r_stop,
.show = show_rcu_pending,
};
static int rcu_pending_open(struct inode *inode, struct file *file)
{
return r_open(inode, file, &rcu_pending_op);
}
static const struct file_operations rcu_pending_fops = {
.owner = THIS_MODULE,
.open = rcu_pending_open,
.read = seq_read,
.llseek = no_llseek,
.release = seq_release,
};
static int show_rcutorture(struct seq_file *m, void *unused)
{
seq_printf(m, "rcutorture test sequence: %lu %s\n",
rcutorture_testseq >> 1,
(rcutorture_testseq & 0x1) ? "(test in progress)" : "");
seq_printf(m, "rcutorture update version number: %lu\n",
rcutorture_vernum);
return 0;
}
static int rcutorture_open(struct inode *inode, struct file *file)
{
return single_open(file, show_rcutorture, NULL);
}
static const struct file_operations rcutorture_fops = {
.owner = THIS_MODULE,
.open = rcutorture_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
static struct dentry *rcudir;
static int __init rcutree_trace_init(void)
{
struct rcu_state *rsp;
struct dentry *retval;
struct dentry *rspdir;
rcudir = debugfs_create_dir("rcu", NULL);
if (!rcudir)
goto free_out;
for_each_rcu_flavor(rsp) {
rspdir = debugfs_create_dir(rsp->name, rcudir);
if (!rspdir)
goto free_out;
retval = debugfs_create_file("rcudata", 0444,
rspdir, rsp, &rcudata_fops);
if (!retval)
goto free_out;
retval = debugfs_create_file("rcuexp", 0444,
rspdir, rsp, &rcuexp_fops);
if (!retval)
goto free_out;
retval = debugfs_create_file("rcu_pending", 0444,
rspdir, rsp, &rcu_pending_fops);
if (!retval)
goto free_out;
retval = debugfs_create_file("rcubarrier", 0444,
rspdir, rsp, &rcubarrier_fops);
if (!retval)
goto free_out;
#ifdef CONFIG_RCU_BOOST
if (rsp == &rcu_preempt_state) {
retval = debugfs_create_file("rcuboost", 0444,
rspdir, NULL, &rcu_node_boost_fops);
if (!retval)
goto free_out;
}
#endif
retval = debugfs_create_file("rcugp", 0444,
rspdir, rsp, &rcugp_fops);
if (!retval)
goto free_out;
retval = debugfs_create_file("rcuhier", 0444,
rspdir, rsp, &rcuhier_fops);
if (!retval)
goto free_out;
}
retval = debugfs_create_file("rcutorture", 0444, rcudir,
NULL, &rcutorture_fops);
if (!retval)
goto free_out;
return 0;
free_out:
debugfs_remove_recursive(rcudir);
return 1;
}
static void __exit rcutree_trace_cleanup(void)
{
debugfs_remove_recursive(rcudir);
}
module_init(rcutree_trace_init);
module_exit(rcutree_trace_cleanup);
MODULE_AUTHOR("Paul E. McKenney");
MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
MODULE_LICENSE("GPL");
C:\Users\Admin\Desktop\linux-4.2.y-new\linux-4.2.y\kernel\/rcu/update.c
/*
* Read-Copy Update mechanism for mutual exclusion
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, you can access it online at
* http://www.gnu.org/licenses/gpl-2.0.html.
*
* Copyright IBM Corporation, 2001
*
* Authors: Dipankar Sarma <dipankar@in.ibm.com>
* Manfred Spraul <manfred@colorfullife.com>
*
* Based on the original work by Paul McKenney <paulmck@us.ibm.com>
* and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
* Papers:
* http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
* http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
*
* For detailed explanation of Read-Copy Update mechanism see -
* http://lse.sourceforge.net/locking/rcupdate.html
*
*/
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/mutex.h>
#include <linux/export.h>
#include <linux/hardirq.h>
#include <linux/delay.h>
#include <linux/module.h>
#include <linux/kthread.h>
#include <linux/tick.h>
#define CREATE_TRACE_POINTS
#include "rcu.h"
MODULE_ALIAS("rcupdate");
#ifdef MODULE_PARAM_PREFIX
#undef MODULE_PARAM_PREFIX
#endif
#define MODULE_PARAM_PREFIX "rcupdate."
module_param(rcu_expedited, int, 0);
#ifndef CONFIG_TINY_RCU
static atomic_t rcu_expedited_nesting =
ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0);
/*
* Should normal grace-period primitives be expedited? Intended for
* use within RCU. Note that this function takes the rcu_expedited
* sysfs/boot variable into account as well as the rcu_expedite_gp()
* nesting. So looping on rcu_unexpedite_gp() until rcu_gp_is_expedited()
* returns false is a -really- bad idea.
*/
bool rcu_gp_is_expedited(void)
{
return rcu_expedited || atomic_read(&rcu_expedited_nesting);
}
EXPORT_SYMBOL_GPL(rcu_gp_is_expedited);
/**
* rcu_expedite_gp - Expedite future RCU grace periods
*
* After a call to this function, future calls to synchronize_rcu() and
* friends act as the corresponding synchronize_rcu_expedited() function
* had instead been called.
*/
void rcu_expedite_gp(void)
{
atomic_inc(&rcu_expedited_nesting);
}
EXPORT_SYMBOL_GPL(rcu_expedite_gp);
/**
* rcu_unexpedite_gp - Cancel prior rcu_expedite_gp() invocation
*
* Undo a prior call to rcu_expedite_gp(). If all prior calls to
* rcu_expedite_gp() are undone by a subsequent call to rcu_unexpedite_gp(),
* and if the rcu_expedited sysfs/boot parameter is not set, then all
* subsequent calls to synchronize_rcu() and friends will return to
* their normal non-expedited behavior.
*/
void rcu_unexpedite_gp(void)
{
atomic_dec(&rcu_expedited_nesting);
}
EXPORT_SYMBOL_GPL(rcu_unexpedite_gp);
#endif /* #ifndef CONFIG_TINY_RCU */
/*
* Inform RCU of the end of the in-kernel boot sequence.
*/
void rcu_end_inkernel_boot(void)
{
if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT))
rcu_unexpedite_gp();
}
#ifdef CONFIG_PREEMPT_RCU
/*
* Preemptible RCU implementation for rcu_read_lock().
* Just increment ->rcu_read_lock_nesting, shared state will be updated
* if we block.
*/
void __rcu_read_lock(void)
{
current->rcu_read_lock_nesting++;
barrier(); /* critical section after entry code. */
}
EXPORT_SYMBOL_GPL(__rcu_read_lock);
/*
* Preemptible RCU implementation for rcu_read_unlock().
* Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
* rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
* invoke rcu_read_unlock_special() to clean up after a context switch
* in an RCU read-side critical section and other special cases.
*/
void __rcu_read_unlock(void)
{
struct task_struct *t = current;
if (t->rcu_read_lock_nesting != 1) {
--t->rcu_read_lock_nesting;
} else {
barrier(); /* critical section before exit code. */
t->rcu_read_lock_nesting = INT_MIN;
barrier(); /* assign before ->rcu_read_unlock_special load */
if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s)))
rcu_read_unlock_special(t);
barrier(); /* ->rcu_read_unlock_special load before assign */
t->rcu_read_lock_nesting = 0;
}
#ifdef CONFIG_PROVE_LOCKING
{
int rrln = READ_ONCE(t->rcu_read_lock_nesting);
WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
}
#endif /* #ifdef CONFIG_PROVE_LOCKING */
}
EXPORT_SYMBOL_GPL(__rcu_read_unlock);
#endif /* #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static struct lock_class_key rcu_lock_key;
struct lockdep_map rcu_lock_map =
STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
EXPORT_SYMBOL_GPL(rcu_lock_map);
static struct lock_class_key rcu_bh_lock_key;
struct lockdep_map rcu_bh_lock_map =
STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_bh", &rcu_bh_lock_key);
EXPORT_SYMBOL_GPL(rcu_bh_lock_map);
static struct lock_class_key rcu_sched_lock_key;
struct lockdep_map rcu_sched_lock_map =
STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key);
EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
static struct lock_class_key rcu_callback_key;
struct lockdep_map rcu_callback_map =
STATIC_LOCKDEP_MAP_INIT("rcu_callback", &rcu_callback_key);
EXPORT_SYMBOL_GPL(rcu_callback_map);
int notrace debug_lockdep_rcu_enabled(void)
{
return rcu_scheduler_active && debug_locks &&
current->lockdep_recursion == 0;
}
EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
/**
* rcu_read_lock_held() - might we be in RCU read-side critical section?
*
* If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an RCU
* read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC,
* this assumes we are in an RCU read-side critical section unless it can
* prove otherwise. This is useful for debug checks in functions that
* require that they be called within an RCU read-side critical section.
*
* Checks debug_lockdep_rcu_enabled() to prevent false positives during boot
* and while lockdep is disabled.
*
* Note that rcu_read_lock() and the matching rcu_read_unlock() must
* occur in the same context, for example, it is illegal to invoke
* rcu_read_unlock() in process context if the matching rcu_read_lock()
* was invoked from within an irq handler.
*
* Note that rcu_read_lock() is disallowed if the CPU is either idle or
* offline from an RCU perspective, so check for those as well.
*/
int rcu_read_lock_held(void)
{
if (!debug_lockdep_rcu_enabled())
return 1;
if (!rcu_is_watching())
return 0;
if (!rcu_lockdep_current_cpu_online())
return 0;
return lock_is_held(&rcu_lock_map);
}
EXPORT_SYMBOL_GPL(rcu_read_lock_held);
/**
* rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
*
* Check for bottom half being disabled, which covers both the
* CONFIG_PROVE_RCU and not cases. Note that if someone uses
* rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled)
* will show the situation. This is useful for debug checks in functions
* that require that they be called within an RCU read-side critical
* section.
*
* Check debug_lockdep_rcu_enabled() to prevent false positives during boot.
*
* Note that rcu_read_lock() is disallowed if the CPU is either idle or
* offline from an RCU perspective, so check for those as well.
*/
int rcu_read_lock_bh_held(void)
{
if (!debug_lockdep_rcu_enabled())
return 1;
if (!rcu_is_watching())
return 0;
if (!rcu_lockdep_current_cpu_online())
return 0;
return in_softirq() || irqs_disabled();
}
EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
/**
* wakeme_after_rcu() - Callback function to awaken a task after grace period
* @head: Pointer to rcu_head member within rcu_synchronize structure
*
* Awaken the corresponding task now that a grace period has elapsed.
*/
void wakeme_after_rcu(struct rcu_head *head)
{
struct rcu_synchronize *rcu;
rcu = container_of(head, struct rcu_synchronize, head);
complete(&rcu->completion);
}
void wait_rcu_gp(call_rcu_func_t crf)
{
struct rcu_synchronize rcu;
init_rcu_head_on_stack(&rcu.head);
init_completion(&rcu.completion);
/* Will wake me after RCU finished. */
crf(&rcu.head, wakeme_after_rcu);
/* Wait for it. */
wait_for_completion(&rcu.completion);
destroy_rcu_head_on_stack(&rcu.head);
}
EXPORT_SYMBOL_GPL(wait_rcu_gp);
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
void init_rcu_head(struct rcu_head *head)
{
debug_object_init(head, &rcuhead_debug_descr);
}
void destroy_rcu_head(struct rcu_head *head)
{
debug_object_free(head, &rcuhead_debug_descr);
}
/*
* fixup_activate is called when:
* - an active object is activated
* - an unknown object is activated (might be a statically initialized object)
* Activation is performed internally by call_rcu().
*/
static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
{
struct rcu_head *head = addr;
switch (state) {
case ODEBUG_STATE_NOTAVAILABLE:
/*
* This is not really a fixup. We just make sure that it is
* tracked in the object tracker.
*/
debug_object_init(head, &rcuhead_debug_descr);
debug_object_activate(head, &rcuhead_debug_descr);
return 0;
default:
return 1;
}
}
/**
* init_rcu_head_on_stack() - initialize on-stack rcu_head for debugobjects
* @head: pointer to rcu_head structure to be initialized
*
* This function informs debugobjects of a new rcu_head structure that
* has been allocated as an auto variable on the stack. This function
* is not required for rcu_head structures that are statically defined or
* that are dynamically allocated on the heap. This function has no
* effect for !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds.
*/
void init_rcu_head_on_stack(struct rcu_head *head)
{
debug_object_init_on_stack(head, &rcuhead_debug_descr);
}
EXPORT_SYMBOL_GPL(init_rcu_head_on_stack);
/**
* destroy_rcu_head_on_stack() - destroy on-stack rcu_head for debugobjects
* @head: pointer to rcu_head structure to be initialized
*
* This function informs debugobjects that an on-stack rcu_head structure
* is about to go out of scope. As with init_rcu_head_on_stack(), this
* function is not required for rcu_head structures that are statically
* defined or that are dynamically allocated on the heap. Also as with
* init_rcu_head_on_stack(), this function has no effect for
* !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds.
*/
void destroy_rcu_head_on_stack(struct rcu_head *head)
{
debug_object_free(head, &rcuhead_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack);
struct debug_obj_descr rcuhead_debug_descr = {
.name = "rcu_head",
.fixup_activate = rcuhead_fixup_activate,
};
EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp,
unsigned long secs,
unsigned long c_old, unsigned long c)
{
trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c);
}
EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
#else
#define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \
do { } while (0)
#endif
#ifdef CONFIG_RCU_STALL_COMMON
#ifdef CONFIG_PROVE_RCU
#define RCU_STALL_DELAY_DELTA (5 * HZ)
#else
#define RCU_STALL_DELAY_DELTA 0
#endif
int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
module_param(rcu_cpu_stall_suppress, int, 0644);
module_param(rcu_cpu_stall_timeout, int, 0644);
int rcu_jiffies_till_stall_check(void)
{
int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout);
/*
* Limit check must be consistent with the Kconfig limits
* for CONFIG_RCU_CPU_STALL_TIMEOUT.
*/
if (till_stall_check < 3) {
WRITE_ONCE(rcu_cpu_stall_timeout, 3);
till_stall_check = 3;
} else if (till_stall_check > 300) {
WRITE_ONCE(rcu_cpu_stall_timeout, 300);
till_stall_check = 300;
}
return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
}
void rcu_sysrq_start(void)
{
if (!rcu_cpu_stall_suppress)
rcu_cpu_stall_suppress = 2;
}
void rcu_sysrq_end(void)
{
if (rcu_cpu_stall_suppress == 2)
rcu_cpu_stall_suppress = 0;
}
static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
{
rcu_cpu_stall_suppress = 1;
return NOTIFY_DONE;
}
static struct notifier_block rcu_panic_block = {
.notifier_call = rcu_panic,
};
static int __init check_cpu_stall_init(void)
{
atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
return 0;
}
early_initcall(check_cpu_stall_init);
#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
#ifdef CONFIG_TASKS_RCU
/*
* Simple variant of RCU whose quiescent states are voluntary context switch,
* user-space execution, and idle. As such, grace periods can take one good
* long time. There are no read-side primitives similar to rcu_read_lock()
* and rcu_read_unlock() because this implementation is intended to get
* the system into a safe state for some of the manipulations involved in
* tracing and the like. Finally, this implementation does not support
* high call_rcu_tasks() rates from multiple CPUs. If this is required,
* per-CPU callback lists will be needed.
*/
/* Global list of callbacks and associated lock. */
static struct rcu_head *rcu_tasks_cbs_head;
static struct rcu_head **rcu_tasks_cbs_tail = &rcu_tasks_cbs_head;
static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq);
static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock);
/* Track exiting tasks in order to allow them to be waited for. */
DEFINE_SRCU(tasks_rcu_exit_srcu);
/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */
static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10;
module_param(rcu_task_stall_timeout, int, 0644);
static void rcu_spawn_tasks_kthread(void);
/*
* Post an RCU-tasks callback. First call must be from process context
* after the scheduler if fully operational.
*/
void call_rcu_tasks(struct rcu_head *rhp, void (*func)(struct rcu_head *rhp))
{
unsigned long flags;
bool needwake;
rhp->next = NULL;
rhp->func = func;
raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags);
needwake = !rcu_tasks_cbs_head;
*rcu_tasks_cbs_tail = rhp;
rcu_tasks_cbs_tail = &rhp->next;
raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags);
if (needwake) {
rcu_spawn_tasks_kthread();
wake_up(&rcu_tasks_cbs_wq);
}
}
EXPORT_SYMBOL_GPL(call_rcu_tasks);
/**
* synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed.
*
* Control will return to the caller some time after a full rcu-tasks
* grace period has elapsed, in other words after all currently
* executing rcu-tasks read-side critical sections have elapsed. These
* read-side critical sections are delimited by calls to schedule(),
* cond_resched_rcu_qs(), idle execution, userspace execution, calls
* to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched().
*
* This is a very specialized primitive, intended only for a few uses in
* tracing and other situations requiring manipulation of function
* preambles and profiling hooks. The synchronize_rcu_tasks() function
* is not (yet) intended for heavy use from multiple CPUs.
*
* Note that this guarantee implies further memory-ordering guarantees.
* On systems with more than one CPU, when synchronize_rcu_tasks() returns,
* each CPU is guaranteed to have executed a full memory barrier since the
* end of its last RCU-tasks read-side critical section whose beginning
* preceded the call to synchronize_rcu_tasks(). In addition, each CPU
* having an RCU-tasks read-side critical section that extends beyond
* the return from synchronize_rcu_tasks() is guaranteed to have executed
* a full memory barrier after the beginning of synchronize_rcu_tasks()
* and before the beginning of that RCU-tasks read-side critical section.
* Note that these guarantees include CPUs that are offline, idle, or
* executing in user mode, as well as CPUs that are executing in the kernel.
*
* Furthermore, if CPU A invoked synchronize_rcu_tasks(), which returned
* to its caller on CPU B, then both CPU A and CPU B are guaranteed
* to have executed a full memory barrier during the execution of
* synchronize_rcu_tasks() -- even if CPU A and CPU B are the same CPU
* (but again only if the system has more than one CPU).
*/
void synchronize_rcu_tasks(void)
{
/* Complain if the scheduler has not started. */
rcu_lockdep_assert(!rcu_scheduler_active,
"synchronize_rcu_tasks called too soon");
/* Wait for the grace period. */
wait_rcu_gp(call_rcu_tasks);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_tasks);
/**
* rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks.
*
* Although the current implementation is guaranteed to wait, it is not
* obligated to, for example, if there are no pending callbacks.
*/
void rcu_barrier_tasks(void)
{
/* There is only one callback queue, so this is easy. ;-) */
synchronize_rcu_tasks();
}
EXPORT_SYMBOL_GPL(rcu_barrier_tasks);
/* See if tasks are still holding out, complain if so. */
static void check_holdout_task(struct task_struct *t,
bool needreport, bool *firstreport)
{
int cpu;
if (!READ_ONCE(t->rcu_tasks_holdout) ||
t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) ||
!READ_ONCE(t->on_rq) ||
(IS_ENABLED(CONFIG_NO_HZ_FULL) &&
!is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) {
WRITE_ONCE(t->rcu_tasks_holdout, false);
list_del_init(&t->rcu_tasks_holdout_list);
put_task_struct(t);
return;
}
if (!needreport)
return;
if (*firstreport) {
pr_err("INFO: rcu_tasks detected stalls on tasks:\n");
*firstreport = false;
}
cpu = task_cpu(t);
pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n",
t, ".I"[is_idle_task(t)],
"N."[cpu < 0 || !tick_nohz_full_cpu(cpu)],
t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout,
t->rcu_tasks_idle_cpu, cpu);
sched_show_task(t);
}
/* RCU-tasks kthread that detects grace periods and invokes callbacks. */
static int __noreturn rcu_tasks_kthread(void *arg)
{
unsigned long flags;
struct task_struct *g, *t;
unsigned long lastreport;
struct rcu_head *list;
struct rcu_head *next;
LIST_HEAD(rcu_tasks_holdouts);
/* Run on housekeeping CPUs by default. Sysadm can move if desired. */
housekeeping_affine(current);
/*
* Each pass through the following loop makes one check for
* newly arrived callbacks, and, if there are some, waits for
* one RCU-tasks grace period and then invokes the callbacks.
* This loop is terminated by the system going down. ;-)
*/
for (;;) {
/* Pick up any new callbacks. */
raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags);
list = rcu_tasks_cbs_head;
rcu_tasks_cbs_head = NULL;
rcu_tasks_cbs_tail = &rcu_tasks_cbs_head;
raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags);
/* If there were none, wait a bit and start over. */
if (!list) {
wait_event_interruptible(rcu_tasks_cbs_wq,
rcu_tasks_cbs_head);
if (!rcu_tasks_cbs_head) {
WARN_ON(signal_pending(current));
schedule_timeout_interruptible(HZ/10);
}
continue;
}
/*
* Wait for all pre-existing t->on_rq and t->nvcsw
* transitions to complete. Invoking synchronize_sched()
* suffices because all these transitions occur with
* interrupts disabled. Without this synchronize_sched(),
* a read-side critical section that started before the
* grace period might be incorrectly seen as having started
* after the grace period.
*
* This synchronize_sched() also dispenses with the
* need for a memory barrier on the first store to
* ->rcu_tasks_holdout, as it forces the store to happen
* after the beginning of the grace period.
*/
synchronize_sched();
/*
* There were callbacks, so we need to wait for an
* RCU-tasks grace period. Start off by scanning
* the task list for tasks that are not already
* voluntarily blocked. Mark these tasks and make
* a list of them in rcu_tasks_holdouts.
*/
rcu_read_lock();
for_each_process_thread(g, t) {
if (t != current && READ_ONCE(t->on_rq) &&
!is_idle_task(t)) {
get_task_struct(t);
t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw);
WRITE_ONCE(t->rcu_tasks_holdout, true);
list_add(&t->rcu_tasks_holdout_list,
&rcu_tasks_holdouts);
}
}
rcu_read_unlock();
/*
* Wait for tasks that are in the process of exiting.
* This does only part of the job, ensuring that all
* tasks that were previously exiting reach the point
* where they have disabled preemption, allowing the
* later synchronize_sched() to finish the job.
*/
synchronize_srcu(&tasks_rcu_exit_srcu);
/*
* Each pass through the following loop scans the list
* of holdout tasks, removing any that are no longer
* holdouts. When the list is empty, we are done.
*/
lastreport = jiffies;
while (!list_empty(&rcu_tasks_holdouts)) {
bool firstreport;
bool needreport;
int rtst;
struct task_struct *t1;
schedule_timeout_interruptible(HZ);
rtst = READ_ONCE(rcu_task_stall_timeout);
needreport = rtst > 0 &&
time_after(jiffies, lastreport + rtst);
if (needreport)
lastreport = jiffies;
firstreport = true;
WARN_ON(signal_pending(current));
list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts,
rcu_tasks_holdout_list) {
check_holdout_task(t, needreport, &firstreport);
cond_resched();
}
}
/*
* Because ->on_rq and ->nvcsw are not guaranteed
* to have a full memory barriers prior to them in the
* schedule() path, memory reordering on other CPUs could
* cause their RCU-tasks read-side critical sections to
* extend past the end of the grace period. However,
* because these ->nvcsw updates are carried out with
* interrupts disabled, we can use synchronize_sched()
* to force the needed ordering on all such CPUs.
*
* This synchronize_sched() also confines all
* ->rcu_tasks_holdout accesses to be within the grace
* period, avoiding the need for memory barriers for
* ->rcu_tasks_holdout accesses.
*
* In addition, this synchronize_sched() waits for exiting
* tasks to complete their final preempt_disable() region
* of execution, cleaning up after the synchronize_srcu()
* above.
*/
synchronize_sched();
/* Invoke the callbacks. */
while (list) {
next = list->next;
local_bh_disable();
list->func(list);
local_bh_enable();
list = next;
cond_resched();
}
schedule_timeout_uninterruptible(HZ/10);
}
}
/* Spawn rcu_tasks_kthread() at first call to call_rcu_tasks(). */
static void rcu_spawn_tasks_kthread(void)
{
static DEFINE_MUTEX(rcu_tasks_kthread_mutex);
static struct task_struct *rcu_tasks_kthread_ptr;
struct task_struct *t;
if (READ_ONCE(rcu_tasks_kthread_ptr)) {
smp_mb(); /* Ensure caller sees full kthread. */
return;
}
mutex_lock(&rcu_tasks_kthread_mutex);
if (rcu_tasks_kthread_ptr) {
mutex_unlock(&rcu_tasks_kthread_mutex);
return;
}
t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread");
BUG_ON(IS_ERR(t));
smp_mb(); /* Ensure others see full kthread. */
WRITE_ONCE(rcu_tasks_kthread_ptr, t);
mutex_unlock(&rcu_tasks_kthread_mutex);
}
#endif /* #ifdef CONFIG_TASKS_RCU */
#ifdef CONFIG_PROVE_RCU
/*
* Early boot self test parameters, one for each flavor
*/
static bool rcu_self_test;
static bool rcu_self_test_bh;
static bool rcu_self_test_sched;
module_param(rcu_self_test, bool, 0444);
module_param(rcu_self_test_bh, bool, 0444);
module_param(rcu_self_test_sched, bool, 0444);
static int rcu_self_test_counter;
static void test_callback(struct rcu_head *r)
{
rcu_self_test_counter++;
pr_info("RCU test callback executed %d\n", rcu_self_test_counter);
}
static void early_boot_test_call_rcu(void)
{
static struct rcu_head head;
call_rcu(&head, test_callback);
}
static void early_boot_test_call_rcu_bh(void)
{
static struct rcu_head head;
call_rcu_bh(&head, test_callback);
}
static void early_boot_test_call_rcu_sched(void)
{
static struct rcu_head head;
call_rcu_sched(&head, test_callback);
}
void rcu_early_boot_tests(void)
{
pr_info("Running RCU self tests\n");
if (rcu_self_test)
early_boot_test_call_rcu();
if (rcu_self_test_bh)
early_boot_test_call_rcu_bh();
if (rcu_self_test_sched)
early_boot_test_call_rcu_sched();
}
static int rcu_verify_early_boot_tests(void)
{
int ret = 0;
int early_boot_test_counter = 0;
if (rcu_self_test) {
early_boot_test_counter++;
rcu_barrier();
}
if (rcu_self_test_bh) {
early_boot_test_counter++;
rcu_barrier_bh();
}
if (rcu_self_test_sched) {
early_boot_test_counter++;
rcu_barrier_sched();
}
if (rcu_self_test_counter != early_boot_test_counter) {
WARN_ON(1);
ret = -1;
}
return ret;
}
late_initcall(rcu_verify_early_boot_tests);
#else
void rcu_early_boot_tests(void) {}
#endif /* CONFIG_PROVE_RCU */
C:\Users\Admin\Desktop\linux-4.2.y-new\linux-4.2.y\kernel\/reboot.c
/*
* linux/kernel/reboot.c
*
* Copyright (C) 2013 Linus Torvalds
*/
#define pr_fmt(fmt) "reboot: " fmt
#include <linux/ctype.h>
#include <linux/export.h>
#include <linux/kexec.h>
#include <linux/kmod.h>
#include <linux/kmsg_dump.h>
#include <linux/reboot.h>
#include <linux/suspend.h>
#include <linux/syscalls.h>
#include <linux/syscore_ops.h>
#include <linux/uaccess.h>
/*
* this indicates whether you can reboot with ctrl-alt-del: the default is yes
*/
int C_A_D = 1;
struct pid *cad_pid;
EXPORT_SYMBOL(cad_pid);
#if defined(CONFIG_ARM) || defined(CONFIG_UNICORE32)
#define DEFAULT_REBOOT_MODE = REBOOT_HARD
#else
#define DEFAULT_REBOOT_MODE
#endif
enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE;
/*
* This variable is used privately to keep track of whether or not
* reboot_type is still set to its default value (i.e., reboot= hasn't
* been set on the command line). This is needed so that we can
* suppress DMI scanning for reboot quirks. Without it, it's
* impossible to override a faulty reboot quirk without recompiling.
*/
int reboot_default = 1;
int reboot_cpu;
enum reboot_type reboot_type = BOOT_ACPI;
int reboot_force;
/*
* If set, this is used for preparing the system to power off.
*/
void (*pm_power_off_prepare)(void);
/**
* emergency_restart - reboot the system
*
* Without shutting down any hardware or taking any locks
* reboot the system. This is called when we know we are in
* trouble so this is our best effort to reboot. This is
* safe to call in interrupt context.
*/
void emergency_restart(void)
{
kmsg_dump(KMSG_DUMP_EMERG);
machine_emergency_restart();
}
EXPORT_SYMBOL_GPL(emergency_restart);
void kernel_restart_prepare(char *cmd)
{
blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
system_state = SYSTEM_RESTART;
usermodehelper_disable();
device_shutdown();
}
/**
* register_reboot_notifier - Register function to be called at reboot time
* @nb: Info about notifier function to be called
*
* Registers a function with the list of functions
* to be called at reboot time.
*
* Currently always returns zero, as blocking_notifier_chain_register()
* always returns zero.
*/
int register_reboot_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_register(&reboot_notifier_list, nb);
}
EXPORT_SYMBOL(register_reboot_notifier);
/**
* unregister_reboot_notifier - Unregister previously registered reboot notifier
* @nb: Hook to be unregistered
*
* Unregisters a previously registered reboot
* notifier function.
*
* Returns zero on success, or %-ENOENT on failure.
*/
int unregister_reboot_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
}
EXPORT_SYMBOL(unregister_reboot_notifier);
/*
* Notifier list for kernel code which wants to be called
* to restart the system.
*/
static ATOMIC_NOTIFIER_HEAD(restart_handler_list);
/**
* register_restart_handler - Register function to be called to reset
* the system
* @nb: Info about handler function to be called
* @nb->priority: Handler priority. Handlers should follow the
* following guidelines for setting priorities.
* 0: Restart handler of last resort,
* with limited restart capabilities
* 128: Default restart handler; use if no other
* restart handler is expected to be available,
* and/or if restart functionality is
* sufficient to restart the entire system
* 255: Highest priority restart handler, will
* preempt all other restart handlers
*
* Registers a function with code to be called to restart the
* system.
*
* Registered functions will be called from machine_restart as last
* step of the restart sequence (if the architecture specific
* machine_restart function calls do_kernel_restart - see below
* for details).
* Registered functions are expected to restart the system immediately.
* If more than one function is registered, the restart handler priority
* selects which function will be called first.
*
* Restart handlers are expected to be registered from non-architecture
* code, typically from drivers. A typical use case would be a system
* where restart functionality is provided through a watchdog. Multiple
* restart handlers may exist; for example, one restart handler might
* restart the entire system, while another only restarts the CPU.
* In such cases, the restart handler which only restarts part of the
* hardware is expected to register with low priority to ensure that
* it only runs if no other means to restart the system is available.
*
* Currently always returns zero, as atomic_notifier_chain_register()
* always returns zero.
*/
int register_restart_handler(struct notifier_block *nb)
{
return atomic_notifier_chain_register(&restart_handler_list, nb);
}
EXPORT_SYMBOL(register_restart_handler);
/**
* unregister_restart_handler - Unregister previously registered
* restart handler
* @nb: Hook to be unregistered
*
* Unregisters a previously registered restart handler function.
*
* Returns zero on success, or %-ENOENT on failure.
*/
int unregister_restart_handler(struct notifier_block *nb)
{
return atomic_notifier_chain_unregister(&restart_handler_list, nb);
}
EXPORT_SYMBOL(unregister_restart_handler);
/**
* do_kernel_restart - Execute kernel restart handler call chain
*
* Calls functions registered with register_restart_handler.
*
* Expected to be called from machine_restart as last step of the restart
* sequence.
*
* Restarts the system immediately if a restart handler function has been
* registered. Otherwise does nothing.
*/
void do_kernel_restart(char *cmd)
{
atomic_notifier_call_chain(&restart_handler_list, reboot_mode, cmd);
}
void migrate_to_reboot_cpu(void)
{
/* The boot cpu is always logical cpu 0 */
int cpu = reboot_cpu;
cpu_hotplug_disable();
/* Make certain the cpu I'm about to reboot on is online */
if (!cpu_online(cpu))
cpu = cpumask_first(cpu_online_mask);
/* Prevent races with other tasks migrating this task */
current->flags |= PF_NO_SETAFFINITY;
/* Make certain I only run on the appropriate processor */
set_cpus_allowed_ptr(current, cpumask_of(cpu));
}
/**
* kernel_restart - reboot the system
* @cmd: pointer to buffer containing command to execute for restart
* or %NULL
*
* Shutdown everything and perform a clean reboot.
* This is not safe to call in interrupt context.
*/
void kernel_restart(char *cmd)
{
kernel_restart_prepare(cmd);
migrate_to_reboot_cpu();
syscore_shutdown();
if (!cmd)
pr_emerg("Restarting system\n");
else
pr_emerg("Restarting system with command '%s'\n", cmd);
kmsg_dump(KMSG_DUMP_RESTART);
machine_restart(cmd);
}
EXPORT_SYMBOL_GPL(kernel_restart);
static void kernel_shutdown_prepare(enum system_states state)
{
blocking_notifier_call_chain(&reboot_notifier_list,
(state == SYSTEM_HALT) ? SYS_HALT : SYS_POWER_OFF, NULL);
system_state = state;
usermodehelper_disable();
device_shutdown();
}
/**
* kernel_halt - halt the system
*
* Shutdown everything and perform a clean system halt.
*/
void kernel_halt(void)
{
kernel_shutdown_prepare(SYSTEM_HALT);
migrate_to_reboot_cpu();
syscore_shutdown();
pr_emerg("System halted\n");
kmsg_dump(KMSG_DUMP_HALT);
machine_halt();
}
EXPORT_SYMBOL_GPL(kernel_halt);
/**
* kernel_power_off - power_off the system
*
* Shutdown everything and perform a clean system power_off.
*/
void kernel_power_off(void)
{
kernel_shutdown_prepare(SYSTEM_POWER_OFF);
if (pm_power_off_prepare)
pm_power_off_prepare();
migrate_to_reboot_cpu();
syscore_shutdown();
pr_emerg("Power down\n");
kmsg_dump(KMSG_DUMP_POWEROFF);
machine_power_off();
}
EXPORT_SYMBOL_GPL(kernel_power_off);
static DEFINE_MUTEX(reboot_mutex);
/*
* Reboot system call: for obvious reasons only root may call it,
* and even root needs to set up some magic numbers in the registers
* so that some mistake won't make this reboot the whole machine.
* You can also set the meaning of the ctrl-alt-del-key here.
*
* reboot doesn't sync: do that yourself before calling this.
*/
SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
void __user *, arg)
{
struct pid_namespace *pid_ns = task_active_pid_ns(current);
char buffer[256];
int ret = 0;
/* We only trust the superuser with rebooting the system. */
if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT))
return -EPERM;
/* For safety, we require "magic" arguments. */
if (magic1 != LINUX_REBOOT_MAGIC1 ||
(magic2 != LINUX_REBOOT_MAGIC2 &&
magic2 != LINUX_REBOOT_MAGIC2A &&
magic2 != LINUX_REBOOT_MAGIC2B &&
magic2 != LINUX_REBOOT_MAGIC2C))
return -EINVAL;
/*
* If pid namespaces are enabled and the current task is in a child
* pid_namespace, the command is handled by reboot_pid_ns() which will
* call do_exit().
*/
ret = reboot_pid_ns(pid_ns, cmd);
if (ret)
return ret;
/* Instead of trying to make the power_off code look like
* halt when pm_power_off is not set do it the easy way.
*/
if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
cmd = LINUX_REBOOT_CMD_HALT;
mutex_lock(&reboot_mutex);
switch (cmd) {
case LINUX_REBOOT_CMD_RESTART:
kernel_restart(NULL);
break;
case LINUX_REBOOT_CMD_CAD_ON:
C_A_D = 1;
break;
case LINUX_REBOOT_CMD_CAD_OFF:
C_A_D = 0;
break;
case LINUX_REBOOT_CMD_HALT:
kernel_halt();
do_exit(0);
panic("cannot halt");
case LINUX_REBOOT_CMD_POWER_OFF:
kernel_power_off();
do_exit(0);
break;
case LINUX_REBOOT_CMD_RESTART2:
ret = strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1);
if (ret < 0) {
ret = -EFAULT;
break;
}
buffer[sizeof(buffer) - 1] = '\0';
kernel_restart(buffer);
break;
#ifdef CONFIG_KEXEC
case LINUX_REBOOT_CMD_KEXEC:
ret = kernel_kexec();
break;
#endif
#ifdef CONFIG_HIBERNATION
case LINUX_REBOOT_CMD_SW_SUSPEND:
ret = hibernate();
break;
#endif
default:
ret = -EINVAL;
break;
}
mutex_unlock(&reboot_mutex);
return ret;
}
static void deferred_cad(struct work_struct *dummy)
{
kernel_restart(NULL);
}
/*
* This function gets called by ctrl-alt-del - ie the keyboard interrupt.
* As it's called within an interrupt, it may NOT sync: the only choice
* is whether to reboot at once, or just ignore the ctrl-alt-del.
*/
void ctrl_alt_del(void)
{
static DECLARE_WORK(cad_work, deferred_cad);
if (C_A_D)
schedule_work(&cad_work);
else
kill_cad_pid(SIGINT, 1);
}
char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
static const char reboot_cmd[] = "/sbin/reboot";
static int run_cmd(const char *cmd)
{
char **argv;
static char *envp[] = {
"HOME=/",
"PATH=/sbin:/bin:/usr/sbin:/usr/bin",
NULL
};
int ret;
argv = argv_split(GFP_KERNEL, cmd, NULL);
if (argv) {
ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
argv_free(argv);
} else {
ret = -ENOMEM;
}
return ret;
}
static int __orderly_reboot(void)
{
int ret;
ret = run_cmd(reboot_cmd);
if (ret) {
pr_warn("Failed to start orderly reboot: forcing the issue\n");
emergency_sync();
kernel_restart(NULL);
}
return ret;
}
static int __orderly_poweroff(bool force)
{
int ret;
ret = run_cmd(poweroff_cmd);
if (ret && force) {
pr_warn("Failed to start orderly shutdown: forcing the issue\n");
/*
* I guess this should try to kick off some daemon to sync and
* poweroff asap. Or not even bother syncing if we're doing an
* emergency shutdown?
*/
emergency_sync();
kernel_power_off();
}
return ret;
}
static bool poweroff_force;
static void poweroff_work_func(struct work_struct *work)
{
__orderly_poweroff(poweroff_force);
}
static DECLARE_WORK(poweroff_work, poweroff_work_func);
/**
* orderly_poweroff - Trigger an orderly system poweroff
* @force: force poweroff if command execution fails
*
* This may be called from any context to trigger a system shutdown.
* If the orderly shutdown fails, it will force an immediate shutdown.
*/
void orderly_poweroff(bool force)
{
if (force) /* do not override the pending "true" */
poweroff_force = true;
schedule_work(&poweroff_work);
}
EXPORT_SYMBOL_GPL(orderly_poweroff);
static void reboot_work_func(struct work_struct *work)
{
__orderly_reboot();
}
static DECLARE_WORK(reboot_work, reboot_work_func);
/**
* orderly_reboot - Trigger an orderly system reboot
*
* This may be called from any context to trigger a system reboot.
* If the orderly reboot fails, it will force an immediate reboot.
*/
void orderly_reboot(void)
{
schedule_work(&reboot_work);
}
EXPORT_SYMBOL_GPL(orderly_reboot);
static int __init reboot_setup(char *str)
{
for (;;) {
/*
* Having anything passed on the command line via
* reboot= will cause us to disable DMI checking
* below.
*/
reboot_default = 0;
switch (*str) {
case 'w':
reboot_mode = REBOOT_WARM;
break;
case 'c':
reboot_mode = REBOOT_COLD;
break;
case 'h':
reboot_mode = REBOOT_HARD;
break;
case 's':
{
int rc;
if (isdigit(*(str+1))) {
rc = kstrtoint(str+1, 0, &reboot_cpu);
if (rc)
return rc;
} else if (str[1] == 'm' && str[2] == 'p' &&
isdigit(*(str+3))) {
rc = kstrtoint(str+3, 0, &reboot_cpu);
if (rc)
return rc;
} else
reboot_mode = REBOOT_SOFT;
break;
}
case 'g':
reboot_mode = REBOOT_GPIO;
break;
case 'b':
case 'a':
case 'k':
case 't':
case 'e':
case 'p':
reboot_type = *str;
break;
case 'f':
reboot_force = 1;
break;
}
str = strchr(str, ',');
if (str)
str++;
else
break;
}
return 1;
}
__setup("reboot=", reboot_setup);
C:\Users\Admin\Desktop\linux-4.2.y-new\linux-4.2.y\kernel\/relay.c
/*
* Public API and common code for kernel->userspace relay file support.
*
* See Documentation/filesystems/relay.txt for an overview.
*
* Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
* Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com)
*
* Moved to kernel/relay.c by Paul Mundt, 2006.
* November 2006 - CPU hotplug support by Mathieu Desnoyers
* (mathieu.desnoyers@polymtl.ca)
*
* This file is released under the GPL.
*/
#include <linux/errno.h>
#include <linux/stddef.h>
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/string.h>
#include <linux/relay.h>
#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <linux/cpu.h>
#include <linux/splice.h>
/* list of open channels, for cpu hotplug */
static DEFINE_MUTEX(relay_channels_mutex);
static LIST_HEAD(relay_channels);
/*
* close() vm_op implementation for relay file mapping.
*/
static void relay_file_mmap_close(struct vm_area_struct *vma)
{
struct rchan_buf *buf = vma->vm_private_data;
buf->chan->cb->buf_unmapped(buf, vma->vm_file);
}
/*
* fault() vm_op implementation for relay file mapping.
*/
static int relay_buf_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct page *page;
struct rchan_buf *buf = vma->vm_private_data;
pgoff_t pgoff = vmf->pgoff;
if (!buf)
return VM_FAULT_OOM;
page = vmalloc_to_page(buf->start + (pgoff << PAGE_SHIFT));
if (!page)
return VM_FAULT_SIGBUS;
get_page(page);
vmf->page = page;
return 0;
}
/*
* vm_ops for relay file mappings.
*/
static const struct vm_operations_struct relay_file_mmap_ops = {
.fault = relay_buf_fault,
.close = relay_file_mmap_close,
};
/*
* allocate an array of pointers of struct page
*/
static struct page **relay_alloc_page_array(unsigned int n_pages)
{
const size_t pa_size = n_pages * sizeof(struct page *);
if (pa_size > PAGE_SIZE)
return vzalloc(pa_size);
return kzalloc(pa_size, GFP_KERNEL);
}
/*
* free an array of pointers of struct page
*/
static void relay_free_page_array(struct page **array)
{
kvfree(array);
}
/**
* relay_mmap_buf: - mmap channel buffer to process address space
* @buf: relay channel buffer
* @vma: vm_area_struct describing memory to be mapped
*
* Returns 0 if ok, negative on error
*
* Caller should already have grabbed mmap_sem.
*/
static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
{
unsigned long length = vma->vm_end - vma->vm_start;
struct file *filp = vma->vm_file;
if (!buf)
return -EBADF;
if (length != (unsigned long)buf->chan->alloc_size)
return -EINVAL;
vma->vm_ops = &relay_file_mmap_ops;
vma->vm_flags |= VM_DONTEXPAND;
vma->vm_private_data = buf;
buf->chan->cb->buf_mapped(buf, filp);
return 0;
}
/**
* relay_alloc_buf - allocate a channel buffer
* @buf: the buffer struct
* @size: total size of the buffer
*
* Returns a pointer to the resulting buffer, %NULL if unsuccessful. The
* passed in size will get page aligned, if it isn't already.
*/
static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
{
void *mem;
unsigned int i, j, n_pages;
*size = PAGE_ALIGN(*size);
n_pages = *size >> PAGE_SHIFT;
buf->page_array = relay_alloc_page_array(n_pages);
if (!buf->page_array)
return NULL;
for (i = 0; i < n_pages; i++) {
buf->page_array[i] = alloc_page(GFP_KERNEL);
if (unlikely(!buf->page_array[i]))
goto depopulate;
set_page_private(buf->page_array[i], (unsigned long)buf);
}
mem = vmap(buf->page_array, n_pages, VM_MAP, PAGE_KERNEL);
if (!mem)
goto depopulate;
memset(mem, 0, *size);
buf->page_count = n_pages;
return mem;
depopulate:
for (j = 0; j < i; j++)
__free_page(buf->page_array[j]);
relay_free_page_array(buf->page_array);
return NULL;
}
/**
* relay_create_buf - allocate and initialize a channel buffer
* @chan: the relay channel
*
* Returns channel buffer if successful, %NULL otherwise.
*/
static struct rchan_buf *relay_create_buf(struct rchan *chan)
{
struct rchan_buf *buf;
if (chan->n_subbufs > UINT_MAX / sizeof(size_t *))
return NULL;
buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL);
if (!buf)
return NULL;
buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL);
if (!buf->padding)
goto free_buf;
buf->start = relay_alloc_buf(buf, &chan->alloc_size);
if (!buf->start)
goto free_buf;
buf->chan = chan;
kref_get(&buf->chan->kref);
return buf;
free_buf:
kfree(buf->padding);
kfree(buf);
return NULL;
}
/**
* relay_destroy_channel - free the channel struct
* @kref: target kernel reference that contains the relay channel
*
* Should only be called from kref_put().
*/
static void relay_destroy_channel(struct kref *kref)
{
struct rchan *chan = container_of(kref, struct rchan, kref);
kfree(chan);
}
/**
* relay_destroy_buf - destroy an rchan_buf struct and associated buffer
* @buf: the buffer struct
*/
static void relay_destroy_buf(struct rchan_buf *buf)
{
struct rchan *chan = buf->chan;
unsigned int i;
if (likely(buf->start)) {
vunmap(buf->start);
for (i = 0; i < buf->page_count; i++)
__free_page(buf->page_array[i]);
relay_free_page_array(buf->page_array);
}
chan->buf[buf->cpu] = NULL;
kfree(buf->padding);
kfree(buf);
kref_put(&chan->kref, relay_destroy_channel);
}
/**
* relay_remove_buf - remove a channel buffer
* @kref: target kernel reference that contains the relay buffer
*
* Removes the file from the filesystem, which also frees the
* rchan_buf_struct and the channel buffer. Should only be called from
* kref_put().
*/
static void relay_remove_buf(struct kref *kref)
{
struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
relay_destroy_buf(buf);
}
/**
* relay_buf_empty - boolean, is the channel buffer empty?
* @buf: channel buffer
*
* Returns 1 if the buffer is empty, 0 otherwise.
*/
static int relay_buf_empty(struct rchan_buf *buf)
{
return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1;
}
/**
* relay_buf_full - boolean, is the channel buffer full?
* @buf: channel buffer
*
* Returns 1 if the buffer is full, 0 otherwise.
*/
int relay_buf_full(struct rchan_buf *buf)
{
size_t ready = buf->subbufs_produced - buf->subbufs_consumed;
return (ready >= buf->chan->n_subbufs) ? 1 : 0;
}
EXPORT_SYMBOL_GPL(relay_buf_full);
/*
* High-level relay kernel API and associated functions.
*/
/*
* rchan_callback implementations defining default channel behavior. Used
* in place of corresponding NULL values in client callback struct.
*/
/*
* subbuf_start() default callback. Does nothing.
*/
static int subbuf_start_default_callback (struct rchan_buf *buf,
void *subbuf,
void *prev_subbuf,
size_t prev_padding)
{
if (relay_buf_full(buf))
return 0;
return 1;
}
/*
* buf_mapped() default callback. Does nothing.
*/
static void buf_mapped_default_callback(struct rchan_buf *buf,
struct file *filp)
{
}
/*
* buf_unmapped() default callback. Does nothing.
*/
static void buf_unmapped_default_callback(struct rchan_buf *buf,
struct file *filp)
{
}
/*
* create_buf_file_create() default callback. Does nothing.
*/
static struct dentry *create_buf_file_default_callback(const char *filename,
struct dentry *parent,
umode_t mode,
struct rchan_buf *buf,
int *is_global)
{
return NULL;
}
/*
* remove_buf_file() default callback. Does nothing.
*/
static int remove_buf_file_default_callback(struct dentry *dentry)
{
return -EINVAL;
}
/* relay channel default callbacks */
static struct rchan_callbacks default_channel_callbacks = {
.subbuf_start = subbuf_start_default_callback,
.buf_mapped = buf_mapped_default_callback,
.buf_unmapped = buf_unmapped_default_callback,
.create_buf_file = create_buf_file_default_callback,
.remove_buf_file = remove_buf_file_default_callback,
};
/**
* wakeup_readers - wake up readers waiting on a channel
* @data: contains the channel buffer
*
* This is the timer function used to defer reader waking.
*/
static void wakeup_readers(unsigned long data)
{
struct rchan_buf *buf = (struct rchan_buf *)data;
wake_up_interruptible(&buf->read_wait);
}
/**
* __relay_reset - reset a channel buffer
* @buf: the channel buffer
* @init: 1 if this is a first-time initialization
*
* See relay_reset() for description of effect.
*/
static void __relay_reset(struct rchan_buf *buf, unsigned int init)
{
size_t i;
if (init) {
init_waitqueue_head(&buf->read_wait);
kref_init(&buf->kref);
setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf);
} else
del_timer_sync(&buf->timer);
buf->subbufs_produced = 0;
buf->subbufs_consumed = 0;
buf->bytes_consumed = 0;
buf->finalized = 0;
buf->data = buf->start;
buf->offset = 0;
for (i = 0; i < buf->chan->n_subbufs; i++)
buf->padding[i] = 0;
buf->chan->cb->subbuf_start(buf, buf->data, NULL, 0);
}
/**
* relay_reset - reset the channel
* @chan: the channel
*
* This has the effect of erasing all data from all channel buffers
* and restarting the channel in its initial state. The buffers
* are not freed, so any mappings are still in effect.
*
* NOTE. Care should be taken that the channel isn't actually
* being used by anything when this call is made.
*/
void relay_reset(struct rchan *chan)
{
unsigned int i;
if (!chan)
return;
if (chan->is_global && chan->buf[0]) {
__relay_reset(chan->buf[0], 0);
return;
}
mutex_lock(&relay_channels_mutex);
for_each_possible_cpu(i)
if (chan->buf[i])
__relay_reset(chan->buf[i], 0);
mutex_unlock(&relay_channels_mutex);
}
EXPORT_SYMBOL_GPL(relay_reset);
static inline void relay_set_buf_dentry(struct rchan_buf *buf,
struct dentry *dentry)
{
buf->dentry = dentry;
d_inode(buf->dentry)->i_size = buf->early_bytes;
}
static struct dentry *relay_create_buf_file(struct rchan *chan,
struct rchan_buf *buf,
unsigned int cpu)
{
struct dentry *dentry;
char *tmpname;
tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL);
if (!tmpname)
return NULL;
snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu);
/* Create file in fs */
dentry = chan->cb->create_buf_file(tmpname, chan->parent,
S_IRUSR, buf,
&chan->is_global);
kfree(tmpname);
return dentry;
}
/*
* relay_open_buf - create a new relay channel buffer
*
* used by relay_open() and CPU hotplug.
*/
static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
{
struct rchan_buf *buf = NULL;
struct dentry *dentry;
if (chan->is_global)
return chan->buf[0];
buf = relay_create_buf(chan);
if (!buf)
return NULL;
if (chan->has_base_filename) {
dentry = relay_create_buf_file(chan, buf, cpu);
if (!dentry)
goto free_buf;
relay_set_buf_dentry(buf, dentry);
}
buf->cpu = cpu;
__relay_reset(buf, 1);
if(chan->is_global) {
chan->buf[0] = buf;
buf->cpu = 0;
}
return buf;
free_buf:
relay_destroy_buf(buf);
return NULL;
}
/**
* relay_close_buf - close a channel buffer
* @buf: channel buffer
*
* Marks the buffer finalized and restores the default callbacks.
* The channel buffer and channel buffer data structure are then freed
* automatically when the last reference is given up.
*/
static void relay_close_buf(struct rchan_buf *buf)
{
buf->finalized = 1;
del_timer_sync(&buf->timer);
buf->chan->cb->remove_buf_file(buf->dentry);
kref_put(&buf->kref, relay_remove_buf);
}
static void setup_callbacks(struct rchan *chan,
struct rchan_callbacks *cb)
{
if (!cb) {
chan->cb = &default_channel_callbacks;
return;
}
if (!cb->subbuf_start)
cb->subbuf_start = subbuf_start_default_callback;
if (!cb->buf_mapped)
cb->buf_mapped = buf_mapped_default_callback;
if (!cb->buf_unmapped)
cb->buf_unmapped = buf_unmapped_default_callback;
if (!cb->create_buf_file)
cb->create_buf_file = create_buf_file_default_callback;
if (!cb->remove_buf_file)
cb->remove_buf_file = remove_buf_file_default_callback;
chan->cb = cb;
}
/**
* relay_hotcpu_callback - CPU hotplug callback
* @nb: notifier block
* @action: hotplug action to take
* @hcpu: CPU number
*
* Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD)
*/
static int relay_hotcpu_callback(struct notifier_block *nb,
unsigned long action,
void *hcpu)
{
unsigned int hotcpu = (unsigned long)hcpu;
struct rchan *chan;
switch(action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
mutex_lock(&relay_channels_mutex);
list_for_each_entry(chan, &relay_channels, list) {
if (chan->buf[hotcpu])
continue;
chan->buf[hotcpu] = relay_open_buf(chan, hotcpu);
if(!chan->buf[hotcpu]) {
printk(KERN_ERR
"relay_hotcpu_callback: cpu %d buffer "
"creation failed\n", hotcpu);
mutex_unlock(&relay_channels_mutex);
return notifier_from_errno(-ENOMEM);
}
}
mutex_unlock(&relay_channels_mutex);
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
/* No need to flush the cpu : will be flushed upon
* final relay_flush() call. */
break;
}
return NOTIFY_OK;
}
/**
* relay_open - create a new relay channel
* @base_filename: base name of files to create, %NULL for buffering only
* @parent: dentry of parent directory, %NULL for root directory or buffer
* @subbuf_size: size of sub-buffers
* @n_subbufs: number of sub-buffers
* @cb: client callback functions
* @private_data: user-defined data
*
* Returns channel pointer if successful, %NULL otherwise.
*
* Creates a channel buffer for each cpu using the sizes and
* attributes specified. The created channel buffer files
* will be named base_filename0...base_filenameN-1. File
* permissions will be %S_IRUSR.
*/
struct rchan *relay_open(const char *base_filename,
struct dentry *parent,
size_t subbuf_size,
size_t n_subbufs,
struct rchan_callbacks *cb,
void *private_data)
{
unsigned int i;
struct rchan *chan;
if (!(subbuf_size && n_subbufs))
return NULL;
if (subbuf_size > UINT_MAX / n_subbufs)
return NULL;
chan = kzalloc(sizeof(struct rchan), GFP_KERNEL);
if (!chan)
return NULL;
chan->version = RELAYFS_CHANNEL_VERSION;
chan->n_subbufs = n_subbufs;
chan->subbuf_size = subbuf_size;
chan->alloc_size = PAGE_ALIGN(subbuf_size * n_subbufs);
chan->parent = parent;
chan->private_data = private_data;
if (base_filename) {
chan->has_base_filename = 1;
strlcpy(chan->base_filename, base_filename, NAME_MAX);
}
setup_callbacks(chan, cb);
kref_init(&chan->kref);
mutex_lock(&relay_channels_mutex);
for_each_online_cpu(i) {
chan->buf[i] = relay_open_buf(chan, i);
if (!chan->buf[i])
goto free_bufs;
}
list_add(&chan->list, &relay_channels);
mutex_unlock(&relay_channels_mutex);
return chan;
free_bufs:
for_each_possible_cpu(i) {
if (chan->buf[i])
relay_close_buf(chan->buf[i]);
}
kref_put(&chan->kref, relay_destroy_channel);
mutex_unlock(&relay_channels_mutex);
return NULL;
}
EXPORT_SYMBOL_GPL(relay_open);
struct rchan_percpu_buf_dispatcher {
struct rchan_buf *buf;
struct dentry *dentry;
};
/* Called in atomic context. */
static void __relay_set_buf_dentry(void *info)
{
struct rchan_percpu_buf_dispatcher *p = info;
relay_set_buf_dentry(p->buf, p->dentry);
}
/**
* relay_late_setup_files - triggers file creation
* @chan: channel to operate on
* @base_filename: base name of files to create
* @parent: dentry of parent directory, %NULL for root directory
*
* Returns 0 if successful, non-zero otherwise.
*
* Use to setup files for a previously buffer-only channel.
* Useful to do early tracing in kernel, before VFS is up, for example.
*/
int relay_late_setup_files(struct rchan *chan,
const char *base_filename,
struct dentry *parent)
{
int err = 0;
unsigned int i, curr_cpu;
unsigned long flags;
struct dentry *dentry;
struct rchan_percpu_buf_dispatcher disp;
if (!chan || !base_filename)
return -EINVAL;
strlcpy(chan->base_filename, base_filename, NAME_MAX);
mutex_lock(&relay_channels_mutex);
/* Is chan already set up? */
if (unlikely(chan->has_base_filename)) {
mutex_unlock(&relay_channels_mutex);
return -EEXIST;
}
chan->has_base_filename = 1;
chan->parent = parent;
curr_cpu = get_cpu();
/*
* The CPU hotplug notifier ran before us and created buffers with
* no files associated. So it's safe to call relay_setup_buf_file()
* on all currently online CPUs.
*/
for_each_online_cpu(i) {
if (unlikely(!chan->buf[i])) {
WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n");
err = -EINVAL;
break;
}
dentry = relay_create_buf_file(chan, chan->buf[i], i);
if (unlikely(!dentry)) {
err = -EINVAL;
break;
}
if (curr_cpu == i) {
local_irq_save(flags);
relay_set_buf_dentry(chan->buf[i], dentry);
local_irq_restore(flags);
} else {
disp.buf = chan->buf[i];
disp.dentry = dentry;
smp_mb();
/* relay_channels_mutex must be held, so wait. */
err = smp_call_function_single(i,
__relay_set_buf_dentry,
&disp, 1);
}
if (unlikely(err))
break;
}
put_cpu();
mutex_unlock(&relay_channels_mutex);
return err;
}
/**
* relay_switch_subbuf - switch to a new sub-buffer
* @buf: channel buffer
* @length: size of current event
*
* Returns either the length passed in or 0 if full.
*
* Performs sub-buffer-switch tasks such as invoking callbacks,
* updating padding counts, waking up readers, etc.
*/
size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
{
void *old, *new;
size_t old_subbuf, new_subbuf;
if (unlikely(length > buf->chan->subbuf_size))
goto toobig;
if (buf->offset != buf->chan->subbuf_size + 1) {
buf->prev_padding = buf->chan->subbuf_size - buf->offset;
old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
buf->padding[old_subbuf] = buf->prev_padding;
buf->subbufs_produced++;
if (buf->dentry)
d_inode(buf->dentry)->i_size +=
buf->chan->subbuf_size -
buf->padding[old_subbuf];
else
buf->early_bytes += buf->chan->subbuf_size -
buf->padding[old_subbuf];
smp_mb();
if (waitqueue_active(&buf->read_wait))
/*
* Calling wake_up_interruptible() from here
* will deadlock if we happen to be logging
* from the scheduler (trying to re-grab
* rq->lock), so defer it.
*/
mod_timer(&buf->timer, jiffies + 1);
}
old = buf->data;
new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
new = buf->start + new_subbuf * buf->chan->subbuf_size;
buf->offset = 0;
if (!buf->chan->cb->subbuf_start(buf, new, old, buf->prev_padding)) {
buf->offset = buf->chan->subbuf_size + 1;
return 0;
}
buf->data = new;
buf->padding[new_subbuf] = 0;
if (unlikely(length + buf->offset > buf->chan->subbuf_size))
goto toobig;
return length;
toobig:
buf->chan->last_toobig = length;
return 0;
}
EXPORT_SYMBOL_GPL(relay_switch_subbuf);
/**
* relay_subbufs_consumed - update the buffer's sub-buffers-consumed count
* @chan: the channel
* @cpu: the cpu associated with the channel buffer to update
* @subbufs_consumed: number of sub-buffers to add to current buf's count
*
* Adds to the channel buffer's consumed sub-buffer count.
* subbufs_consumed should be the number of sub-buffers newly consumed,
* not the total consumed.
*
* NOTE. Kernel clients don't need to call this function if the channel
* mode is 'overwrite'.
*/
void relay_subbufs_consumed(struct rchan *chan,
unsigned int cpu,
size_t subbufs_consumed)
{
struct rchan_buf *buf;
if (!chan)
return;
if (cpu >= NR_CPUS || !chan->buf[cpu] ||
subbufs_consumed > chan->n_subbufs)
return;
buf = chan->buf[cpu];
if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed)
buf->subbufs_consumed = buf->subbufs_produced;
else
buf->subbufs_consumed += subbufs_consumed;
}
EXPORT_SYMBOL_GPL(relay_subbufs_consumed);
/**
* relay_close - close the channel
* @chan: the channel
*
* Closes all channel buffers and frees the channel.
*/
void relay_close(struct rchan *chan)
{
unsigned int i;
if (!chan)
return;
mutex_lock(&relay_channels_mutex);
if (chan->is_global && chan->buf[0])
relay_close_buf(chan->buf[0]);
else
for_each_possible_cpu(i)
if (chan->buf[i])
relay_close_buf(chan->buf[i]);
if (chan->last_toobig)
printk(KERN_WARNING "relay: one or more items not logged "
"[item size (%Zd) > sub-buffer size (%Zd)]\n",
chan->last_toobig, chan->subbuf_size);
list_del(&chan->list);
kref_put(&chan->kref, relay_destroy_channel);
mutex_unlock(&relay_channels_mutex);
}
EXPORT_SYMBOL_GPL(relay_close);
/**
* relay_flush - close the channel
* @chan: the channel
*
* Flushes all channel buffers, i.e. forces buffer switch.
*/
void relay_flush(struct rchan *chan)
{
unsigned int i;
if (!chan)
return;
if (chan->is_global && chan->buf[0]) {
relay_switch_subbuf(chan->buf[0], 0);
return;
}
mutex_lock(&relay_channels_mutex);
for_each_possible_cpu(i)
if (chan->buf[i])
relay_switch_subbuf(chan->buf[i], 0);
mutex_unlock(&relay_channels_mutex);
}
EXPORT_SYMBOL_GPL(relay_flush);
/**
* relay_file_open - open file op for relay files
* @inode: the inode
* @filp: the file
*
* Increments the channel buffer refcount.
*/
static int relay_file_open(struct inode *inode, struct file *filp)
{
struct rchan_buf *buf = inode->i_private;
kref_get(&buf->kref);
filp->private_data = buf;
return nonseekable_open(inode, filp);
}
/**
* relay_file_mmap - mmap file op for relay files
* @filp: the file
* @vma: the vma describing what to map
*
* Calls upon relay_mmap_buf() to map the file into user space.
*/
static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma)
{
struct rchan_buf *buf = filp->private_data;
return relay_mmap_buf(buf, vma);
}
/**
* relay_file_poll - poll file op for relay files
* @filp: the file
* @wait: poll table
*
* Poll implemention.
*/
static unsigned int relay_file_poll(struct file *filp, poll_table *wait)
{
unsigned int mask = 0;
struct rchan_buf *buf = filp->private_data;
if (buf->finalized)
return POLLERR;
if (filp->f_mode & FMODE_READ) {
poll_wait(filp, &buf->read_wait, wait);
if (!relay_buf_empty(buf))
mask |= POLLIN | POLLRDNORM;
}
return mask;
}
/**
* relay_file_release - release file op for relay files
* @inode: the inode
* @filp: the file
*
* Decrements the channel refcount, as the filesystem is
* no longer using it.
*/
static int relay_file_release(struct inode *inode, struct file *filp)
{
struct rchan_buf *buf = filp->private_data;
kref_put(&buf->kref, relay_remove_buf);
return 0;
}
/*
* relay_file_read_consume - update the consumed count for the buffer
*/
static void relay_file_read_consume(struct rchan_buf *buf,
size_t read_pos,
size_t bytes_consumed)
{
size_t subbuf_size = buf->chan->subbuf_size;
size_t n_subbufs = buf->chan->n_subbufs;
size_t read_subbuf;
if (buf->subbufs_produced == buf->subbufs_consumed &&
buf->offset == buf->bytes_consumed)
return;
if (buf->bytes_consumed + bytes_consumed > subbuf_size) {
relay_subbufs_consumed(buf->chan, buf->cpu, 1);
buf->bytes_consumed = 0;
}
buf->bytes_consumed += bytes_consumed;
if (!read_pos)
read_subbuf = buf->subbufs_consumed % n_subbufs;
else
read_subbuf = read_pos / buf->chan->subbuf_size;
if (buf->bytes_consumed + buf->padding[read_subbuf] == subbuf_size) {
if ((read_subbuf == buf->subbufs_produced % n_subbufs) &&
(buf->offset == subbuf_size))
return;
relay_subbufs_consumed(buf->chan, buf->cpu, 1);
buf->bytes_consumed = 0;
}
}
/*
* relay_file_read_avail - boolean, are there unconsumed bytes available?
*/
static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
{
size_t subbuf_size = buf->chan->subbuf_size;
size_t n_subbufs = buf->chan->n_subbufs;
size_t produced = buf->subbufs_produced;
size_t consumed = buf->subbufs_consumed;
relay_file_read_consume(buf, read_pos, 0);
consumed = buf->subbufs_consumed;
if (unlikely(buf->offset > subbuf_size)) {
if (produced == consumed)
return 0;
return 1;
}
if (unlikely(produced - consumed >= n_subbufs)) {
consumed = produced - n_subbufs + 1;
buf->subbufs_consumed = consumed;
buf->bytes_consumed = 0;
}
produced = (produced % n_subbufs) * subbuf_size + buf->offset;
consumed = (consumed % n_subbufs) * subbuf_size + buf->bytes_consumed;
if (consumed > produced)
produced += n_subbufs * subbuf_size;
if (consumed == produced) {
if (buf->offset == subbuf_size &&
buf->subbufs_produced > buf->subbufs_consumed)
return 1;
return 0;
}
return 1;
}
/**
* relay_file_read_subbuf_avail - return bytes available in sub-buffer
* @read_pos: file read position
* @buf: relay channel buffer
*/
static size_t relay_file_read_subbuf_avail(size_t read_pos,
struct rchan_buf *buf)
{
size_t padding, avail = 0;
size_t read_subbuf, read_offset, write_subbuf, write_offset;
size_t subbuf_size = buf->chan->subbuf_size;
write_subbuf = (buf->data - buf->start) / subbuf_size;
write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset;
read_subbuf = read_pos / subbuf_size;
read_offset = read_pos % subbuf_size;
padding = buf->padding[read_subbuf];
if (read_subbuf == write_subbuf) {
if (read_offset + padding < write_offset)
avail = write_offset - (read_offset + padding);
} else
avail = (subbuf_size - padding) - read_offset;
return avail;
}
/**
* relay_file_read_start_pos - find the first available byte to read
* @read_pos: file read position
* @buf: relay channel buffer
*
* If the @read_pos is in the middle of padding, return the
* position of the first actually available byte, otherwise
* return the original value.
*/
static size_t relay_file_read_start_pos(size_t read_pos,
struct rchan_buf *buf)
{
size_t read_subbuf, padding, padding_start, padding_end;
size_t subbuf_size = buf->chan->subbuf_size;
size_t n_subbufs = buf->chan->n_subbufs;
size_t consumed = buf->subbufs_consumed % n_subbufs;
if (!read_pos)
read_pos = consumed * subbuf_size + buf->bytes_consumed;
read_subbuf = read_pos / subbuf_size;
padding = buf->padding[read_subbuf];
padding_start = (read_subbuf + 1) * subbuf_size - padding;
padding_end = (read_subbuf + 1) * subbuf_size;
if (read_pos >= padding_start && read_pos < padding_end) {
read_subbuf = (read_subbuf + 1) % n_subbufs;
read_pos = read_subbuf * subbuf_size;
}
return read_pos;
}
/**
* relay_file_read_end_pos - return the new read position
* @read_pos: file read position
* @buf: relay channel buffer
* @count: number of bytes to be read
*/
static size_t relay_file_read_end_pos(struct rchan_buf *buf,
size_t read_pos,
size_t count)
{
size_t read_subbuf, padding, end_pos;
size_t subbuf_size = buf->chan->subbuf_size;
size_t n_subbufs = buf->chan->n_subbufs;
read_subbuf = read_pos / subbuf_size;
padding = buf->padding[read_subbuf];
if (read_pos % subbuf_size + count + padding == subbuf_size)
end_pos = (read_subbuf + 1) * subbuf_size;
else
end_pos = read_pos + count;
if (end_pos >= subbuf_size * n_subbufs)
end_pos = 0;
return end_pos;
}
/*
* subbuf_read_actor - read up to one subbuf's worth of data
*/
static int subbuf_read_actor(size_t read_start,
struct rchan_buf *buf,
size_t avail,
read_descriptor_t *desc)
{
void *from;
int ret = 0;
from = buf->start + read_start;
ret = avail;
if (copy_to_user(desc->arg.buf, from, avail)) {
desc->error = -EFAULT;
ret = 0;
}
desc->arg.data += ret;
desc->written += ret;
desc->count -= ret;
return ret;
}
typedef int (*subbuf_actor_t) (size_t read_start,
struct rchan_buf *buf,
size_t avail,
read_descriptor_t *desc);
/*
* relay_file_read_subbufs - read count bytes, bridging subbuf boundaries
*/
static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
subbuf_actor_t subbuf_actor,
read_descriptor_t *desc)
{
struct rchan_buf *buf = filp->private_data;
size_t read_start, avail;
int ret;
if (!desc->count)
return 0;
mutex_lock(&file_inode(filp)->i_mutex);
do {
if (!relay_file_read_avail(buf, *ppos))
break;
read_start = relay_file_read_start_pos(*ppos, buf);
avail = relay_file_read_subbuf_avail(read_start, buf);
if (!avail)
break;
avail = min(desc->count, avail);
ret = subbuf_actor(read_start, buf, avail, desc);
if (desc->error < 0)
break;
if (ret) {
relay_file_read_consume(buf, read_start, ret);
*ppos = relay_file_read_end_pos(buf, read_start, ret);
}
} while (desc->count && ret);
mutex_unlock(&file_inode(filp)->i_mutex);
return desc->written;
}
static ssize_t relay_file_read(struct file *filp,
char __user *buffer,
size_t count,
loff_t *ppos)
{
read_descriptor_t desc;
desc.written = 0;
desc.count = count;
desc.arg.buf = buffer;
desc.error = 0;
return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, &desc);
}
static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed)
{
rbuf->bytes_consumed += bytes_consumed;
if (rbuf->bytes_consumed >= rbuf->chan->subbuf_size) {
relay_subbufs_consumed(rbuf->chan, rbuf->cpu, 1);
rbuf->bytes_consumed %= rbuf->chan->subbuf_size;
}
}
static void relay_pipe_buf_release(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
struct rchan_buf *rbuf;
rbuf = (struct rchan_buf *)page_private(buf->page);
relay_consume_bytes(rbuf, buf->private);
}
static const struct pipe_buf_operations relay_pipe_buf_ops = {
.can_merge = 0,
.confirm = generic_pipe_buf_confirm,
.release = relay_pipe_buf_release,
.steal = generic_pipe_buf_steal,
.get = generic_pipe_buf_get,
};
static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i)
{
}
/*
* subbuf_splice_actor - splice up to one subbuf's worth of data
*/
static ssize_t subbuf_splice_actor(struct file *in,
loff_t *ppos,
struct pipe_inode_info *pipe,
size_t len,
unsigned int flags,
int *nonpad_ret)
{
unsigned int pidx, poff, total_len, subbuf_pages, nr_pages;
struct rchan_buf *rbuf = in->private_data;
unsigned int subbuf_size = rbuf->chan->subbuf_size;
uint64_t pos = (uint64_t) *ppos;
uint32_t alloc_size = (uint32_t) rbuf->chan->alloc_size;
size_t read_start = (size_t) do_div(pos, alloc_size);
size_t read_subbuf = read_start / subbuf_size;
size_t padding = rbuf->padding[read_subbuf];
size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding;
struct page *pages[PIPE_DEF_BUFFERS];
struct partial_page partial[PIPE_DEF_BUFFERS];
struct splice_pipe_desc spd = {
.pages = pages,
.nr_pages = 0,
.nr_pages_max = PIPE_DEF_BUFFERS,
.partial = partial,
.flags = flags,
.ops = &relay_pipe_buf_ops,
.spd_release = relay_page_release,
};
ssize_t ret;
if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
return 0;
if (splice_grow_spd(pipe, &spd))
return -ENOMEM;
/*
* Adjust read len, if longer than what is available
*/
if (len > (subbuf_size - read_start % subbuf_size))
len = subbuf_size - read_start % subbuf_size;
subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
pidx = (read_start / PAGE_SIZE) % subbuf_pages;
poff = read_start & ~PAGE_MASK;
nr_pages = min_t(unsigned int, subbuf_pages, spd.nr_pages_max);
for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) {
unsigned int this_len, this_end, private;
unsigned int cur_pos = read_start + total_len;
if (!len)
break;
this_len = min_t(unsigned long, len, PAGE_SIZE - poff);
private = this_len;
spd.pages[spd.nr_pages] = rbuf->page_array[pidx];
spd.partial[spd.nr_pages].offset = poff;
this_end = cur_pos + this_len;
if (this_end >= nonpad_end) {
this_len = nonpad_end - cur_pos;
private = this_len + padding;
}
spd.partial[spd.nr_pages].len = this_len;
spd.partial[spd.nr_pages].private = private;
len -= this_len;
total_len += this_len;
poff = 0;
pidx = (pidx + 1) % subbuf_pages;
if (this_end >= nonpad_end) {
spd.nr_pages++;
break;
}
}
ret = 0;
if (!spd.nr_pages)
goto out;
ret = *nonpad_ret = splice_to_pipe(pipe, &spd);
if (ret < 0 || ret < total_len)
goto out;
if (read_start + ret == nonpad_end)
ret += padding;
out:
splice_shrink_spd(&spd);
return ret;
}
static ssize_t relay_file_splice_read(struct file *in,
loff_t *ppos,
struct pipe_inode_info *pipe,
size_t len,
unsigned int flags)
{
ssize_t spliced;
int ret;
int nonpad_ret = 0;
ret = 0;
spliced = 0;
while (len && !spliced) {
ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret);
if (ret < 0)
break;
else if (!ret) {
if (flags & SPLICE_F_NONBLOCK)
ret = -EAGAIN;
break;
}
*ppos += ret;
if (ret > len)
len = 0;
else
len -= ret;
spliced += nonpad_ret;
nonpad_ret = 0;
}
if (spliced)
return spliced;
return ret;
}
const struct file_operations relay_file_operations = {
.open = relay_file_open,
.poll = relay_file_poll,
.mmap = relay_file_mmap,
.read = relay_file_read,
.llseek = no_llseek,
.release = relay_file_release,
.splice_read = relay_file_splice_read,
};
EXPORT_SYMBOL_GPL(relay_file_operations);
static __init int relay_init(void)
{
hotcpu_notifier(relay_hotcpu_callback, 0);
return 0;
}
early_initcall(relay_init);
C:\Users\Admin\Desktop\linux-4.2.y-new\linux-4.2.y\kernel\/resource.c
/*
* linux/kernel/resource.c
*
* Copyright (C) 1999 Linus Torvalds
* Copyright (C) 1999 Martin Mares <mj@ucw.cz>
*
* Arbitrary resource management.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/export.h>
#include <linux/errno.h>
#include <linux/ioport.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/fs.h>
#include <linux/proc_fs.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/device.h>
#include <linux/pfn.h>
#include <linux/mm.h>
#include <linux/resource_ext.h>
#include <asm/io.h>
struct resource ioport_resource = {
.name = "PCI IO",
.start = 0,
.end = IO_SPACE_LIMIT,
.flags = IORESOURCE_IO,
};
EXPORT_SYMBOL(ioport_resource);
struct resource iomem_resource = {
.name = "PCI mem",
.start = 0,
.end = -1,
.flags = IORESOURCE_MEM,
};
EXPORT_SYMBOL(iomem_resource);
/* constraints to be met while allocating resources */
struct resource_constraint {
resource_size_t min, max, align;
resource_size_t (*alignf)(void *, const struct resource *,
resource_size_t, resource_size_t);
void *alignf_data;
};
static DEFINE_RWLOCK(resource_lock);
/*
* For memory hotplug, there is no way to free resource entries allocated
* by boot mem after the system is up. So for reusing the resource entry
* we need to remember the resource.
*/
static struct resource *bootmem_resource_free;
static DEFINE_SPINLOCK(bootmem_resource_lock);
static struct resource *next_resource(struct resource *p, bool sibling_only)
{
/* Caller wants to traverse through siblings only */
if (sibling_only)
return p->sibling;
if (p->child)
return p->child;
while (!p->sibling && p->parent)
p = p->parent;
return p->sibling;
}
static void *r_next(struct seq_file *m, void *v, loff_t *pos)
{
struct resource *p = v;
(*pos)++;
return (void *)next_resource(p, false);
}
#ifdef CONFIG_PROC_FS
enum { MAX_IORES_LEVEL = 5 };
static void *r_start(struct seq_file *m, loff_t *pos)
__acquires(resource_lock)
{
struct resource *p = m->private;
loff_t l = 0;
read_lock(&resource_lock);
for (p = p->child; p && l < *pos; p = r_next(m, p, &l))
;
return p;
}
static void r_stop(struct seq_file *m, void *v)
__releases(resource_lock)
{
read_unlock(&resource_lock);
}
static int r_show(struct seq_file *m, void *v)
{
struct resource *root = m->private;
struct resource *r = v, *p;
int width = root->end < 0x10000 ? 4 : 8;
int depth;
for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent)
if (p->parent == root)
break;
seq_printf(m, "%*s%0*llx-%0*llx : %s\n",
depth * 2, "",
width, (unsigned long long) r->start,
width, (unsigned long long) r->end,
r->name ? r->name : "<BAD>");
return 0;
}
static const struct seq_operations resource_op = {
.start = r_start,
.next = r_next,
.stop = r_stop,
.show = r_show,
};
static int ioports_open(struct inode *inode, struct file *file)
{
int res = seq_open(file, &resource_op);
if (!res) {
struct seq_file *m = file->private_data;
m->private = &ioport_resource;
}
return res;
}
static int iomem_open(struct inode *inode, struct file *file)
{
int res = seq_open(file, &resource_op);
if (!res) {
struct seq_file *m = file->private_data;
m->private = &iomem_resource;
}
return res;
}
static const struct file_operations proc_ioports_operations = {
.open = ioports_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
static const struct file_operations proc_iomem_operations = {
.open = iomem_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
static int __init ioresources_init(void)
{
proc_create("ioports", 0, NULL, &proc_ioports_operations);
proc_create("iomem", 0, NULL, &proc_iomem_operations);
return 0;
}
__initcall(ioresources_init);
#endif /* CONFIG_PROC_FS */
static void free_resource(struct resource *res)
{
if (!res)
return;
if (!PageSlab(virt_to_head_page(res))) {
spin_lock(&bootmem_resource_lock);
res->sibling = bootmem_resource_free;
bootmem_resource_free = res;
spin_unlock(&bootmem_resource_lock);
} else {
kfree(res);
}
}
static struct resource *alloc_resource(gfp_t flags)
{
struct resource *res = NULL;
spin_lock(&bootmem_resource_lock);
if (bootmem_resource_free) {
res = bootmem_resource_free;
bootmem_resource_free = res->sibling;
}
spin_unlock(&bootmem_resource_lock);
if (res)
memset(res, 0, sizeof(struct resource));
else
res = kzalloc(sizeof(struct resource), flags);
return res;
}
/* Return the conflict entry if you can't request it */
static struct resource * __request_resource(struct resource *root, struct resource *new)
{
resource_size_t start = new->start;
resource_size_t end = new->end;
struct resource *tmp, **p;
if (end < start)
return root;
if (start < root->start)
return root;
if (end > root->end)
return root;
p = &root->child;
for (;;) {
tmp = *p;
if (!tmp || tmp->start > end) {
new->sibling = tmp;
*p = new;
new->parent = root;
return NULL;
}
p = &tmp->sibling;
if (tmp->end < start)
continue;
return tmp;
}
}
static int __release_resource(struct resource *old)
{
struct resource *tmp, **p;
p = &old->parent->child;
for (;;) {
tmp = *p;
if (!tmp)
break;
if (tmp == old) {
*p = tmp->sibling;
old->parent = NULL;
return 0;
}
p = &tmp->sibling;
}
return -EINVAL;
}
static void __release_child_resources(struct resource *r)
{
struct resource *tmp, *p;
resource_size_t size;
p = r->child;
r->child = NULL;
while (p) {
tmp = p;
p = p->sibling;
tmp->parent = NULL;
tmp->sibling = NULL;
__release_child_resources(tmp);
printk(KERN_DEBUG "release child resource %pR\n", tmp);
/* need to restore size, and keep flags */
size = resource_size(tmp);
tmp->start = 0;
tmp->end = size - 1;
}
}
kkkkkkk31
最新推荐文章于 2024-08-30 14:11:28 发布