The Linux Kernel Module Programming Guide - part2

最新推荐文章于 2024-07-14 13:21:09 发布

maimang09

最新推荐文章于 2024-07-14 13:21:09 发布

阅读量184

点赞数

文章标签： java 开发语言

原文链接：https://sysprog21.github.io/lkmpg/

版权

12 Avoiding Collisions and Deadlocks

If processes running on different CPUs or in different threads try to access the same memory, then it is possible that strange things can happen or your system can lock up. To avoid this, various types of mutual exclusion kernel functions are available. These indicate if a section of code is "locked" or "unlocked" so that simultaneous attempts to run it can not happen.

12.1 Mutex

You can use kernel mutexes (mutual exclusions) in much the same manner that you might deploy them in userland. This may be all that is needed to avoid collisions in most cases.

1/* 
2 * example_mutex.c 
3 */ 
4#include <linux/module.h> 
5#include <linux/mutex.h> 
6#include <linux/printk.h> 
7 
8static DEFINE_MUTEX(mymutex); 
9 
10static int __init example_mutex_init(void) 
11{ 
12    int ret; 
13 
14    pr_info("example_mutex init\n"); 
15 
16    ret = mutex_trylock(&mymutex); 
17    if (ret != 0) { 
18        pr_info("mutex is locked\n"); 
19 
20        if (mutex_is_locked(&mymutex) == 0) 
21            pr_info("The mutex failed to lock!\n"); 
22 
23        mutex_unlock(&mymutex); 
24        pr_info("mutex is unlocked\n"); 
25    } else 
26        pr_info("Failed to lock\n"); 
27 
28    return 0; 
29} 
30 
31static void __exit example_mutex_exit(void) 
32{ 
33    pr_info("example_mutex exit\n"); 
34} 
35 
36module_init(example_mutex_init); 
37module_exit(example_mutex_exit); 
38 
39MODULE_DESCRIPTION("Mutex example"); 
40MODULE_LICENSE("GPL");

12.2 Spinlocks

As the name suggests, spinlocks lock up the CPU that the code is running on, taking 100% of its resources. Because of this you should only use the spinlock mechanism around code which is likely to take no more than a few milliseconds to run and so will not noticeably slow anything down from the user’s point of view.

The example here is "irq safe" in that if interrupts happen during the lock then they will not be forgotten and will activate when the unlock happens, using the flags variable to retain their state.

1/* 
2 * example_spinlock.c 
3 */ 
4#include <linux/init.h> 
5#include <linux/module.h> 
6#include <linux/printk.h> 
7#include <linux/spinlock.h> 
8 
9static DEFINE_SPINLOCK(sl_static); 
10static spinlock_t sl_dynamic; 
11 
12static void example_spinlock_static(void) 
13{ 
14    unsigned long flags; 
15 
16    spin_lock_irqsave(&sl_static, flags); 
17    pr_info("Locked static spinlock\n"); 
18 
19    /* Do something or other safely. Because this uses 100% CPU time, this 
20     * code should take no more than a few milliseconds to run. 
21     */ 
22 
23    spin_unlock_irqrestore(&sl_static, flags); 
24    pr_info("Unlocked static spinlock\n"); 
25} 
26 
27static void example_spinlock_dynamic(void) 
28{ 
29    unsigned long flags; 
30 
31    spin_lock_init(&sl_dynamic); 
32    spin_lock_irqsave(&sl_dynamic, flags); 
33    pr_info("Locked dynamic spinlock\n"); 
34 
35    /* Do something or other safely. Because this uses 100% CPU time, this 
36     * code should take no more than a few milliseconds to run. 
37     */ 
38 
39    spin_unlock_irqrestore(&sl_dynamic, flags); 
40    pr_info("Unlocked dynamic spinlock\n"); 
41} 
42 
43static int __init example_spinlock_init(void) 
44{ 
45    pr_info("example spinlock started\n"); 
46 
47    example_spinlock_static(); 
48    example_spinlock_dynamic(); 
49 
50    return 0; 
51} 
52 
53static void __exit example_spinlock_exit(void) 
54{ 
55    pr_info("example spinlock exit\n"); 
56} 
57 
58module_init(example_spinlock_init); 
59module_exit(example_spinlock_exit); 
60 
61MODULE_DESCRIPTION("Spinlock example"); 
62MODULE_LICENSE("GPL");

12.3 Read and write locks

Read and write locks are specialised kinds of spinlocks so that you can exclusively read from something or write to something. Like the earlier spinlocks example, the one below shows an "irq safe" situation in which if other functions were triggered from irqs which might also read and write to whatever you are concerned with then they would not disrupt the logic. As before it is a good idea to keep anything done within the lock as short as possible so that it does not hang up the system and cause users to start revolting against the tyranny of your module.

1/* 
2 * example_rwlock.c 
3 */ 
4#include <linux/module.h> 
5#include <linux/printk.h> 
6#include <linux/rwlock.h> 
7 
8static DEFINE_RWLOCK(myrwlock); 
9 
10static void example_read_lock(void) 
11{ 
12    unsigned long flags; 
13 
14    read_lock_irqsave(&myrwlock, flags); 
15    pr_info("Read Locked\n"); 
16 
17    /* Read from something */ 
18 
19    read_unlock_irqrestore(&myrwlock, flags); 
20    pr_info("Read Unlocked\n"); 
21} 
22 
23static void example_write_lock(void) 
24{ 
25    unsigned long flags; 
26 
27    write_lock_irqsave(&myrwlock, flags); 
28    pr_info("Write Locked\n"); 
29 
30    /* Write to something */ 
31 
32    write_unlock_irqrestore(&myrwlock, flags); 
33    pr_info("Write Unlocked\n"); 
34} 
35 
36static int __init example_rwlock_init(void) 
37{ 
38    pr_info("example_rwlock started\n"); 
39 
40    example_read_lock(); 
41    example_write_lock(); 
42 
43    return 0; 
44} 
45 
46static void __exit example_rwlock_exit(void) 
47{ 
48    pr_info("example_rwlock exit\n"); 
49} 
50 
51module_init(example_rwlock_init); 
52module_exit(example_rwlock_exit); 
53 
54MODULE_DESCRIPTION("Read/Write locks example"); 
55MODULE_LICENSE("GPL");

Of course, if you know for sure that there are no functions triggered by irqs which could possibly interfere with your logic then you can use the simpler read_lock(&myrwlock) and read_unlock(&myrwlock) or the corresponding write functions.

12.4 Atomic operations

If you are doing simple arithmetic: adding, subtracting or bitwise operations, then there is another way in the multi-CPU and multi-hyperthreaded world to stop other parts of the system from messing with your mojo. By using atomic operations you can be confident that your addition, subtraction or bit flip did actually happen and was not overwritten by some other shenanigans. An example is shown below.

1/* 
2 * example_atomic.c 
3 */ 
4#include <linux/atomic.h> 
5#include <linux/bitops.h> 
6#include <linux/module.h> 
7#include <linux/printk.h> 
8 
9#define BYTE_TO_BINARY_PATTERN "%c%c%c%c%c%c%c%c" 
10#define BYTE_TO_BINARY(byte)                                                   \ 
11    ((byte & 0x80) ? '1' : '0'), ((byte & 0x40) ? '1' : '0'),                  \ 
12        ((byte & 0x20) ? '1' : '0'), ((byte & 0x10) ? '1' : '0'),              \ 
13        ((byte & 0x08) ? '1' : '0'), ((byte & 0x04) ? '1' : '0'),              \ 
14        ((byte & 0x02) ? '1' : '0'), ((byte & 0x01) ? '1' : '0') 
15 
16static void atomic_add_subtract(void) 
17{ 
18    atomic_t debbie; 
19    atomic_t chris = ATOMIC_INIT(50); 
20 
21    atomic_set(&debbie, 45); 
22 
23    /* subtract one */ 
24    atomic_dec(&debbie); 
25 
26    atomic_add(7, &debbie); 
27 
28    /* add one */ 
29    atomic_inc(&debbie); 
30 
31    pr_info("chris: %d, debbie: %d\n", atomic_read(&chris), 
32            atomic_read(&debbie)); 
33} 
34 
35static void atomic_bitwise(void) 
36{ 
37    unsigned long word = 0; 
38 
39    pr_info("Bits 0: " BYTE_TO_BINARY_PATTERN, BYTE_TO_BINARY(word)); 
40    set_bit(3, &word); 
41    set_bit(5, &word); 
42    pr_info("Bits 1: " BYTE_TO_BINARY_PATTERN, BYTE_TO_BINARY(word)); 
43    clear_bit(5, &word); 
44    pr_info("Bits 2: " BYTE_TO_BINARY_PATTERN, BYTE_TO_BINARY(word)); 
45    change_bit(3, &word); 
46 
47    pr_info("Bits 3: " BYTE_TO_BINARY_PATTERN, BYTE_TO_BINARY(word)); 
48    if (test_and_set_bit(3, &word)) 
49        pr_info("wrong\n"); 
50    pr_info("Bits 4: " BYTE_TO_BINARY_PATTERN, BYTE_TO_BINARY(word)); 
51 
52    word = 255; 
53    pr_info("Bits 5: " BYTE_TO_BINARY_PATTERN, BYTE_TO_BINARY(word)); 
54} 
55 
56static int __init example_atomic_init(void) 
57{ 
58    pr_info("example_atomic started\n"); 
59 
60    atomic_add_subtract(); 
61    atomic_bitwise(); 
62 
63    return 0; 
64} 
65 
66static void __exit example_atomic_exit(void) 
67{ 
68    pr_info("example_atomic exit\n"); 
69} 
70 
71module_init(example_atomic_init); 
72module_exit(example_atomic_exit); 
73 
74MODULE_DESCRIPTION("Atomic operations example"); 
75MODULE_LICENSE("GPL");

Before the C11 standard adopts the built-in atomic types, the kernel already provided a small set of atomic types by using a bunch of tricky architecture-specific codes. Implementing the atomic types by C11 atomics may allow the kernel to throw away the architecture-specific codes and letting the kernel code be more friendly to the people who understand the standard. But there are some problems, such as the memory model of the kernel doesn’t match the model formed by the C11 atomics. For further details, see:

13 Replacing Print Macros

13.1 Replacement

In Section 1.7, it was noted that the X Window System and kernel module programming are not conducive to integration. This remains valid during the development of kernel modules. However, in practical scenarios, the necessity emerges to relay messages to the tty (teletype) originating the module load command.

The term “tty” originates from teletype, which initially referred to a combined keyboard-printer for Unix system communication. Today, it signifies a text stream abstraction employed by Unix programs, encompassing physical terminals, xterms in X displays, and network connections like SSH.

To achieve this, the “current” pointer is leveraged to access the active task’s tty structure. Within this structure lies a pointer to a string write function, facilitating the string’s transmission to the tty.

1/* 
2 * print_string.c - Send output to the tty we're running on, regardless if 
3 * it is through X11, telnet, etc.  We do this by printing the string to the 
4 * tty associated with the current task. 
5 */ 
6#include <linux/init.h> 
7#include <linux/kernel.h> 
8#include <linux/module.h> 
9#include <linux/sched.h> /* For current */ 
10#include <linux/tty.h> /* For the tty declarations */ 
11 
12static void print_string(char *str) 
13{ 
14    /* The tty for the current task */ 
15    struct tty_struct *my_tty = get_current_tty(); 
16 
17    /* If my_tty is NULL, the current task has no tty you can print to (i.e., 
18     * if it is a daemon). If so, there is nothing we can do. 
19     */ 
20    if (my_tty) { 
21        const struct tty_operations *ttyops = my_tty->driver->ops; 
22        /* my_tty->driver is a struct which holds the tty's functions, 
23         * one of which (write) is used to write strings to the tty. 
24         * It can be used to take a string either from the user's or 
25         * kernel's memory segment. 
26         * 
27         * The function's 1st parameter is the tty to write to, because the 
28         * same function would normally be used for all tty's of a certain 
29         * type. 
30         * The 2nd parameter is a pointer to a string. 
31         * The 3rd parameter is the length of the string. 
32         * 
33         * As you will see below, sometimes it's necessary to use 
34         * preprocessor stuff to create code that works for different 
35         * kernel versions. The (naive) approach we've taken here does not 
36         * scale well. The right way to deal with this is described in 
37         * section 2 of 
38         * linux/Documentation/SubmittingPatches 
39         */ 
40        (ttyops->write)(my_tty, /* The tty itself */ 
41                        str, /* String */ 
42                        strlen(str)); /* Length */ 
43 
44        /* ttys were originally hardware devices, which (usually) strictly 
45         * followed the ASCII standard. In ASCII, to move to a new line you 
46         * need two characters, a carriage return and a line feed. On Unix, 
47         * the ASCII line feed is used for both purposes - so we can not 
48         * just use \n, because it would not have a carriage return and the 
49         * next line will start at the column right after the line feed. 
50         * 
51         * This is why text files are different between Unix and MS Windows. 
52         * In CP/M and derivatives, like MS-DOS and MS Windows, the ASCII 
53         * standard was strictly adhered to, and therefore a newline requires 
54         * both a LF and a CR. 
55         */ 
56        (ttyops->write)(my_tty, "\015\012", 2); 
57    } 
58} 
59 
60static int __init print_string_init(void) 
61{ 
62    print_string("The module has been inserted.  Hello world!"); 
63    return 0; 
64} 
65 
66static void __exit print_string_exit(void) 
67{ 
68    print_string("The module has been removed.  Farewell world!"); 
69} 
70 
71module_init(print_string_init); 
72module_exit(print_string_exit); 
73 
74MODULE_LICENSE("GPL");

13.2 Flashing keyboard LEDs

In certain conditions, you may desire a simpler and more direct way to communicate to the external world. Flashing keyboard LEDs can be such a solution: It is an immediate way to attract attention or to display a status condition. Keyboard LEDs are present on every hardware, they are always visible, they do not need any setup, and their use is rather simple and non-intrusive, compared to writing to a tty or a file.

From v4.14 to v4.15, the timer API made a series of changes to improve memory safety. A buffer overflow in the area of a timer_list structure may be able to overwrite the function and data fields, providing the attacker with a way to use return-object programming (ROP) to call arbitrary functions within the kernel. Also, the function prototype of the callback, containing a unsigned long argument, will prevent work from any type checking. Furthermore, the function prototype with unsigned long argument may be an obstacle to the forward-edge protection of control-flow integrity. Thus, it is better to use a unique prototype to separate from the cluster that takes an unsigned long argument. The timer callback should be passed a pointer to the timer_list structure rather than an unsigned long argument. Then, it wraps all the information the callback needs, including the timer_list structure, into a larger structure, and it can use the container_of macro instead of the unsigned long value. For more information see: Improving the kernel timers API.

Before Linux v4.14, setup_timer was used to initialize the timer and the timer_list structure looked like:

1struct timer_list { 
2    unsigned long expires; 
3    void (*function)(unsigned long); 
4    unsigned long data; 
5    u32 flags; 
6    /* ... */ 
7}; 
8 
9void setup_timer(struct timer_list *timer, void (*callback)(unsigned long), 
10                 unsigned long data);

Since Linux v4.14, timer_setup is adopted and the kernel step by step converting to timer_setup from setup_timer . One of the reasons why API was changed is it need to coexist with the old version interface. Moreover, the timer_setup was implemented by setup_timer at first.

1void timer_setup(struct timer_list *timer, 
2                 void (*callback)(struct timer_list *), unsigned int flags);

The setup_timer was then removed since v4.15. As a result, the timer_list structure had changed to the following.

1struct timer_list { 
2    unsigned long expires; 
3    void (*function)(struct timer_list *); 
4    u32 flags; 
5    /* ... */ 
6};

The following source code illustrates a minimal kernel module which, when loaded, starts blinking the keyboard LEDs until it is unloaded.

1/* 
2 * kbleds.c - Blink keyboard leds until the module is unloaded. 
3 */ 
4 
5#include <linux/init.h> 
6#include <linux/kd.h> /* For KDSETLED */ 
7#include <linux/module.h> 
8#include <linux/tty.h> /* For tty_struct */ 
9#include <linux/vt.h> /* For MAX_NR_CONSOLES */ 
10#include <linux/vt_kern.h> /* for fg_console */ 
11#include <linux/console_struct.h> /* For vc_cons */ 
12 
13MODULE_DESCRIPTION("Example module illustrating the use of Keyboard LEDs."); 
14 
15static struct timer_list my_timer; 
16static struct tty_driver *my_driver; 
17static unsigned long kbledstatus = 0; 
18 
19#define BLINK_DELAY HZ / 5 
20#define ALL_LEDS_ON 0x07 
21#define RESTORE_LEDS 0xFF 
22 
23/* Function my_timer_func blinks the keyboard LEDs periodically by invoking 
24 * command KDSETLED of ioctl() on the keyboard driver. To learn more on virtual 
25 * terminal ioctl operations, please see file: 
26 *   drivers/tty/vt/vt_ioctl.c, function vt_ioctl(). 
27 * 
28 * The argument to KDSETLED is alternatively set to 7 (thus causing the led 
29 * mode to be set to LED_SHOW_IOCTL, and all the leds are lit) and to 0xFF 
30 * (any value above 7 switches back the led mode to LED_SHOW_FLAGS, thus 
31 * the LEDs reflect the actual keyboard status).  To learn more on this, 
32 * please see file: drivers/tty/vt/keyboard.c, function setledstate(). 
33 */ 
34static void my_timer_func(struct timer_list *unused) 
35{ 
36    struct tty_struct *t = vc_cons[fg_console].d->port.tty; 
37 
38    if (kbledstatus == ALL_LEDS_ON) 
39        kbledstatus = RESTORE_LEDS; 
40    else 
41        kbledstatus = ALL_LEDS_ON; 
42 
43    (my_driver->ops->ioctl)(t, KDSETLED, kbledstatus); 
44 
45    my_timer.expires = jiffies + BLINK_DELAY; 
46    add_timer(&my_timer); 
47} 
48 
49static int __init kbleds_init(void) 
50{ 
51    int i; 
52 
53    pr_info("kbleds: loading\n"); 
54    pr_info("kbleds: fgconsole is %x\n", fg_console); 
55    for (i = 0; i < MAX_NR_CONSOLES; i++) { 
56        if (!vc_cons[i].d) 
57            break; 
58        pr_info("poet_atkm: console[%i/%i] #%i, tty %p\n", i, MAX_NR_CONSOLES, 
59                vc_cons[i].d->vc_num, (void *)vc_cons[i].d->port.tty); 
60    } 
61    pr_info("kbleds: finished scanning consoles\n"); 
62 
63    my_driver = vc_cons[fg_console].d->port.tty->driver; 
64    pr_info("kbleds: tty driver name %s\n", my_driver->driver_name); 
65 
66    /* Set up the LED blink timer the first time. */ 
67    timer_setup(&my_timer, my_timer_func, 0); 
68    my_timer.expires = jiffies + BLINK_DELAY; 
69    add_timer(&my_timer); 
70 
71    return 0; 
72} 
73 
74static void __exit kbleds_cleanup(void) 
75{ 
76    pr_info("kbleds: unloading...\n"); 
77    del_timer(&my_timer); 
78    (my_driver->ops->ioctl)(vc_cons[fg_console].d->port.tty, KDSETLED, 
79                            RESTORE_LEDS); 
80} 
81 
82module_init(kbleds_init); 
83module_exit(kbleds_cleanup); 
84 
85MODULE_LICENSE("GPL");

If none of the examples in this chapter fit your debugging needs, there might yet be some other tricks to try. Ever wondered what CONFIG_LL_DEBUG in make menuconfig is good for? If you activate that you get low level access to the serial port. While this might not sound very powerful by itself, you can patch kernel/printk.c or any other essential syscall to print ASCII characters, thus making it possible to trace virtually everything what your code does over a serial line. If you find yourself porting the kernel to some new and former unsupported architecture, this is usually amongst the first things that should be implemented. Logging over a netconsole might also be worth a try.

While you have seen lots of stuff that can be used to aid debugging here, there are some things to be aware of. Debugging is almost always intrusive. Adding debug code can change the situation enough to make the bug seem to disappear. Thus, you should keep debug code to a minimum and make sure it does not show up in production code.

14 Scheduling Tasks

There are two main ways of running tasks: tasklets and work queues. Tasklets are a quick and easy way of scheduling a single function to be run. For example, when triggered from an interrupt, whereas work queues are more complicated but also better suited to running multiple things in a sequence.

14.1 Tasklets

Here is an example tasklet module. The tasklet_fn function runs for a few seconds. In the meantime, execution of the example_tasklet_init function may continue to the exit point, depending on whether it is interrupted by softirq.

1/* 
2 * example_tasklet.c 
3 */ 
4#include <linux/delay.h> 
5#include <linux/interrupt.h> 
6#include <linux/module.h> 
7#include <linux/printk.h> 
8 
9/* Macro DECLARE_TASKLET_OLD exists for compatibility. 
10 * See https://lwn.net/Articles/830964/ 
11 */ 
12#ifndef DECLARE_TASKLET_OLD 
13#define DECLARE_TASKLET_OLD(arg1, arg2) DECLARE_TASKLET(arg1, arg2, 0L) 
14#endif 
15 
16static void tasklet_fn(unsigned long data) 
17{ 
18    pr_info("Example tasklet starts\n"); 
19    mdelay(5000); 
20    pr_info("Example tasklet ends\n"); 
21} 
22 
23static DECLARE_TASKLET_OLD(mytask, tasklet_fn); 
24 
25static int __init example_tasklet_init(void) 
26{ 
27    pr_info("tasklet example init\n"); 
28    tasklet_schedule(&mytask); 
29    mdelay(200); 
30    pr_info("Example tasklet init continues...\n"); 
31    return 0; 
32} 
33 
34static void __exit example_tasklet_exit(void) 
35{ 
36    pr_info("tasklet example exit\n"); 
37    tasklet_kill(&mytask); 
38} 
39 
40module_init(example_tasklet_init); 
41module_exit(example_tasklet_exit); 
42 
43MODULE_DESCRIPTION("Tasklet example"); 
44MODULE_LICENSE("GPL");

So with this example loaded dmesg should show:

tasklet example init
Example tasklet starts
Example tasklet init continues...
Example tasklet ends

Although tasklet is easy to use, it comes with several defators, and developers are discussing about getting rid of tasklet in linux kernel. The tasklet callback runs in atomic context, inside a software interrupt, meaning that it cannot sleep or access user-space data, so not all work can be done in a tasklet handler. Also, the kernel only allows one instance of any given tasklet to be running at any given time; multiple different tasklet callbacks can run in parallel.

In recent kernels, tasklets can be replaced by workqueues, timers, or threaded interrupts.1 While the removal of tasklets remains a longer-term goal, the current kernel contains more than a hundred uses of tasklets. Now developers are proceeding with the API changes and the macro DECLARE_TASKLET_OLD exists for compatibility. For further information, see Modernizing the tasklet API [LWN.net].

14.2 Work queues

To add a task to the scheduler we can use a workqueue. The kernel then uses the Completely Fair Scheduler (CFS) to execute work within the queue.

1/* 
2 * sched.c 
3 */ 
4#include <linux/init.h> 
5#include <linux/module.h> 
6#include <linux/workqueue.h> 
7 
8static struct workqueue_struct *queue = NULL; 
9static struct work_struct work; 
10 
11static void work_handler(struct work_struct *data) 
12{ 
13    pr_info("work handler function.\n"); 
14} 
15 
16static int __init sched_init(void) 
17{ 
18    queue = alloc_workqueue("HELLOWORLD", WQ_UNBOUND, 1); 
19    INIT_WORK(&work, work_handler); 
20    schedule_work(&work); 
21    return 0; 
22} 
23 
24static void __exit sched_exit(void) 
25{ 
26    destroy_workqueue(queue); 
27} 
28 
29module_init(sched_init); 
30module_exit(sched_exit); 
31 
32MODULE_LICENSE("GPL"); 
33MODULE_DESCRIPTION("Workqueue example");

15 Interrupt Handlers

15.1 Interrupt Handlers

Except for the last chapter, everything we did in the kernel so far we have done as a response to a process asking for it, either by dealing with a special file, sending an ioctl() , or issuing a system call. But the job of the kernel is not just to respond to process requests. Another job, which is every bit as important, is to speak to the hardware connected to the machine.

There are two types of interaction between the CPU and the rest of the computer’s hardware. The first type is when the CPU gives orders to the hardware, the other is when the hardware needs to tell the CPU something. The second, called interrupts, is much harder to implement because it has to be dealt with when convenient for the hardware, not the CPU. Hardware devices typically have a very small amount of RAM, and if you do not read their information when available, it is lost.

Under Linux, hardware interrupts are called IRQ’s (Interrupt ReQuests). There are two types of IRQ’s, short and long. A short IRQ is one which is expected to take a very short period of time, during which the rest of the machine will be blocked and no other interrupts will be handled. A long IRQ is one which can take longer, and during which other interrupts may occur (but not interrupts from the same device). If at all possible, it is better to declare an interrupt handler to be long.

When the CPU receives an interrupt, it stops whatever it is doing (unless it is processing a more important interrupt, in which case it will deal with this one only when the more important one is done), saves certain parameters on the stack and calls the interrupt handler. This means that certain things are not allowed in the interrupt handler itself, because the system is in an unknown state. Linux kernel solves the problem by splitting interrupt handling into two parts. The first part executes right away and masks the interrupt line. Hardware interrupts must be handled quickly, and that is why we need the second part to handle the heavy work deferred from an interrupt handler. Historically, BH (Linux naming for Bottom Halves) statistically book-keeps the deferred functions. Softirq and its higher level abstraction, Tasklet, replace BH since Linux 2.3.

The way to implement this is to call request_irq() to get your interrupt handler called when the relevant IRQ is received.

In practice IRQ handling can be a bit more complex. Hardware is often designed in a way that chains two interrupt controllers, so that all the IRQs from interrupt controller B are cascaded to a certain IRQ from interrupt controller A. Of course, that requires that the kernel finds out which IRQ it really was afterwards and that adds overhead. Other architectures offer some special, very low overhead, so called "fast IRQ" or FIQs. To take advantage of them requires handlers to be written in assembly language, so they do not really fit into the kernel. They can be made to work similar to the others, but after that procedure, they are no longer any faster than "common" IRQs. SMP enabled kernels running on systems with more than one processor need to solve another truckload of problems. It is not enough to know if a certain IRQs has happened, it’s also important to know what CPU(s) it was for. People still interested in more details, might want to refer to "APIC" now.

This function receives the IRQ number, the name of the function, flags, a name for /proc/interrupts and a parameter to be passed to the interrupt handler. Usually there is a certain number of IRQs available. How many IRQs there are is hardware-dependent. The flags can include SA_SHIRQ to indicate you are willing to share the IRQ with other interrupt handlers (usually because a number of hardware devices sit on the same IRQ) and SA_INTERRUPT to indicate this is a fast interrupt. This function will only succeed if there is not already a handler on this IRQ, or if you are both willing to share.

15.2 Detecting button presses

Many popular single board computers, such as Raspberry Pi or Beagleboards, have a bunch of GPIO pins. Attaching buttons to those and then having a button press do something is a classic case in which you might need to use interrupts, so that instead of having the CPU waste time and battery power polling for a change in input state, it is better for the input to trigger the CPU to then run a particular handling function.

Here is an example where buttons are connected to GPIO numbers 17 and 18 and an LED is connected to GPIO 4. You can change those numbers to whatever is appropriate for your board.

1/* 
2 * intrpt.c - Handling GPIO with interrupts 
3 * 
4 * Based upon the RPi example by Stefan Wendler (devnull@kaltpost.de) 
5 * from: 
6 *   https://github.com/wendlers/rpi-kmod-samples 
7 * 
8 * Press one button to turn on a LED and another to turn it off. 
9 */ 
10 
11#include <linux/gpio.h> 
12#include <linux/interrupt.h> 
13#include <linux/kernel.h> /* for ARRAY_SIZE() */ 
14#include <linux/module.h> 
15#include <linux/printk.h> 
16 
17static int button_irqs[] = { -1, -1 }; 
18 
19/* Define GPIOs for LEDs. 
20 * TODO: Change the numbers for the GPIO on your board. 
21 */ 
22static struct gpio leds[] = { { 4, GPIOF_OUT_INIT_LOW, "LED 1" } }; 
23 
24/* Define GPIOs for BUTTONS 
25 * TODO: Change the numbers for the GPIO on your board. 
26 */ 
27static struct gpio buttons[] = { { 17, GPIOF_IN, "LED 1 ON BUTTON" }, 
28                                 { 18, GPIOF_IN, "LED 1 OFF BUTTON" } }; 
29 
30/* interrupt function triggered when a button is pressed. */ 
31static irqreturn_t button_isr(int irq, void *data) 
32{ 
33    /* first button */ 
34    if (irq == button_irqs[0] && !gpio_get_value(leds[0].gpio)) 
35        gpio_set_value(leds[0].gpio, 1); 
36    /* second button */ 
37    else if (irq == button_irqs[1] && gpio_get_value(leds[0].gpio)) 
38        gpio_set_value(leds[0].gpio, 0); 
39 
40    return IRQ_HANDLED; 
41} 
42 
43static int __init intrpt_init(void) 
44{ 
45    int ret = 0; 
46 
47    pr_info("%s\n", __func__); 
48 
49    /* register LED gpios */ 
50    ret = gpio_request_array(leds, ARRAY_SIZE(leds)); 
51 
52    if (ret) { 
53        pr_err("Unable to request GPIOs for LEDs: %d\n", ret); 
54        return ret; 
55    } 
56 
57    /* register BUTTON gpios */ 
58    ret = gpio_request_array(buttons, ARRAY_SIZE(buttons)); 
59 
60    if (ret) { 
61        pr_err("Unable to request GPIOs for BUTTONs: %d\n", ret); 
62        goto fail1; 
63    } 
64 
65    pr_info("Current button1 value: %d\n", gpio_get_value(buttons[0].gpio)); 
66 
67    ret = gpio_to_irq(buttons[0].gpio); 
68 
69    if (ret < 0) { 
70        pr_err("Unable to request IRQ: %d\n", ret); 
71        goto fail2; 
72    } 
73 
74    button_irqs[0] = ret; 
75 
76    pr_info("Successfully requested BUTTON1 IRQ # %d\n", button_irqs[0]); 
77 
78    ret = request_irq(button_irqs[0], button_isr, 
79                      IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING, 
80                      "gpiomod#button1", NULL); 
81 
82    if (ret) { 
83        pr_err("Unable to request IRQ: %d\n", ret); 
84        goto fail2; 
85    } 
86 
87    ret = gpio_to_irq(buttons[1].gpio); 
88 
89    if (ret < 0) { 
90        pr_err("Unable to request IRQ: %d\n", ret); 
91        goto fail2; 
92    } 
93 
94    button_irqs[1] = ret; 
95 
96    pr_info("Successfully requested BUTTON2 IRQ # %d\n", button_irqs[1]); 
97 
98    ret = request_irq(button_irqs[1], button_isr, 
99                      IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING, 
100                      "gpiomod#button2", NULL); 
101 
102    if (ret) { 
103        pr_err("Unable to request IRQ: %d\n", ret); 
104        goto fail3; 
105    } 
106 
107    return 0; 
108 
109/* cleanup what has been setup so far */ 
110fail3: 
111    free_irq(button_irqs[0], NULL); 
112 
113fail2: 
114    gpio_free_array(buttons, ARRAY_SIZE(leds)); 
115 
116fail1: 
117    gpio_free_array(leds, ARRAY_SIZE(leds)); 
118 
119    return ret; 
120} 
121 
122static void __exit intrpt_exit(void) 
123{ 
124    int i; 
125 
126    pr_info("%s\n", __func__); 
127 
128    /* free irqs */ 
129    free_irq(button_irqs[0], NULL); 
130    free_irq(button_irqs[1], NULL); 
131 
132    /* turn all LEDs off */ 
133    for (i = 0; i < ARRAY_SIZE(leds); i++) 
134        gpio_set_value(leds[i].gpio, 0); 
135 
136    /* unregister */ 
137    gpio_free_array(leds, ARRAY_SIZE(leds)); 
138    gpio_free_array(buttons, ARRAY_SIZE(buttons)); 
139} 
140 
141module_init(intrpt_init); 
142module_exit(intrpt_exit); 
143 
144MODULE_LICENSE("GPL"); 
145MODULE_DESCRIPTION("Handle some GPIO interrupts");

15.3 Bottom Half

Suppose you want to do a bunch of stuff inside of an interrupt routine. A common way to do that without rendering the interrupt unavailable for a significant duration is to combine it with a tasklet. This pushes the bulk of the work off into the scheduler.

The example below modifies the previous example to also run an additional task when an interrupt is triggered.

1/* 
2 * bottomhalf.c - Top and bottom half interrupt handling 
3 * 
4 * Based upon the RPi example by Stefan Wendler (devnull@kaltpost.de) 
5 * from: 
6 *    https://github.com/wendlers/rpi-kmod-samples 
7 * 
8 * Press one button to turn on an LED and another to turn it off 
9 */ 
10 
11#include <linux/delay.h> 
12#include <linux/gpio.h> 
13#include <linux/interrupt.h> 
14#include <linux/module.h> 
15#include <linux/printk.h> 
16#include <linux/init.h> 
17 
18/* Macro DECLARE_TASKLET_OLD exists for compatibiity. 
19 * See https://lwn.net/Articles/830964/ 
20 */ 
21#ifndef DECLARE_TASKLET_OLD 
22#define DECLARE_TASKLET_OLD(arg1, arg2) DECLARE_TASKLET(arg1, arg2, 0L) 
23#endif 
24 
25static int button_irqs[] = { -1, -1 }; 
26 
27/* Define GPIOs for LEDs. 
28 * TODO: Change the numbers for the GPIO on your board. 
29 */ 
30static struct gpio leds[] = { { 4, GPIOF_OUT_INIT_LOW, "LED 1" } }; 
31 
32/* Define GPIOs for BUTTONS 
33 * TODO: Change the numbers for the GPIO on your board. 
34 */ 
35static struct gpio buttons[] = { 
36    { 17, GPIOF_IN, "LED 1 ON BUTTON" }, 
37    { 18, GPIOF_IN, "LED 1 OFF BUTTON" }, 
38}; 
39 
40/* Tasklet containing some non-trivial amount of processing */ 
41static void bottomhalf_tasklet_fn(unsigned long data) 
42{ 
43    pr_info("Bottom half tasklet starts\n"); 
44    /* do something which takes a while */ 
45    mdelay(500); 
46    pr_info("Bottom half tasklet ends\n"); 
47} 
48 
49static DECLARE_TASKLET_OLD(buttontask, bottomhalf_tasklet_fn); 
50 
51/* interrupt function triggered when a button is pressed */ 
52static irqreturn_t button_isr(int irq, void *data) 
53{ 
54    /* Do something quickly right now */ 
55    if (irq == button_irqs[0] && !gpio_get_value(leds[0].gpio)) 
56        gpio_set_value(leds[0].gpio, 1); 
57    else if (irq == button_irqs[1] && gpio_get_value(leds[0].gpio)) 
58        gpio_set_value(leds[0].gpio, 0); 
59 
60    /* Do the rest at leisure via the scheduler */ 
61    tasklet_schedule(&buttontask); 
62 
63    return IRQ_HANDLED; 
64} 
65 
66static int __init bottomhalf_init(void) 
67{ 
68    int ret = 0; 
69 
70    pr_info("%s\n", __func__); 
71 
72    /* register LED gpios */ 
73    ret = gpio_request_array(leds, ARRAY_SIZE(leds)); 
74 
75    if (ret) { 
76        pr_err("Unable to request GPIOs for LEDs: %d\n", ret); 
77        return ret; 
78    } 
79 
80    /* register BUTTON gpios */ 
81    ret = gpio_request_array(buttons, ARRAY_SIZE(buttons)); 
82 
83    if (ret) { 
84        pr_err("Unable to request GPIOs for BUTTONs: %d\n", ret); 
85        goto fail1; 
86    } 
87 
88    pr_info("Current button1 value: %d\n", gpio_get_value(buttons[0].gpio)); 
89 
90    ret = gpio_to_irq(buttons[0].gpio); 
91 
92    if (ret < 0) { 
93        pr_err("Unable to request IRQ: %d\n", ret); 
94        goto fail2; 
95    } 
96 
97    button_irqs[0] = ret; 
98 
99    pr_info("Successfully requested BUTTON1 IRQ # %d\n", button_irqs[0]); 
100 
101    ret = request_irq(button_irqs[0], button_isr, 
102                      IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING, 
103                      "gpiomod#button1", NULL); 
104 
105    if (ret) { 
106        pr_err("Unable to request IRQ: %d\n", ret); 
107        goto fail2; 
108    } 
109 
110    ret = gpio_to_irq(buttons[1].gpio); 
111 
112    if (ret < 0) { 
113        pr_err("Unable to request IRQ: %d\n", ret); 
114        goto fail2; 
115    } 
116 
117    button_irqs[1] = ret; 
118 
119    pr_info("Successfully requested BUTTON2 IRQ # %d\n", button_irqs[1]); 
120 
121    ret = request_irq(button_irqs[1], button_isr, 
122                      IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING, 
123                      "gpiomod#button2", NULL); 
124 
125    if (ret) { 
126        pr_err("Unable to request IRQ: %d\n", ret); 
127        goto fail3; 
128    } 
129 
130    return 0; 
131 
132/* cleanup what has been setup so far */ 
133fail3: 
134    free_irq(button_irqs[0], NULL); 
135 
136fail2: 
137    gpio_free_array(buttons, ARRAY_SIZE(leds)); 
138 
139fail1: 
140    gpio_free_array(leds, ARRAY_SIZE(leds)); 
141 
142    return ret; 
143} 
144 
145static void __exit bottomhalf_exit(void) 
146{ 
147    int i; 
148 
149    pr_info("%s\n", __func__); 
150 
151    /* free irqs */ 
152    free_irq(button_irqs[0], NULL); 
153    free_irq(button_irqs[1], NULL); 
154 
155    /* turn all LEDs off */ 
156    for (i = 0; i < ARRAY_SIZE(leds); i++) 
157        gpio_set_value(leds[i].gpio, 0); 
158 
159    /* unregister */ 
160    gpio_free_array(leds, ARRAY_SIZE(leds)); 
161    gpio_free_array(buttons, ARRAY_SIZE(buttons)); 
162} 
163 
164module_init(bottomhalf_init); 
165module_exit(bottomhalf_exit); 
166 
167MODULE_LICENSE("GPL"); 
168MODULE_DESCRIPTION("Interrupt with top and bottom half");

16 Crypto

At the dawn of the internet, everybody trusted everybody completely…but that did not work out so well. When this guide was originally written, it was a more innocent era in which almost nobody actually gave a damn about crypto - least of all kernel developers. That is certainly no longer the case now. To handle crypto stuff, the kernel has its own API enabling common methods of encryption, decryption and your favourite hash functions.

16.1 Hash functions

Calculating and checking the hashes of things is a common operation. Here is a demonstration of how to calculate a sha256 hash within a kernel module. To provide the sha256 algorithm support, make sure CONFIG_CRYPTO_SHA256 is enabled in kernel.

1/* 
2 * cryptosha256.c 
3 */ 
4#include <crypto/internal/hash.h> 
5#include <linux/module.h> 
6 
7#define SHA256_LENGTH 32 
8 
9static void show_hash_result(char *plaintext, char *hash_sha256) 
10{ 
11    int i; 
12    char str[SHA256_LENGTH * 2 + 1]; 
13 
14    pr_info("sha256 test for string: \"%s\"\n", plaintext); 
15    for (i = 0; i < SHA256_LENGTH; i++) 
16        sprintf(&str[i * 2], "%02x", (unsigned char)hash_sha256[i]); 
17    str[i * 2] = 0; 
18    pr_info("%s\n", str); 
19} 
20 
21static int __init cryptosha256_init(void) 
22{ 
23    char *plaintext = "This is a test"; 
24    char hash_sha256[SHA256_LENGTH]; 
25    struct crypto_shash *sha256; 
26    struct shash_desc *shash; 
27 
28    sha256 = crypto_alloc_shash("sha256", 0, 0); 
29    if (IS_ERR(sha256)) { 
30        pr_err( 
31            "%s(): Failed to allocate sha256 algorithm, enable CONFIG_CRYPTO_SHA256 and try again.\n", 
32            __func__); 
33        return -1; 
34    } 
35 
36    shash = kmalloc(sizeof(struct shash_desc) + crypto_shash_descsize(sha256), 
37                    GFP_KERNEL); 
38    if (!shash) 
39        return -ENOMEM; 
40 
41    shash->tfm = sha256; 
42 
43    if (crypto_shash_init(shash)) 
44        return -1; 
45 
46    if (crypto_shash_update(shash, plaintext, strlen(plaintext))) 
47        return -1; 
48 
49    if (crypto_shash_final(shash, hash_sha256)) 
50        return -1; 
51 
52    kfree(shash); 
53    crypto_free_shash(sha256); 
54 
55    show_hash_result(plaintext, hash_sha256); 
56 
57    return 0; 
58} 
59 
60static void __exit cryptosha256_exit(void) 
61{ 
62} 
63 
64module_init(cryptosha256_init); 
65module_exit(cryptosha256_exit); 
66 
67MODULE_DESCRIPTION("sha256 hash test"); 
68MODULE_LICENSE("GPL");

Install the module:

1sudo insmod cryptosha256.ko 
2sudo dmesg

And you should see that the hash was calculated for the test string.

Finally, remove the test module:

1sudo rmmod cryptosha256

16.2 Symmetric key encryption

Here is an example of symmetrically encrypting a string using the AES algorithm and a password.

1/* 
2 * cryptosk.c 
3 */ 
4#include <crypto/internal/skcipher.h> 
5#include <linux/crypto.h> 
6#include <linux/module.h> 
7#include <linux/random.h> 
8#include <linux/scatterlist.h> 
9 
10#define SYMMETRIC_KEY_LENGTH 32 
11#define CIPHER_BLOCK_SIZE 16 
12 
13struct tcrypt_result { 
14    struct completion completion; 
15    int err; 
16}; 
17 
18struct skcipher_def { 
19    struct scatterlist sg; 
20    struct crypto_skcipher *tfm; 
21    struct skcipher_request *req; 
22    struct tcrypt_result result; 
23    char *scratchpad; 
24    char *ciphertext; 
25    char *ivdata; 
26}; 
27 
28static struct skcipher_def sk; 
29 
30static void test_skcipher_finish(struct skcipher_def *sk) 
31{ 
32    if (sk->tfm) 
33        crypto_free_skcipher(sk->tfm); 
34    if (sk->req) 
35        skcipher_request_free(sk->req); 
36    if (sk->ivdata) 
37        kfree(sk->ivdata); 
38    if (sk->scratchpad) 
39        kfree(sk->scratchpad); 
40    if (sk->ciphertext) 
41        kfree(sk->ciphertext); 
42} 
43 
44static int test_skcipher_result(struct skcipher_def *sk, int rc) 
45{ 
46    switch (rc) { 
47    case 0: 
48        break; 
49    case -EINPROGRESS || -EBUSY: 
50        rc = wait_for_completion_interruptible(&sk->result.completion); 
51        if (!rc && !sk->result.err) { 
52            reinit_completion(&sk->result.completion); 
53            break; 
54        } 
55        pr_info("skcipher encrypt returned with %d result %d\n", rc, 
56                sk->result.err); 
57        break; 
58    default: 
59        pr_info("skcipher encrypt returned with %d result %d\n", rc, 
60                sk->result.err); 
61        break; 
62    } 
63 
64    init_completion(&sk->result.completion); 
65 
66    return rc; 
67} 
68 
69static void test_skcipher_callback(struct crypto_async_request *req, int error) 
70{ 
71    struct tcrypt_result *result = req->data; 
72 
73    if (error == -EINPROGRESS) 
74        return; 
75 
76    result->err = error; 
77    complete(&result->completion); 
78    pr_info("Encryption finished successfully\n"); 
79 
80    /* decrypt data */ 
81#if 0 
82    memset((void*)sk.scratchpad, '-', CIPHER_BLOCK_SIZE); 
83    ret = crypto_skcipher_decrypt(sk.req); 
84    ret = test_skcipher_result(&sk, ret); 
85    if (ret) 
86        return; 
87 
88    sg_copy_from_buffer(&sk.sg, 1, sk.scratchpad, CIPHER_BLOCK_SIZE); 
89    sk.scratchpad[CIPHER_BLOCK_SIZE-1] = 0; 
90 
91    pr_info("Decryption request successful\n"); 
92    pr_info("Decrypted: %s\n", sk.scratchpad); 
93#endif 
94} 
95 
96static int test_skcipher_encrypt(char *plaintext, char *password, 
97                                 struct skcipher_def *sk) 
98{ 
99    int ret = -EFAULT; 
100    unsigned char key[SYMMETRIC_KEY_LENGTH]; 
101 
102    if (!sk->tfm) { 
103        sk->tfm = crypto_alloc_skcipher("cbc-aes-aesni", 0, 0); 
104        if (IS_ERR(sk->tfm)) { 
105            pr_info("could not allocate skcipher handle\n"); 
106            return PTR_ERR(sk->tfm); 
107        } 
108    } 
109 
110    if (!sk->req) { 
111        sk->req = skcipher_request_alloc(sk->tfm, GFP_KERNEL); 
112        if (!sk->req) { 
113            pr_info("could not allocate skcipher request\n"); 
114            ret = -ENOMEM; 
115            goto out; 
116        } 
117    } 
118 
119    skcipher_request_set_callback(sk->req, CRYPTO_TFM_REQ_MAY_BACKLOG, 
120                                  test_skcipher_callback, &sk->result); 
121 
122    /* clear the key */ 
123    memset((void *)key, '\0', SYMMETRIC_KEY_LENGTH); 
124 
125    /* Use the world's favourite password */ 
126    sprintf((char *)key, "%s", password); 
127 
128    /* AES 256 with given symmetric key */ 
129    if (crypto_skcipher_setkey(sk->tfm, key, SYMMETRIC_KEY_LENGTH)) { 
130        pr_info("key could not be set\n"); 
131        ret = -EAGAIN; 
132        goto out; 
133    } 
134    pr_info("Symmetric key: %s\n", key); 
135    pr_info("Plaintext: %s\n", plaintext); 
136 
137    if (!sk->ivdata) { 
138        /* see https://en.wikipedia.org/wiki/Initialization_vector */ 
139        sk->ivdata = kmalloc(CIPHER_BLOCK_SIZE, GFP_KERNEL); 
140        if (!sk->ivdata) { 
141            pr_info("could not allocate ivdata\n"); 
142            goto out; 
143        } 
144        get_random_bytes(sk->ivdata, CIPHER_BLOCK_SIZE); 
145    } 
146 
147    if (!sk->scratchpad) { 
148        /* The text to be encrypted */ 
149        sk->scratchpad = kmalloc(CIPHER_BLOCK_SIZE, GFP_KERNEL); 
150        if (!sk->scratchpad) { 
151            pr_info("could not allocate scratchpad\n"); 
152            goto out; 
153        } 
154    } 
155    sprintf((char *)sk->scratchpad, "%s", plaintext); 
156 
157    sg_init_one(&sk->sg, sk->scratchpad, CIPHER_BLOCK_SIZE); 
158    skcipher_request_set_crypt(sk->req, &sk->sg, &sk->sg, CIPHER_BLOCK_SIZE, 
159                               sk->ivdata); 
160    init_completion(&sk->result.completion); 
161 
162    /* encrypt data */ 
163    ret = crypto_skcipher_encrypt(sk->req); 
164    ret = test_skcipher_result(sk, ret); 
165    if (ret) 
166        goto out; 
167 
168    pr_info("Encryption request successful\n"); 
169 
170out: 
171    return ret; 
172} 
173 
174static int __init cryptoapi_init(void) 
175{ 
176    /* The world's favorite password */ 
177    char *password = "password123"; 
178 
179    sk.tfm = NULL; 
180    sk.req = NULL; 
181    sk.scratchpad = NULL; 
182    sk.ciphertext = NULL; 
183    sk.ivdata = NULL; 
184 
185    test_skcipher_encrypt("Testing", password, &sk); 
186    return 0; 
187} 
188 
189static void __exit cryptoapi_exit(void) 
190{ 
191    test_skcipher_finish(&sk); 
192} 
193 
194module_init(cryptoapi_init); 
195module_exit(cryptoapi_exit); 
196 
197MODULE_DESCRIPTION("Symmetric key encryption example"); 
198MODULE_LICENSE("GPL");

17 Virtual Input Device Driver

The input device driver is a module that provides a way to communicate with the interaction device via the event. For example, the keyboard can send the press or release event to tell the kernel what we want to do. The input device driver will allocate a new input structure with input_allocate_device() and sets up input bitfields, device id, version, etc. After that, registers it by calling input_register_device() .

Here is an example, vinput, It is an API to allow easy development of virtual input drivers. The drivers needs to export a vinput_device() that contains the virtual device name and vinput_ops structure that describes:

the init function: init()
the input event injection function: send()
the readback function: read()

Then using vinput_register_device() and vinput_unregister_device() will add a new device to the list of support virtual input devices.

1int init(struct vinput *);

This function is passed a struct vinput already initialized with an allocated struct input_dev . The init() function is responsible for initializing the capabilities of the input device and register it.

1int send(struct vinput *, char *, int);

This function will receive a user string to interpret and inject the event using the input_report_XXXX or input_event call. The string is already copied from user.

1int read(struct vinput *, char *, int);

This function is used for debugging and should fill the buffer parameter with the last event sent in the virtual input device format. The buffer will then be copied to user.

vinput devices are created and destroyed using sysfs. And, event injection is done through a /dev node. The device name will be used by the userland to export a new virtual input device.

The class_attribute structure is similar to other attribute types we talked about in section 8:

1struct class_attribute { 
2    struct attribute attr; 
3    ssize_t (*show)(struct class *class, struct class_attribute *attr, 
4                    char *buf); 
5    ssize_t (*store)(struct class *class, struct class_attribute *attr, 
6                    const char *buf, size_t count); 
7};

In vinput.c, the macro CLASS_ATTR_WO(export/unexport) defined in include/linux/device.h (in this case, device.h is included in include/linux/input.h) will generate the class_attribute structures which are named class_attr_export/unexport. Then, put them into vinput_class_attrs array and the macro ATTRIBUTE_GROUPS(vinput_class) will generate the struct attribute_group vinput_class_group that should be assigned in vinput_class . Finally, call class_register(&vinput_class) to create attributes in sysfs.

To create a vinputX sysfs entry and /dev node.

1echo "vkbd" | sudo tee /sys/class/vinput/export

To unexport the device, just echo its id in unexport:

1echo "0" | sudo tee /sys/class/vinput/unexport

1/* 
2 * vinput.h 
3 */ 
4 
5#ifndef VINPUT_H 
6#define VINPUT_H 
7 
8#include <linux/input.h> 
9#include <linux/spinlock.h> 
10 
11#define VINPUT_MAX_LEN 128 
12#define MAX_VINPUT 32 
13#define VINPUT_MINORS MAX_VINPUT 
14 
15#define dev_to_vinput(dev) container_of(dev, struct vinput, dev) 
16 
17struct vinput_device; 
18 
19struct vinput { 
20    long id; 
21    long devno; 
22    long last_entry; 
23    spinlock_t lock; 
24 
25    void *priv_data; 
26 
27    struct device dev; 
28    struct list_head list; 
29    struct input_dev *input; 
30    struct vinput_device *type; 
31}; 
32 
33struct vinput_ops { 
34    int (*init)(struct vinput *); 
35    int (*kill)(struct vinput *); 
36    int (*send)(struct vinput *, char *, int); 
37    int (*read)(struct vinput *, char *, int); 
38}; 
39 
40struct vinput_device { 
41    char name[16]; 
42    struct list_head list; 
43    struct vinput_ops *ops; 
44}; 
45 
46int vinput_register(struct vinput_device *dev); 
47void vinput_unregister(struct vinput_device *dev); 
48 
49#endif

1/* 
2 * vinput.c 
3 */ 
4 
5#include <linux/cdev.h> 
6#include <linux/input.h> 
7#include <linux/module.h> 
8#include <linux/slab.h> 
9#include <linux/spinlock.h> 
10 
11#include <asm/uaccess.h> 
12 
13#include "vinput.h" 
14 
15#define DRIVER_NAME "vinput" 
16 
17#define dev_to_vinput(dev) container_of(dev, struct vinput, dev) 
18 
19static DECLARE_BITMAP(vinput_ids, VINPUT_MINORS); 
20 
21static LIST_HEAD(vinput_devices); 
22static LIST_HEAD(vinput_vdevices); 
23 
24static int vinput_dev; 
25static struct spinlock vinput_lock; 
26static struct class vinput_class; 
27 
28/* Search the name of vinput device in the vinput_devices linked list, 
29 * which added at vinput_register(). 
30 */ 
31static struct vinput_device *vinput_get_device_by_type(const char *type) 
32{ 
33    int found = 0; 
34    struct vinput_device *vinput; 
35    struct list_head *curr; 
36 
37    spin_lock(&vinput_lock); 
38    list_for_each (curr, &vinput_devices) { 
39        vinput = list_entry(curr, struct vinput_device, list); 
40        if (vinput && strncmp(type, vinput->name, strlen(vinput->name)) == 0) { 
41            found = 1; 
42            break; 
43        } 
44    } 
45    spin_unlock(&vinput_lock); 
46 
47    if (found) 
48        return vinput; 
49    return ERR_PTR(-ENODEV); 
50} 
51 
52/* Search the id of virtual device in the vinput_vdevices linked list, 
53 * which added at vinput_alloc_vdevice(). 
54 */ 
55static struct vinput *vinput_get_vdevice_by_id(long id) 
56{ 
57    struct vinput *vinput = NULL; 
58    struct list_head *curr; 
59 
60    spin_lock(&vinput_lock); 
61    list_for_each (curr, &vinput_vdevices) { 
62        vinput = list_entry(curr, struct vinput, list); 
63        if (vinput && vinput->id == id) 
64            break; 
65    } 
66    spin_unlock(&vinput_lock); 
67 
68    if (vinput && vinput->id == id) 
69        return vinput; 
70    return ERR_PTR(-ENODEV); 
71} 
72 
73static int vinput_open(struct inode *inode, struct file *file) 
74{ 
75    int err = 0; 
76    struct vinput *vinput = NULL; 
77 
78    vinput = vinput_get_vdevice_by_id(iminor(inode)); 
79 
80    if (IS_ERR(vinput)) 
81        err = PTR_ERR(vinput); 
82    else 
83        file->private_data = vinput; 
84 
85    return err; 
86} 
87 
88static int vinput_release(struct inode *inode, struct file *file) 
89{ 
90    return 0; 
91} 
92 
93static ssize_t vinput_read(struct file *file, char __user *buffer, size_t count, 
94                           loff_t *offset) 
95{ 
96    int len; 
97    char buff[VINPUT_MAX_LEN + 1]; 
98    struct vinput *vinput = file->private_data; 
99 
100    len = vinput->type->ops->read(vinput, buff, count); 
101 
102    if (*offset > len) 
103        count = 0; 
104    else if (count + *offset > VINPUT_MAX_LEN) 
105        count = len - *offset; 
106 
107    if (raw_copy_to_user(buffer, buff + *offset, count)) 
108        count = -EFAULT; 
109 
110    *offset += count; 
111 
112    return count; 
113} 
114 
115static ssize_t vinput_write(struct file *file, const char __user *buffer, 
116                            size_t count, loff_t *offset) 
117{ 
118    char buff[VINPUT_MAX_LEN + 1]; 
119    struct vinput *vinput = file->private_data; 
120 
121    memset(buff, 0, sizeof(char) * (VINPUT_MAX_LEN + 1)); 
122 
123    if (count > VINPUT_MAX_LEN) { 
124        dev_warn(&vinput->dev, "Too long. %d bytes allowed\n", VINPUT_MAX_LEN); 
125        return -EINVAL; 
126    } 
127 
128    if (raw_copy_from_user(buff, buffer, count)) 
129        return -EFAULT; 
130 
131    return vinput->type->ops->send(vinput, buff, count); 
132} 
133 
134static const struct file_operations vinput_fops = { 
135    .owner = THIS_MODULE, 
136    .open = vinput_open, 
137    .release = vinput_release, 
138    .read = vinput_read, 
139    .write = vinput_write, 
140}; 
141 
142static void vinput_unregister_vdevice(struct vinput *vinput) 
143{ 
144    input_unregister_device(vinput->input); 
145    if (vinput->type->ops->kill) 
146        vinput->type->ops->kill(vinput); 
147} 
148 
149static void vinput_destroy_vdevice(struct vinput *vinput) 
150{ 
151    /* Remove from the list first */ 
152    spin_lock(&vinput_lock); 
153    list_del(&vinput->list); 
154    clear_bit(vinput->id, vinput_ids); 
155    spin_unlock(&vinput_lock); 
156 
157    module_put(THIS_MODULE); 
158 
159    kfree(vinput); 
160} 
161 
162static void vinput_release_dev(struct device *dev) 
163{ 
164    struct vinput *vinput = dev_to_vinput(dev); 
165    int id = vinput->id; 
166 
167    vinput_destroy_vdevice(vinput); 
168 
169    pr_debug("released vinput%d.\n", id); 
170} 
171 
172static struct vinput *vinput_alloc_vdevice(void) 
173{ 
174    int err; 
175    struct vinput *vinput = kzalloc(sizeof(struct vinput), GFP_KERNEL); 
176 
177    try_module_get(THIS_MODULE); 
178 
179    memset(vinput, 0, sizeof(struct vinput)); 
180 
181    spin_lock_init(&vinput->lock); 
182 
183    spin_lock(&vinput_lock); 
184    vinput->id = find_first_zero_bit(vinput_ids, VINPUT_MINORS); 
185    if (vinput->id >= VINPUT_MINORS) { 
186        err = -ENOBUFS; 
187        goto fail_id; 
188    } 
189    set_bit(vinput->id, vinput_ids); 
190    list_add(&vinput->list, &vinput_vdevices); 
191    spin_unlock(&vinput_lock); 
192 
193    /* allocate the input device */ 
194    vinput->input = input_allocate_device(); 
195    if (vinput->input == NULL) { 
196        pr_err("vinput: Cannot allocate vinput input device\n"); 
197        err = -ENOMEM; 
198        goto fail_input_dev; 
199    } 
200 
201    /* initialize device */ 
202    vinput->dev.class = &vinput_class; 
203    vinput->dev.release = vinput_release_dev; 
204    vinput->dev.devt = MKDEV(vinput_dev, vinput->id); 
205    dev_set_name(&vinput->dev, DRIVER_NAME "%lu", vinput->id); 
206 
207    return vinput; 
208 
209fail_input_dev: 
210    spin_lock(&vinput_lock); 
211    list_del(&vinput->list); 
212fail_id: 
213    spin_unlock(&vinput_lock); 
214    module_put(THIS_MODULE); 
215    kfree(vinput); 
216 
217    return ERR_PTR(err); 
218} 
219 
220static int vinput_register_vdevice(struct vinput *vinput) 
221{ 
222    int err = 0; 
223 
224    /* register the input device */ 
225    vinput->input->name = vinput->type->name; 
226    vinput->input->phys = "vinput"; 
227    vinput->input->dev.parent = &vinput->dev; 
228 
229    vinput->input->id.bustype = BUS_VIRTUAL; 
230    vinput->input->id.product = 0x0000; 
231    vinput->input->id.vendor = 0x0000; 
232    vinput->input->id.version = 0x0000; 
233 
234    err = vinput->type->ops->init(vinput); 
235 
236    if (err == 0) 
237        dev_info(&vinput->dev, "Registered virtual input %s %ld\n", 
238                 vinput->type->name, vinput->id); 
239 
240    return err; 
241} 
242 
243static ssize_t export_store(struct class *class, struct class_attribute *attr, 
244                            const char *buf, size_t len) 
245{ 
246    int err; 
247    struct vinput *vinput; 
248    struct vinput_device *device; 
249 
250    device = vinput_get_device_by_type(buf); 
251    if (IS_ERR(device)) { 
252        pr_info("vinput: This virtual device isn't registered\n"); 
253        err = PTR_ERR(device); 
254        goto fail; 
255    } 
256 
257    vinput = vinput_alloc_vdevice(); 
258    if (IS_ERR(vinput)) { 
259        err = PTR_ERR(vinput); 
260        goto fail; 
261    } 
262 
263    vinput->type = device; 
264    err = device_register(&vinput->dev); 
265    if (err < 0) 
266        goto fail_register; 
267 
268    err = vinput_register_vdevice(vinput); 
269    if (err < 0) 
270        goto fail_register_vinput; 
271 
272    return len; 
273 
274fail_register_vinput: 
275    device_unregister(&vinput->dev); 
276fail_register: 
277    vinput_destroy_vdevice(vinput); 
278fail: 
279    return err; 
280} 
281/* This macro generates class_attr_export structure and export_store() */ 
282static CLASS_ATTR_WO(export); 
283 
284static ssize_t unexport_store(struct class *class, struct class_attribute *attr, 
285                              const char *buf, size_t len) 
286{ 
287    int err; 
288    unsigned long id; 
289    struct vinput *vinput; 
290 
291    err = kstrtol(buf, 10, &id); 
292    if (err) { 
293        err = -EINVAL; 
294        goto failed; 
295    } 
296 
297    vinput = vinput_get_vdevice_by_id(id); 
298    if (IS_ERR(vinput)) { 
299        pr_err("vinput: No such vinput device %ld\n", id); 
300        err = PTR_ERR(vinput); 
301        goto failed; 
302    } 
303 
304    vinput_unregister_vdevice(vinput); 
305    device_unregister(&vinput->dev); 
306 
307    return len; 
308failed: 
309    return err; 
310} 
311/* This macro generates class_attr_unexport structure and unexport_store() */ 
312static CLASS_ATTR_WO(unexport); 
313 
314static struct attribute *vinput_class_attrs[] = { 
315    &class_attr_export.attr, 
316    &class_attr_unexport.attr, 
317    NULL, 
318}; 
319 
320/* This macro generates vinput_class_groups structure */ 
321ATTRIBUTE_GROUPS(vinput_class); 
322 
323static struct class vinput_class = { 
324    .name = "vinput", 
325    .owner = THIS_MODULE, 
326    .class_groups = vinput_class_groups, 
327}; 
328 
329int vinput_register(struct vinput_device *dev) 
330{ 
331    spin_lock(&vinput_lock); 
332    list_add(&dev->list, &vinput_devices); 
333    spin_unlock(&vinput_lock); 
334 
335    pr_info("vinput: registered new virtual input device '%s'\n", dev->name); 
336 
337    return 0; 
338} 
339EXPORT_SYMBOL(vinput_register); 
340 
341void vinput_unregister(struct vinput_device *dev) 
342{ 
343    struct list_head *curr, *next; 
344 
345    /* Remove from the list first */ 
346    spin_lock(&vinput_lock); 
347    list_del(&dev->list); 
348    spin_unlock(&vinput_lock); 
349 
350    /* unregister all devices of this type */ 
351    list_for_each_safe (curr, next, &vinput_vdevices) { 
352        struct vinput *vinput = list_entry(curr, struct vinput, list); 
353        if (vinput && vinput->type == dev) { 
354            vinput_unregister_vdevice(vinput); 
355            device_unregister(&vinput->dev); 
356        } 
357    } 
358 
359    pr_info("vinput: unregistered virtual input device '%s'\n", dev->name); 
360} 
361EXPORT_SYMBOL(vinput_unregister); 
362 
363static int __init vinput_init(void) 
364{ 
365    int err = 0; 
366 
367    pr_info("vinput: Loading virtual input driver\n"); 
368 
369    vinput_dev = register_chrdev(0, DRIVER_NAME, &vinput_fops); 
370    if (vinput_dev < 0) { 
371        pr_err("vinput: Unable to allocate char dev region\n"); 
372        err = vinput_dev; 
373        goto failed_alloc; 
374    } 
375 
376    spin_lock_init(&vinput_lock); 
377 
378    err = class_register(&vinput_class); 
379    if (err < 0) { 
380        pr_err("vinput: Unable to register vinput class\n"); 
381        goto failed_class; 
382    } 
383 
384    return 0; 
385failed_class: 
386    class_unregister(&vinput_class); 
387failed_alloc: 
388    return err; 
389} 
390 
391static void __exit vinput_end(void) 
392{ 
393    pr_info("vinput: Unloading virtual input driver\n"); 
394 
395    unregister_chrdev(vinput_dev, DRIVER_NAME); 
396    class_unregister(&vinput_class); 
397} 
398 
399module_init(vinput_init); 
400module_exit(vinput_end); 
401 
402MODULE_LICENSE("GPL"); 
403MODULE_DESCRIPTION("Emulate input events");

Here the virtual keyboard is one of example to use vinput. It supports all KEY_MAX keycodes. The injection format is the KEY_CODE such as defined in include/linux/input.h. A positive value means KEY_PRESS while a negative value is a KEY_RELEASE . The keyboard supports repetition when the key stays pressed for too long. The following demonstrates how simulation work.

Simulate a key press on "g" ( KEY_G = 34):

1echo "+34" | sudo tee /dev/vinput0

Simulate a key release on "g" ( KEY_G = 34):

1echo "-34" | sudo tee /dev/vinput0

1/* 
2 * vkbd.c 
3 */ 
4 
5#include <linux/init.h> 
6#include <linux/input.h> 
7#include <linux/module.h> 
8#include <linux/spinlock.h> 
9 
10#include "vinput.h" 
11 
12#define VINPUT_KBD "vkbd" 
13#define VINPUT_RELEASE 0 
14#define VINPUT_PRESS 1 
15 
16static unsigned short vkeymap[KEY_MAX]; 
17 
18static int vinput_vkbd_init(struct vinput *vinput) 
19{ 
20    int i; 
21 
22    /* Set up the input bitfield */ 
23    vinput->input->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REP); 
24    vinput->input->keycodesize = sizeof(unsigned short); 
25    vinput->input->keycodemax = KEY_MAX; 
26    vinput->input->keycode = vkeymap; 
27 
28    for (i = 0; i < KEY_MAX; i++) 
29        set_bit(vkeymap[i], vinput->input->keybit); 
30 
31    /* vinput will help us allocate new input device structure via 
32     * input_allocate_device(). So, we can register it straightforwardly. 
33     */ 
34    return input_register_device(vinput->input); 
35} 
36 
37static int vinput_vkbd_read(struct vinput *vinput, char *buff, int len) 
38{ 
39    spin_lock(&vinput->lock); 
40    len = snprintf(buff, len, "%+ld\n", vinput->last_entry); 
41    spin_unlock(&vinput->lock); 
42 
43    return len; 
44} 
45 
46static int vinput_vkbd_send(struct vinput *vinput, char *buff, int len) 
47{ 
48    int ret; 
49    long key = 0; 
50    short type = VINPUT_PRESS; 
51 
52    /* Determine which event was received (press or release) 
53     * and store the state. 
54     */ 
55    if (buff[0] == '+') 
56        ret = kstrtol(buff + 1, 10, &key); 
57    else 
58        ret = kstrtol(buff, 10, &key); 
59    if (ret) 
60        dev_err(&vinput->dev, "error during kstrtol: -%d\n", ret); 
61    spin_lock(&vinput->lock); 
62    vinput->last_entry = key; 
63    spin_unlock(&vinput->lock); 
64 
65    if (key < 0) { 
66        type = VINPUT_RELEASE; 
67        key = -key; 
68    } 
69 
70    dev_info(&vinput->dev, "Event %s code %ld\n", 
71             (type == VINPUT_RELEASE) ? "VINPUT_RELEASE" : "VINPUT_PRESS", key); 
72 
73    /* Report the state received to input subsystem. */ 
74    input_report_key(vinput->input, key, type); 
75    /* Tell input subsystem that it finished the report. */ 
76    input_sync(vinput->input); 
77 
78    return len; 
79} 
80 
81static struct vinput_ops vkbd_ops = { 
82    .init = vinput_vkbd_init, 
83    .send = vinput_vkbd_send, 
84    .read = vinput_vkbd_read, 
85}; 
86 
87static struct vinput_device vkbd_dev = { 
88    .name = VINPUT_KBD, 
89    .ops = &vkbd_ops, 
90}; 
91 
92static int __init vkbd_init(void) 
93{ 
94    int i; 
95 
96    for (i = 0; i < KEY_MAX; i++) 
97        vkeymap[i] = i; 
98    return vinput_register(&vkbd_dev); 
99} 
100 
101static void __exit vkbd_end(void) 
102{ 
103    vinput_unregister(&vkbd_dev); 
104} 
105 
106module_init(vkbd_init); 
107module_exit(vkbd_end); 
108 
109MODULE_LICENSE("GPL"); 
110MODULE_DESCRIPTION("Emulate keyboard input events through /dev/vinput");

18 Standardizing the interfaces: The Device Model

Up to this point we have seen all kinds of modules doing all kinds of things, but there was no consistency in their interfaces with the rest of the kernel. To impose some consistency such that there is at minimum a standardized way to start, suspend and resume a device model was added. An example is shown below, and you can use this as a template to add your own suspend, resume or other interface functions.

1/* 
2 * devicemodel.c 
3 */ 
4#include <linux/kernel.h> 
5#include <linux/module.h> 
6#include <linux/platform_device.h> 
7 
8struct devicemodel_data { 
9    char *greeting; 
10    int number; 
11}; 
12 
13static int devicemodel_probe(struct platform_device *dev) 
14{ 
15    struct devicemodel_data *pd = 
16        (struct devicemodel_data *)(dev->dev.platform_data); 
17 
18    pr_info("devicemodel probe\n"); 
19    pr_info("devicemodel greeting: %s; %d\n", pd->greeting, pd->number); 
20 
21    /* Your device initialization code */ 
22 
23    return 0; 
24} 
25 
26static int devicemodel_remove(struct platform_device *dev) 
27{ 
28    pr_info("devicemodel example removed\n"); 
29 
30    /* Your device removal code */ 
31 
32    return 0; 
33} 
34 
35static int devicemodel_suspend(struct device *dev) 
36{ 
37    pr_info("devicemodel example suspend\n"); 
38 
39    /* Your device suspend code */ 
40 
41    return 0; 
42} 
43 
44static int devicemodel_resume(struct device *dev) 
45{ 
46    pr_info("devicemodel example resume\n"); 
47 
48    /* Your device resume code */ 
49 
50    return 0; 
51} 
52 
53static const struct dev_pm_ops devicemodel_pm_ops = { 
54    .suspend = devicemodel_suspend, 
55    .resume = devicemodel_resume, 
56    .poweroff = devicemodel_suspend, 
57    .freeze = devicemodel_suspend, 
58    .thaw = devicemodel_resume, 
59    .restore = devicemodel_resume, 
60}; 
61 
62static struct platform_driver devicemodel_driver = { 
63    .driver = 
64        { 
65            .name = "devicemodel_example", 
66            .pm = &devicemodel_pm_ops, 
67        }, 
68    .probe = devicemodel_probe, 
69    .remove = devicemodel_remove, 
70}; 
71 
72static int __init devicemodel_init(void) 
73{ 
74    int ret; 
75 
76    pr_info("devicemodel init\n"); 
77 
78    ret = platform_driver_register(&devicemodel_driver); 
79 
80    if (ret) { 
81        pr_err("Unable to register driver\n"); 
82        return ret; 
83    } 
84 
85    return 0; 
86} 
87 
88static void __exit devicemodel_exit(void) 
89{ 
90    pr_info("devicemodel exit\n"); 
91    platform_driver_unregister(&devicemodel_driver); 
92} 
93 
94module_init(devicemodel_init); 
95module_exit(devicemodel_exit); 
96 
97MODULE_LICENSE("GPL"); 
98MODULE_DESCRIPTION("Linux Device Model example");

19 Optimizations

19.1 Likely and Unlikely conditions

Sometimes you might want your code to run as quickly as possible, especially if it is handling an interrupt or doing something which might cause noticeable latency. If your code contains boolean conditions and if you know that the conditions are almost always likely to evaluate as either true or false , then you can allow the compiler to optimize for this using the likely and unlikely macros. For example, when allocating memory you are almost always expecting this to succeed.

1bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx); 
2if (unlikely(!bvl)) { 
3    mempool_free(bio, bio_pool); 
4    bio = NULL; 
5    goto out; 
6}

When the unlikely macro is used, the compiler alters its machine instruction output, so that it continues along the false branch and only jumps if the condition is true. That avoids flushing the processor pipeline. The opposite happens if you use the likely macro.

19.2 Static keys

Static keys allow us to enable or disable kernel code paths based on the runtime state of key. Its APIs have been available since 2010 (most architectures are already supported), use self-modifying code to eliminate the overhead of cache and branch prediction. The most typical use case of static keys is for performance-sensitive kernel code, such as tracepoints, context switching, networking, etc. These hot paths of the kernel often contain branches and can be optimized easily using this technique. Before we can use static keys in the kernel, we need to make sure that gcc supports asm goto inline assembly, and the following kernel configurations are set:

1CONFIG_JUMP_LABEL=y 
2CONFIG_HAVE_ARCH_JUMP_LABEL=y 
3CONFIG_HAVE_ARCH_JUMP_LABEL_RELATIVE=y

To declare a static key, we need to define a global variable using the DEFINE_STATIC_KEY_FALSE or DEFINE_STATIC_KEY_TRUE macro defined in include/linux/jump_label.h. This macro initializes the key with the given initial value, which is either false or true, respectively. For example, to declare a static key with an initial value of false, we can use the following code:

1DEFINE_STATIC_KEY_FALSE(fkey);

Once the static key has been declared, we need to add branching code to the module that uses the static key. For example, the code includes a fastpath, where a no-op instruction will be generated at compile time as the key is initialized to false and the branch is unlikely to be taken.

1pr_info("fastpath 1\n"); 
2if (static_branch_unlikely(&fkey)) 
3    pr_alert("do unlikely thing\n"); 
4pr_info("fastpath 2\n");

If the key is enabled at runtime by calling static_branch_enable(&fkey) , the fastpath will be patched with an unconditional jump instruction to the slowpath code pr_alert , so the branch will always be taken until the key is disabled again.

The following kernel module derived from chardev.c, demonstrates how the static key works.

1/* 
2 * static_key.c 
3 */ 
4 
5#include <linux/atomic.h> 
6#include <linux/device.h> 
7#include <linux/fs.h> 
8#include <linux/kernel.h> /* for sprintf() */ 
9#include <linux/module.h> 
10#include <linux/printk.h> 
11#include <linux/types.h> 
12#include <linux/uaccess.h> /* for get_user and put_user */ 
13#include <linux/jump_label.h> /* for static key macros */ 
14 
15#include <asm/errno.h> 
16 
17static int device_open(struct inode *inode, struct file *file); 
18static int device_release(struct inode *inode, struct file *file); 
19static ssize_t device_read(struct file *file, char __user *buf, size_t count, 
20                           loff_t *ppos); 
21static ssize_t device_write(struct file *file, const char __user *buf, 
22                            size_t count, loff_t *ppos); 
23 
24#define SUCCESS 0 
25#define DEVICE_NAME "key_state" 
26#define BUF_LEN 10 
27 
28static int major; 
29 
30enum { 
31    CDEV_NOT_USED = 0, 
32    CDEV_EXCLUSIVE_OPEN = 1, 
33}; 
34 
35static atomic_t already_open = ATOMIC_INIT(CDEV_NOT_USED); 
36 
37static char msg[BUF_LEN + 1]; 
38 
39static struct class *cls; 
40 
41static DEFINE_STATIC_KEY_FALSE(fkey); 
42 
43static struct file_operations chardev_fops = { 
44    .owner = THIS_MODULE, 
45    .open = device_open, 
46    .release = device_release, 
47    .read = device_read, 
48    .write = device_write, 
49}; 
50 
51static int __init chardev_init(void) 
52{ 
53    major = register_chrdev(0, DEVICE_NAME, &chardev_fops); 
54    if (major < 0) { 
55        pr_alert("Registering char device failed with %d\n", major); 
56        return major; 
57    } 
58 
59    pr_info("I was assigned major number %d\n", major); 
60 
61    cls = class_create(THIS_MODULE, DEVICE_NAME); 
62 
63    device_create(cls, NULL, MKDEV(major, 0), NULL, DEVICE_NAME); 
64 
65    pr_info("Device created on /dev/%s\n", DEVICE_NAME); 
66 
67    return SUCCESS; 
68} 
69 
70static void __exit chardev_exit(void) 
71{ 
72    device_destroy(cls, MKDEV(major, 0)); 
73    class_destroy(cls); 
74 
75    /* Unregister the device */ 
76    unregister_chrdev(major, DEVICE_NAME); 
77} 
78 
79/* Methods */ 
80 
81/** 
82 * Called when a process tried to open the device file, like 
83 * cat /dev/key_state 
84 */ 
85static int device_open(struct inode *inode, struct file *file) 
86{ 
87    if (atomic_cmpxchg(&already_open, CDEV_NOT_USED, CDEV_EXCLUSIVE_OPEN)) 
88        return -EBUSY; 
89 
90    sprintf(msg, static_key_enabled(&fkey) ? "enabled\n" : "disabled\n"); 
91 
92    pr_info("fastpath 1\n"); 
93    if (static_branch_unlikely(&fkey)) 
94        pr_alert("do unlikely thing\n"); 
95    pr_info("fastpath 2\n"); 
96 
97    try_module_get(THIS_MODULE); 
98 
99    return SUCCESS; 
100} 
101 
102/** 
103 * Called when a process closes the device file 
104 */ 
105static int device_release(struct inode *inode, struct file *file) 
106{ 
107    /* We are now ready for our next caller. */ 
108    atomic_set(&already_open, CDEV_NOT_USED); 
109 
110    /** 
111     * Decrement the usage count, or else once you opened the file, you will 
112     * never get rid of the module. 
113     */ 
114    module_put(THIS_MODULE); 
115 
116    return SUCCESS; 
117} 
118 
119/** 
120 * Called when a process, which already opened the dev file, attempts to 
121 * read from it. 
122 */ 
123static ssize_t device_read(struct file *filp, /* see include/linux/fs.h */ 
124                           char __user *buffer, /* buffer to fill with data */ 
125                           size_t length, /* length of the buffer */ 
126                           loff_t *offset) 
127{ 
128    /* Number of the bytes actually written to the buffer */ 
129    int bytes_read = 0; 
130    const char *msg_ptr = msg; 
131 
132    if (!*(msg_ptr + *offset)) { /* We are at the end of the message */ 
133        *offset = 0; /* reset the offset */ 
134        return 0; /* signify end of file */ 
135    } 
136 
137    msg_ptr += *offset; 
138 
139    /* Actually put the data into the buffer */ 
140    while (length && *msg_ptr) { 
141        /** 
142         * The buffer is in the user data segment, not the kernel 
143         * segment so "*" assignment won't work. We have to use 
144         * put_user which copies data from the kernel data segment to 
145         * the user data segment. 
146         */ 
147        put_user(*(msg_ptr++), buffer++); 
148        length--; 
149        bytes_read++; 
150    } 
151 
152    *offset += bytes_read; 
153 
154    /* Most read functions return the number of bytes put into the buffer. */ 
155    return bytes_read; 
156} 
157 
158/* Called when a process writes to dev file; echo "enable" > /dev/key_state */ 
159static ssize_t device_write(struct file *filp, const char __user *buffer, 
160                            size_t length, loff_t *offset) 
161{ 
162    char command[10]; 
163 
164    if (length > 10) { 
165        pr_err("command exceeded 10 char\n"); 
166        return -EINVAL; 
167    } 
168 
169    if (copy_from_user(command, buffer, length)) 
170        return -EFAULT; 
171 
172    if (strncmp(command, "enable", strlen("enable")) == 0) 
173        static_branch_enable(&fkey); 
174    else if (strncmp(command, "disable", strlen("disable")) == 0) 
175        static_branch_disable(&fkey); 
176    else { 
177        pr_err("Invalid command: %s\n", command); 
178        return -EINVAL; 
179    } 
180 
181    /* Again, return the number of input characters used. */ 
182    return length; 
183} 
184 
185module_init(chardev_init); 
186module_exit(chardev_exit); 
187 
188MODULE_LICENSE("GPL");

To check the state of the static key, we can use the /dev/key_state interface.

1cat /dev/key_state

This will display the current state of the key, which is disabled by default.

To change the state of the static key, we can perform a write operation on the file:

1echo enable > /dev/key_state

This will enable the static key, causing the code path to switch from the fastpath to the slowpath.

In some cases, the key is enabled or disabled at initialization and never changed, we can declare a static key as read-only, which means that it can only be toggled in the module init function. To declare a read-only static key, we can use the DEFINE_STATIC_KEY_FALSE_RO or DEFINE_STATIC_KEY_TRUE_RO macro instead. Attempts to change the key at runtime will result in a page fault. For more information, see Static keys

20 Common Pitfalls

20.1 Using standard libraries

You can not do that. In a kernel module, you can only use kernel functions which are the functions you can see in /proc/kallsyms.

20.2 Disabling interrupts

You might need to do this for a short time and that is OK, but if you do not enable them afterwards, your system will be stuck and you will have to power it off.

21 Where To Go From Here?

For people seriously interested in kernel programming, I recommend kernelnewbies.org and the Documentation subdirectory within the kernel source code which is not always easy to understand but can be a starting point for further investigation. Also, as Linus Torvalds said, the best way to learn the kernel is to read the source code yourself.

If you would like to contribute to this guide or notice anything glaringly wrong, please create an issue at https://github.com/sysprog21/lkmpg. Your pull requests will be appreciated.

Happy hacking!

1The goal of threaded interrupts is to push more of the work to separate threads, so that the minimum needed for acknowledging an interrupt is reduced, and therefore the time spent handling the interrupt (where it can’t handle any other interrupts at the same time) is reduced. See Moving interrupts to threads [LWN.net].