linux软中断和系统调用深入研究

最新推荐文章于 2024-08-16 11:40:30 发布

加油2019

最新推荐文章于 2024-08-16 11:40:30 发布

阅读量2.1k

点赞数 1

分类专栏：小张学inux内核文章标签： linux 软中断系统调用

本文链接：https://blog.csdn.net/qq_40036519/article/details/111769180

版权

小张学inux内核专栏收录该内容

23 篇文章 11 订阅

订阅专栏

arm软中断模式

arm7种模式
在这里插入图片描述
有中断模式，但是并没有软中断模式。那么arm的软中断是什么呢？
arm的软中断是arm从用户模式切换到特权模式，也就是linux中从用户态切换到内核态的过程。
swi命令触发软中断

linux系统中，swi异常向量代码：

linux系统调用

x86 架构是硬中断int 80，中断号为80来实现系统调用的；
arm架构是使用swi命令，使arm切产生硬件软中断，从用户模式陷入特权模式，执行swi异常向量表中的异常向量。

软中断的异常向量

arm中异常象量表：

异常类型	偏移地址（低）	偏移地址（高）
复位	0x00000000	0xffff0000
未定义指令	0x00000004	0xffff0004
软中断	0x00000008	0xffff0008
预取指令终	0x0000000c	0xffff000c
数据终止	0x00000010	0xffff0010
保留	0x00000014	0xffff0014
中断请求（IRQ）	0x00000018	0xffff0018
快速中断请求（FIQ）	0x0000001c	0xffff001c

软中断中断向量在arn异常向量表中偏移为0x08的地址。
来看linux代码的处理函数

.L__vectors_start:
	W(b)	vector_rst
	W(b)	vector_und
	W(ldr)	pc, .L__vectors_start + 0x1000   /*软中断异常的handle*/
	W(b)	vector_pabt
	W(b)	vector_dabt
	W(b)	vector_addrexcptn
	W(b)	vector_irq
	W(b)	vector_fiq

软中断异常的handle，是异常向量表后0x1000的位置，即异常向量后刚好一个page(4K)，我们知道linux内存分布，在。
在这里插入图片描述
vector段是向量表段，vector在编译链接时，链接在代码段之后，在linux内核初始化时后将vector段复制到内核空间的第一页，之后机器异常入口就设置为0xffff0000(PAGE_OFFSET)。所以存在两个vector段。代码段是从0xffff1000开始的。

内存管理中的零页不在物理内存的第一块(有mmu的系统)

关于内存管理子系统中零页的地址问题：
ZERO_PAGE，零页。linux中，在分配内存时遵从一个原则，写时分配。COW，写时复制，在进程创建时有讲到。linux分配一块内存，一开始之后映射到零页上，只有当往这块内存写内容时才会真正分配内存。
而这个零页的位置，在没有mmu的系统中才位于物理内存开始处，而在有mmu的系统中是当内存管理子系统初始化时会分配一个页作为零页。所以零页不在物理内存开始的第一页。

/*pgtable-nommu.h*/
#define ZERO_PAGE(vaddr)	(virt_to_page(0))
/*pgtable.h*/
extern struct page *empty_zero_page;
#define ZERO_PAGE(vaddr)	(empty_zero_page)

empty_zero_page即为ZERO_PAGE,在pagin_init中初始化

void __init paging_init(const struct machine_desc *mdesc)
{
...
	zero_page = early_alloc(PAGE_SIZE);
	bootmem_init();
	empty_zero_page = virt_to_page(zero_page);
...
}

再说内核空间内存分布：
linux链接脚本vmlinux.lds.s中有各内存区的说明

SECTIONS
{
	. = PAGE_OFFSET + TEXT_OFFSET;    /*PAGE_OFFSET 是内核空间的起始地址，TEXT_OFFSET预留一个页的空间给向量表*/

	/*删掉一些空置的段*/
	.text : {			/* Real text segment		*/
		_stext = .;		/* Text and read-only data	*/
		ARM_TEXT
	}

	_etext = .;			/* End of text section */

	RO_DATA(PAGE_SIZE)    /*只读段*/

	. = ALIGN(4);
	__ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
		__start___ex_table = .;
		ARM_MMU_KEEP(*(__ex_table))
		__stop___ex_table = .;
	}

#ifdef CONFIG_ARM_UNWIND
	ARM_UNWIND_SECTIONS
#endif

	NOTES

#ifdef CONFIG_STRICT_KERNEL_RWX
	. = ALIGN(1<<SECTION_SHIFT);
#else
	. = ALIGN(PAGE_SIZE);
#endif
	__init_begin = .;      /*init段开始标志*/

	ARM_VECTORS    /*异常向量表*/
	INIT_TEXT_SECTION(8)
	.exit.text : {
		ARM_EXIT_KEEP(EXIT_TEXT)
	}
	.init.proc.info : {
		ARM_CPU_DISCARD(PROC_INFO)
	}
	.init.arch.info : {
		__arch_info_begin = .;
		*(.arch.info.init)
		__arch_info_end = .;
	}
	.......

vmlinux.lds.h中

#define ARM_VECTORS							\
	__vectors_start = .;						\
	.vectors 0xffff0000 : AT(__vectors_start) {			\
		*(.vectors)						\                /*vertort断*/
	}								\
	. = __vectors_start + SIZEOF(.vectors);				\
	__vectors_end = .;						\
									\
	__stubs_start = .;						\
	.stubs ADDR(.vectors) + 0x1000 : AT(__stubs_start) {		\
		*(.stubs)						\       /*stubs段，vector_swi函数*/
	}								\
	. = __stubs_start + SIZEOF(.stubs);				\
	__stubs_end = .;						\
									\
	PROVIDE(vector_fiq_offset = vector_fiq - ADDR(.vectors));

vector段后0x1000的为.stubs段开始。来看看stubs段放了什么东西。
继续回到entry-armv.S

	.section .stubs, "ax", %progbits       /*stubs段声明*/
	@ This must be the first word
	.word	vector_swi                    /*存放vector_swi的地址*/

找到了软中断的入口，vector_swi函数。

软中断的处理入口vector_swi

进入vector_swi函数：

/*=============================================================================
 * SWI handler
 *-----------------------------------------------------------------------------
 */

	.align	5
ENTRY(vector_swi)
/*现场保护*/
.....
	/*
	 * Get the system call number.
	 */
/*OABI 和EABI获取系统调用号会有区别，EABI，系统调用号在R7寄存器中，OABI则保存在R10中*/
	 /* Pure EABI user space always put syscall number into scno (r7).
	 */

	/* saved_psr and saved_pc are now dead */

	uaccess_disable tbl
	/*获取系统调用表*/
	adr	tbl, sys_call_table		@ load syscall table pointer
	....
	/*获取当前执行的进程*/
	get_thread_info tsk
	
	/*
	 * Reload the registers that may have been corrupted on entry to
	 * the syscall assembly (by tracing or context tracking.)
	 */
 TRACE(	ldmia	sp, {r0 - r3}		)

local_restart:
	/*检查进程flag，是否开启系统调用追踪，保存到r10中*/
	ldr	r10, [tsk, #TI_FLAGS]		@ check for syscall tracing
	stmdb	sp!, {r4, r5}			@ push fifth and sixth args

	tst	r10, #_TIF_SYSCALL_WORK		@ are we tracing syscalls?
	bne	__sys_trace
	/*执行系统调用的handle，汇编宏，在entry-header.S中*/
	/*tbl: 系统调用表
	* scno:系统调用号
	* r10：是否进行系统调用追踪标志
	* __ret_fast_syscall：系统调用返回用户态，恢复现场，检查抢占，调度等
	*/
	invoke_syscall tbl, scno, r10, __ret_fast_syscall
...
	/*快速系统调用的出口*/
	b	ret_fast_syscall
#endif
ENDPROC(vector_swi)

entry-header.S中给寄存器起了别名

scno	.req	r7		@ syscall number
tbl	.req	r8		@ syscall table pointer
why	.req	r8		@ Linux syscall (!= 0)
tsk	.req	r9		@ current thread_info

关于系统调用号的获取，OABI和EABI区别有所不同，见
https://www.cnblogs.com/DF11G/p/10172520.html

OABI方式系统调用
SWI{cond} immed_24
immed_24: 24位立即数，指定了系统调用号，参数用通用寄存器传递

MOV R0，#34
SWI 12

EABI方式系统调用

MOV R7，#34
SWI 0X0

系统调用号由R7寄存器决定。

系统调用号和入口

sys_call_table的定义
entry-common.S中

#define COMPAT(nr, native, compat) syscall nr, native
#ifdef CONFIG_AEABI
#include <calls-eabi.S>
#else
#include <calls-oabi.S>
#endif
#undef COMPAT
	syscall_table_end sys_call_table

以eabi为例

NATIVE(0, sys_restart_syscall)
NATIVE(1, sys_exit)
NATIVE(2, sys_fork)
NATIVE(3, sys_read)
NATIVE(4, sys_write)
NATIVE(5, sys_open)
NATIVE(6, sys_close)
NATIVE(8, sys_creat)
NATIVE(9, sys_link)
NATIVE(10, sys_unlink)
NATIVE(11, sys_execve)
NATIVE(12, sys_chdir)
NATIVE(14, sys_mknod)
NATIVE(15, sys_chmod)
NATIVE(16, sys_lchown16)
NATIVE(19, sys_lseek)
NATIVE(20, sys_getpid)

系统调用函数定义
include/linux/syscalls.h中定义
SYSCALL_DEFINE* 宏

系统调用影响性能影响在哪？

系统调用是通过软中断陷入内核的，然后执行注册的软中断处理函数。
其过程中存在如下消耗：

arm模式切换，陷入内核，需要保存上下文，所以频繁的系统调用，会放大这一开销，降低代码执行效率。
系统调用退出时可能产生系统调度，可能会很容易发生调度，得不到足够的运行时间。
最主要的消耗还是保存/恢复现场的消耗。

linux陷入内核的几种方式

在linux内核(SVC模式)中只有硬件中断和软中断
硬件中断陷入内核的arm处理器模式的切换过程

usr -> 硬件中断(irq模式) -> svc模式；短暂的尽力irq模式，主要的处理还是在svc模式下。
软中断的切换过程，就是系统调用：
usr->系统调用(SVC)

linux软中断

此软中断非彼软中断！！！
linux内核的软中断是纯软件的实现，和arm的软中断区分开来。和系统调用的软中断不是同一个软中断。
此软中断和tasklet类比，相当于一系列的任务，是由内核线程[ksoftirqd%d]内核线程执行的，当有需要执行softirq，会唤醒内核线程执行。

原理

linux软件中断，借用硬件中断思想，一个中断号对应一个handle的思路。
kernel/softirq.c中

static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;

每个cpu都有自己的ksoftirqd%d内核线程执行这些，软中断。
softirq和smp，可以同时在smp的多个cpu上用执行同样的softirq处理函数。也就是可以并行处理，所以，irq线程和处理函数只能访问perCPU的变量，否则存在同步的问题。

代码走读

在这里插入图片描述
open_softirq注册软中断处理函数

void open_softirq(int nr, void (*action)(struct softirq_action *))
{
	softirq_vec[nr].action = action;
}

raise_softirq触发softirq

void raise_softirq(unsigned int nr)
{
	unsigned long flags;

	local_irq_save(flags);
	raise_softirq_irqoff(nr);
	local_irq_restore(flags);
}
inline void raise_softirq_irqoff(unsigned int nr)
{
	__raise_softirq_irqoff(nr);
	if (!in_interrupt())
		wakeup_softirqd();
}

软中断处理线程ksoftirqd%d

static struct smp_hotplug_thread softirq_threads = {
	.store			= &ksoftirqd,
	.thread_should_run	= ksoftirqd_should_run,
	.thread_fn		= run_ksoftirqd,
	.thread_comm		= "ksoftirqd/%u",
};

内核线程函数run_ksoftirqd
调用__do_softirq函数实际处理。

static void run_ksoftirqd(unsigned int cpu)
{
	local_irq_disable();
	if (local_softirq_pending()) {
		/*
		 * We can safely run softirq on inline stack, as we are not deep
		 * in the task stack here.
		 */
		__do_softirq();
		local_irq_enable();
		cond_resched();
		return;
	}
	local_irq_enable();
}

asmlinkage __visible void __softirq_entry __do_softirq(void)
{
/*软中断处理时长在2ms一下，如果处理完软中断有发生则继续执行*/
	unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
	unsigned long old_flags = current->flags;
	int max_restart = MAX_SOFTIRQ_RESTART;
	struct softirq_action *h;
	bool in_hardirq;
	__u32 pending;
	int softirq_bit;

	/*
	 * Mask out PF_MEMALLOC s current task context is borrowed for the
	 * softirq. A softirq handled such as network RX might set PF_MEMALLOC
	 * again if the socket is related to swap
	 */
	current->flags &= ~PF_MEMALLOC;

	pending = local_softirq_pending();
	account_irq_enter_time(current);

	__local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
	in_hardirq = lockdep_softirq_start();

restart:
	/* Reset the pending bitmask before enabling irqs */
	set_softirq_pending(0);

	local_irq_enable();

	h = softirq_vec;    /*软中断处理函数数组*/

	while ((softirq_bit = ffs(pending))) {
		unsigned int vec_nr;
		int prev_count;

		h += softirq_bit - 1;

		vec_nr = h - softirq_vec;   /*软中断号*/
		prev_count = preempt_count();

		kstat_incr_softirqs_this_cpu(vec_nr);

		trace_softirq_entry(vec_nr);
		h->action(h);            /*执行软中断处理函数*/
		trace_softirq_exit(vec_nr);
		if (unlikely(prev_count != preempt_count())) {
			pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
			       vec_nr, softirq_to_name[vec_nr], h->action,
			       prev_count, preempt_count());
			preempt_count_set(prev_count);
		}
		h++;
		pending >>= softirq_bit;
	}

	rcu_bh_qs();
	local_irq_disable();

	pending = local_softirq_pending();
	/*如果又来软中断，时间还有则继续处理，如果不需要调度的话*/
	if (pending) {
		if (time_before(jiffies, end) && !need_resched() &&
		    --max_restart)
			goto restart;

		wakeup_softirqd();
	}

	lockdep_softirq_end(in_hardirq);
	account_irq_exit_time(current);
	__local_bh_enable(SOFTIRQ_OFFSET);
	WARN_ON_ONCE(in_interrupt());
	current_restore_flags(old_flags, PF_MEMALLOC);
}

内核线程的执行时机和优先级
唤醒：

raise_softirq
irq_exit()
如果执行一轮软中断后，时间超过2ms但是又有软中断来，主动调出后等待下次调度执行；如果软中断过多会导致处理不过来。
优先级？
浏览代码，并没有设置该内核线程的优先级的地方，即使用的默认优先级，即nice值0，和我们应用线程没有什么区别。

软中断的使用场景

定时器时间到处理handle；
网络收发包
RCU
inlucde/linux/interrupt.h中定义

enum
{
	HI_SOFTIRQ=0,
	TIMER_SOFTIRQ,
	NET_TX_SOFTIRQ,
	NET_RX_SOFTIRQ,
	BLOCK_SOFTIRQ,
	IRQ_POLL_SOFTIRQ,
	TASKLET_SOFTIRQ,
	SCHED_SOFTIRQ,
	HRTIMER_SOFTIRQ, /* Unused, but kept as tools rely on the
			    numbering. Sigh! */
	RCU_SOFTIRQ,    /* Preferable RCU should always be the last softirq */

	NR_SOFTIRQS
};