Linux进程与调度

进程创建

fork

fork实际调用clone实现

1
2
3
4
5
6
7
8
// linux/kernel/fork.c
SYSCALL_DEFINE0(fork)
{
struct kernel_clone_args args = {
.exit_signal = SIGCHLD,
};
return kernel_clone(&args);
}

kernel_clone

进程创建主要操作在kernel_clone函数中

1
2
3
4
5
6
7
8
9
10
// linux/kernerl/fork.c:2866
/*
* Ok, this is the main fork-routine.
*
* It copies the process, and if successful kick-starts
* it and waits for it to finish using the VM if required.
*
* args->exit_signal is expected to be checked for sanity by the caller.
*/
pid_t kernel_clone(struct kernel_clone_args *args);

参数结构体kernel_clone_args定义如下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
// linux/include/linux/sched/task.h:21
struct kernel_clone_args {
u64 flags;
int __user *pidfd;
int __user *child_tid;
int __user *parent_tid;
const char *name;
int exit_signal;
u32 kthread:1;
u32 io_thread:1;
u32 user_worker:1;
u32 no_files:1;
unsigned long stack;
unsigned long stack_size;
unsigned long tls;
pid_t *set_tid;
/* Number of elements in *set_tid */
size_t set_tid_size;
int cgroup;
int idle;
int (*fn)(void *);
void *fn_arg;
struct cgroup *cgrp;
struct css_set *cset;
};

kernel_clone复制当前进程,当前进程currenttrace新创建的进程,获取新创建的进程的pid,启动新创建的进程,返回pid

1
2
3
4
5
6
7
8
9
10
11
12
p = copy_process(NULL, trace, NUMA_NO_NODE, args);
add_latent_entropy();

trace_sched_process_fork(current, p);

pid = get_task_pid(p, PIDTYPE_PID);
nr = pid_vnr(pid);

wake_up_new_task(p);

put_pid(pid);
return nr;

kernel_clonereturn之前添加调试代码

1
2
3
4
5
6
7
put_pid(pid);
if (nr > 1000) // pid > 1000避免在系统启动阶段输出太多日志导致无法启动
{
pid_t current_nr = *(&current->pid);
printk("[noxke dbg] kernel_clone return value: %d; current pid : %d\n", nr, current_nr);
}
return nr;

调试可以发现,主进程2060创建了子进程2061,kernel_clone函数返回值为fork得到的子进程的pid,但是子进程并没有发现从此处返回,并且不能解释子进程fork返回值为0

子进程返回

分析copy_process

1
2
3
4
// linux/kernel/fork.c:2519 copy_process
retval = copy_thread(p, args);
// linux/arch/x86/kernel/process.c:182 copy_thread
frame->ret_addr = (unsigned long) ret_from_fork_asm;

在copy_thread时调用了ret_from_fork_asm

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
/*linux/arch/x86/entry/entry_64.S:225*/
SYM_CODE_START(ret_from_fork_asm)
/*
* This is the start of the kernel stack; even through there's a
* register set at the top, the regset isn't necessarily coherent
* (consider kthreads) and one cannot unwind further.
*
* This ensures stack unwinds of kernel threads terminate in a known
* good state.
*/
UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR // copy_thread
CALL_DEPTH_ACCOUNT

movq %rax, %rdi /* prev */
movq %rsp, %rsi /* regs */
movq %rbx, %rdx /* fn */
movq %r12, %rcx /* fn_arg */
call ret_from_fork

/*
* Set the stack state to what is expected for the target function
* -- at this point the register set should be a valid user set
* and unwind should work normally.
*/
UNWIND_HINT_REGS
jmp swapgs_restore_regs_and_return_to_usermode
SYM_CODE_END(ret_from_fork_asm)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
// linux/arch/x86/kernel/process.c:140
__visible void ret_from_fork(struct task_struct *prev, struct pt_regs *regs,
int (*fn)(void *), void *fn_arg)
{
schedule_tail(prev);

/* Is this a kernel thread? */
if (unlikely(fn)) {
fn(fn_arg);
/*
* A kernel thread is allowed to return here after successfully
* calling kernel_execve(). Exit to userspace to complete the
* execve() syscall.
*/
regs->ax = 0;
}

syscall_exit_to_user_mode(regs);
}

最终调用了ret_from_fork,修改了子进程的ax寄存器为0,以及使子进程由系统调用返回,因此子进程wake_up之后的返回值为0,并且和主进程返回位置不同

使用printk调试进程创建过程

任务调度

任务调度代码位于linux/kernel/sched/中,主调度器为__schedule

1
2
// linux/kernel/sched/core.c:6568
static void __sched notrace __schedule(unsigned int sched_mode);
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
// linux/kernel/sched/core.c:6568 __schedule
struct task_struct *prev, *next;

// 获取当前任务
cpu = smp_processor_id();
rq = cpu_rq(cpu);
prev = rq->curr;
// 获取当前任务的切换次数
switch_count = &prev->nivcsw;
// 挂起当前任务
deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
switch_count = &prev->nvcsw;

// 获取下一个任务
next = pick_next_task(rq, prev, &rf);
clear_tsk_need_resched(prev);


if (likely(prev != next)) {
rq->nr_switches++;
RCU_INIT_POINTER(rq->curr, next);
// 上一个人物的切换次数+1
++*switch_count;
migrate_disable_switch(rq, prev);
psi_sched_switch(prev, next, !task_on_rq_queued(prev));

trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);
// 切换到新的任务
rq = context_switch(rq, prev, next, &rf);
}

Linux进程与调度
https://blog.noxke.icu/2023/11/27/linux/Linux进程与调度/
作者
noxke
发布于
2023年11月27日
许可协议