linux o(1) scheduling
DESCRIPTION
outdatedTRANSCRIPT
1
Linux Scheduling (Kernel 2.6)
Roy Lee, 21 Sep 2005
NCTU Computer Operating System Lab
2
TASK_RUNNING
TASK_INTERRUPTIBLE
TASK_UNINTERRUPTIBLE
TASK_STOPPED
EXIT_ZOMBIE
EXIT_DEAD
set_task_state(task, state);
task_state = state;
set_current_state( state);
Robert Love, “Linux Kernel Development,” 2nd Edition
[include/linux/sched.h]
Linux Scheduling
3
Runnable & Running
struct runqueue {
spinlock_t lock;
unsigned long nr_running;
unsigned long long nr_switches;
unsigned long nr_uninterruptible;
unsigned long expired_timestamp;
unsigned long long timestamp_last_tick;
task_t *curr, *idle;
struct mm_struct *prev_mm;
prio_array_t *active, *expired, arrays[2];
int best_expired_prio;
atomic_t nr_iowait;
4
Runnable & Running(cont.)
struct prio_array {
unsigned int nr_active;
unsigned long bitmap[BITMAP_SIZE];
struct list_head queue[MAX_PRIO];
};
struct prio_array
p2p1
queue
bitmap
... ...
... ...
100
0
139
100
139
0
H
p4
H
H
H
idx = sched_find_first_bit(array->bitmap);
queue = array->queue + idx;
next = list_entry(queue->next, task_t, run_list);
#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
5
p3p2
p1
*active *expired
struct prio_array array[2]
bitmap
queue
Runnable & Running(cont.)
process
list_head
... ...
... ...
100
0
139
100
139
0
bitmap
queue
... ...
... ...
100
0
139
100
139
0
H
p4
H
H
H
P4 has the highest priority, and is selected for its execution
H H
p4
6
p3p2
p1
*active *expired
struct prio_array array[2]
bitmap
queue
Runnable & Running(cont.)
process
list_head
... ...
... ...
100
0
139
100
139
0
bitmap
queue
... ...
... ...
100
0
139
100
139
0
H
p4
H
H
H
Later on, P4 runs out of its timeslice, and get moved to the expired array
H H
p4
7
p3p2
p1
*active *expired
struct prio_array array[2]
bitmap
queue
Runnable & Running(cont.)
process
list_head
... ...
... ...
100
0
139
100
139
0
bitmap
queue
... ...
... ...
100
0
139
100
139
0
H
H
H
H
Bitmaps are also updated
H H
8
p3p2
p1
*active *expired
struct prio_array array[2]
bitmap
queue
Runnable & Running(cont.)
process
list_head
... ...
... ...
100
0
139
100
139
0
bitmap
queue
... ...
... ...
100
0
139
100
139
0
H
p4
H
H
H H H
Now, P2 has the highest priority, and is selected for its execution
p2
9
p3p2
p1
*active *expired
struct prio_array array[2]
bitmap
queue
Runnable & Running(cont.)
process
list_head
... ...
... ...
100
0
139
100
139
0
bitmap
queue
... ...
... ...
100
0
139
100
139
0
H
p4
H
H
H H H
Later on, P2 runs out of its timeslice, and get moved to the expired array
p2
10
p3p2
p1
*active *expired
struct prio_array array[2]
bitmap
queue
Runnable & Running(cont.)
process
list_head
... ...
... ...
100
0
139
100
139
0
bitmap
queue
... ...
... ...
100
0
139
100
139
0
H
p4
H
H
H H H
Bitmaps are also updated
p3
11
p1
*active *expired
struct prio_array array[2]
bitmap
queue
Runnable & Running(cont.)
process
list_head
... ...
... ...
100
0
139
100
139
0
bitmap
queue
... ...
... ...
100
0
139
100
139
0
H
p4
HH H
p2
H
H
Now, P1 has the highest priority, and is selected for its execution
p3
12
p1
*active *expired
struct prio_array array[2]
bitmap
queue
Runnable & Running(cont.)
process
list_head
... ...
... ...
100
0
139
100
139
0
bitmap
queue
... ...
... ...
100
0
139
100
139
0
H
p4
HH H
p2
H
H
During its execution, it forks a child process P5
p5
p3
p1
13
p5
*active *expired
struct prio_array array[2]
bitmap
queue
Runnable & Running(cont.)
process
list_head
... ...
... ...
100
0
139
100
139
0
bitmap
queue
... ...
... ...
100
0
139
100
139
0
H
p4
HH H
p2
H
H
To avoid COW overhead, P1 yields the CPU to the P5
p5
14
p3
p1
p5
*active *expired
struct prio_array array[2]
bitmap
queue
Runnable & Running(cont.)
process
list_head
... ...
... ...
100
0
139
100
139
0
bitmap
queue
... ...
... ...
100
0
139
100
139
0
H
p4
H
H
H H
p2
H
H
Later on, P5 runs out of its timeslice, and get moved to the expired array
p5
15
p3
p1
p5
*active *expired
struct prio_array array[2]
bitmap
queue
Runnable & Running(cont.)
process
list_head
... ...
... ...
100
0
139
100
139
0
bitmap
queue
... ...
... ...
100
0
139
100
139
0
H
p4
H
H
H H
p2
H
H
Bitmaps are also updated
You may notice that it’s priority is changed here
We will explain this later
p3
p1
16
*active *expired
struct prio_array array[2]
bitmap
queue
Runnable & Running(cont.)
process
list_head
... ...
... ...
100
0
139
100
139
0
bitmap
queue
... ...
... ...
100
0
139
100
139
0
H
p4
H
H
H H
p2
H
HH
p5
P1 resumes its execution, finishes its job and then exits
This is a typical fork() and the exec() scenario
p3
17
*active *expired
struct prio_array array[2]
bitmap
queue
Runnable & Running(cont.)
process
list_head
... ...
... ...
100
0
139
100
139
0
bitmap
queue
... ...
... ...
100
0
139
100
139
0
H
p4
H
H
H H
p2
H
HH
p5
Now, P3 has the highest priority, and is selected for its execution
p3
18
p3
*active *expired
struct prio_array array[2]
bitmap
queue
Runnable & Running(cont.)
process
list_head
... ...
... ...
100
0
139
100
139
0
bitmap
queue
... ...
... ...
100
0
139
100
139
0
H
p4
H
H
H H
p2
H
HH
p5
Later on, P3 runs out of its timeslice, and get moved to the expired array
p3
19
p3
*active *expired
struct prio_array array[2]
bitmap
queue
Runnable & Running(cont.)
process
list_head
... ...
... ...
100
0
139
100
139
0
bitmap
queue
... ...
... ...
100
0
139
100
139
0
H
p4
H
H
H H
p2
H
HH
p5
Bitmaps are also updated
20
p3
*active *expired
struct prio_array array[2]
bitmap
queue
Runnable & Running(cont.)
process
list_head
... ...
100
... ...
0
139
100
139
0
bitmap
queue
... ...
... ...
100
0
139
100
139
0
H
p4
H
H
H H
p2
H
H
Exchange!
Hp5
Now the active array is empty, scheduler exchanges it with the expired one
21
p3
*active*expired
struct prio_array array[2]
bitmap
queue
Runnable & Running(cont.)
process
list_head
... ...
100
... ...
0
139
100
139
0
bitmap
... ...
... ...
100
queue
0
139
100
139
0
H
p4
H
H
H H
p2
H
HH
p5
Another round begins!
22
What Polices Do We Have?
SCHED_NORMAL
Ranges from MAX_RT_PRIO to MAX_PRIO - 1 (100 ~ 139)
SCHED_FIFO & SCHED_RR
Ranges from 0 to MAX_RT_PRIO -1 (0 ~ 99)
Both are soft real-time scheduling.
A SCHED_FIFO process doesn’t have timeslice.
A SCHED_RR process only round-robbin with those which have
equal priority.
The real-time processes:
never expire.
work with static priority
0MAX_RT_PRIO MAX_PRIO
p5
23
p2
p3
p1
*active*expired
struct prio_array array[2]
bitmap
queue
Realtime Scheduling
Normal process
list_head
... ...
100
... ...
0
139
100
139
0
bitmap
queue
... ...
... ...
100
0
139
100
139
0
H
p4
H
H
H H H
H
RR process
FIFO process
p7
p6
H
P4 has the highest priority, and is selected for its execution
p5
24
p4
p3
p1
*active*expired
struct prio_array array[2]
bitmap
queue
Realtime Scheduling
Normal process
list_head
... ...
100
... ...
0
139
100
139
0
bitmap
queue
... ...
... ...
100
0
139
100
139
0
H
p2
H
H
H H H
H
RR process
FIFO process
p7
p6
H
P4 runs out its timeslice, but since its a RR process, it does not expire
Scheduler reinserts it to the tail of its priority list
p5
25
p4
p3
p1
*active*expired
struct prio_array array[2]
bitmap
queue
Realtime Scheduling
Normal process
list_head
... ...
100
... ...
0
139
100
139
0
bitmap
queue
... ...
... ...
100
0
139
100
139
0
H
p2
H
H
H H H
H
RR process
FIFO process
p7
p6
H
Now P2 has the highest priority, and is selected for its execution
p5
26
p3
p1
*active*expired
struct prio_array array[2]
bitmap
queue
Realtime Scheduling
Normal process
list_head
... ...
100
... ...
0
139
100
139
0
bitmap
queue
... ...
... ...
100
0
139
100
139
0
H
p4
H
H
H H H
H
RR process
FIFO process
p7
p6
H
P2 finishes its job and exits
Now P4 has the highest priority, and is selected for its execution
p5
27
p3
p1
*active*expired
struct prio_array array[2]
bitmap
queue
Realtime Scheduling
Normal process
list_head
... ...
100
... ...
0
139
100
139
0
bitmap
queue
... ...
... ...
100
0
139
100
139
0
H
p4
H
H
H H H
H
RR process
FIFO process
p7
p6
H
In this case, unless P4 exits or voluntarily relinquishes its execution,
or higher priority processes are created/waked up, it monopolize the CPU
p5
p1
28
p3
*active*expired
struct prio_array array[2]
bitmap
queue
Realtime Scheduling
Normal process
list_head
... ...
100
... ...
0
139
100
139
0
bitmap
queue
... ...
... ...
100
0
139
100
139
0
H
H
H
H H
H
RR process
FIFO process
p7
p6
H
Later on, P4 finishes its job and exits
H
P1 is selected for its execution
29
p3
*active*expired
struct prio_array array[2]
bitmap
queue
Realtime Scheduling
Normal process
list_head
... ...
100
... ...
0
139
100
139
0
bitmap
queue
... ...
... ...
100
0
139
100
139
0
H
H
H
H H
H
RR process
FIFO process
p7
p6
H
P1 runs out its timeslice and is reinserted to the tail of its list
p1
p5
H
30
p1
p5
p3
*active*expired
struct prio_array array[2]
bitmap
queue
Realtime Scheduling
Normal process
list_head
... ...
100
... ...
0
139
100
139
0
bitmap
queue
... ...
... ...
100
0
139
100
139
0
H
H
H
H H H
H
RR process
FIFO process
p7
p6
H
P5 is FIFO realtime, it does not have timeslice.
Unless higher priority processes are created/waked up, it monopolizes the CPU
31
Static priority
Ranges from -20 to 19
Specified by the user (nice value).
Dynamic priority
A bonus or penalty from the range -5 to +5 based on the interactivity of the task.
mapping
0MAX_RT_PRIO MAX_PRIO
#define MAX_USER_RT_PRIO 100
#define MAX_RT_PRIO MAX_USER_RT_PRIO
#define MAX_PRIO (MAX_RT_PRIO + 40)
#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
The Priority of Processes
32
Struct task_struct
{
int state;
...
int prio, static_prio;
...
prio_array_t *array;
unsigned long sleep_avg;
unsigned long long timestamp, last_ran;
unsigned long long sched_time;
int activated;
unsigned long policy;
cpumask_t cpus_allowed;
unsigned int time_slice, first_time_slice;
...
Dynamic priority.
Specified by the user.(nice)
The Priority of Processes(cont.)
Ranges from 0 to MAX_SLEEP_AVG
timer_interrupt()
update_process_times()
scheduler_tick()
effective_prio()
recalc_task_prio()
if (!--p->time_slice)
33
p3
p1
*expired *active
struct prio_array array[2]
bitmap
queue
When A Process is Interactive Enough…
process
list_head
... ...
100
... ...
0
139
100
139
0
bitmap
queue
... ...
... ...
100
0
139
100
139
0
H
p4
H
H
H H
p2
H
HH
Case 1
If P4 has enough interactivity, after it runs out its timeslice,
the scheduler would reinsert it to the end of its list
34
p3
p4
*expired *active
struct prio_array array[2]
bitmap
queue
When A Process is Interactive Enough…
process
list_head
... ...
100
... ...
0
139
100
139
0
bitmap
queue
... ...
... ...
100
0
139
100
139
0
H
p1
H
H
H H
p2
H
HH
Case 1
The scheduler would reinsert it to the end of its list
instead of moving it to the expired array
35
p3
p1
*expired *active
struct prio_array array[2]
bitmap
queue
When A Process is Interactive Enough…
process
list_head
... ...
100
... ...
0
139
100
139
0
bitmap
queue
... ...
... ...
100
0
139
100
139
0
H
p4
H
H
H H
p2
H
HH
Case 2
However, if there are any processes that have been starved,
it still has to be expired to prevent further starvation
p4
36
Interactivity of Process
#define TASK_INTERACTIVE(p) \
((p)->prio <= (p)->static_prio - DELTA(p))
#define DELTA(p) \
(SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA)
#define EXPIRED_STARVING(rq) \
((STARVATION_LIMIT && ((rq)->expired_timestamp && \
(jiffies - (rq)->expired_timestamp >= \
STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
((rq)->curr->static_prio > (rq)->best_expired_prio))
if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
enqueue_task(p, rq->expired);
if (p->static_prio < rq->best_expired_prio)
rq->best_expired_prio = p->static_prio;
} else
enqueue_task(p, rq->active);
37
Timeslice
The calculation is a simple scaling of the static priority into a range of timeslices (5 ~ 800 ms).
By default (with nice value of zero) is 100 ms.
#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
#define DEF_TIMESLICE (100 * HZ / 1000)
#define SCALE_PRIO(x, prio) \
max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
static inline unsigned int task_timeslice(task_t *p)
{
if (p->static_prio < NICE_TO_PRIO(0))
return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
else
return SCALE_PRIO(DEF_TIMESLICE, p->static_prio);
}
38
Scheduling with Process Creation
do_fork()
copy_process()
sched_fork()
p->state = TASK_RUNNING;
INIT_LIST_HEAD(&p->run_list);
p->array = NULL;
spin_lock_init(&p->switch_lock);
...
local_irq_disable();
p->time_slice = (current->time_slice + 1) >> 1;
p->first_time_slice = 1;
current->time_slice >>= 1;
p->timestamp = sched_clock();
if (unlikely(!current->time_slice)) {
current->time_slice = 1;
preempt_disable();
scheduler_tick();
local_irq_enable();
preempt_enable();
} else
local_irq_enable();
wake_up_new()
p->prio = current->prio;
list_add_tail(...);
p->array = current->array;
p->array->nr_active++;
rq->nr_running++;
To avoid the COW overhead,
we let the child go first
39
Scheduling with Process Termination
rq = task_rq_lock(p->parent, &flags);
if (p->first_time_slice) {
p->parent->time_slice += p->time_slice;
if (unlikely(p->parent->time_slice >
task_timeslice(p)))
p->parent->time_slice = task_timeslice(p);
}
if (p->sleep_avg < p->parent->sleep_avg)
p->parent->sleep_avg = p->parent->sleep_avg /
(EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
(EXIT_WEIGHT + 1);
task_rq_unlock(rq, &flags);
do_exit()
sys_exit()
release_task()
exit_notify()
sched_exit()
schedule()
BUG();
sched_exit()
40
Control Flow of scheduler_tick()
scheduler_tick()
Realtime task?yes
no
Round robbin?no(FIFO)
Timeslice remained?
Set need_reschedule flag
Continue exection
yesTimeslice remained?
Remove from active
Recalculate priority
and timeslice
Interactive enough? Reinsert to active
Reinsert to expired
no
Is there any task in
expired starving?
yes
yes
no
no
yesyes
no
Set need_reschedule flag
41
Charge Ticks to the Current Process
timer_interrupt()
update_process_times(user_mode(regs))
account_user_time()
account_system_time()
#define user_mode(regs) (!!((regs)->cs & 3))
User mode?
p->stime = cputime_add(p->stime, cputime);
p->utime = cputime_add(p->utime, cputime);
YesNo
jiffies_to_cputime(1)