linux o(1) scheduling

41
1 Linux Scheduling (Kernel 2.6) Roy Lee, 21 Sep 2005 NCTU Computer Operating System Lab

Upload: roy-lee

Post on 11-May-2015

3.280 views

Category:

Documents


0 download

DESCRIPTION

outdated

TRANSCRIPT

Page 1: Linux O(1) Scheduling

1

Linux Scheduling (Kernel 2.6)

Roy Lee, 21 Sep 2005

NCTU Computer Operating System Lab

Page 2: Linux O(1) Scheduling

2

TASK_RUNNING

TASK_INTERRUPTIBLE

TASK_UNINTERRUPTIBLE

TASK_STOPPED

EXIT_ZOMBIE

EXIT_DEAD

set_task_state(task, state);

task_state = state;

set_current_state( state);

Robert Love, “Linux Kernel Development,” 2nd Edition

[include/linux/sched.h]

Linux Scheduling

Page 3: Linux O(1) Scheduling

3

Runnable & Running

struct runqueue {

spinlock_t lock;

unsigned long nr_running;

unsigned long long nr_switches;

unsigned long nr_uninterruptible;

unsigned long expired_timestamp;

unsigned long long timestamp_last_tick;

task_t *curr, *idle;

struct mm_struct *prev_mm;

prio_array_t *active, *expired, arrays[2];

int best_expired_prio;

atomic_t nr_iowait;

Page 4: Linux O(1) Scheduling

4

Runnable & Running(cont.)

struct prio_array {

unsigned int nr_active;

unsigned long bitmap[BITMAP_SIZE];

struct list_head queue[MAX_PRIO];

};

struct prio_array

p2p1

queue

bitmap

... ...

... ...

100

0

139

100

139

0

H

p4

H

H

H

idx = sched_find_first_bit(array->bitmap);

queue = array->queue + idx;

next = list_entry(queue->next, task_t, run_list);

#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))

Page 5: Linux O(1) Scheduling

5

p3p2

p1

*active *expired

struct prio_array array[2]

bitmap

queue

Runnable & Running(cont.)

process

list_head

... ...

... ...

100

0

139

100

139

0

bitmap

queue

... ...

... ...

100

0

139

100

139

0

H

p4

H

H

H

P4 has the highest priority, and is selected for its execution

H H

Page 6: Linux O(1) Scheduling

p4

6

p3p2

p1

*active *expired

struct prio_array array[2]

bitmap

queue

Runnable & Running(cont.)

process

list_head

... ...

... ...

100

0

139

100

139

0

bitmap

queue

... ...

... ...

100

0

139

100

139

0

H

p4

H

H

H

Later on, P4 runs out of its timeslice, and get moved to the expired array

H H

Page 7: Linux O(1) Scheduling

p4

7

p3p2

p1

*active *expired

struct prio_array array[2]

bitmap

queue

Runnable & Running(cont.)

process

list_head

... ...

... ...

100

0

139

100

139

0

bitmap

queue

... ...

... ...

100

0

139

100

139

0

H

H

H

H

Bitmaps are also updated

H H

Page 8: Linux O(1) Scheduling

8

p3p2

p1

*active *expired

struct prio_array array[2]

bitmap

queue

Runnable & Running(cont.)

process

list_head

... ...

... ...

100

0

139

100

139

0

bitmap

queue

... ...

... ...

100

0

139

100

139

0

H

p4

H

H

H H H

Now, P2 has the highest priority, and is selected for its execution

Page 9: Linux O(1) Scheduling

p2

9

p3p2

p1

*active *expired

struct prio_array array[2]

bitmap

queue

Runnable & Running(cont.)

process

list_head

... ...

... ...

100

0

139

100

139

0

bitmap

queue

... ...

... ...

100

0

139

100

139

0

H

p4

H

H

H H H

Later on, P2 runs out of its timeslice, and get moved to the expired array

Page 10: Linux O(1) Scheduling

p2

10

p3p2

p1

*active *expired

struct prio_array array[2]

bitmap

queue

Runnable & Running(cont.)

process

list_head

... ...

... ...

100

0

139

100

139

0

bitmap

queue

... ...

... ...

100

0

139

100

139

0

H

p4

H

H

H H H

Bitmaps are also updated

Page 11: Linux O(1) Scheduling

p3

11

p1

*active *expired

struct prio_array array[2]

bitmap

queue

Runnable & Running(cont.)

process

list_head

... ...

... ...

100

0

139

100

139

0

bitmap

queue

... ...

... ...

100

0

139

100

139

0

H

p4

HH H

p2

H

H

Now, P1 has the highest priority, and is selected for its execution

Page 12: Linux O(1) Scheduling

p3

12

p1

*active *expired

struct prio_array array[2]

bitmap

queue

Runnable & Running(cont.)

process

list_head

... ...

... ...

100

0

139

100

139

0

bitmap

queue

... ...

... ...

100

0

139

100

139

0

H

p4

HH H

p2

H

H

During its execution, it forks a child process P5

p5

Page 13: Linux O(1) Scheduling

p3

p1

13

p5

*active *expired

struct prio_array array[2]

bitmap

queue

Runnable & Running(cont.)

process

list_head

... ...

... ...

100

0

139

100

139

0

bitmap

queue

... ...

... ...

100

0

139

100

139

0

H

p4

HH H

p2

H

H

To avoid COW overhead, P1 yields the CPU to the P5

Page 14: Linux O(1) Scheduling

p5

14

p3

p1

p5

*active *expired

struct prio_array array[2]

bitmap

queue

Runnable & Running(cont.)

process

list_head

... ...

... ...

100

0

139

100

139

0

bitmap

queue

... ...

... ...

100

0

139

100

139

0

H

p4

H

H

H H

p2

H

H

Later on, P5 runs out of its timeslice, and get moved to the expired array

Page 15: Linux O(1) Scheduling

p5

15

p3

p1

p5

*active *expired

struct prio_array array[2]

bitmap

queue

Runnable & Running(cont.)

process

list_head

... ...

... ...

100

0

139

100

139

0

bitmap

queue

... ...

... ...

100

0

139

100

139

0

H

p4

H

H

H H

p2

H

H

Bitmaps are also updated

You may notice that it’s priority is changed here

We will explain this later

Page 16: Linux O(1) Scheduling

p3

p1

16

*active *expired

struct prio_array array[2]

bitmap

queue

Runnable & Running(cont.)

process

list_head

... ...

... ...

100

0

139

100

139

0

bitmap

queue

... ...

... ...

100

0

139

100

139

0

H

p4

H

H

H H

p2

H

HH

p5

P1 resumes its execution, finishes its job and then exits

This is a typical fork() and the exec() scenario

Page 17: Linux O(1) Scheduling

p3

17

*active *expired

struct prio_array array[2]

bitmap

queue

Runnable & Running(cont.)

process

list_head

... ...

... ...

100

0

139

100

139

0

bitmap

queue

... ...

... ...

100

0

139

100

139

0

H

p4

H

H

H H

p2

H

HH

p5

Now, P3 has the highest priority, and is selected for its execution

Page 18: Linux O(1) Scheduling

p3

18

p3

*active *expired

struct prio_array array[2]

bitmap

queue

Runnable & Running(cont.)

process

list_head

... ...

... ...

100

0

139

100

139

0

bitmap

queue

... ...

... ...

100

0

139

100

139

0

H

p4

H

H

H H

p2

H

HH

p5

Later on, P3 runs out of its timeslice, and get moved to the expired array

Page 19: Linux O(1) Scheduling

p3

19

p3

*active *expired

struct prio_array array[2]

bitmap

queue

Runnable & Running(cont.)

process

list_head

... ...

... ...

100

0

139

100

139

0

bitmap

queue

... ...

... ...

100

0

139

100

139

0

H

p4

H

H

H H

p2

H

HH

p5

Bitmaps are also updated

Page 20: Linux O(1) Scheduling

20

p3

*active *expired

struct prio_array array[2]

bitmap

queue

Runnable & Running(cont.)

process

list_head

... ...

100

... ...

0

139

100

139

0

bitmap

queue

... ...

... ...

100

0

139

100

139

0

H

p4

H

H

H H

p2

H

H

Exchange!

Hp5

Now the active array is empty, scheduler exchanges it with the expired one

Page 21: Linux O(1) Scheduling

21

p3

*active*expired

struct prio_array array[2]

bitmap

queue

Runnable & Running(cont.)

process

list_head

... ...

100

... ...

0

139

100

139

0

bitmap

... ...

... ...

100

queue

0

139

100

139

0

H

p4

H

H

H H

p2

H

HH

p5

Another round begins!

Page 22: Linux O(1) Scheduling

22

What Polices Do We Have?

SCHED_NORMAL

Ranges from MAX_RT_PRIO to MAX_PRIO - 1 (100 ~ 139)

SCHED_FIFO & SCHED_RR

Ranges from 0 to MAX_RT_PRIO -1 (0 ~ 99)

Both are soft real-time scheduling.

A SCHED_FIFO process doesn’t have timeslice.

A SCHED_RR process only round-robbin with those which have

equal priority.

The real-time processes:

never expire.

work with static priority

0MAX_RT_PRIO MAX_PRIO

Page 23: Linux O(1) Scheduling

p5

23

p2

p3

p1

*active*expired

struct prio_array array[2]

bitmap

queue

Realtime Scheduling

Normal process

list_head

... ...

100

... ...

0

139

100

139

0

bitmap

queue

... ...

... ...

100

0

139

100

139

0

H

p4

H

H

H H H

H

RR process

FIFO process

p7

p6

H

P4 has the highest priority, and is selected for its execution

Page 24: Linux O(1) Scheduling

p5

24

p4

p3

p1

*active*expired

struct prio_array array[2]

bitmap

queue

Realtime Scheduling

Normal process

list_head

... ...

100

... ...

0

139

100

139

0

bitmap

queue

... ...

... ...

100

0

139

100

139

0

H

p2

H

H

H H H

H

RR process

FIFO process

p7

p6

H

P4 runs out its timeslice, but since its a RR process, it does not expire

Scheduler reinserts it to the tail of its priority list

Page 25: Linux O(1) Scheduling

p5

25

p4

p3

p1

*active*expired

struct prio_array array[2]

bitmap

queue

Realtime Scheduling

Normal process

list_head

... ...

100

... ...

0

139

100

139

0

bitmap

queue

... ...

... ...

100

0

139

100

139

0

H

p2

H

H

H H H

H

RR process

FIFO process

p7

p6

H

Now P2 has the highest priority, and is selected for its execution

Page 26: Linux O(1) Scheduling

p5

26

p3

p1

*active*expired

struct prio_array array[2]

bitmap

queue

Realtime Scheduling

Normal process

list_head

... ...

100

... ...

0

139

100

139

0

bitmap

queue

... ...

... ...

100

0

139

100

139

0

H

p4

H

H

H H H

H

RR process

FIFO process

p7

p6

H

P2 finishes its job and exits

Now P4 has the highest priority, and is selected for its execution

Page 27: Linux O(1) Scheduling

p5

27

p3

p1

*active*expired

struct prio_array array[2]

bitmap

queue

Realtime Scheduling

Normal process

list_head

... ...

100

... ...

0

139

100

139

0

bitmap

queue

... ...

... ...

100

0

139

100

139

0

H

p4

H

H

H H H

H

RR process

FIFO process

p7

p6

H

In this case, unless P4 exits or voluntarily relinquishes its execution,

or higher priority processes are created/waked up, it monopolize the CPU

Page 28: Linux O(1) Scheduling

p5

p1

28

p3

*active*expired

struct prio_array array[2]

bitmap

queue

Realtime Scheduling

Normal process

list_head

... ...

100

... ...

0

139

100

139

0

bitmap

queue

... ...

... ...

100

0

139

100

139

0

H

H

H

H H

H

RR process

FIFO process

p7

p6

H

Later on, P4 finishes its job and exits

H

P1 is selected for its execution

Page 29: Linux O(1) Scheduling

29

p3

*active*expired

struct prio_array array[2]

bitmap

queue

Realtime Scheduling

Normal process

list_head

... ...

100

... ...

0

139

100

139

0

bitmap

queue

... ...

... ...

100

0

139

100

139

0

H

H

H

H H

H

RR process

FIFO process

p7

p6

H

P1 runs out its timeslice and is reinserted to the tail of its list

p1

p5

H

Page 30: Linux O(1) Scheduling

30

p1

p5

p3

*active*expired

struct prio_array array[2]

bitmap

queue

Realtime Scheduling

Normal process

list_head

... ...

100

... ...

0

139

100

139

0

bitmap

queue

... ...

... ...

100

0

139

100

139

0

H

H

H

H H H

H

RR process

FIFO process

p7

p6

H

P5 is FIFO realtime, it does not have timeslice.

Unless higher priority processes are created/waked up, it monopolizes the CPU

Page 31: Linux O(1) Scheduling

31

Static priority

Ranges from -20 to 19

Specified by the user (nice value).

Dynamic priority

A bonus or penalty from the range -5 to +5 based on the interactivity of the task.

mapping

0MAX_RT_PRIO MAX_PRIO

#define MAX_USER_RT_PRIO 100

#define MAX_RT_PRIO MAX_USER_RT_PRIO

#define MAX_PRIO (MAX_RT_PRIO + 40)

#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)

#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)

#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)

The Priority of Processes

Page 32: Linux O(1) Scheduling

32

Struct task_struct

{

int state;

...

int prio, static_prio;

...

prio_array_t *array;

unsigned long sleep_avg;

unsigned long long timestamp, last_ran;

unsigned long long sched_time;

int activated;

unsigned long policy;

cpumask_t cpus_allowed;

unsigned int time_slice, first_time_slice;

...

Dynamic priority.

Specified by the user.(nice)

The Priority of Processes(cont.)

Ranges from 0 to MAX_SLEEP_AVG

timer_interrupt()

update_process_times()

scheduler_tick()

effective_prio()

recalc_task_prio()

if (!--p->time_slice)

Page 33: Linux O(1) Scheduling

33

p3

p1

*expired *active

struct prio_array array[2]

bitmap

queue

When A Process is Interactive Enough…

process

list_head

... ...

100

... ...

0

139

100

139

0

bitmap

queue

... ...

... ...

100

0

139

100

139

0

H

p4

H

H

H H

p2

H

HH

Case 1

If P4 has enough interactivity, after it runs out its timeslice,

the scheduler would reinsert it to the end of its list

Page 34: Linux O(1) Scheduling

34

p3

p4

*expired *active

struct prio_array array[2]

bitmap

queue

When A Process is Interactive Enough…

process

list_head

... ...

100

... ...

0

139

100

139

0

bitmap

queue

... ...

... ...

100

0

139

100

139

0

H

p1

H

H

H H

p2

H

HH

Case 1

The scheduler would reinsert it to the end of its list

instead of moving it to the expired array

Page 35: Linux O(1) Scheduling

35

p3

p1

*expired *active

struct prio_array array[2]

bitmap

queue

When A Process is Interactive Enough…

process

list_head

... ...

100

... ...

0

139

100

139

0

bitmap

queue

... ...

... ...

100

0

139

100

139

0

H

p4

H

H

H H

p2

H

HH

Case 2

However, if there are any processes that have been starved,

it still has to be expired to prevent further starvation

p4

Page 36: Linux O(1) Scheduling

36

Interactivity of Process

#define TASK_INTERACTIVE(p) \

((p)->prio <= (p)->static_prio - DELTA(p))

#define DELTA(p) \

(SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA)

#define EXPIRED_STARVING(rq) \

((STARVATION_LIMIT && ((rq)->expired_timestamp && \

(jiffies - (rq)->expired_timestamp >= \

STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \

((rq)->curr->static_prio > (rq)->best_expired_prio))

if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {

enqueue_task(p, rq->expired);

if (p->static_prio < rq->best_expired_prio)

rq->best_expired_prio = p->static_prio;

} else

enqueue_task(p, rq->active);

Page 37: Linux O(1) Scheduling

37

Timeslice

The calculation is a simple scaling of the static priority into a range of timeslices (5 ~ 800 ms).

By default (with nice value of zero) is 100 ms.

#define MIN_TIMESLICE max(5 * HZ / 1000, 1)

#define DEF_TIMESLICE (100 * HZ / 1000)

#define SCALE_PRIO(x, prio) \

max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)

static inline unsigned int task_timeslice(task_t *p)

{

if (p->static_prio < NICE_TO_PRIO(0))

return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);

else

return SCALE_PRIO(DEF_TIMESLICE, p->static_prio);

}

Page 38: Linux O(1) Scheduling

38

Scheduling with Process Creation

do_fork()

copy_process()

sched_fork()

p->state = TASK_RUNNING;

INIT_LIST_HEAD(&p->run_list);

p->array = NULL;

spin_lock_init(&p->switch_lock);

...

local_irq_disable();

p->time_slice = (current->time_slice + 1) >> 1;

p->first_time_slice = 1;

current->time_slice >>= 1;

p->timestamp = sched_clock();

if (unlikely(!current->time_slice)) {

current->time_slice = 1;

preempt_disable();

scheduler_tick();

local_irq_enable();

preempt_enable();

} else

local_irq_enable();

wake_up_new()

p->prio = current->prio;

list_add_tail(...);

p->array = current->array;

p->array->nr_active++;

rq->nr_running++;

To avoid the COW overhead,

we let the child go first

Page 39: Linux O(1) Scheduling

39

Scheduling with Process Termination

rq = task_rq_lock(p->parent, &flags);

if (p->first_time_slice) {

p->parent->time_slice += p->time_slice;

if (unlikely(p->parent->time_slice >

task_timeslice(p)))

p->parent->time_slice = task_timeslice(p);

}

if (p->sleep_avg < p->parent->sleep_avg)

p->parent->sleep_avg = p->parent->sleep_avg /

(EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /

(EXIT_WEIGHT + 1);

task_rq_unlock(rq, &flags);

do_exit()

sys_exit()

release_task()

exit_notify()

sched_exit()

schedule()

BUG();

sched_exit()

Page 40: Linux O(1) Scheduling

40

Control Flow of scheduler_tick()

scheduler_tick()

Realtime task?yes

no

Round robbin?no(FIFO)

Timeslice remained?

Set need_reschedule flag

Continue exection

yesTimeslice remained?

Remove from active

Recalculate priority

and timeslice

Interactive enough? Reinsert to active

Reinsert to expired

no

Is there any task in

expired starving?

yes

yes

no

no

yesyes

no

Set need_reschedule flag

Page 41: Linux O(1) Scheduling

41

Charge Ticks to the Current Process

timer_interrupt()

update_process_times(user_mode(regs))

account_user_time()

account_system_time()

#define user_mode(regs) (!!((regs)->cs & 3))

User mode?

p->stime = cputime_add(p->stime, cputime);

p->utime = cputime_add(p->utime, cputime);

YesNo

jiffies_to_cputime(1)