diff -Naur linux-2.4.14/arch/i386/kernel/apm.c linux-2.4.14-mq/arch/i386/kernel/apm.c --- linux-2.4.14/arch/i386/kernel/apm.c Fri Oct 19 15:32:28 2001 +++ linux-2.4.14-mq/arch/i386/kernel/apm.c Tue Nov 6 22:19:27 2001 @@ -1342,7 +1342,7 @@ * decide if we should just power down. * */ -#define system_idle() (nr_running == 1) +#define system_idle() (nr_running() == 1) static void apm_mainloop(void) { diff -Naur linux-2.4.14/arch/i386/kernel/smpboot.c linux-2.4.14-mq/arch/i386/kernel/smpboot.c --- linux-2.4.14/arch/i386/kernel/smpboot.c Fri Oct 5 01:42:54 2001 +++ linux-2.4.14-mq/arch/i386/kernel/smpboot.c Tue Nov 6 22:19:27 2001 @@ -799,14 +799,19 @@ if (!idle) panic("No idle process for CPU %d", cpu); - idle->processor = cpu; map_cpu_to_boot_apicid(cpu, apicid); - idle->has_cpu = 1; /* we schedule the first task manually */ idle->thread.eip = (unsigned long) start_secondary; del_from_runqueue(idle); + /* + * Don't change processor/runqueue of task while it is + * still on runqueue. + */ + idle->processor = cpu; + idle->has_cpu = 1; /* we schedule the first task manually */ + unhash_process(idle); init_tasks[cpu] = idle; diff -Naur linux-2.4.14/fs/proc/proc_misc.c linux-2.4.14-mq/fs/proc/proc_misc.c --- linux-2.4.14/fs/proc/proc_misc.c Thu Oct 11 17:46:57 2001 +++ linux-2.4.14-mq/fs/proc/proc_misc.c Tue Nov 6 22:19:27 2001 @@ -96,7 +96,7 @@ LOAD_INT(a), LOAD_FRAC(a), LOAD_INT(b), LOAD_FRAC(b), LOAD_INT(c), LOAD_FRAC(c), - nr_running, nr_threads, last_pid); + nr_running(), nr_threads, last_pid); return proc_calc_metrics(page, start, off, count, eof, len); } diff -Naur linux-2.4.14/include/linux/sched.h linux-2.4.14-mq/include/linux/sched.h --- linux-2.4.14/include/linux/sched.h Mon Nov 5 20:42:14 2001 +++ linux-2.4.14-mq/include/linux/sched.h Tue Nov 6 22:46:17 2001 @@ -72,7 +72,7 @@ #define CT_TO_SECS(x) ((x) / HZ) #define CT_TO_USECS(x) (((x) % HZ) * 1000000/HZ) -extern int nr_running, nr_threads; +extern int nr_threads; extern int last_pid; #include <linux/fs.h> @@ -132,14 +132,8 @@ #include <linux/spinlock.h> -/* - * This serializes "schedule()" and also protects - * the run-queue from deletions/modifications (but - * _adding_ to the beginning of the run-queue has - * a separate lock). - */ + extern rwlock_t tasklist_lock; -extern spinlock_t runqueue_lock; extern spinlock_t mmlist_lock; extern void sched_init(void); @@ -437,6 +431,7 @@ #define DEF_COUNTER (10*HZ/100) /* 100 ms time slice */ #define MAX_COUNTER (20*HZ/100) #define DEF_NICE (0) +#define ALL_CPUS_ALLOWED (-1) /* @@ -461,7 +456,7 @@ policy: SCHED_OTHER, \ mm: NULL, \ active_mm: &init_mm, \ - cpus_allowed: -1, \ + cpus_allowed: ALL_CPUS_ALLOWED, \ run_list: LIST_HEAD_INIT(tsk.run_list), \ next_task: &tsk, \ prev_task: &tsk, \ @@ -846,18 +841,497 @@ #define next_thread(p) \ list_entry((p)->thread_group.next, struct task_struct, thread_group) -static inline void del_from_runqueue(struct task_struct * p) +static inline int task_on_runqueue(struct task_struct *p) +{ + return (p->run_list.next != NULL); +} + +/* + * runqueue_data + * One runqueue per CPU in the system, plus one additional runqueue for + * realtime tasks. Size should be a multiple of cache line size, and array + * of items should start on a cache line boundary. + */ +typedef union runqueue_data { + struct rq_data { + int nt_running; /* # of tasks on runqueue */ +#ifdef CONFIG_SMP + int max_na_goodness; /* maximum non-affinity */ + /* goodness value of */ + /* 'schedulable' task */ + /* on this runqueue */ + struct task_struct * max_na_ptr; /* pointer to task which */ + /* has max_na_goodness */ + unsigned long max_na_cpus_allowed; /* copy of cpus_allowed */ + /* field from task with */ + /* max_na_goodness */ +#endif + struct list_head runqueue; /* list of tasks on runqueue */ +#ifdef CONFIG_SMP + int running_non_idle; /* flag to indicate this cpu */ + /* is running something */ + /* besides the idle thread */ +#endif + spinlock_t runqueue_lock; /* lock for this runqueue */ + } rq_data; + char __pad [SMP_CACHE_BYTES]; +} runqueue_data_t; +#define nt_running(cpu) runqueue_data[(cpu)].rq_data.nt_running +#ifdef CONFIG_SMP +#define max_na_goodness(cpu) runqueue_data[(cpu)].rq_data.max_na_goodness +#define max_na_ptr(cpu) runqueue_data[(cpu)].rq_data.max_na_ptr +#define max_na_cpus_allowed(cpu) \ + runqueue_data[(cpu)].rq_data.max_na_cpus_allowed +#endif +#define runqueue(cpu) runqueue_data[(cpu)].rq_data.runqueue +#define runqueue_lock(cpu) runqueue_data[(cpu)].rq_data.runqueue_lock +#ifdef CONFIG_SMP +#define running_non_idle(cpu) runqueue_data[(cpu)].rq_data.running_non_idle +#endif +extern runqueue_data_t runqueue_data[]; + +#ifdef CONFIG_SMP +#define INIT_RUNQUEUE_DATA_SMP(n) { \ + max_na_goodness((n)) = MIN_GOODNESS; \ + max_na_ptr((n)) = NULL; \ + /* max_na_cpus_allowed need not be initialized */ \ + running_non_idle((n)) = 0; \ +} +#else +#define INIT_RUNQUEUE_DATA_SMP(n) /* NOOP */ +#endif +#define INIT_RUNQUEUE_DATA(n) { \ + nt_running((n)) = 0; \ + INIT_LIST_HEAD(&runqueue((n))); \ + runqueue_lock((n)) = SPIN_LOCK_UNLOCKED; \ + INIT_RUNQUEUE_DATA_SMP((n)); \ +} +#define N_RUNQUEUES (NR_CPUS + 1) +#define N_ALIGNED_DATA NR_CPUS +#define REALTIME_RQ_ID NR_CPUS +#define MIN_GOODNESS -1000 +#ifndef CONFIG_SMP +#define UP_RQ_LOCK_ID 0 +#endif + +/* + * aligned_data + * CPU specific scheduling data. One data item for each CPU + * in the system. Size should be a multiple of cache line size, + * and array of items should start on a cache line boundary. + */ +typedef union aligned_data { + struct schedule_data { + struct task_struct * curr; /* current task on this CPU */ + cycles_t last_schedule; /* time of last scheduling */ + /* decision */ +#ifdef CONFIG_SMP + int curr_na_goodness; /* non-affinity goodness */ + /* value of current task */ +#endif + } schedule_data; + char __pad [SMP_CACHE_BYTES]; +} aligned_data_t; +#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr +#ifdef CONFIG_SMP +#define curr_na_goodness(cpu) aligned_data[(cpu)].schedule_data.curr_na_goodness +#endif +#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule +extern aligned_data_t aligned_data[]; + +#ifdef CONFIG_SMP +#define INIT_ALIGNED_DATA_SMP(n) { \ + curr_na_goodness((n)) = MIN_GOODNESS; \ +} +#else +#define INIT_ALIGNED_DATA_SMP(n) /* NOOP */ +#endif +#define INIT_ALIGNED_DATA(n) { \ + cpu_curr((n)) = &init_task; \ + last_schedule((n)) = 0; \ + INIT_ALIGNED_DATA_SMP((n)); \ +} + +/* + * Determine runqueue associated with task + */ +static inline int task_to_runqueue(struct task_struct *t) +{ + int rq; + + if ((t->policy & ~SCHED_YIELD) != SCHED_OTHER) { + rq = REALTIME_RQ_ID; + } else { + rq = t->processor; + } + + return(rq); +} +#define TASK_RQ(t) runqueue(task_to_runqueue((t))) + +/* + * Sum CPU specific nt_running fields to determine how many + * runnable tasks there are in the system. + */ +static inline int nr_running(void) +{ + int i; + int tot=nt_running(REALTIME_RQ_ID); + + for(i=0; i<smp_num_cpus; i++) { + tot += nt_running(cpu_logical_map(i)); + } + + return(tot); +} + +/* + * The following macros and the base_goodness() routine contain common + * code for the 'exported' goodness routines which follow. + */ +#define RT_GOODNESS_MIN 1000 +#define RT_GOODNESS(t) (RT_GOODNESS_MIN + (t)->rt_priority) +#define MM_GOODNESS(t, this_mm) ((t)->mm == (this_mm) ? 1 : 0) +#define CPU_GOODNESS(t, this_cpu) ((t)->processor == (this_cpu) ? \ + PROC_CHANGE_PENALTY : 0) +static inline int base_goodness(struct task_struct * t) +{ + int weight; + + weight = -1; + if (t->policy & SCHED_YIELD) + goto out; + + /* + * base_goodness is based on the nuber of ticks left. + * Don't do any other calculations if the time slice is + * over.. + */ + weight = t->counter; + if (!weight) + goto out; + + /* + * Factor in the nice value + */ + weight += 20 - t->nice; + +out: + return weight; +} + +/* + * non-affinity goodness value of a task. MM and CPU affinity are not + * taken into account. + */ +static inline int na_goodness(struct task_struct * t) +{ + /* + * Normal tasks first + */ + if ((t->policy & ~SCHED_YIELD) == SCHED_OTHER) { + return (base_goodness(t)); + } + + /* + * Realtime task + */ + return (RT_GOODNESS(t)); +} + +/* + * Stripped down version of goodness routine to be used on a CPU + * specific (local) runqueue. This routine does not need to be + * concerned with realtime tasks, and does not need to take CPU + * affinity into account. + */ +static inline int local_goodness(struct task_struct * t, + struct mm_struct *this_mm) +{ + int weight = base_goodness(t); + + if (weight > 0) { + weight += MM_GOODNESS(t, this_mm); + } + + return(weight); +} + +/* + * Full Blown goodness function, this is the function that decides how + * desirable a process is. You can weigh different processes against + * each other depending on what CPU they've run on lately etc to try to + * handle cache and TLB miss penalties. + * + * Return values: + * <0: dont select this one + * 0: out of time, recalculate counters (but it might still be + * selected) + * +ve: "goodness" value (the larger, the better) + * +RT_GOODNESS_MIN: realtime process, select this. + */ +static inline int goodness(struct task_struct * t, int this_cpu, + struct mm_struct *this_mm) { - nr_running--; + int weight; + + /* + * Normal tasks first + */ + if ((t->policy & ~SCHED_YIELD) == SCHED_OTHER) { + weight = base_goodness(t); + if (weight > 0) { + weight += MM_GOODNESS(t, this_mm); +#ifdef CONFIG_SMP + weight += CPU_GOODNESS(t, this_cpu); +#endif + } + return(weight); + } + + /* + * Realtime task + */ + return (RT_GOODNESS(t)); +} + +/* + * Common code for add to runqueue. In SMP update max_na_* values + * for runquue if appropriate. + */ +static inline void add_to_runqueue_common(struct task_struct * p, int upd) +{ + int rq = task_to_runqueue(p); +#ifdef CONFIG_SMP + int tsk_na_goodness = na_goodness(p); + + if (upd && + !p->has_cpu && (tsk_na_goodness > max_na_goodness(rq))) { + max_na_goodness(rq) = tsk_na_goodness; + max_na_cpus_allowed(rq) = p->cpus_allowed; + max_na_ptr(rq) = p; + } +#endif + list_add(&p->run_list, &runqueue(rq)); + nt_running(rq)++; +} +static inline void add_to_runqueue(struct task_struct * p) +{ + add_to_runqueue_common(p, 1); +} +static inline void add_to_runqueue_noupd(struct task_struct * p) +{ + add_to_runqueue_common(p, 0); +} + +/* + * Common routine for both flavors of del_from_runqueue. Expensive scan + * of runqueue only happens in SMP is explicitly requested. + */ +static inline void del_from_runqueue_common(struct task_struct * p, int upd) +{ + int rq = task_to_runqueue(p); + + nt_running(rq)--; p->sleep_time = jiffies; list_del(&p->run_list); p->run_list.next = NULL; + +#ifdef CONFIG_SMP + if (max_na_ptr(rq) == p) { + if (upd) { + /* + * If we want to update max_na_* valies for the + * runqueue, then we scan the queue and look for + * the FIRST schedulable task. This is a 'good + * enough' approximation. + */ + struct list_head *tmp; + struct task_struct *t, *tmp_task = NULL; + int weight, tmp_weight = 0; + + list_for_each(tmp, &runqueue(rq)) { + t = list_entry(tmp, struct task_struct, + run_list); + if (!t->has_cpu) { + weight = na_goodness(t); + if (weight > tmp_weight) { + tmp_weight = weight; + tmp_task = t; + goto found_one; + } + } + } +found_one: + if (tmp_weight) { + max_na_goodness(rq) = tmp_weight; + max_na_cpus_allowed(rq) = + tmp_task->cpus_allowed; + max_na_ptr(rq) = tmp_task; + } else { + max_na_goodness(rq) = MIN_GOODNESS; + max_na_ptr(rq) = NULL; + } + } else { + max_na_goodness(rq) = MIN_GOODNESS; + max_na_ptr(rq) = NULL; + } + } +#endif +} +/* + * del_from_runqueue without updating max_na_* values. Used in + * places where we know we will updating these values before + * dropping the runqueue lock. + */ +static inline void del_from_runqueue(struct task_struct * p) +{ + del_from_runqueue_common(p, 0); +} +/* + * del_from_runqueue_update will update the max_na_* values + * if necessary. + */ +static inline void del_from_runqueue_update(struct task_struct * p) +{ + del_from_runqueue_common(p, 1); } -static inline int task_on_runqueue(struct task_struct *p) +/* + * Macros to call runqueue locking routines + */ +#ifdef CONFIG_SMP +#define LOCK_REALTIME_RQ() \ + spin_lock(&runqueue_lock(REALTIME_RQ_ID)); +#define UNLOCK_REALTIME_RQ() \ + spin_unlock(&runqueue_lock(REALTIME_RQ_ID)); +#else +#define LOCK_REALTIME_RQ() /* NOOP */ +#define UNLOCK_REALTIME_RQ() /* NOOP */ +#endif +#define LOCK_TASK_CPU_RQ_IRQ(t) \ + lock_task_cpu_rq_irq(t) +#define UNLOCK_TASK_CPU_RQ_IRQ(t) \ + unlock_task_cpu_rq_irq(t) +#define LOCK_TASK_RQ_VERIFY(t) \ + lock_task_rq_verify(t) +#ifdef CONFIG_SMP +#define UNLOCK_TASK_RQ(t) \ + spin_unlock(&runqueue_lock(task_to_runqueue((t)))); +#else +#define UNLOCK_TASK_RQ(t) \ + spin_unlock(&runqueue_lock(UP_RQ_LOCK_ID)); +#endif +#define LOCK_TASK_CPU_RQ_IRQSAVE_VERIFY(t, flags) \ + lock_task_cpu_rq_irqsave_verify(t, flags) +#define UNLOCK_TASK_CPU_RQ_IRQSAVE(cpu_rq, t, flags) \ + unlock_task_cpu_rq_irqsave(cpu_rq, t, flags) +#define LOCK_TASK_CPU_RQ_IRQ_VERIFY(t) \ + lock_task_cpu_rq_irq_verify(t) + +static inline void lock_task_cpu_rq_irq(struct task_struct *t) { - return (p->run_list.next != NULL); +#ifdef CONFIG_SMP + spin_lock_irq(&runqueue_lock(t->processor)); + if (task_to_runqueue(t) == REALTIME_RQ_ID) { + LOCK_REALTIME_RQ(); + } +#else + spin_lock_irq(&runqueue_lock(UP_RQ_LOCK_ID)); +#endif +} + +static inline void unlock_task_cpu_rq_irq(struct task_struct *t) +{ +#ifdef CONFIG_SMP + if (task_to_runqueue(t) == REALTIME_RQ_ID) { + UNLOCK_REALTIME_RQ(); + } + spin_unlock_irq(&runqueue_lock(t->processor)); +#else + spin_unlock_irq(&runqueue_lock(UP_RQ_LOCK_ID)); +#endif +} + +static inline void lock_task_rq_verify(struct task_struct *t) +{ +#ifdef CONFIG_SMP + int rq = task_to_runqueue(t); + + spin_lock(&runqueue_lock(rq)); + while (rq != task_to_runqueue(t)) { + spin_unlock(&runqueue_lock(rq)); + rq = task_to_runqueue(t); + spin_lock(&runqueue_lock(rq)); + } +#else + spin_lock(&runqueue_lock(UP_RQ_LOCK_ID)); +#endif } + +static inline int lock_task_cpu_rq_irqsave_verify(struct task_struct *t, + unsigned long *flags) +{ +#ifdef CONFIG_SMP + int rq = t->processor; + + spin_lock_irqsave(&runqueue_lock(rq), *flags); + while (t->processor != rq) { + spin_unlock_irqrestore(&runqueue_lock(rq), *flags); + rq = t->processor; + spin_lock_irqsave(&runqueue_lock(rq), *flags); + } + if (task_to_runqueue(t) == REALTIME_RQ_ID) { + LOCK_REALTIME_RQ(); + } + return(rq); +#else + spin_lock_irqsave(&runqueue_lock(UP_RQ_LOCK_ID), *flags); + return(UP_RQ_LOCK_ID); +#endif +} + +static inline void unlock_task_cpu_rq_irqsave(int cpu_rq, struct task_struct *t, + unsigned long flags) +{ +#ifdef CONFIG_SMP + int rq = task_to_runqueue(t); + + if (rq == REALTIME_RQ_ID) { + UNLOCK_REALTIME_RQ(); + } + spin_unlock_irqrestore(&runqueue_lock(cpu_rq), flags); +#else + spin_unlock_irqrestore(&runqueue_lock(UP_RQ_LOCK_ID), flags); +#endif +} + +static inline void lock_task_cpu_rq_irq_verify(struct task_struct *t) +{ +#ifdef CONFIG_SMP + int rq = t->processor; + + spin_lock_irq(&runqueue_lock(rq)); + while (t->processor != rq) { + spin_unlock_irq(&runqueue_lock(rq)); + rq = t->processor; + spin_lock_irq(&runqueue_lock(rq)); + } + if (task_to_runqueue(t) == REALTIME_RQ_ID) { + LOCK_REALTIME_RQ(); + } +#else + spin_lock_irq(&runqueue_lock(UP_RQ_LOCK_ID)); +#endif +} + +#ifdef CONFIG_SMP +#define UPDATE_SCHED_DATA(tc, next) update_sched_data(tc, next) +#define EXAMINE_RMT_RQS(tc, c, p, n) examine_rmt_rqs(tc, c, p, n) +#else +#define UPDATE_SCHED_DATA(tc, next) /* NOOP */ +#define EXAMINE_RMT_RQS(tc, c, p, n) (n) +#endif static inline void unhash_process(struct task_struct *p) { diff -Naur linux-2.4.14/kernel/fork.c linux-2.4.14-mq/kernel/fork.c --- linux-2.4.14/kernel/fork.c Wed Oct 24 00:44:15 2001 +++ linux-2.4.14-mq/kernel/fork.c Tue Nov 6 22:19:27 2001 @@ -28,7 +28,6 @@ /* The idle threads do not count.. */ int nr_threads; -int nr_running; int max_threads; unsigned long total_forks; /* Handle normal Linux uptimes. */ diff -Naur linux-2.4.14/kernel/sched.c linux-2.4.14-mq/kernel/sched.c --- linux-2.4.14/kernel/sched.c Wed Oct 17 21:14:37 2001 +++ linux-2.4.14-mq/kernel/sched.c Tue Nov 6 22:19:27 2001 @@ -80,33 +80,20 @@ /* * The tasklist_lock protects the linked list of processes. * - * The runqueue_lock locks the parts that actually access - * and change the run-queues, and have to be interrupt-safe. - * * If both locks are to be concurrently held, the runqueue_lock * nests inside the tasklist_lock. * * task->alloc_lock nests inside tasklist_lock. */ -spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED; /* inner */ rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */ -static LIST_HEAD(runqueue_head); - /* - * We align per-CPU scheduling data on cacheline boundaries, - * to prevent cacheline ping-pong. + * runqueue_data and aligned_data contain CPU specific scheduling data. + * There is one runqueue per CPU in the system, and SMP kernels also + * contain a realtime runquue. Initialization is performed in sched_init(). */ -static union { - struct schedule_data { - struct task_struct * curr; - cycles_t last_schedule; - } schedule_data; - char __pad [SMP_CACHE_BYTES]; -} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}}; - -#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr -#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule +runqueue_data_t runqueue_data [N_RUNQUEUES] __cacheline_aligned; +aligned_data_t aligned_data [N_ALIGNED_DATA] __cacheline_aligned; struct kernel_stat kstat; extern struct task_struct *child_reaper; @@ -116,6 +103,8 @@ #define idle_task(cpu) (init_tasks[cpu_number_map(cpu)]) #define can_schedule(p,cpu) ((!(p)->has_cpu) && \ ((p)->cpus_allowed & (1 << cpu))) +#define local_can_schedule(p) (!(p)->has_cpu) +#define this_cpu_allowed(ca, tcpu) ((ca) & (1 << tcpu)) #else @@ -127,72 +116,6 @@ void scheduling_functions_start_here(void) { } /* - * This is the function that decides how desirable a process is.. - * You can weigh different processes against each other depending - * on what CPU they've run on lately etc to try to handle cache - * and TLB miss penalties. - * - * Return values: - * -1000: never select this - * 0: out of time, recalculate counters (but it might still be - * selected) - * +ve: "goodness" value (the larger, the better) - * +1000: realtime process, select this. - */ - -static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm) -{ - int weight; - - /* - * select the current process after every other - * runnable process, but before the idle thread. - * Also, dont trigger a counter recalculation. - */ - weight = -1; - if (p->policy & SCHED_YIELD) - goto out; - - /* - * Non-RT process - normal case first. - */ - if (p->policy == SCHED_OTHER) { - /* - * Give the process a first-approximation goodness value - * according to the number of clock-ticks it has left. - * - * Don't do any other calculations if the time slice is - * over.. - */ - weight = p->counter; - if (!weight) - goto out; - -#ifdef CONFIG_SMP - /* Give a largish advantage to the same processor... */ - /* (this is equivalent to penalizing other processors) */ - if (p->processor == this_cpu) - weight += PROC_CHANGE_PENALTY; -#endif - - /* .. and a slight advantage to the current MM */ - if (p->mm == this_mm || !p->mm) - weight += 1; - weight += 20 - p->nice; - goto out; - } - - /* - * Realtime process, select the first one on the - * runqueue (taking priorities within processes - * into account). - */ - weight = 1000 + p->rt_priority; -out: - return weight; -} - -/* * the 'goodness value' of replacing a process on a given CPU. * positive value means 'replace', zero or negative means 'dont'. */ @@ -202,126 +125,212 @@ } /* - * This is ugly, but reschedule_idle() is very timing-critical. - * We are called with the runqueue spinlock held and we must - * not claim the tasklist_lock. + * reschedule_idle - Determine which CPU the specified task should + * should run on. The runqueue lock must be held upon entry to this + * routine. */ static FASTCALL(void reschedule_idle(struct task_struct * p)); static void reschedule_idle(struct task_struct * p) { #ifdef CONFIG_SMP - int this_cpu = smp_processor_id(); - struct task_struct *tsk, *target_tsk; - int cpu, best_cpu, i, max_prio; - cycles_t oldest_idle; + struct task_struct *tsk; + int target_cpu, this_cpu, tsk_cpu; + int i, cpu; + int need_resched; + cycles_t curr_cycles, tmp_cycles; + int stack_list[NR_CPUS]; + int saved_na_goodness, tmp_min_na_goodness; + + tsk_cpu = p->processor; + this_cpu = smp_processor_id(); + /* + * First check if the task's previous CPU is idle, use it if it is. + */ + if (can_schedule(p, tsk_cpu) && + (cpu_curr(tsk_cpu) == idle_task(tsk_cpu))) { + if (!task_on_runqueue(p)) { + add_to_runqueue(p); + } + tsk = cpu_curr(tsk_cpu); + need_resched = tsk->need_resched; + tsk->need_resched = 1; + if ((tsk_cpu != this_cpu) && !need_resched) { + smp_send_reschedule(tsk_cpu); + } + return; + } /* - * shortcut if the woken up task's last CPU is - * idle now. - */ - best_cpu = p->processor; - if (can_schedule(p, best_cpu)) { - tsk = idle_task(best_cpu); - if (cpu_curr(best_cpu) == tsk) { - int need_resched; -send_now_idle: + * Create a list of current na_goodness values on our stack. + * Only values less than the non-affinity goodness value of + * p should be considered for preemption. + */ + saved_na_goodness = na_goodness(p); /* preemption_goodness() > 0 */ + tmp_min_na_goodness = saved_na_goodness; + curr_cycles = get_cycles(); + target_cpu = -1; + for (i = 0; i < smp_num_cpus; i++) { + cpu = cpu_logical_map(i); + + if (!can_schedule(p, cpu)) { + stack_list[cpu] = saved_na_goodness; + continue; + } + + if (curr_na_goodness(cpu) == MIN_GOODNESS) { /* - * If need_resched == -1 then we can skip sending - * the IPI altogether, tsk->need_resched is - * actively watched by the idle thread. + * Indicates an idle task. For idle tasks, determine + * the amount of time they have been idle. Use the + * negative of this value in the list. Hence, we + * first choose the CPU that has been idle the longest. */ - need_resched = tsk->need_resched; - tsk->need_resched = 1; - if ((best_cpu != this_cpu) && !need_resched) - smp_send_reschedule(best_cpu); - return; + tmp_cycles = curr_cycles - last_schedule(cpu); + if (tmp_cycles > INT_MAX) { + stack_list[cpu] = INT_MIN; + } else { + stack_list[cpu] = (int)-tmp_cycles; + } + } else { + stack_list[cpu] = curr_na_goodness(cpu); + /* + * Add in PROC_CHANGE_PENALTY for remote CPUs + */ + if (cpu != tsk_cpu) { + stack_list[cpu] += PROC_CHANGE_PENALTY; + } + } + + /* + * Look for the lowest value + */ + if (stack_list[cpu] < tmp_min_na_goodness) { + target_cpu = cpu; + tmp_min_na_goodness = stack_list[cpu]; } } /* - * We know that the preferred CPU has a cache-affine current - * process, lets try to find a new idle CPU for the woken-up - * process. Select the least recently active idle CPU. (that - * one will have the least active cache context.) Also find - * the executing process which has the least priority. - */ - oldest_idle = (cycles_t) -1; - target_tsk = NULL; - max_prio = 0; + * We try to add the task to a runqueue starting with the one + * that has the lowest na_goodness value. + */ + while (target_cpu != -1) { + if (target_cpu == tsk_cpu && + preemption_goodness((tsk = cpu_curr(target_cpu)), + p, target_cpu) > 0) { + /* + * If target_cpu is tsk_cpu, then no additional + * locking is required (we already have the CPU + * specific runqueue locked). We also know that + * this CPU can not be idle, otherwise the fast + * path at the beginning of this routine would + * have been executed. Therefore, simply send + * the IPI if required. + */ + if (!task_on_runqueue(p)) { + add_to_runqueue(p); + } + tsk = cpu_curr(target_cpu); + tsk->need_resched = 1; + if (target_cpu != this_cpu) { + smp_send_reschedule(target_cpu); + } + return; + } - for (i = 0; i < smp_num_cpus; i++) { - cpu = cpu_logical_map(i); - if (!can_schedule(p, cpu)) - continue; - tsk = cpu_curr(cpu); /* - * We use the first available idle CPU. This creates - * a priority list between idle CPUs, but this is not - * a problem. + * Try to lock runqueue and verify na_goodness value. */ - if (tsk == idle_task(cpu)) { - if (last_schedule(cpu) < oldest_idle) { - oldest_idle = last_schedule(cpu); - target_tsk = tsk; - } - } else { - if (oldest_idle == -1ULL) { - int prio = preemption_goodness(tsk, p, cpu); + else if (spin_trylock(&runqueue_lock(target_cpu))) { + tsk = cpu_curr(target_cpu); + if ((tsk == idle_task(target_cpu)) || + (preemption_goodness(tsk, p, target_cpu) > 0)) { + /* + * Target CPU is idle, or it is running + * a task with lower priority than p. + * Therefore, move p to target runqueue. + */ + if (task_on_runqueue(p)) { + del_from_runqueue_update(p); + } + p->processor = target_cpu; + add_to_runqueue(p); - if (prio > max_prio) { - max_prio = prio; - target_tsk = tsk; + /* + * Send an IPI to target CPU, unless the + * CPU is idle and the need_resched flag + * has already been set. + */ + need_resched = tsk->need_resched; + tsk->need_resched = 1; + if ((target_cpu != this_cpu) && + ((tsk != idle_task(target_cpu)) || + !need_resched)){ + smp_send_reschedule(target_cpu); } + + spin_unlock(&runqueue_lock(target_cpu)); + + return; } + spin_unlock(&runqueue_lock(target_cpu)); } - } - tsk = target_tsk; - if (tsk) { - if (oldest_idle != -1ULL) { - best_cpu = tsk->processor; - goto send_now_idle; + + /* + * Update list value so we don't check this CPU again. + */ + stack_list[target_cpu] = saved_na_goodness; + + /* + * Find the 'next lowest' cur_na_goodness value. + */ + target_cpu = -1; + tmp_min_na_goodness = saved_na_goodness; + for (i = 0; i < smp_num_cpus; i++) { + cpu = cpu_logical_map(i); + if (stack_list[cpu] < tmp_min_na_goodness) { + target_cpu = cpu; + tmp_min_na_goodness = stack_list[cpu]; + } } - tsk->need_resched = 1; - if (tsk->processor != this_cpu) - smp_send_reschedule(tsk->processor); + } + + /* + * If we get here, it means that the best place for the task is + * on its currently assigned runqueue. Also, we know that the + * task currently running on this task's runqueue has sufficuent + * priority to prevent preemption. Hence, we simply ensure the + * task is on the runqueue. + */ + if (!task_on_runqueue(p)) { + add_to_runqueue(p); } return; - #else /* UP */ int this_cpu = smp_processor_id(); struct task_struct *tsk; tsk = cpu_curr(this_cpu); - if (preemption_goodness(tsk, p, this_cpu) > 0) + if (preemption_goodness(tsk, p, this_cpu) > 0) { tsk->need_resched = 1; + } + if (!task_on_runqueue(p)) { + add_to_runqueue(p); + } #endif } -/* - * Careful! - * - * This has to add the process to the _beginning_ of the - * run-queue, not the end. See the comment about "This is - * subtle" in the scheduler proper.. - */ -static inline void add_to_runqueue(struct task_struct * p) -{ - list_add(&p->run_list, &runqueue_head); - nr_running++; -} - static inline void move_last_runqueue(struct task_struct * p) { list_del(&p->run_list); - list_add_tail(&p->run_list, &runqueue_head); + list_add_tail(&p->run_list, &TASK_RQ(p)); } static inline void move_first_runqueue(struct task_struct * p) { list_del(&p->run_list); - list_add(&p->run_list, &runqueue_head); + list_add(&p->run_list, &TASK_RQ(p)); } /* @@ -336,20 +345,24 @@ { unsigned long flags; int success = 0; + int cpu_rq; + + cpu_rq = LOCK_TASK_CPU_RQ_IRQSAVE_VERIFY(p, &flags); /* * We want the common case fall through straight, thus the goto. */ - spin_lock_irqsave(&runqueue_lock, flags); p->state = TASK_RUNNING; if (task_on_runqueue(p)) goto out; - add_to_runqueue(p); + if (!synchronous || !(p->cpus_allowed & (1 << smp_processor_id()))) reschedule_idle(p); + else + add_to_runqueue(p); success = 1; out: - spin_unlock_irqrestore(&runqueue_lock, flags); + UNLOCK_TASK_CPU_RQ_IRQSAVE(cpu_rq, p, flags); return success; } @@ -495,6 +508,7 @@ needs_resched: { unsigned long flags; + int cpu_rq; /* * Avoid taking the runqueue lock in cases where @@ -504,15 +518,17 @@ (policy & SCHED_YIELD)) goto out_unlock; - spin_lock_irqsave(&runqueue_lock, flags); + cpu_rq = LOCK_TASK_CPU_RQ_IRQSAVE_VERIFY(prev, &flags); + if ((prev->state == TASK_RUNNING) && !prev->has_cpu) reschedule_idle(prev); - spin_unlock_irqrestore(&runqueue_lock, flags); + + UNLOCK_TASK_CPU_RQ_IRQSAVE(cpu_rq, prev, flags); goto out_unlock; } #else prev->policy &= ~SCHED_YIELD; -#endif /* CONFIG_SMP */ +#endif } void schedule_tail(struct task_struct *prev) @@ -520,11 +536,322 @@ __schedule_tail(prev); } +#ifdef CONFIG_SMP /* - * 'schedule()' is the scheduler function. It's a very simple and nice - * scheduler: it's not perfect, but certainly works for most things. - * - * The goto is "interesting". + * Examine remote runqueues and look for a task which is more desirable + * to run than 'next'. + */ +static FASTCALL(struct task_struct *examine_rmt_rqs(int this_cpu, int *cg, + struct task_struct *prev, + struct task_struct *next)); +static struct task_struct *examine_rmt_rqs(int this_cpu, int *cg, + struct task_struct *prev, + struct task_struct *next) +{ + int premature_idle; + int rrq; + int tmp_na_goodness; + int rq; + int rcpu; + int retry; + int stack_list[NR_CPUS]; + + /* + * cg (current goodness) does not contain a CPU affinity boost. + * We must add this boost before comparing to tasks on other + * runqueues. Only add PROC_CHANGE_PENALTY if c is a positive + * goodness value. + */ + if (*cg > 0) { + *cg += PROC_CHANGE_PENALTY; + } + + /* + * Copy max_na_goodness values from CPU specific runqueue + * structures to the list on our stack. + */ +scan_again: + premature_idle = 0; + rrq = -1; + tmp_na_goodness = *cg; + for (rq = 0; rq < smp_num_cpus; rq++) { + rcpu = cpu_logical_map(rq); + if (rcpu == this_cpu) { + stack_list[rcpu] = MIN_GOODNESS; + continue; + } + if (!this_cpu_allowed(max_na_cpus_allowed(rcpu), this_cpu)) { + stack_list[rcpu] = MIN_GOODNESS; + continue; + } + if (max_na_goodness(rcpu) <= *cg) { + stack_list[rcpu] = MIN_GOODNESS; + continue; + } + + stack_list[rcpu] = max_na_goodness(rcpu); + if (stack_list[rcpu] > tmp_na_goodness) { + rrq = rcpu; + tmp_na_goodness = stack_list[rcpu]; + } + } + + /* + * Now use the values from the stack list to search for a + * task to steal. + */ + while (rrq != -1) { + /* + * First try to lock the remote runqueue and verify + * the max_na_goodness value. + */ + if (spin_trylock(&runqueue_lock(rrq))) { + if (max_na_goodness(rrq) > *cg && + this_cpu_allowed(max_na_cpus_allowed(rrq), + this_cpu)) { + /* + * Move a remote task to our runqueue, + * don't forget to update max_na_values + * for our queue. + */ + if (!next->has_cpu && + next != idle_task(this_cpu)) { + max_na_goodness(this_cpu) = + na_goodness(next); + max_na_cpus_allowed(this_cpu) = + next->cpus_allowed; + max_na_ptr(this_cpu) = next; + } + next = max_na_ptr(rrq);; + *cg = max_na_goodness(rrq); + del_from_runqueue_update(next); + next->processor = this_cpu; + add_to_runqueue_noupd(next); + + /* + * We have stolen a task from another + * runqueue, quit looking. + */ + spin_unlock(&runqueue_lock(rrq)); + + break; + } + spin_unlock(&runqueue_lock(rrq)); + } else { + premature_idle++; + } + + /* + * We were either unable to get the remote runqueue lock, + * or the remote runqueue has changed such that it is no + * longer desirable to steal a task from the queue. + * + * Go to the runqueue with the 'next highest' max_na_goodness + * value. + */ + stack_list[rrq] = MIN_GOODNESS; + tmp_na_goodness = *cg; + rrq = -1; + for (rq = 0; rq < smp_num_cpus; rq++) { + rcpu = cpu_logical_map(rq); + if (stack_list[rcpu] > tmp_na_goodness) { + rrq = rcpu; + tmp_na_goodness = stack_list[rcpu]; + } + } + } + + /* + * Check for going idle prematurely. If this is the case, there + * is a good chance there are schedulable tasks on other runquueues. + */ + retry = (next == idle_task(this_cpu)) && + (task_to_runqueue(prev) != REALTIME_RQ_ID); + if (retry && premature_idle) { + /* + * Be sure to clear max_na_goodness, otherwise there + * is the potential for deadlock. + */ + max_na_goodness(this_cpu) = MIN_GOODNESS; + goto scan_again; + } + + return(next); +} + +/* + * Find next best schedulable task on runqueue. In addition, while scanning + * the queue keep track of the 'second best' schedulable task and update + * runquue max_na_* values accordingly. + */ +static FASTCALL(struct task_struct *scan_runqueue(struct task_struct *prev, + int *cg, struct task_struct *next)); +static struct task_struct *scan_runqueue(struct task_struct *prev, int *cg, + struct task_struct *next) +{ + struct task_struct *p; + struct list_head *tmp; + int prev_next_weight; + struct task_struct *prev_next; + int rq = prev->processor; + + prev_next = idle_task(rq); + prev_next_weight = MIN_GOODNESS; + /* + * Search local (CPU specific) runqueue + */ + list_for_each(tmp, &runqueue(rq)) { + p = list_entry(tmp, struct task_struct, run_list); + if (local_can_schedule(p)) { + int weight = local_goodness(p, prev->active_mm); + if (weight > *cg) { + if (!next->has_cpu) { + /* + * prev_next must point to a + * schedulable task. + */ + prev_next_weight = *cg; + prev_next = next; + } + *cg = weight; + next = p; + } else if (weight > prev_next_weight) { + prev_next_weight = weight; + prev_next = p; + } + } + } + + /* + * Update max_na_* values for this runqueue + */ + if (prev_next != idle_task(rq)) { + max_na_goodness(rq) = na_goodness(prev_next); + max_na_cpus_allowed(rq) = prev_next->cpus_allowed; + max_na_ptr(rq) = prev_next; + } else { + max_na_goodness(rq) = MIN_GOODNESS; + /* max_na_cpus_allowed need not be set */ + max_na_ptr(rq) = NULL; + } + + return(next); +} + +static FASTCALL(struct task_struct * scan_rt_runqueue(struct task_struct *prev, + int *cg, struct task_struct *next)); +static struct task_struct * scan_rt_runqueue(struct task_struct *prev, + int *cg, struct task_struct *next) +{ + struct list_head *tmp; + struct task_struct *p; + int this_cpu = prev->processor; + + if (task_to_runqueue(prev) != REALTIME_RQ_ID) { + LOCK_REALTIME_RQ(); + } + + /* + * Scan the queue. Note that there is no need to + * keep track of max_na_goodness values for the + * realtime runqueue, as they are never used in + * scheduling decisions. + */ + list_for_each(tmp, &runqueue(REALTIME_RQ_ID)) { + p = list_entry(tmp, struct task_struct, run_list); + if (can_schedule(p, this_cpu)) { + int weight = RT_GOODNESS(p); + if (weight > *cg) + *cg = weight, next = p; + } + } + + /* + * If we found a realtime task, make it non-schedulable (set + * has_cpu) before potentially releasing the queue's lock. + * This prevents other CPUs from trying to steal the task + * before we can switch to it. + * Note that we still hold the CPU specific runqueue lock. + */ + if (task_to_runqueue(next) == REALTIME_RQ_ID) { + next->has_cpu = 1; + } + if (task_to_runqueue(prev) != REALTIME_RQ_ID) { + UNLOCK_REALTIME_RQ(); + } + + return(next); +} + +static inline void update_sched_data(int this_cpu, struct task_struct *next) +{ + /* + * Update values in aligned_data + */ + if (next != idle_task(this_cpu)) { + curr_na_goodness(this_cpu) = na_goodness(next); + running_non_idle(this_cpu) = 1; + } else { + curr_na_goodness(this_cpu) = MIN_GOODNESS; + running_non_idle(this_cpu) = 0; + } + +} +#else + +/* + * Scan local runqueue UP version, unnecessary searches/updates removed + */ +static inline struct task_struct *scan_runqueue(struct task_struct *prev, + int *cg, struct task_struct *next) +{ + struct task_struct *p; + struct list_head *tmp; + int this_cpu = prev->processor; + int weight; + + /* + * Search local (CPU specific) runqueue + */ + list_for_each(tmp, &runqueue(this_cpu)) { + p = list_entry(tmp, struct task_struct, run_list); + weight = local_goodness(p, prev->active_mm); + if (weight > *cg) + *cg = weight, next = p; + } + + return(next); +} + +/* + * Scan RT runqueue UP version with unnecessary locking removed + */ +static inline struct task_struct * scan_rt_runqueue(struct task_struct *prev, + int *cg, struct task_struct *next) +{ + struct list_head *tmp; + struct task_struct *p; + int weight; + + /* + * Scan the queue. Note that there is no need to + * keep track of max_na_goodness values for the + * realtime runqueue, as they are never used in + * scheduling decisions. + */ + list_for_each(tmp, &runqueue(REALTIME_RQ_ID)) { + p = list_entry(tmp, struct task_struct, run_list); + weight = RT_GOODNESS(p); + if (weight > *cg) + *cg = weight, next = p; + } + + return(next); +} +#endif + +/* + * 'schedule()' is the scheduler function. * * NOTE!! Task 0 is the 'idle' task, which gets called when no other * tasks can run. It can not be killed, and it cannot sleep. The 'state' @@ -532,13 +859,11 @@ */ asmlinkage void schedule(void) { - struct schedule_data * sched_data; - struct task_struct *prev, *next, *p; - struct list_head *tmp; + struct task_struct *prev, *next; int this_cpu, c; + int update_prev_processor = 0; - - spin_lock_prefetch(&runqueue_lock); + spin_lock_prefetch(&runqueue_lock(current->processor)); if (!current->active_mm) BUG(); need_resched_back: @@ -551,12 +876,9 @@ release_kernel_lock(prev, this_cpu); /* - * 'sched_data' is protected by the fact that we can run - * only one process per CPU. + * Lock runqueue(s) associated with task */ - sched_data = & aligned_data[this_cpu].schedule_data; - - spin_lock_irq(&runqueue_lock); + LOCK_TASK_CPU_RQ_IRQ(prev); /* move an exhausted RR process to be last.. */ if (prev->policy == SCHED_RR) @@ -584,34 +906,69 @@ * Default process to select.. */ next = idle_task(this_cpu); - c = -1000; + c = MIN_GOODNESS; if (prev->state == TASK_RUNNING) goto still_running; still_running_back: - list_for_each(tmp, &runqueue_head) { - p = list_entry(tmp, struct task_struct, run_list); - if (can_schedule(p, this_cpu)) { - int weight = goodness(p, this_cpu, prev->active_mm); - if (weight > c) - c = weight, next = p; + /* + * First check the realtime runqueue + */ + if (nt_running(REALTIME_RQ_ID)) { + next = scan_rt_runqueue(prev, &c, next); + if (task_to_runqueue(next) == REALTIME_RQ_ID) { + /* Found a RT task, no need to look elsewhere */ + goto set_sched_data; } } - /* Do we need to re-calculate counters? */ + /* + * Search CPU specific runqueue + */ + next = scan_runqueue(prev, &c, next); + + /* + * Take a look at the remote runqueues + */ + next = EXAMINE_RMT_RQS(this_cpu, &c, prev, next); + + /* Do we need to re-calculate counters for ALL tasks? */ if (!c) goto recalculate; + +set_sched_data: + /* + * Update scheduler data + */ + UPDATE_SCHED_DATA(this_cpu, next); + + /* + * Update scheduling fields in next task structure. + */ + next->has_cpu = 1; + next->processor = this_cpu; + /* * from this point on nothing can prevent us from * switching to the next task, save this fact in * sched_data. */ - sched_data->curr = next; -#ifdef CONFIG_SMP - next->has_cpu = 1; - next->processor = this_cpu; -#endif - spin_unlock_irq(&runqueue_lock); + cpu_curr(this_cpu) = next; + + UNLOCK_TASK_CPU_RQ_IRQ(prev); + + if (update_prev_processor) { + unsigned long allowed = prev->cpus_allowed; + /* + * current task (prev) has had its processor field + * updated and should no longer run on this CPU. + */ + prev->processor = 0; + while (!(allowed & 1UL)) { + prev->processor++; + allowed = allowed >> 1; + } + } if (prev == next) { /* We won't go through the normal tail, so do this by hand */ @@ -627,15 +984,14 @@ * and it's approximate, so we do not have to maintain * it while holding the runqueue spinlock. */ - sched_data->last_schedule = get_cycles(); + last_schedule(this_cpu) = get_cycles(); /* * We drop the scheduler lock early (it's a global spinlock), * thus we have to lock the previous process from getting * rescheduled during switch_to(). */ - -#endif /* CONFIG_SMP */ +#endif kstat.context_swtch++; /* @@ -684,18 +1040,29 @@ recalculate: { struct task_struct *p; - spin_unlock_irq(&runqueue_lock); + UNLOCK_TASK_CPU_RQ_IRQ(prev); read_lock(&tasklist_lock); for_each_task(p) p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice); read_unlock(&tasklist_lock); - spin_lock_irq(&runqueue_lock); + LOCK_TASK_CPU_RQ_IRQ(prev); } goto repeat_schedule; still_running: - if (!(prev->cpus_allowed & (1UL << this_cpu))) + if (!(prev->cpus_allowed & (1UL << this_cpu))) { + /* + * task is no longer runnable on this CPU. We remove it + * from the runqueue to ensure that it will not be considered + * for scheduling here. Let schedule_tail take care of + * sending it off to an appropriate CPU. + */ + if (!update_prev_processor) { + del_from_runqueue(prev); + update_prev_processor++; + } goto still_running_back; + } c = goodness(prev, this_cpu, prev->active_mm); next = prev; goto still_running_back; @@ -913,6 +1280,7 @@ struct sched_param lp; struct task_struct *p; int retval; + int was_on_rq = 0; retval = -EINVAL; if (!param || pid < 0) @@ -922,18 +1290,20 @@ if (copy_from_user(&lp, param, sizeof(struct sched_param))) goto out_nounlock; + retval = -ESRCH; /* - * We play safe to avoid deadlocks. + * Note that here we get the tasklist lock so that we can + * find the task struct. Not until we have access to the + * task struct can we determine what runqueue to lock. */ - read_lock_irq(&tasklist_lock); - spin_lock(&runqueue_lock); - + read_lock(&tasklist_lock); p = find_process_by_pid(pid); + if (!p) { + read_unlock(&tasklist_lock); + goto out_nounlock; + } + LOCK_TASK_CPU_RQ_IRQ_VERIFY(p); - retval = -ESRCH; - if (!p) - goto out_unlock; - if (policy < 0) policy = p->policy; else { @@ -962,16 +1332,35 @@ goto out_unlock; retval = 0; + + if ((p->policy == SCHED_OTHER) && (policy != SCHED_OTHER)) { + /* + * If changing to a realtime policy, we lock the + * realtime runqueue so that we can (potentially) + * move p to it. + * + * Note: If we lock the realtime runqueue here, it + * means that p is becoming a realtime task, and + * UNLOCK_TASK_CPU_RQ_IRQ() will take care of unlocking + * the realtime runqueue. + */ + LOCK_REALTIME_RQ(); + } + + if (task_on_runqueue(p)) { + del_from_runqueue_update(p); + was_on_rq = 1; + } p->policy = policy; p->rt_priority = lp.sched_priority; - if (task_on_runqueue(p)) - move_first_runqueue(p); - + if (was_on_rq) { + add_to_runqueue(p); + } current->need_resched = 1; out_unlock: - spin_unlock(&runqueue_lock); - read_unlock_irq(&tasklist_lock); + UNLOCK_TASK_CPU_RQ_IRQ(p); + read_unlock(&tasklist_lock); out_nounlock: return retval; @@ -1048,19 +1437,18 @@ * to be atomic.) In threaded applications this optimization * gets triggered quite often. */ - - int nr_pending = nr_running; - #if CONFIG_SMP + int nr_pending = nt_running(REALTIME_RQ_ID); int i; - // Subtract non-idle processes running on other CPUs. + // Substract non-idle processes running on other CPUs. for (i = 0; i < smp_num_cpus; i++) { int cpu = cpu_logical_map(i); - if (aligned_data[cpu].schedule_data.curr != idle_task(cpu)) - nr_pending--; + nr_pending += (nt_running(cpu) - running_non_idle(cpu)); } #else + int nr_pending = nr_running(); + // on UP this process is on the runqueue as well nr_pending--; #endif @@ -1240,6 +1628,7 @@ void reparent_to_init(void) { struct task_struct *this_task = current; + unsigned long old_policy; write_lock_irq(&tasklist_lock); @@ -1259,11 +1648,23 @@ /* We also take the runqueue_lock while altering task fields * which affect scheduling decisions */ - spin_lock(&runqueue_lock); + LOCK_TASK_RQ_VERIFY(this_task); this_task->ptrace = 0; + /* + * If this is a RT task, we must move it off the + * RT runqueue (and onto a CPU specific queue). + */ + old_policy = this_task->policy; + if (old_policy != SCHED_OTHER) { + del_from_runqueue(this_task); + } this_task->nice = DEF_NICE; this_task->policy = SCHED_OTHER; + if (old_policy != SCHED_OTHER) { + add_to_runqueue(this_task); + } + /* cpus_allowed? */ /* rt_priority? */ /* signals? */ @@ -1274,7 +1675,7 @@ memcpy(this_task->rlim, init_task.rlim, sizeof(*(this_task->rlim))); this_task->user = INIT_USER; - spin_unlock(&runqueue_lock); + UNLOCK_TASK_RQ(this_task); write_unlock_irq(&tasklist_lock); } @@ -1337,6 +1738,14 @@ */ int cpu = smp_processor_id(); int nr; + + /* + * Initialize the scheduling data structures + */ + for (nr = 0; nr < N_RUNQUEUES; nr++) + INIT_RUNQUEUE_DATA(nr); + for (nr = 0; nr < N_ALIGNED_DATA; nr++) + INIT_ALIGNED_DATA(nr); init_task.processor = cpu; diff -Naur linux-2.4.14/kernel/signal.c linux-2.4.14-mq/kernel/signal.c --- linux-2.4.14/kernel/signal.c Mon Sep 17 23:40:01 2001 +++ linux-2.4.14-mq/kernel/signal.c Tue Nov 6 22:19:27 2001 @@ -478,10 +478,10 @@ * process of changing - but no harm is done by that * other than doing an extra (lightweight) IPI interrupt. */ - spin_lock(&runqueue_lock); + LOCK_TASK_RQ_VERIFY(t); if (t->has_cpu && t->processor != smp_processor_id()) smp_send_reschedule(t->processor); - spin_unlock(&runqueue_lock); + UNLOCK_TASK_RQ(t); #endif /* CONFIG_SMP */ if (t->state & TASK_INTERRUPTIBLE) { diff -Naur linux-2.4.14/kernel/timer.c linux-2.4.14-mq/kernel/timer.c --- linux-2.4.14/kernel/timer.c Mon Oct 8 17:41:41 2001 +++ linux-2.4.14-mq/kernel/timer.c Tue Nov 6 22:19:27 2001 @@ -587,6 +587,11 @@ p->counter = 0; p->need_resched = 1; } +#ifdef CONFIG_SMP + if (curr_na_goodness(cpu) != MIN_GOODNESS) { + curr_na_goodness(cpu) = na_goodness(p); + } +#endif if (p->nice > 0) kstat.per_cpu_nice[cpu] += user_tick; else