diff -Naur linux-2.4.7/arch/i386/kernel/apm.c linux-2.4.7-mq/arch/i386/kernel/apm.c --- linux-2.4.7/arch/i386/kernel/apm.c Fri Apr 6 17:42:47 2001 +++ linux-2.4.7-mq/arch/i386/kernel/apm.c Mon Aug 6 15:59:32 2001 @@ -1092,7 +1092,11 @@ * decide if we should just power down. * */ +#ifdef CONFIG_SMP +#define system_idle() (nr_running() == 1) +#else #define system_idle() (nr_running == 1) +#endif static void apm_mainloop(void) { diff -Naur linux-2.4.7/arch/i386/kernel/smpboot.c linux-2.4.7-mq/arch/i386/kernel/smpboot.c --- linux-2.4.7/arch/i386/kernel/smpboot.c Tue Feb 13 22:13:43 2001 +++ linux-2.4.7-mq/arch/i386/kernel/smpboot.c Mon Aug 6 15:59:32 2001 @@ -562,13 +562,18 @@ if (!idle) panic("No idle process for CPU %d", cpu); - idle->processor = cpu; x86_cpu_to_apicid[cpu] = apicid; x86_apicid_to_cpu[apicid] = cpu; - idle->has_cpu = 1; /* we schedule the first task manually */ idle->thread.eip = (unsigned long) start_secondary; del_from_runqueue(idle); + /* + * Don't change processor/runqueue of task while it is + * still on runqueue. + */ + idle->processor = cpu; + idle->has_cpu = 1; /* we schedule the first task manually */ + unhash_process(idle); init_tasks[cpu] = idle; diff -Naur linux-2.4.7/fs/proc/proc_misc.c linux-2.4.7-mq/fs/proc/proc_misc.c --- linux-2.4.7/fs/proc/proc_misc.c Sat Jul 7 18:43:24 2001 +++ linux-2.4.7-mq/fs/proc/proc_misc.c Mon Aug 6 15:59:32 2001 @@ -96,7 +96,11 @@ LOAD_INT(a), LOAD_FRAC(a), LOAD_INT(b), LOAD_FRAC(b), LOAD_INT(c), LOAD_FRAC(c), +#ifdef CONFIG_SMP + nr_running(), nr_threads, last_pid); +#else nr_running, nr_threads, last_pid); +#endif return proc_calc_metrics(page, start, off, count, eof, len); } diff -Naur linux-2.4.7/include/linux/sched.h linux-2.4.7-mq/include/linux/sched.h --- linux-2.4.7/include/linux/sched.h Fri Jul 20 19:52:18 2001 +++ linux-2.4.7-mq/include/linux/sched.h Mon Aug 6 22:31:00 2001 @@ -70,9 +70,15 @@ #define CT_TO_SECS(x) ((x) / HZ) #define CT_TO_USECS(x) (((x) % HZ) * 1000000/HZ) -extern int nr_running, nr_threads; +extern int nr_threads; extern int last_pid; +#ifdef CONFIG_SMP +extern int nr_running(void); +#else +extern int nr_running; +#endif + #include <linux/fs.h> #include <linux/time.h> #include <linux/param.h> @@ -131,13 +137,81 @@ #include <linux/spinlock.h> /* + * aligned_data + * CPU specific scheduling data. One data item for each CPU + * in the system. Size should be a multiple of cache line size, + * and array of items should start on a cache line boundary. + */ +typedef union aligned_data { + struct schedule_data { + struct task_struct * curr; /* current task on this CPU */ + cycles_t last_schedule; /* time of last scheduling */ + /* decision */ +#ifdef CONFIG_SMP + int curr_na_goodness; /* non-affinity goodness */ + /* value of current task */ +#endif + } schedule_data; + char __pad [SMP_CACHE_BYTES]; +} aligned_data_t; +extern aligned_data_t aligned_data[]; +#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr +#ifdef CONFIG_SMP +#define curr_na_goodness(cpu) aligned_data[(cpu)].schedule_data.curr_na_goodness +#endif +#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule + +#ifdef CONFIG_SMP +/* + * runqueue_data + * One runqueue per CPU in the system, plus one additional runqueue for + * realtime tasks. Size should be a multiple of cache line size, and array + * of items should start on a cache line boundary. + */ +typedef union runqueue_data { + struct rq_data { + int nt_running; /* # of tasks on runqueue */ + int max_na_goodness; /* maximum non-affinity */ + /* goodness value of */ + /* 'schedulable' task */ + /* on this runqueue */ + struct task_struct * max_na_ptr; /* pointer to task which */ + /* has max_na_goodness */ + unsigned long max_na_cpus_allowed; /* copy of cpus_allowed */ + /* field from task with */ + /* max_na_goodness */ + struct list_head runqueue; /* list of tasks on runqueue */ + int running_non_idle; /* flag to indicate this cpu */ + /* is running something */ + /* besides the idle thread */ + spinlock_t runqueue_lock; /* lock for this runqueue */ + } rq_data; + char __pad [SMP_CACHE_BYTES]; +} runqueue_data_t; +extern runqueue_data_t runqueue_data[]; +#define runqueue_lock(cpu) runqueue_data[(cpu)].rq_data.runqueue_lock +#define runqueue(cpu) runqueue_data[(cpu)].rq_data.runqueue +#define max_na_goodness(cpu) runqueue_data[(cpu)].rq_data.max_na_goodness +#define max_na_cpus_allowed(cpu) \ + runqueue_data[(cpu)].rq_data.max_na_cpus_allowed +#define max_na_ptr(cpu) runqueue_data[(cpu)].rq_data.max_na_ptr +#define nt_running(cpu) runqueue_data[(cpu)].rq_data.nt_running +#define running_non_idle(cpu) runqueue_data[(cpu)].rq_data.running_non_idle + +#define REALTIME_RQ NR_CPUS +#define NA_GOODNESS_INIT -10000 +#endif + +/* * This serializes "schedule()" and also protects * the run-queue from deletions/modifications (but * _adding_ to the beginning of the run-queue has * a separate lock). */ extern rwlock_t tasklist_lock; +#ifndef CONFIG_SMP extern spinlock_t runqueue_lock; +#endif extern spinlock_t mmlist_lock; extern void sched_init(void); @@ -436,6 +510,7 @@ #define DEF_COUNTER (10*HZ/100) /* 100 ms time slice */ #define MAX_COUNTER (20*HZ/100) #define DEF_NICE (0) +#define ALL_CPUS_ALLOWED (-1) /* * INIT_TASK is used to set up the first task table, touch at @@ -454,7 +529,7 @@ policy: SCHED_OTHER, \ mm: NULL, \ active_mm: &init_mm, \ - cpus_allowed: -1, \ + cpus_allowed: ALL_CPUS_ALLOWED, \ run_list: LIST_HEAD_INIT(tsk.run_list), \ next_task: &tsk, \ prev_task: &tsk, \ @@ -838,18 +913,198 @@ #define next_thread(p) \ list_entry((p)->thread_group.next, struct task_struct, thread_group) -static inline void del_from_runqueue(struct task_struct * p) +static inline int task_on_runqueue(struct task_struct *p) { - nr_running--; + return (p->run_list.next != NULL); +} + +#ifdef CONFIG_SMP +/* + * This function calculates the non-affinity goodness (na_goodness) + * of a task. This value is stored in the task struct while a task + * is on a runqueue. + */ +static inline int na_goodness(struct task_struct * p) +{ + int weight; + + /* + * select the current process after every other + * runnable process, but before the idle thread. + * Also, dont trigger a counter recalculation. + */ + weight = -1; + if (p->policy & SCHED_YIELD) + goto out; + + /* + * Be sure to give sufficiently high boost to realtime tasks + */ + if ((p->policy & ~SCHED_YIELD) != SCHED_OTHER) { + weight = 1000 + p->rt_priority; + goto out; + } + + /* + * na_goodness is based on the nuber of ticks left. + * Don't do any other calculations if the time slice is + * over.. + */ + weight = p->counter; + if (!weight) + goto out; + + /* + * Factor in the nice value + */ + weight += 20 - p->nice; + +out: + return weight; +} + +/* + * Determine runqueue associated with task + */ +static inline int task_to_runqueue(struct task_struct *t) +{ + int rq; + + if ((t->policy & ~SCHED_YIELD) != SCHED_OTHER) { + rq = REALTIME_RQ; + } else { + rq = t->processor; + } + + return(rq); +} + +/* + * Lock runqueue associated with task + */ +static inline void lock_task_rq(struct task_struct *t) +{ + int rq = task_to_runqueue(t); + + spin_lock(&runqueue_lock(rq)); + while (task_to_runqueue(t) != rq) { + spin_unlock(&runqueue_lock(rq)); + rq = task_to_runqueue(t); + spin_lock(&runqueue_lock(rq)); + } +} + +/* + * Unlock runqueue associated with task + */ +static inline void unlock_task_rq(struct task_struct *t) +{ + spin_unlock(&runqueue_lock(task_to_runqueue(t))); +} + +/* + * Lock runqueue associated with task and disable interrupts + */ +static inline void lock_task_rq_irq(struct task_struct *t) +{ + int rq = t->processor; + + spin_lock_irq(&runqueue_lock(rq)); + while (t->processor != rq) { + spin_unlock_irq(&runqueue_lock(rq)); + rq = t->processor; + spin_lock_irq(&runqueue_lock(rq)); + } +} + +/* + * Unlock runqueue associated with task and enable interrupts + */ +static inline void unlock_task_rq_irq(struct task_struct *t) +{ + spin_unlock_irq(&runqueue_lock(t->processor)); +} + +/* + * Common routine for both flavors of del_from_runqueue. + */ +static inline void del_from_runqueue_common(struct task_struct * p, int upd) +{ + int rq = task_to_runqueue(p); + + nt_running(rq)--; p->sleep_time = jiffies; list_del(&p->run_list); p->run_list.next = NULL; + + if (max_na_ptr(rq) == p) { + if (upd) { + /* + * If we want to update max_na_* valies for the + * runqueue, then we scan the queue and look for + * the FIRST schedulable task. This is a 'good + * enough' approximation. + */ + struct list_head *tmp; + struct task_struct *t, *tmp_task = NULL; + int weight, tmp_weight = 0; + + list_for_each(tmp, &runqueue(rq)) { + t = list_entry(tmp, struct task_struct, + run_list); + if (!t->has_cpu) { + weight = na_goodness(t); + if (weight > tmp_weight) { + tmp_weight = weight; + tmp_task = t; + goto found_one; + } + } + } +found_one: + if (tmp_weight) { + max_na_goodness(rq) = tmp_weight; + max_na_cpus_allowed(rq) = + tmp_task->cpus_allowed; + max_na_ptr(rq) = tmp_task; + } else { + max_na_goodness(rq) = NA_GOODNESS_INIT; + max_na_ptr(rq) = NULL; + } + } else { + max_na_goodness(rq) = NA_GOODNESS_INIT; + max_na_ptr(rq) = NULL; + } + } } -static inline int task_on_runqueue(struct task_struct *p) +/* + * del_from_runqueue without updating max_na_* values. Used in + * places where we know we will updating these values before + * dropping the runqueue lock. + */ +static inline void del_from_runqueue(struct task_struct * p) { - return (p->run_list.next != NULL); + del_from_runqueue_common(p, 0); } + +/* + * del_from_runqueue_update will update the max_na_* values + * if necessary. + */ +static inline void del_from_runqueue_update(struct task_struct * p) +{ + del_from_runqueue_common(p, 1); +} +#else +static inline void del_from_runqueue(struct task_struct * p) +{ + nr_running--; + p->sleep_time = jiffies; + list_del(&p->run_list); + p->run_list.next = NULL; +} +#endif static inline void unhash_process(struct task_struct *p) { diff -Naur linux-2.4.7/kernel/fork.c linux-2.4.7-mq/kernel/fork.c --- linux-2.4.7/kernel/fork.c Wed Jul 18 01:23:28 2001 +++ linux-2.4.7-mq/kernel/fork.c Mon Aug 6 15:59:32 2001 @@ -27,7 +27,9 @@ /* The idle threads do not count.. */ int nr_threads; +#ifndef CONFIG_SMP int nr_running; +#endif int max_threads; unsigned long total_forks; /* Handle normal Linux uptimes. */ diff -Naur linux-2.4.7/kernel/sched.c linux-2.4.7-mq/kernel/sched.c --- linux-2.4.7/kernel/sched.c Wed Jul 18 01:30:50 2001 +++ linux-2.4.7-mq/kernel/sched.c Mon Aug 6 17:10:18 2001 @@ -78,33 +78,42 @@ /* * The tasklist_lock protects the linked list of processes. * - * The runqueue_lock locks the parts that actually access - * and change the run-queues, and have to be interrupt-safe. - * * If both locks are to be concurrently held, the runqueue_lock * nests inside the tasklist_lock. * * task->alloc_lock nests inside tasklist_lock. */ -spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED; /* inner */ rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */ +#ifdef CONFIG_SMP +/* + * There are NR_CPUS + 1 run queues. In addition to CPU specific + * runqueues there is a realtime runqueue so that any CPU can easily + * schedule realtime tasks. All CPUs can schedule tasks from the + * realtime runqueue (with appropriate locking of course). + * Initializatoin is performed in sched_init(). + */ +runqueue_data_t runqueue_data [NR_CPUS+1] __cacheline_aligned; +#else +/* + * The run-queue lock locks the parts that actually access + * and change the run-queues, and have to be interrupt-safe. + */ +spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED; /* inner */ static LIST_HEAD(runqueue_head); +#endif /* * We align per-CPU scheduling data on cacheline boundaries, * to prevent cacheline ping-pong. */ -static union { - struct schedule_data { - struct task_struct * curr; - cycles_t last_schedule; - } schedule_data; - char __pad [SMP_CACHE_BYTES]; -} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}}; - -#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr -#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule +#ifdef CONFIG_SMP +aligned_data_t aligned_data [NR_CPUS] __cacheline_aligned = + { {{&init_task,NA_GOODNESS_INIT,0}}}; +#else +aligned_data_t aligned_data [NR_CPUS] __cacheline_aligned = + { {{&init_task,0}}}; +#endif struct kernel_stat kstat; @@ -113,6 +122,8 @@ #define idle_task(cpu) (init_tasks[cpu_number_map(cpu)]) #define can_schedule(p,cpu) ((!(p)->has_cpu) && \ ((p)->cpus_allowed & (1 << cpu))) +#define local_can_schedule(p) (!(p)->has_cpu) +#define this_cpu_allowed(ca, tcpu) ((ca) & (1 << tcpu)) #else @@ -123,6 +134,50 @@ void scheduling_functions_start_here(void) { } +#ifdef CONFIG_SMP +/* + * Sum CPU specific nt_running fields to determine how many + * runnable tasks there are in the system. Replacement for + * the global nr_running variable. + */ +int +nr_running(void) { + int i; + int tot=nt_running(REALTIME_RQ); + + for(i=0; i<smp_num_cpus; i++) { + tot += nt_running(cpu_logical_map(i)); + } + + return(tot); +} + +/* + * A stripped down version of the goodness routine for use on a CPU + * specific runqueue. This routine does not need to be concerned + * with realtime tasks, and does not need to take CPU affinity into + * account. + */ +static inline int local_goodness(struct task_struct * p, struct mm_struct *this_mm) +{ + int weight; + + weight = -1; + if (p->policy & SCHED_YIELD) + goto local_out; + + weight = p->counter; + if (!weight) + goto local_out; + + if (p->mm == this_mm || !p->mm) + weight += 1; + weight += 20 - p->nice; +local_out: + return weight; +} +#endif + /* * This is the function that decides how desirable a process is.. * You can weigh different processes against each other depending @@ -199,92 +254,213 @@ } /* - * This is ugly, but reschedule_idle() is very timing-critical. - * We are called with the runqueue spinlock held and we must - * not claim the tasklist_lock. + * Careful! + * + * This has to add the process to the _beginning_ of the + * run-queue, not the end. See the comment about "This is + * subtle" in the scheduler proper.. + */ +static inline void add_to_runqueue(struct task_struct * p) +{ +#ifdef CONFIG_SMP + int rq = task_to_runqueue(p); + int tsk_na_goodness = na_goodness(p); + + list_add(&p->run_list, &runqueue(rq)); + + if (!p->has_cpu && (tsk_na_goodness > max_na_goodness(rq))) { + max_na_goodness(rq) = tsk_na_goodness; + max_na_cpus_allowed(rq) = p->cpus_allowed; + max_na_ptr(rq) = p; + } + + nt_running(rq)++; +#else + list_add(&p->run_list, &runqueue_head); + nr_running++; +#endif +} + +/* + * The runqueue lock must be held upon entry to this routine. */ static FASTCALL(void reschedule_idle(struct task_struct * p)); static void reschedule_idle(struct task_struct * p) { #ifdef CONFIG_SMP - int this_cpu = smp_processor_id(); - struct task_struct *tsk, *target_tsk; - int cpu, best_cpu, i, max_prio; - cycles_t oldest_idle; + struct task_struct *tsk; + int target_cpu, this_cpu, tsk_cpu; + int i, cpu; + int need_resched; + cycles_t curr_cycles, tmp_cycles; + int stack_list[NR_CPUS]; + int saved_na_goodness, tmp_min_na_goodness; + + tsk_cpu = p->processor; + this_cpu = smp_processor_id(); + /* + * First check if the task's previous CPU is idle, use it if it is. + */ + if (can_schedule(p, tsk_cpu) && + (cpu_curr(tsk_cpu) == idle_task(tsk_cpu))) { + if (!task_on_runqueue(p)) { + add_to_runqueue(p); + } + tsk = cpu_curr(tsk_cpu); + need_resched = tsk->need_resched; + tsk->need_resched = 1; + if ((tsk_cpu != this_cpu) && !need_resched) { + smp_send_reschedule(tsk_cpu); + } + return; + } /* - * shortcut if the woken up task's last CPU is - * idle now. - */ - best_cpu = p->processor; - if (can_schedule(p, best_cpu)) { - tsk = idle_task(best_cpu); - if (cpu_curr(best_cpu) == tsk) { - int need_resched; -send_now_idle: + * Create a list of current na_goodness values on our stack. + * Only values less than the non-affinity goodness value of + * p should be considered for preemption. + */ + saved_na_goodness = na_goodness(p) - 1; /* preemption_goodness() > 1 */ + tmp_min_na_goodness = saved_na_goodness; + curr_cycles = get_cycles(); + target_cpu = -1; + for (i = 0; i < smp_num_cpus; i++) { + cpu = cpu_logical_map(i); + + if (!can_schedule(p, cpu)) { + stack_list[cpu] = saved_na_goodness; + continue; + } + + if (curr_na_goodness(cpu) == NA_GOODNESS_INIT) { /* - * If need_resched == -1 then we can skip sending - * the IPI altogether, tsk->need_resched is - * actively watched by the idle thread. + * Indicates an idle task. For idle tasks, determine + * the amount of time they have been idle. Use the + * negative of this value in the list. Hence, we + * first choose the CPU that has been idle the longest. */ - need_resched = tsk->need_resched; - tsk->need_resched = 1; - if ((best_cpu != this_cpu) && !need_resched) - smp_send_reschedule(best_cpu); - return; + tmp_cycles = curr_cycles - last_schedule(cpu); + if (tmp_cycles > INT_MAX) { + stack_list[cpu] = INT_MIN; + } else { + stack_list[cpu] = (int)-tmp_cycles; + } + } else { + stack_list[cpu] = curr_na_goodness(cpu); + /* + * Add in PROC_CHANGE_PENALTY for remote CPUs + */ + if (cpu != tsk_cpu) { + stack_list[cpu] += PROC_CHANGE_PENALTY; + } + } + + /* + * Look for the lowest value + */ + if (stack_list[cpu] < tmp_min_na_goodness) { + target_cpu = cpu; + tmp_min_na_goodness = stack_list[cpu]; } } /* - * We know that the preferred CPU has a cache-affine current - * process, lets try to find a new idle CPU for the woken-up - * process. Select the least recently active idle CPU. (that - * one will have the least active cache context.) Also find - * the executing process which has the least priority. - */ - oldest_idle = (cycles_t) -1; - target_tsk = NULL; - max_prio = 1; + * We try to add the task to a runqueue starting with the one + * that has the lowest na_goodness value. + */ + while (target_cpu != -1) { + if (target_cpu == tsk_cpu && + preemption_goodness((tsk = cpu_curr(target_cpu)), + p, target_cpu) > 1) { + /* + * If target_cpu is tsk_cpu, then no additional + * locking is required (we already have the CPU + * specific runqueue locked). We also know that + * this CPU can not be idle, otherwise the fast + * path at the beginning of this routine would + * have been executed. Therefore, simply send + * the IPI if required. + */ + if (!task_on_runqueue(p)) { + add_to_runqueue(p); + } + tsk = cpu_curr(target_cpu); + tsk->need_resched = 1; + if (target_cpu != this_cpu) { + smp_send_reschedule(target_cpu); + } + return; + } - for (i = 0; i < smp_num_cpus; i++) { - cpu = cpu_logical_map(i); - if (!can_schedule(p, cpu)) - continue; - tsk = cpu_curr(cpu); /* - * We use the first available idle CPU. This creates - * a priority list between idle CPUs, but this is not - * a problem. + * Try to lock runqueue and verify na_goodness value. */ - if (tsk == idle_task(cpu)) { - if (last_schedule(cpu) < oldest_idle) { - oldest_idle = last_schedule(cpu); - target_tsk = tsk; - } - } else { - if (oldest_idle == -1ULL) { - int prio = preemption_goodness(tsk, p, cpu); + else if (spin_trylock(&runqueue_lock(target_cpu))) { + tsk = cpu_curr(target_cpu); + if ((tsk == idle_task(target_cpu)) || + (preemption_goodness(tsk, p, target_cpu) > 1)) { + /* + * Target CPU is idle, or it is running + * a task with lower priority than p. + * Therefore, move p to target runqueue. + */ + if (task_on_runqueue(p)) { + del_from_runqueue_update(p); + } + p->processor = target_cpu; + add_to_runqueue(p); - if (prio > max_prio) { - max_prio = prio; - target_tsk = tsk; + /* + * Send an IPI to target CPU, unless the + * CPU is idle and the need_resched flag + * has already been set. + */ + need_resched = tsk->need_resched; + tsk->need_resched = 1; + if ((target_cpu != this_cpu) && + ((tsk != idle_task(target_cpu)) || + !need_resched)){ + smp_send_reschedule(target_cpu); } + + spin_unlock(&runqueue_lock(target_cpu)); + + return; } + spin_unlock(&runqueue_lock(target_cpu)); } - } - tsk = target_tsk; - if (tsk) { - if (oldest_idle != -1ULL) { - best_cpu = tsk->processor; - goto send_now_idle; + + /* + * Update list value so we don't check this CPU again. + */ + stack_list[target_cpu] = saved_na_goodness; + + /* + * Find the 'next lowest' cur_na_goodness value. + */ + target_cpu = -1; + tmp_min_na_goodness = saved_na_goodness; + for (i = 0; i < smp_num_cpus; i++) { + cpu = cpu_logical_map(i); + if (stack_list[cpu] < tmp_min_na_goodness) { + target_cpu = cpu; + tmp_min_na_goodness = stack_list[cpu]; + } } - tsk->need_resched = 1; - if (tsk->processor != this_cpu) - smp_send_reschedule(tsk->processor); + } + + /* + * If we get here, it means that the best place for the task is + * on its currently assigned runqueue. Also, we know that the + * task currently running on the this tasks runqueue has sufficuent + * priority to prevent preemption. Hence, we simply ensure the + * task is on the runqueue. + */ + if (!task_on_runqueue(p)) { + add_to_runqueue(p); } return; - #else /* UP */ int this_cpu = smp_processor_id(); @@ -296,29 +472,46 @@ #endif } +#ifdef CONFIG_SMP /* - * Careful! - * - * This has to add the process to the _beginning_ of the - * run-queue, not the end. See the comment about "This is - * subtle" in the scheduler proper.. + * Lock runqueue associated with task's CPU and save irq state + * NOTE that this is different than locking the runqueue associated + * with the task (specifically for realtime tasks). */ -static inline void add_to_runqueue(struct task_struct * p) +static inline int lock_task_cpu_rq_irqsave(struct task_struct *t, + unsigned long *flags) { - list_add(&p->run_list, &runqueue_head); - nr_running++; + int rq = t->processor; + + spin_lock_irqsave(&runqueue_lock(rq), *flags); + while (t->processor != rq) { + spin_unlock_irqrestore(&runqueue_lock(rq), *flags); + rq = t->processor; + spin_lock_irqsave(&runqueue_lock(rq), *flags); + } + + return(rq); } +#endif static inline void move_last_runqueue(struct task_struct * p) { list_del(&p->run_list); +#ifdef CONFIG_SMP + list_add_tail(&p->run_list, &runqueue(task_to_runqueue(p))); +#else list_add_tail(&p->run_list, &runqueue_head); +#endif } static inline void move_first_runqueue(struct task_struct * p) { list_del(&p->run_list); +#ifdef CONFIG_SMP + list_add(&p->run_list, &runqueue(task_to_runqueue(p))); +#else list_add(&p->run_list, &runqueue_head); +#endif } /* @@ -333,20 +526,47 @@ { unsigned long flags; int success = 0; +#ifdef CONFIG_SMP + int rq; + + rq = lock_task_cpu_rq_irqsave(p, &flags); + if (task_to_runqueue(p) == REALTIME_RQ) { + /* + * Indicates p is a realtime task. We must + * also lock the realtime runqueue. + */ + spin_lock(&runqueue_lock(REALTIME_RQ)); + } +#else + spin_lock_irqsave(&runqueue_lock, flags); +#endif /* * We want the common case fall through straight, thus the goto. */ - spin_lock_irqsave(&runqueue_lock, flags); p->state = TASK_RUNNING; if (task_on_runqueue(p)) goto out; +#ifndef CONFIG_SMP add_to_runqueue(p); if (!synchronous || !(p->cpus_allowed & (1 << smp_processor_id()))) reschedule_idle(p); +#else + if (!synchronous || !(p->cpus_allowed & (1 << smp_processor_id()))) + reschedule_idle(p); + else + add_to_runqueue(p); +#endif success = 1; out: +#ifdef CONFIG_SMP + if (task_to_runqueue(p) == REALTIME_RQ) { + spin_unlock(&runqueue_lock(REALTIME_RQ)); + } + spin_unlock_irqrestore(&runqueue_lock(rq), flags); +#else spin_unlock_irqrestore(&runqueue_lock, flags); +#endif return success; } @@ -449,6 +669,7 @@ { #ifdef CONFIG_SMP int policy; + int rq; /* * prev->policy can be written from here only before `prev' @@ -501,15 +722,27 @@ (policy & SCHED_YIELD)) goto out_unlock; - spin_lock_irqsave(&runqueue_lock, flags); + rq = lock_task_cpu_rq_irqsave(prev, &flags); + if (task_to_runqueue(prev) == REALTIME_RQ) { + /* + * Indicates prev is a realtime task. We must + * also lock the realtime runqueue. + */ + spin_lock(&runqueue_lock(REALTIME_RQ)); + } + if ((prev->state == TASK_RUNNING) && !prev->has_cpu) reschedule_idle(prev); - spin_unlock_irqrestore(&runqueue_lock, flags); + + if (task_to_runqueue(prev) == REALTIME_RQ) { + spin_unlock(&runqueue_lock(REALTIME_RQ)); + } + spin_unlock_irqrestore(&runqueue_lock(rq), flags); goto out_unlock; } #else prev->policy &= ~SCHED_YIELD; -#endif /* CONFIG_SMP */ +#endif } void schedule_tail(struct task_struct *prev) @@ -518,10 +751,7 @@ } /* - * 'schedule()' is the scheduler function. It's a very simple and nice - * scheduler: it's not perfect, but certainly works for most things. - * - * The goto is "interesting". + * 'schedule()' is the scheduler function. * * NOTE!! Task 0 is the 'idle' task, which gets called when no other * tasks can run. It can not be killed, and it cannot sleep. The 'state' @@ -529,28 +759,42 @@ */ asmlinkage void schedule(void) { - struct schedule_data * sched_data; struct task_struct *prev, *next, *p; struct list_head *tmp; int this_cpu, c; +#ifdef CONFIG_SMP + struct task_struct *prev_next; + int tmp_na_goodness, prev_next_weight; + int prev_rq, rrq, rq, rcpu; + int stack_list[NR_CPUS]; + int premature_idle; +#endif if (!current->active_mm) BUG(); need_resched_back: prev = current; this_cpu = prev->processor; +#ifdef CONFIG_SMP + prev_rq = task_to_runqueue(prev); +#endif if (in_interrupt()) goto scheduling_in_interrupt; release_kernel_lock(prev, this_cpu); - /* - * 'sched_data' is protected by the fact that we can run - * only one process per CPU. - */ - sched_data = & aligned_data[this_cpu].schedule_data; - +#ifdef CONFIG_SMP + spin_lock_irq(&runqueue_lock(this_cpu)); + if (prev_rq != this_cpu) { + /* + * Indicates curent is a realtime task. We must also + * lock the realtime runqueue. + */ + spin_lock(&runqueue_lock(REALTIME_RQ)); + } +#else spin_lock_irq(&runqueue_lock); +#endif /* move an exhausted RR process to be last.. */ if (prev->policy == SCHED_RR) @@ -579,10 +823,15 @@ */ next = idle_task(this_cpu); c = -1000; +#ifdef CONFIG_SMP + prev_next = next; + prev_next_weight = c; +#endif if (prev->state == TASK_RUNNING) goto still_running; still_running_back: +#ifndef CONFIG_SMP list_for_each(tmp, &runqueue_head) { p = list_entry(tmp, struct task_struct, run_list); if (can_schedule(p, this_cpu)) { @@ -591,21 +840,212 @@ c = weight, next = p; } } +#else + /* + * There exists a separate realtime runqueue for all realtime + * tasks. Any CPU can schedule a task from the realtime runqueue. + * + * We scan the realtime runqueue if either, + * - The current task is a realtime task. Hence, the realtime + * run-queue is already locked. + * - There is a task on the realtime runqueue. In this case we + * must acquire the realtime runqueue lock here. + * Otherwise, we skip the scan of the realtime runqueue. + */ + if (prev_rq == REALTIME_RQ || nt_running(REALTIME_RQ)) { + goto scan_rt_queue; + } +scan_rt_queue_back: + + /* + * Search CPU specific runqueue + */ + list_for_each(tmp, &runqueue(this_cpu)) { + p = list_entry(tmp, struct task_struct, run_list); + if (local_can_schedule(p)) { + int weight = local_goodness(p, prev->active_mm); + if (weight > c) { + if (!next->has_cpu) { + /* + * prev_next must point to a + * schedulable task. + */ + prev_next_weight = c; + prev_next = next; + } + c = weight; + next = p; + } else if (weight > prev_next_weight) { + prev_next_weight = weight; + prev_next = p; + } + } + } + + /* + * If next task is realtime, there is no need to look at other + * runqueues. Simply set max_na_goodness values appropriately. + */ + if (task_to_runqueue(next) == REALTIME_RQ) { + goto set_max_na_goodness; + } + + /* + * As calculated above, c does not contain a CPU affinity boost. + * We must add this boost before comparing to tasks on other + * runqueues. Only add PROC_CHANGE_PENALTY if c is a positive + * goodness value. + */ + if (c > 0) { + c += PROC_CHANGE_PENALTY; + } + + /* + * Copy max_na_goodness values from CPU specific runqueue + * structures to the list on our stack. + */ +scan_rmt_queues: + premature_idle = 0; + rrq = -1; + tmp_na_goodness = c; + for (rq = 0; rq < smp_num_cpus; rq++) { + rcpu = cpu_logical_map(rq); + if (rcpu == this_cpu) { + stack_list[rcpu] = NA_GOODNESS_INIT; + continue; + } + if (!this_cpu_allowed(max_na_cpus_allowed(rcpu), this_cpu)) { + stack_list[rcpu] = NA_GOODNESS_INIT; + continue; + } + if (max_na_goodness(rcpu) <= c) { + stack_list[rcpu] = NA_GOODNESS_INIT; + continue; + } + + stack_list[rcpu] = max_na_goodness(rcpu); + if (stack_list[rcpu] > tmp_na_goodness) { + rrq = rcpu; + tmp_na_goodness = stack_list[rcpu]; + } + } + + /* + * Now use the values from the stack list to search for a + * task to steal. + */ + while (rrq != -1) { + /* + * First try to lock the remote runqueue and verify + * the max_na_goodness value. + */ + if (spin_trylock(&runqueue_lock(rrq))) { + if (max_na_goodness(rrq) > c && + this_cpu_allowed(max_na_cpus_allowed(rrq), + this_cpu)) { + /* + * Move a remote task to our runqueue + */ + if (!next->has_cpu) { + prev_next = next; + } + next = max_na_ptr(rrq); + c = max_na_goodness(rrq); + del_from_runqueue_update(next); + next->processor = this_cpu; + add_to_runqueue(next); + + /* + * We have stolen a task from another + * runqueue, quit looking. + */ + spin_unlock(&runqueue_lock(rrq)); + break; + } + spin_unlock(&runqueue_lock(rrq)); + } else { + premature_idle++; + } + + /* + * We were either unable to get the remote runqueue lock, + * or the remote runqueue has changed such that it is no + * longer desirable to steal a task from the queue. + * + * Go to the runqueue with the 'next highest' max_na_goodness + * value. + */ + stack_list[rrq] = NA_GOODNESS_INIT; + tmp_na_goodness = c; + rrq = -1; + for (rq = 0; rq < smp_num_cpus; rq++) { + rcpu = cpu_logical_map(rq); + if (stack_list[rcpu] > tmp_na_goodness) { + rrq = rcpu; + tmp_na_goodness = stack_list[rcpu]; + } + } + } + + /* + * Check for going idle prematurely. If this is the case, there + * is a good chance there are schedulable tasks on other runquueues. + */ + if ((next == idle_task(this_cpu)) && premature_idle) { + /* + * Be sure to clear max_na_goodness, otherwise there + * is the potential for deadlock. + */ + max_na_goodness(this_cpu) = NA_GOODNESS_INIT; + goto scan_rmt_queues; + } +#endif /* Do we need to re-calculate counters? */ if (!c) goto recalculate; + +#ifdef CONFIG_SMP +set_max_na_goodness: + /* + * Make sure max_na_* values for runqueue are updated + */ + if (prev_next != idle_task(this_cpu)) { + max_na_goodness(this_cpu) = na_goodness(prev_next); + max_na_cpus_allowed(this_cpu) = prev_next->cpus_allowed; + max_na_ptr(this_cpu) = prev_next; + } else { + max_na_goodness(this_cpu) = NA_GOODNESS_INIT; + /* max_na_cpus_allowed need not be set */ + max_na_ptr(this_cpu) = NULL; + } + + /* + * Update scheduling fields in next task structure. + */ + next->has_cpu = 1; +found_next: + next->processor = this_cpu; + +#endif /* * from this point on nothing can prevent us from * switching to the next task, save this fact in * sched_data. */ - sched_data->curr = next; + cpu_curr(this_cpu) = next; #ifdef CONFIG_SMP - next->has_cpu = 1; - next->processor = this_cpu; -#endif + if (next != idle_task(this_cpu)) { + curr_na_goodness(this_cpu) = na_goodness(next); + running_non_idle(this_cpu) = 1; + } else { + curr_na_goodness(this_cpu) = NA_GOODNESS_INIT; + running_non_idle(this_cpu) = 0; + } + spin_unlock_irq(&runqueue_lock(this_cpu)); +#else spin_unlock_irq(&runqueue_lock); +#endif if (prev == next) { /* We won't go through the normal tail, so do this by hand */ @@ -621,7 +1061,7 @@ * and it's approximate, so we do not have to maintain * it while holding the runqueue spinlock. */ - sched_data->last_schedule = get_cycles(); + last_schedule(this_cpu) = get_cycles(); /* * We drop the scheduler lock early (it's a global spinlock), @@ -629,7 +1069,7 @@ * rescheduled during switch_to(). */ -#endif /* CONFIG_SMP */ +#endif kstat.context_swtch++; /* @@ -678,18 +1118,42 @@ recalculate: { struct task_struct *p; +#ifdef CONFIG_SMP + spin_unlock_irq(&runqueue_lock(this_cpu)); +#else spin_unlock_irq(&runqueue_lock); +#endif read_lock(&tasklist_lock); for_each_task(p) p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice); read_unlock(&tasklist_lock); +#ifdef CONFIG_SMP + spin_lock_irq(&runqueue_lock(this_cpu)); +#else spin_lock_irq(&runqueue_lock); +#endif } goto repeat_schedule; still_running: - if (!(prev->cpus_allowed & (1UL << this_cpu))) +#ifdef CONFIG_SMP + if (!(prev->cpus_allowed & (1UL << this_cpu))) { + unsigned long allowed = prev->cpus_allowed; + /* + * task is no longer runnable on this CPU. We remove it + * from the runqueue to ensure that it will not be considered + * for scheduling here. Let schedule_tail take care of + * sending it off to an appropriate CPU. + */ + del_from_runqueue(prev); + prev->processor = 0; + while (!(allowed & 1UL)) { + prev->processor++; + allowed = allowed >> 1; + } goto still_running_back; + } +#endif c = goodness(prev, this_cpu, prev->active_mm); next = prev; goto still_running_back; @@ -701,6 +1165,54 @@ } goto move_rr_back; +#ifdef CONFIG_SMP +scan_rt_queue: + if (prev_rq != REALTIME_RQ) { + spin_lock(&runqueue_lock(REALTIME_RQ)); + } + /* + * Scan the queue. Note that there is no need to + * keep track of max_na_goodness values for the + * realtime runqueue, as they are never used in + * scheduling decisions. + */ + list_for_each(tmp, &runqueue(REALTIME_RQ)) { + p = list_entry(tmp, struct task_struct, run_list); + if (can_schedule(p, this_cpu)) { + int weight = 1000 + p->rt_priority; + if (weight > c) + c = weight, next = p; + } + } + + /* + * Unlock the realtime runqueue before continuing. If we + * selected a realtime task to execute, be sure to make it + * non-schedulable (set has_cpu) before releasing the lock. + * Note that we still hold the CPU specific runqueue lock. + */ + if (next != idle_task(this_cpu)) { + next->has_cpu = 1; + } + spin_unlock(&runqueue_lock(REALTIME_RQ)); + + /* + * If we find a realtime task to schedule we are mostly done. + * However, the max_na_goodness value of this CPU's runqueue + * may need to be set to an appropriate value (so that other + * CPUs can steal tasks while this CPU runs the realtime task). + * If necessary, make a pass through the CPU specific runqueue + * to set the value. + */ + if ((task_to_runqueue(next) == REALTIME_RQ) && + ((max_na_goodness(this_cpu) != NA_GOODNESS_INIT) || + !nt_running(this_cpu)) ) { + goto found_next; + } + + goto scan_rt_queue_back; +#endif + scheduling_in_interrupt: printk("Scheduling in interrupt\n"); BUG(); @@ -909,6 +1421,8 @@ struct sched_param lp; struct task_struct *p; int retval; + int lock_rt = 0; + int was_on_rq = 0; retval = -EINVAL; if (!param || pid < 0) @@ -918,18 +1432,31 @@ if (copy_from_user(&lp, param, sizeof(struct sched_param))) goto out_nounlock; + retval = -ESRCH; +#ifdef CONFIG_SMP + /* + * Note that here we get the tasklist lock so that we can + * find the task struct. Not until we have access to the + * task struct can we determine what runqueue to lock. + */ + read_lock(&tasklist_lock); + p = find_process_by_pid(pid); + if (!p) { + read_unlock(&tasklist_lock); + goto out_nounlock; + } + lock_task_rq_irq(p); +#else /* * We play safe to avoid deadlocks. */ read_lock_irq(&tasklist_lock); spin_lock(&runqueue_lock); - p = find_process_by_pid(pid); - - retval = -ESRCH; if (!p) goto out_unlock; - +#endif + if (policy < 0) policy = p->policy; else { @@ -958,16 +1485,49 @@ goto out_unlock; retval = 0; + +#ifdef CONFIG_SMP + if ((p->policy != SCHED_OTHER) || (policy != SCHED_OTHER)) { + /* + * If changing to/from a realtime policy, OR simply + * changing the priority of a realtime task, we must + * lock the realtime runqueue. + */ + spin_lock(&runqueue_lock(REALTIME_RQ)); + lock_rt = 1; + } + + if (task_on_runqueue(p)) { + del_from_runqueue_update(p); + was_on_rq = 1; + } + p->policy = policy; + p->rt_priority = lp.sched_priority; + if (was_on_rq) { + add_to_runqueue(p); + } + current->need_resched = 1; + + if (lock_rt) { + spin_unlock(&runqueue_lock(REALTIME_RQ)); + } +#else p->policy = policy; p->rt_priority = lp.sched_priority; if (task_on_runqueue(p)) move_first_runqueue(p); current->need_resched = 1; +#endif out_unlock: +#ifdef CONFIG_SMP + unlock_task_rq_irq(p); + read_unlock(&tasklist_lock); +#else spin_unlock(&runqueue_lock); read_unlock_irq(&tasklist_lock); +#endif out_nounlock: return retval; @@ -1044,19 +1604,18 @@ * to be atomic.) In threaded applications this optimization * gets triggered quite often. */ - - int nr_pending = nr_running; - #if CONFIG_SMP + int nr_pending = nt_running(REALTIME_RQ); int i; // Substract non-idle processes running on other CPUs. for (i = 0; i < smp_num_cpus; i++) { int cpu = cpu_logical_map(i); - if (aligned_data[cpu].schedule_data.curr != idle_task(cpu)) - nr_pending--; + nr_pending += (nt_running(cpu) - running_non_idle(cpu)); } #else + int nr_pending = nr_running; + // on UP this process is on the runqueue as well nr_pending--; #endif @@ -1270,6 +1829,21 @@ */ int cpu = smp_processor_id(); int nr; + +#ifdef CONFIG_SMP + /* + * Initialize the scheduling data structures + */ + for (nr = 0; nr < NR_CPUS+1; nr++) { + runqueue_lock(nr) = SPIN_LOCK_UNLOCKED; + INIT_LIST_HEAD(&runqueue(nr)); + max_na_goodness(nr) = NA_GOODNESS_INIT; + /* max_na_cpus_allowed need not be initialized */ + max_na_ptr(nr) = NULL; + nt_running(nr) = 0; + running_non_idle(nr) = 0; + } +#endif init_task.processor = cpu; diff -Naur linux-2.4.7/kernel/signal.c linux-2.4.7-mq/kernel/signal.c --- linux-2.4.7/kernel/signal.c Thu Jan 4 04:45:26 2001 +++ linux-2.4.7-mq/kernel/signal.c Mon Aug 6 15:59:32 2001 @@ -483,10 +483,10 @@ * process of changing - but no harm is done by that * other than doing an extra (lightweight) IPI interrupt. */ - spin_lock(&runqueue_lock); + lock_task_rq(t); if (t->has_cpu && t->processor != smp_processor_id()) smp_send_reschedule(t->processor); - spin_unlock(&runqueue_lock); + unlock_task_rq(t); #endif /* CONFIG_SMP */ } diff -Naur linux-2.4.7/kernel/timer.c linux-2.4.7-mq/kernel/timer.c --- linux-2.4.7/kernel/timer.c Tue Jun 12 23:40:11 2001 +++ linux-2.4.7-mq/kernel/timer.c Mon Aug 6 15:59:32 2001 @@ -587,6 +587,11 @@ p->counter = 0; p->need_resched = 1; } +#ifdef CONFIG_SMP + if (curr_na_goodness(cpu) != NA_GOODNESS_INIT) { + curr_na_goodness(cpu) = na_goodness(p); + } +#endif if (p->nice > 0) kstat.per_cpu_nice[cpu] += user_tick; else