[PATCH] (2/3) Initial load balancing

author Martin J. Bligh <mbligh@aracnet.com>

Thu, 16 Jan 2003 03:46:10 +0000 (19:46 -0800)

committer Justin T. Gibbs <gibbs@overdrive.btc.adaptec.com>

Thu, 16 Jan 2003 03:46:10 +0000 (19:46 -0800)
author Martin J. Bligh <mbligh@aracnet.com>
Thu, 16 Jan 2003 03:46:10 +0000 (19:46 -0800)
committer Justin T. Gibbs <gibbs@overdrive.btc.adaptec.com>
Thu, 16 Jan 2003 03:46:10 +0000 (19:46 -0800)
diff --git a/fs/exec.c b/fs/exec.c

index 4bc2f4d75f25a7fd4348e3f0424f7958e393dc72..92716bc7c10b9119be881743a2ecd5c4a5776f17 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1031,6 +1031,8 @@ int do_execve(char * filename, char ** argv, char ** envp, struct pt_regs * regs
         int retval;
         int i;
  
+       sched_balance_exec();
+
         file = open_exec(filename);
  
         retval = PTR_ERR(file);
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 931cdf559eb29564eb1c23534a77c961e59d4011..15a951d2d27e7fc6e927ae55b0ca9fe8e87c77ed 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -447,6 +447,14 @@ extern void set_cpus_allowed(task_t *p, unsigned long new_mask);
  # define set_cpus_allowed(p, new_mask) do { } while (0)
  #endif
  
+#ifdef CONFIG_NUMA
+extern void sched_balance_exec(void);
+extern void node_nr_running_init(void);
+#else
+#define sched_balance_exec()   {}
+#define node_nr_running_init() {}
+#endif
+
  extern void set_user_nice(task_t *p, long nice);
  extern int task_prio(task_t *p);
  extern int task_nice(task_t *p);
diff --git a/init/main.c b/init/main.c

index ec70f2fa704c4aaa81308cb151a8cd698487238a..493e3715830ba853c60d58e9d2f353e088bdda5b 100644 (file)
--- a/init/main.c
+++ b/init/main.c
@@ -495,6 +495,7 @@ static void do_pre_smp_initcalls(void)
  
         migration_init();
  #endif
+       node_nr_running_init();
         spawn_ksoftirqd();
  }
  
diff --git a/kernel/sched.c b/kernel/sched.c

index dd13ece4708916657c478d498b066253ef352278..5454656e15196d1dc24f15dcd6196e0560c9de5a 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -153,7 +153,9 @@ struct runqueue {
         task_t *curr, *idle;
         prio_array_t *active, *expired, arrays[2];
         int prev_nr_running[NR_CPUS];
-
+#ifdef CONFIG_NUMA
+       atomic_t *node_nr_running;
+#endif
         task_t *migration_thread;
         struct list_head migration_queue;
  
@@ -177,6 +179,48 @@ static struct runqueue runqueues[NR_CPUS] __cacheline_aligned;
  # define task_running(rq, p)           ((rq)->curr == (p))
  #endif
  
+#ifdef CONFIG_NUMA
+
+/*
+ * Keep track of running tasks.
+ */
+
+static atomic_t node_nr_running[MAX_NUMNODES] ____cacheline_maxaligned_in_smp =
+       {[0 ...MAX_NUMNODES-1] = ATOMIC_INIT(0)};
+
+static inline void nr_running_init(struct runqueue *rq)
+{
+       rq->node_nr_running = &node_nr_running[0];
+}
+
+static inline void nr_running_inc(runqueue_t *rq)
+{
+       atomic_inc(rq->node_nr_running);
+       rq->nr_running++;
+}
+
+static inline void nr_running_dec(runqueue_t *rq)
+{
+       atomic_dec(rq->node_nr_running);
+       rq->nr_running--;
+}
+
+__init void node_nr_running_init(void)
+{
+       int i;
+
+       for (i = 0; i < NR_CPUS; i++)
+               cpu_rq(i)->node_nr_running = &node_nr_running[__cpu_to_node(i)];
+}
+
+#else /* !CONFIG_NUMA */
+
+# define nr_running_init(rq)   do { } while (0)
+# define nr_running_inc(rq)    do { (rq)->nr_running++; } while (0)
+# define nr_running_dec(rq)    do { (rq)->nr_running--; } while (0)
+
+#endif /* CONFIG_NUMA */
+
  /*
   * task_rq_lock - lock the runqueue a given task resides on and disable
   * interrupts.  Note the ordering: we can safely lookup the task_rq without
@@ -294,7 +338,7 @@ static inline void activate_task(task_t *p, runqueue_t *rq)
                 p->prio = effective_prio(p);
         }
         enqueue_task(p, array);
-       rq->nr_running++;
+       nr_running_inc(rq);
  }
  
  /*
@@ -302,7 +346,7 @@ static inline void activate_task(task_t *p, runqueue_t *rq)
   */
  static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
  {
-       rq->nr_running--;
+       nr_running_dec(rq);
         if (p->state == TASK_UNINTERRUPTIBLE)
                 rq->nr_uninterruptible++;
         dequeue_task(p, p->array);
@@ -624,7 +668,72 @@ static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
                 spin_unlock(&rq2->lock);
  }
  
-#ifdef CONFIG_NUMA
+#if CONFIG_NUMA
+/*
+ * If dest_cpu is allowed for this process, migrate the task to it.
+ * This is accomplished by forcing the cpu_allowed mask to only
+ * allow dest_cpu, which will force the cpu onto dest_cpu.  Then
+ * the cpu_allowed mask is restored.
+ */
+static void sched_migrate_task(task_t *p, int dest_cpu)
+{
+       unsigned long old_mask;
+
+       old_mask = p->cpus_allowed;
+       if (!(old_mask & (1UL << dest_cpu)))
+               return;
+       /* force the process onto the specified CPU */
+       set_cpus_allowed(p, 1UL << dest_cpu);
+
+       /* restore the cpus allowed mask */
+       set_cpus_allowed(p, old_mask);
+}
+
+/*
+ * Find the least loaded CPU.  Slightly favor the current CPU by
+ * setting its runqueue length as the minimum to start.
+ */
+static int sched_best_cpu(struct task_struct *p)
+{
+       int i, minload, load, best_cpu, node = 0;
+       unsigned long cpumask;
+
+       best_cpu = task_cpu(p);
+       if (cpu_rq(best_cpu)->nr_running <= 2)
+               return best_cpu;
+
+       minload = 10000000;
+       for (i = 0; i < numnodes; i++) {
+               load = atomic_read(&node_nr_running[i]);
+               if (load < minload) {
+                       minload = load;
+                       node = i;
+               }
+       }
+
+       minload = 10000000;
+       cpumask = __node_to_cpu_mask(node);
+       for (i = 0; i < NR_CPUS; ++i) {
+               if (!(cpumask & (1UL << i)))
+                       continue;
+               if (cpu_rq(i)->nr_running < minload) {
+                       best_cpu = i;
+                       minload = cpu_rq(i)->nr_running;
+               }
+       }
+       return best_cpu;
+}
+
+void sched_balance_exec(void)
+{
+       int new_cpu;
+
+       if (numnodes > 1) {
+               new_cpu = sched_best_cpu(current);
+               if (new_cpu != smp_processor_id())
+                       sched_migrate_task(current, new_cpu);
+       }
+}
  
  static inline unsigned long cpus_to_balance(int this_cpu)
  {
@@ -752,9 +861,9 @@ out:
  static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, int this_cpu)
  {
         dequeue_task(p, src_array);
-       src_rq->nr_running--;
+       nr_running_dec(src_rq);
         set_task_cpu(p, this_cpu);
-       this_rq->nr_running++;
+       nr_running_inc(this_rq);
         enqueue_task(p, this_rq->active);
         /*
          * Note that idle threads have a prio of MAX_PRIO, for this test
@@ -2248,6 +2357,7 @@ void __init sched_init(void)
                 spin_lock_init(&rq->lock);
                 INIT_LIST_HEAD(&rq->migration_queue);
                 atomic_set(&rq->nr_iowait, 0);
+               nr_running_init(rq);
  
                 for (j = 0; j < 2; j++) {
                         array = rq->arrays + j;
author	Martin J. Bligh <mbligh@aracnet.com>
	Thu, 16 Jan 2003 03:46:10 +0000 (19:46 -0800)
committer	Justin T. Gibbs <gibbs@overdrive.btc.adaptec.com>
	Thu, 16 Jan 2003 03:46:10 +0000 (19:46 -0800)
fs/exec.c		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
init/main.c		patch \| blob \| history
kernel/sched.c		patch \| blob \| history