v2.4.9.10 -> v2.4.9.11

author Linus Torvalds <torvalds@athlon.transmeta.com>

Tue, 5 Feb 2002 04:18:55 +0000 (20:18 -0800)

committer Linus Torvalds <torvalds@athlon.transmeta.com>

Tue, 5 Feb 2002 04:18:55 +0000 (20:18 -0800)
author Linus Torvalds <torvalds@athlon.transmeta.com>
Tue, 5 Feb 2002 04:18:55 +0000 (20:18 -0800)
committer Linus Torvalds <torvalds@athlon.transmeta.com>
Tue, 5 Feb 2002 04:18:55 +0000 (20:18 -0800)
diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile

index 5e1d047b109727d24a97c0ddcb9b64e31a18a8f7..1d5e0797ecab93051e53c797f22966f92e3b17d8 100644 (file)
--- a/Documentation/DocBook/Makefile
+++ b/Documentation/DocBook/Makefile
@@ -88,6 +88,7 @@ APISOURCES := $(TOPDIR)/drivers/media/video/videodev.c \
                 $(TOPDIR)/arch/i386/kernel/mca.c \
                 $(TOPDIR)/arch/i386/kernel/mtrr.c \
                 $(TOPDIR)/drivers/char/misc.c \
+               $(TOPDIR)/kernel/printk.c \
                 $(TOPDIR)/drivers/net/net_init.c \
                 $(TOPDIR)/drivers/net/8390.c \
                 $(TOPDIR)/drivers/char/serial.c \
diff --git a/Makefile b/Makefile

index 17482ca0940c963d92472b2dce072f72b1f6279d..b0dacf88eaf33f5f838f6b6a806fc0a47d264691 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
  VERSION = 2
  PATCHLEVEL = 4
  SUBLEVEL = 10
-EXTRAVERSION =-pre10
+EXTRAVERSION =-pre11
  
  KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)
  
diff --git a/arch/alpha/kernel/irq.c b/arch/alpha/kernel/irq.c

index 6ecca515c540bdcc74d25bd5b8b20e369a09c2f2..a84e686673d1103e6f4c9140dfff23238fa7c34c 100644 (file)
--- a/arch/alpha/kernel/irq.c
+++ b/arch/alpha/kernel/irq.c
@@ -38,7 +38,7 @@ irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = {
  
  static void register_irq_proc(unsigned int irq);
  
-unsigned long irq_err_count;
+volatile unsigned long irq_err_count;
  
  /*
   * Special irq handlers.
diff --git a/arch/alpha/kernel/irq_alpha.c b/arch/alpha/kernel/irq_alpha.c

index 6e8056ad7023800f8d02c754d3a0fb761522bf25..91e99f573436dd712f87f105ae3699b2b249ccbc 100644 (file)
--- a/arch/alpha/kernel/irq_alpha.c
+++ b/arch/alpha/kernel/irq_alpha.c
@@ -18,8 +18,6 @@
  unsigned long __irq_attempt[NR_IRQS];
  #endif
  
-extern unsigned long irq_err_count;
-
  /* Hack minimum IPL during interrupt processing for broken hardware.  */
  #ifdef CONFIG_ALPHA_BROKEN_IRQ_MASK
  int __min_ipl;
diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c

index 12d32c7a527a691a0ea0df733492657b1528fa24..4b655b3430a295649a044c8f8a81f1b572833e3e 100644 (file)
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -50,7 +50,6 @@
   */
  
  unsigned long init_user_stack[1024] = { STACK_MAGIC, };
-static struct vm_area_struct init_mmap = INIT_MMAP;
  static struct fs_struct init_fs = INIT_FS;
  static struct files_struct init_files = INIT_FILES;
  static struct signal_struct init_signals = INIT_SIGNALS;
@@ -213,7 +212,9 @@ machine_power_off(void)
  void
  show_regs(struct pt_regs * regs)
  {
-       printk("\nps: %04lx pc: [<%016lx>]\n", regs->ps, regs->pc);
+       printk("\n");
+       printk("Pid: %d, comm: %20s\n", current->pid, current->comm);
+       printk("ps: %04lx pc: [<%016lx>] CPU %d\n", regs->ps, regs->pc, smp_processor_id());
         printk("rp: [<%016lx>] sp: %p\n", regs->r26, regs+1);
         printk(" r0: %016lx  r1: %016lx  r2: %016lx  r3: %016lx\n",
                regs->r0, regs->r1, regs->r2, regs->r3);
diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c

index e678a205b5a24e86abb017eb6a63c1b39f5817c3..ca4d01f5cf0b49e9fd5bbfeb9b9f57ae1f2273b4 100644 (file)
--- a/arch/alpha/kernel/traps.c
+++ b/arch/alpha/kernel/traps.c
@@ -128,6 +128,18 @@ dik_show_trace(unsigned long *sp)
         printk("\n");
  }
  
+void show_trace_task(struct task_struct * tsk)
+{
+       struct thread_struct * thread = &tsk->thread;
+       unsigned long fp, sp = thread->ksp, base = (unsigned long) thread;
+ 
+       if (sp > base && sp+6*8 < base + 16*1024) {
+               fp = ((unsigned long*)sp)[6];
+               if (fp > sp && fp < base + 16*1024)
+                       dik_show_trace((unsigned long *)fp);
+       }
+}
+
  int kstack_depth_to_print = 24;
  
  void show_stack(unsigned long *sp)
@@ -299,6 +311,7 @@ do_entIF(unsigned long type, unsigned long a1,
               case 3: /* FEN fault */
               case 5: /* illoc */
               default: /* unexpected instruction-fault type */
+                     ;
         }
         send_sig(SIGILL, current, 1);
  }
diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c

index f27f0a9f2ac4f2032a30ca4481bdfe1ccdb6d7d4..ca766f2050c65d525aa54957bb08d34368941cb4 100644 (file)
--- a/arch/alpha/mm/fault.c
+++ b/arch/alpha/mm/fault.c
@@ -140,6 +140,7 @@ good_area:
                         goto bad_area;
         }
  
+ survive:
         /*
          * If for any reason at all we couldn't handle the fault,
          * make sure we exit gracefully rather than endlessly redo
@@ -194,6 +195,12 @@ no_context:
   * us unable to handle the page fault gracefully.
   */
  out_of_memory:
+       if (current->pid == 1) {
+               current->policy |= SCHED_YIELD;
+               schedule();
+               down_read(&mm->mmap_sem);
+               goto survive;
+       }
         printk(KERN_ALERT "VM: killing process %s(%d)\n",
                current->comm, current->pid);
         if (!user_mode(regs))
diff --git a/arch/arm/kernel/init_task.c b/arch/arm/kernel/init_task.c

index 480c025fba9b6e2b247a53bbd43201432a1b3f87..7245a45b0ab77dae863b8375ba0b935d3437396c 100644 (file)
--- a/arch/arm/kernel/init_task.c
+++ b/arch/arm/kernel/init_task.c
@@ -9,7 +9,6 @@
  #include <asm/uaccess.h>
  #include <asm/pgtable.h>
  
-static struct vm_area_struct init_mmap = INIT_MMAP;
  static struct fs_struct init_fs = INIT_FS;
  static struct files_struct init_files = INIT_FILES;
  static struct signal_struct init_signals = INIT_SIGNALS;
diff --git a/arch/cris/kernel/process.c b/arch/cris/kernel/process.c

index f0ab61b9cf17c028cb68faa70eb16edd9ea00c45..8658fe615ffd8e7ba80370969bc5052ee6657f9f 100644 (file)
--- a/arch/cris/kernel/process.c
+++ b/arch/cris/kernel/process.c
@@ -65,7 +65,6 @@
   * setup.
   */
  
-static struct vm_area_struct init_mmap = INIT_MMAP;
  static struct fs_struct init_fs = INIT_FS;
  static struct files_struct init_files = INIT_FILES;
  static struct signal_struct init_signals = INIT_SIGNALS;
diff --git a/arch/i386/kernel/init_task.c b/arch/i386/kernel/init_task.c

index 84fba51064446cdd0b37542023e32381fde15bbf..7779809ef2ccdd8fdc4af2e06b14484213914b6c 100644 (file)
--- a/arch/i386/kernel/init_task.c
+++ b/arch/i386/kernel/init_task.c
@@ -6,7 +6,6 @@
  #include <asm/pgtable.h>
  #include <asm/desc.h>
  
-static struct vm_area_struct init_mmap = INIT_MMAP;
  static struct fs_struct init_fs = INIT_FS;
  static struct files_struct init_files = INIT_FILES;
  static struct signal_struct init_signals = INIT_SIGNALS;
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c

index a24e58d47db1913e65a5d614d77aa73b8f0c344a..bdff768554f7b50445f0924c17790c591e5aa794 100644 (file)
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -154,14 +154,17 @@ static void unmask_IO_APIC_irq (unsigned int irq)
  void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
  {
         struct IO_APIC_route_entry entry;
+       unsigned long flags;
  
         /*
          * Disable it in the IO-APIC irq-routing table:
          */
         memset(&entry, 0, sizeof(entry));
         entry.mask = 1;
+       spin_lock_irqsave(&ioapic_lock, flags);
         io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
         io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
+       spin_unlock_irqrestore(&ioapic_lock, flags);
  }
  
  static void clear_IO_APIC (void)
@@ -595,6 +598,7 @@ void __init setup_IO_APIC_irqs(void)
  {
         struct IO_APIC_route_entry entry;
         int apic, pin, idx, irq, first_notcon = 1, vector;
+       unsigned long flags;
  
         printk(KERN_DEBUG "init IO_APIC IRQs\n");
  
@@ -650,8 +654,10 @@ void __init setup_IO_APIC_irqs(void)
                         if (!apic && (irq < 16))
                                 disable_8259A_irq(irq);
                 }
+               spin_lock_irqsave(&ioapic_lock, flags);
                 io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
                 io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
+               spin_unlock_irqrestore(&ioapic_lock, flags);
         }
         }
  
@@ -666,6 +672,7 @@ void __init setup_IO_APIC_irqs(void)
  void __init setup_ExtINT_IRQ0_pin(unsigned int pin, int vector)
  {
         struct IO_APIC_route_entry entry;
+       unsigned long flags;
  
         memset(&entry,0,sizeof(entry));
  
@@ -695,8 +702,10 @@ void __init setup_ExtINT_IRQ0_pin(unsigned int pin, int vector)
         /*
          * Add it to the IO-APIC irq-routing table:
          */
+       spin_lock_irqsave(&ioapic_lock, flags);
         io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1));
         io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0));
+       spin_unlock_irqrestore(&ioapic_lock, flags);
  
         enable_8259A_irq(0);
  }
@@ -713,6 +722,7 @@ void __init print_IO_APIC(void)
         struct IO_APIC_reg_00 reg_00;
         struct IO_APIC_reg_01 reg_01;
         struct IO_APIC_reg_02 reg_02;
+       unsigned long flags;
  
         printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
         for (i = 0; i < nr_ioapics; i++)
@@ -727,10 +737,12 @@ void __init print_IO_APIC(void)
  
         for (apic = 0; apic < nr_ioapics; apic++) {
  
+       spin_lock_irqsave(&ioapic_lock, flags);
         *(int *)&reg_00 = io_apic_read(apic, 0);
         *(int *)&reg_01 = io_apic_read(apic, 1);
         if (reg_01.version >= 0x10)
                 *(int *)&reg_02 = io_apic_read(apic, 2);
+       spin_unlock_irqrestore(&ioapic_lock, flags);
  
         printk("\n");
         printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
@@ -778,8 +790,10 @@ void __init print_IO_APIC(void)
         for (i = 0; i <= reg_01.entries; i++) {
                 struct IO_APIC_route_entry entry;
  
+               spin_lock_irqsave(&ioapic_lock, flags);
                 *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
                 *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
+               spin_unlock_irqrestore(&ioapic_lock, flags);
  
                 printk(KERN_DEBUG " %02x %03X %02X  ",
                         i,
@@ -956,6 +970,7 @@ static void __init enable_IO_APIC(void)
  {
         struct IO_APIC_reg_01 reg_01;
         int i;
+       unsigned long flags;
  
         for (i = 0; i < PIN_MAP_SIZE; i++) {
                 irq_2_pin[i].pin = -1;
@@ -969,7 +984,9 @@ static void __init enable_IO_APIC(void)
          * The number of IO-APIC IRQ registers (== #pins):
          */
         for (i = 0; i < nr_ioapics; i++) {
+               spin_lock_irqsave(&ioapic_lock, flags);
                 *(int *)&reg_01 = io_apic_read(i, 1);
+               spin_unlock_irqrestore(&ioapic_lock, flags);
                 nr_ioapic_registers[i] = reg_01.entries+1;
         }
  
@@ -1006,6 +1023,7 @@ static void __init setup_ioapic_ids_from_mpc (void)
         int apic;
         int i;
         unsigned char old_id;
+       unsigned long flags;
  
         /*
          * Set the IOAPIC ID to the value stored in the MPC table.
@@ -1013,7 +1031,9 @@ static void __init setup_ioapic_ids_from_mpc (void)
         for (apic = 0; apic < nr_ioapics; apic++) {
  
                 /* Read the register 0 value */
+               spin_lock_irqsave(&ioapic_lock, flags);
                 *(int *)&reg_00 = io_apic_read(apic, 0);
+               spin_unlock_irqrestore(&ioapic_lock, flags);
                 
                 old_id = mp_ioapics[apic].mpc_apicid;
  
@@ -1062,12 +1082,16 @@ static void __init setup_ioapic_ids_from_mpc (void)
                                         mp_ioapics[apic].mpc_apicid);
  
                 reg_00.ID = mp_ioapics[apic].mpc_apicid;
+               spin_lock_irqsave(&ioapic_lock, flags);
                 io_apic_write(apic, 0, *(int *)&reg_00);
+               spin_unlock_irqrestore(&ioapic_lock, flags);
  
                 /*
                  * Sanity check
                  */
+               spin_lock_irqsave(&ioapic_lock, flags);
                 *(int *)&reg_00 = io_apic_read(apic, 0);
+               spin_unlock_irqrestore(&ioapic_lock, flags);
                 if (reg_00.ID != mp_ioapics[apic].mpc_apicid)
                         panic("could not set ID!\n");
                 else
@@ -1416,13 +1440,16 @@ static inline void unlock_ExtINT_logic(void)
         int pin, i;
         struct IO_APIC_route_entry entry0, entry1;
         unsigned char save_control, save_freq_select;
+       unsigned long flags;
  
         pin = find_isa_irq_pin(8, mp_INT);
         if (pin == -1)
                 return;
  
+       spin_lock_irqsave(&ioapic_lock, flags);
         *(((int *)&entry0) + 1) = io_apic_read(0, 0x11 + 2 * pin);
         *(((int *)&entry0) + 0) = io_apic_read(0, 0x10 + 2 * pin);
+       spin_unlock_irqrestore(&ioapic_lock, flags);
         clear_IO_APIC_pin(0, pin);
  
         memset(&entry1, 0, sizeof(entry1));
@@ -1435,8 +1462,10 @@ static inline void unlock_ExtINT_logic(void)
         entry1.trigger = 0;
         entry1.vector = 0;
  
+       spin_lock_irqsave(&ioapic_lock, flags);
         io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
         io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
+       spin_unlock_irqrestore(&ioapic_lock, flags);
  
         save_control = CMOS_READ(RTC_CONTROL);
         save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
@@ -1455,8 +1484,10 @@ static inline void unlock_ExtINT_logic(void)
         CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
         clear_IO_APIC_pin(0, pin);
  
+       spin_lock_irqsave(&ioapic_lock, flags);
         io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
         io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
+       spin_unlock_irqrestore(&ioapic_lock, flags);
  }
  
  /*
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c

index d0ef4d2108ee7f0d01a011fcb49dea1442746307..5ef93e608306cbc3d264cad4f4b323c4db89c26b 100644 (file)
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -391,6 +391,7 @@ void show_regs(struct pt_regs * regs)
         unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
  
         printk("\n");
+       printk("Pid: %d, comm: %20s\n", current->pid, current->comm);
         printk("EIP: %04x:[<%08lx>] CPU: %d",0xffff & regs->xcs,regs->eip, smp_processor_id());
         if (regs->xcs & 3)
                 printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c

index 5c824047a608481d3b7187421d5dc9410946b2a2..a738a3a415e5d95da537543f28a7d41486f126c6 100644 (file)
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -468,6 +468,7 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
  
         spin_lock_bh(&call_lock);
         call_data = &data;
+       wmb();
         /* Send a message to all other CPUs and wait for them to respond */
         send_IPI_allbutself(CALL_FUNCTION_VECTOR);
  
@@ -531,12 +532,15 @@ asmlinkage void smp_call_function_interrupt(void)
          * Notify initiating CPU that I've grabbed the data and am
          * about to execute the function
          */
+       mb();
         atomic_inc(&call_data->started);
         /*
          * At this point the info structure may be out of scope unless wait==1
          */
         (*func)(info);
-       if (wait)
+       if (wait) {
+               mb();
                 atomic_inc(&call_data->finished);
+       }
  }
  
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c

index a16965ac57e33b40d27cf73eb1eaeda1975b6b9d..9d235f0caa32d8242911b67b7115f82c9b4d0127 100644 (file)
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -66,7 +66,7 @@ static volatile unsigned long cpu_callin_map;
  static volatile unsigned long cpu_callout_map;
  
  /* Per CPU bogomips and other parameters */
-struct cpuinfo_x86 cpu_data[NR_CPUS];
+struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
  
  /* Set when the idlers are all forked */
  int smp_threads_ready;
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c

index dc0655ee23480f9db392e4c91803588fe42a89fb..58b8d59d4c61b2d29c1ae7a9afd2d37313d8057e 100644 (file)
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -64,8 +64,6 @@ struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },
   */
  struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, };
  
-extern void bust_spinlocks(void);
-
  asmlinkage void divide_error(void);
  asmlinkage void debug(void);
  asmlinkage void nmi(void);
@@ -245,9 +243,10 @@ void die(const char * str, struct pt_regs * regs, long err)
  {
         console_verbose();
         spin_lock_irq(&die_lock);
+       bust_spinlocks(1);
         printk("%s: %04lx\n", str, err & 0xffff);
         show_registers(regs);
-
+       bust_spinlocks(0);
         spin_unlock_irq(&die_lock);
         do_exit(SIGSEGV);
  }
@@ -433,26 +432,50 @@ __setup("nmi_watchdog=", setup_nmi_watchdog);
  
  static spinlock_t nmi_print_lock = SPIN_LOCK_UNLOCKED;
  
-inline void nmi_watchdog_tick(struct pt_regs * regs)
+static unsigned int
+       last_irq_sums [NR_CPUS],
+       alert_counter [NR_CPUS];
+
+/*
+ * Sometimes, we know that we're disabling interrupts for too long.
+ * This happens during long writes to slow console devices, and may
+ * happen in other places.
+ *
+ * To prevent the NMI watchdog from firing when we're doing these things,
+ * touch_nmi_watchdog() may be used to reset the NMI watchdog timer
+ * back to its full interval (five seconds).
+ */
+void touch_nmi_watchdog (void)
  {
+       int i;
+
         /*
-        * the best way to detect wether a CPU has a 'hard lockup' problem
-        * is to check it's local APIC timer IRQ counts. If they are not
-        * changing then that CPU has some problem.
-        *
-        * as these watchdog NMI IRQs are broadcasted to every CPU, here
-        * we only have to check the current processor.
-        *
-        * since NMIs dont listen to _any_ locks, we have to be extremely
-        * careful not to rely on unsafe variables. The printk might lock
-        * up though, so we have to break up console_lock first ...
-        * [when there will be more tty-related locks, break them up
-        *  here too!]
+        * Just reset the alert counters, (other CPUs might be
+        * spinning on locks we hold):
          */
+       for (i = 0; i < smp_num_cpus; i++)
+               alert_counter[i] = 0;
+}
  
-       static unsigned int last_irq_sums [NR_CPUS],
-                               alert_counter [NR_CPUS];
+/*
+ * The best way to detect whether a CPU has a 'hard lockup' problem
+ * is to check it's local APIC timer IRQ counts. If they are not
+ * changing then that CPU has some problem.
+ *
+ * As these watchdog NMI IRQs are generated on every CPU, we only
+ * have to check the current processor.
+ *
+ * Since NMIs don't listen to _any_ locks, we have to be extremely
+ * careful not to rely on unsafe variables. The printk path might lock
+ * up though, so we use bust_spinlocks() to break up any console
+ * locks first.  There may be other tty-related locks which require
+ * breaking as well.  They can be broken in bust_spinlocks(), or the
+ * global variable `oops_in_progress' may be used to bypass the
+ * tty locking.
+ */
  
+inline void nmi_watchdog_tick(struct pt_regs * regs)
+{
         /*
          * Since current-> is always on the stack, and we always switch
          * the stack NMI-atomically, it's safe to use smp_processor_id().
@@ -473,12 +496,13 @@ inline void nmi_watchdog_tick(struct pt_regs * regs)
                          * We are in trouble anyway, lets at least try
                          * to get a message out.
                          */
-                       bust_spinlocks();
+                       bust_spinlocks(1);
                         printk("NMI Watchdog detected LOCKUP on CPU%d, registers:\n", cpu);
                         show_registers(regs);
                         printk("console shuts up ...\n");
                         console_silent();
                         spin_unlock(&nmi_print_lock);
+                       bust_spinlocks(0);
                         do_exit(SIGSEGV);
                 }
         } else {
diff --git a/arch/i386/mm/extable.c b/arch/i386/mm/extable.c

index 223d05af4e0dec69c720bd8c9fc2af2c24374038..4cd9f064c37be6211fe967b12b815dcde663f959 100644 (file)
--- a/arch/i386/mm/extable.c
+++ b/arch/i386/mm/extable.c
@@ -37,13 +37,13 @@ unsigned long
  search_exception_table(unsigned long addr)
  {
         unsigned long ret = 0;
-       unsigned long flags;
         
  #ifndef CONFIG_MODULES
         /* There is only the kernel to search.  */
         ret = search_one_table(__start___ex_table, __stop___ex_table-1, addr);
         return ret;
  #else
+       unsigned long flags;
         /* The kernel is the last "module" -- no need to treat it special.  */
         struct module *mp;
  
diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c

index 0385a00d0086461af0fb74a3c74c30e07c48c0b2..c58d8b38419d76be39bc2969f42e836afb0a62ca 100644 (file)
--- a/arch/i386/mm/fault.c
+++ b/arch/i386/mm/fault.c
@@ -17,6 +17,7 @@
  #include <linux/smp_lock.h>
  #include <linux/interrupt.h>
  #include <linux/init.h>
+#include <linux/vt_kern.h>             /* For unblank_screen() */
  
  #include <asm/system.h>
  #include <asm/uaccess.h>
@@ -25,6 +26,8 @@
  
  extern void die(const char *,struct pt_regs *,long);
  
+extern int console_loglevel;
+
  /*
   * Ugly, ugly, but the goto's result in better assembly..
   */
@@ -51,8 +54,14 @@ good_area:
         start &= PAGE_MASK;
  
         for (;;) {
-               if (handle_mm_fault(current->mm, vma, start, 1) <= 0)
-                       goto bad_area;
+       survive:
+               {
+                       int fault = handle_mm_fault(current->mm, vma, start, 1);
+                       if (!fault)
+                               goto bad_area;
+                       if (fault < 0)
+                               goto out_of_memory;
+               }
                 if (!size)
                         break;
                 size--;
@@ -75,21 +84,57 @@ check_stack:
  
  bad_area:
         return 0;
+
+out_of_memory:
+       if (current->pid == 1) {
+               current->policy |= SCHED_YIELD;
+               schedule();
+               goto survive;
+       }
+       goto bad_area;
  }
  
-extern spinlock_t console_lock, timerlist_lock;
+extern spinlock_t timerlist_lock;
  
  /*
   * Unlock any spinlocks which will prevent us from getting the
   * message out (timerlist_lock is acquired through the
   * console unblank code)
   */
-void bust_spinlocks(void)
+void bust_spinlocks(int yes)
  {
-       spin_lock_init(&console_lock);
         spin_lock_init(&timerlist_lock);
+       if (yes) {
+               oops_in_progress = 1;
+#ifdef CONFIG_SMP
+               global_irq_lock = 0;    /* Many serial drivers do __global_cli() */
+#endif
+       } else {
+               int loglevel_save = console_loglevel;
+               unblank_screen();
+               oops_in_progress = 0;
+               /*
+                * OK, the message is on the console.  Now we call printk()
+                * without oops_in_progress set so that printk will give klogd
+                * a poke.  Hold onto your hats...
+                */
+               console_loglevel = 15;          /* NMI oopser may have shut the console up */
+               printk(" ");
+               console_loglevel = loglevel_save;
+       }
  }
  
+#if 0
+/*
+ * Verbose bug reporting: call do_BUG(__FILE__, __LINE__) in page.h:BUG() to enable this
+ */
+void do_BUG(const char *file, int line)
+{
+       bust_spinlocks(1);
+       printk("kernel BUG at %s:%d!\n", file, line);
+}
+#endif
+
  asmlinkage void do_invalid_op(struct pt_regs *, unsigned long);
  extern unsigned long idt;
  
@@ -196,6 +241,7 @@ good_area:
                                 goto bad_area;
         }
  
+ survive:
         /*
          * If for any reason at all we couldn't handle the fault,
          * make sure we exit gracefully rather than endlessly redo
@@ -271,7 +317,7 @@ no_context:
   * terminate things with extreme prejudice.
   */
  
-       bust_spinlocks();
+       bust_spinlocks(1);
  
         if (address < PAGE_SIZE)
                 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
@@ -290,6 +336,7 @@ no_context:
                 printk(KERN_ALERT "*pte = %08lx\n", page);
         }
         die("Oops", regs, error_code);
+       bust_spinlocks(0);
         do_exit(SIGKILL);
  
  /*
@@ -298,6 +345,12 @@ no_context:
   */
  out_of_memory:
         up_read(&mm->mmap_sem);
+       if (tsk->pid == 1) {
+               tsk->policy |= SCHED_YIELD;
+               schedule();
+               down_read(&mm->mmap_sem);
+               goto survive;
+       }
         printk("VM: killing process %s\n", tsk->comm);
         if (error_code & 4)
                 do_exit(SIGKILL);
diff --git a/arch/ia64/kernel/init_task.c b/arch/ia64/kernel/init_task.c

index 12265046165be6d521653ee06701553b34843f4a..341e32e14ba81d3100b5eaf87a5d65394a91350d 100644 (file)
--- a/arch/ia64/kernel/init_task.c
+++ b/arch/ia64/kernel/init_task.c
@@ -13,7 +13,6 @@
  #include <asm/uaccess.h>
  #include <asm/pgtable.h>
  
-static struct vm_area_struct init_mmap = INIT_MMAP;
  static struct fs_struct init_fs = INIT_FS;
  static struct files_struct init_files = INIT_FILES;
  static struct signal_struct init_signals = INIT_SIGNALS;
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c

index 7fd40aaa4a05158ffd2a60b13fd03dfc11deaee3..6052ecd225a6b13c1df40e421ed4d48104e803cc 100644 (file)
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -38,7 +38,6 @@
   * alignment requirements and potentially different initial
   * setup.
   */
-static struct vm_area_struct init_mmap = INIT_MMAP;
  static struct fs_struct init_fs = INIT_FS;
  static struct files_struct init_files = INIT_FILES;
  static struct signal_struct init_signals = INIT_SIGNALS;
diff --git a/arch/mips/kernel/init_task.c b/arch/mips/kernel/init_task.c

index fbf5bf9f09cb870ca1273e1461091a31017701ba..ccef3ba6be861e97266817f665c6944f8b89aac8 100644 (file)
--- a/arch/mips/kernel/init_task.c
+++ b/arch/mips/kernel/init_task.c
@@ -4,7 +4,6 @@
  #include <asm/uaccess.h>
  #include <asm/pgtable.h>
  
-static struct vm_area_struct init_mmap = INIT_MMAP;
  static struct fs_struct init_fs = INIT_FS;
  static struct files_struct init_files = INIT_FILES;
  static struct signal_struct init_signals = INIT_SIGNALS;
diff --git a/arch/mips64/kernel/init_task.c b/arch/mips64/kernel/init_task.c

index 1e68e96d29262ebd96c5a5edcfba0836b3ffe37b..879214ff545cbb6439e5de4f1efb0fb5aab8a9ec 100644 (file)
--- a/arch/mips64/kernel/init_task.c
+++ b/arch/mips64/kernel/init_task.c
@@ -4,7 +4,6 @@
  #include <asm/uaccess.h>
  #include <asm/pgtable.h>
  
-static struct vm_area_struct init_mmap = INIT_MMAP;
  static struct fs_struct init_fs = INIT_FS;
  static struct files_struct init_files = INIT_FILES;
  static struct signal_struct init_signals = INIT_SIGNALS;
diff --git a/arch/mips64/mm/fault.c b/arch/mips64/mm/fault.c

index 573e7042a3cd4acc57df9f281238dd73f3a1950c..fe7601bf38b7b8c1a2d6f694714689a8cc3ae0fa 100644 (file)
--- a/arch/mips64/mm/fault.c
+++ b/arch/mips64/mm/fault.c
@@ -58,7 +58,7 @@ dodebug2(abi64_no_regargs, struct pt_regs regs)
         printk("Got exception 0x%lx at 0x%lx\n", retaddr, regs.cp0_epc);
  }
  
-extern spinlock_t console_lock, timerlist_lock;
+extern spinlock_t timerlist_lock;
  
  /*
   * Unlock any spinlocks which will prevent us from getting the
@@ -226,6 +226,7 @@ no_context:
                 (unsigned int) regs->regs[31]);
         die("Oops", regs, write);
         do_exit(SIGKILL);
+       bust_spinlocks(0);
  
  /*
   * We ran out of memory, or some other thing happened to us that made
diff --git a/arch/parisc/kernel/init_task.c b/arch/parisc/kernel/init_task.c

index a838548de02da8ea867a5af1515743f684232967..45ed70c9c5adb67419ea76a31d61582a6285d371 100644 (file)
--- a/arch/parisc/kernel/init_task.c
+++ b/arch/parisc/kernel/init_task.c
@@ -6,7 +6,6 @@
  #include <asm/pgtable.h>
  #include <asm/pgalloc.h>
  
-static struct vm_area_struct init_mmap = INIT_MMAP;
  static struct fs_struct init_fs = INIT_FS;
  static struct files_struct init_files = INIT_FILES;
  static struct signal_struct init_signals = INIT_SIGNALS;
diff --git a/arch/parisc/kernel/pdc_cons.c b/arch/parisc/kernel/pdc_cons.c

index f2d45862bfd41192a08f4e5ab5ce8b536145e162..87567e6a1307cf384b4d6cebc7dc460a6ee95ab9 100644 (file)
--- a/arch/parisc/kernel/pdc_cons.c
+++ b/arch/parisc/kernel/pdc_cons.c
@@ -141,11 +141,7 @@ void pdc_console_die(void)
         --pdc_console_initialized;
         
  #ifdef CONFIG_VT_CONSOLE
-       {
-           /* fixme (needed?): Wait for console-tasklet to finish !*/
-           extern struct tasklet_struct console_tasklet;
-           tasklet_schedule(&console_tasklet);
-       }
+       schedule_console_callback();
  #endif
  
         unregister_console(&pdc_cons);
diff --git a/arch/ppc/Makefile b/arch/ppc/Makefile

index f6e5baa45608ba80dc419b0db782e6329910f298..0ace879ab6332cfbb0d169f0829d7d3db34680de 100644 (file)
--- a/arch/ppc/Makefile
+++ b/arch/ppc/Makefile
@@ -87,6 +87,9 @@ checks:
  
  BOOT_TARGETS = zImage zImage.initrd znetboot znetboot.initrd
  
+# All the instructions talk about "make bzImage".
+bzImage: zImage
+
  $(BOOT_TARGETS): $(CHECKS) vmlinux
         @$(MAKEBOOT) $@
  
diff --git a/arch/ppc/amiga/config.c b/arch/ppc/amiga/config.c

index 9c389fec3795545f7aa1e9c1b4dfe9aaa131ba85..9e12f97eac875f492f769532ce3312ef7f9617a7 100644 (file)
--- a/arch/ppc/amiga/config.c
+++ b/arch/ppc/amiga/config.c
@@ -762,20 +762,6 @@ static int amiga_wait_key (struct console *co)
      return 0;
  }
  
-void dbprintf(const char *fmt , ...)
-{
-       static char buf[1024];
-       va_list args;
-       extern void console_print (const char *str);
-       extern int vsprintf(char * buf, const char * fmt, va_list args);
-
-       va_start(args, fmt);
-       vsprintf(buf, fmt, args);
-       va_end(args);
-
-       console_print (buf);
-}
-
  static NORET_TYPE void amiga_reset( void )
      ATTRIB_NORET;
  
diff --git a/arch/ppc/kernel/ppc_ksyms.c b/arch/ppc/kernel/ppc_ksyms.c

index d86e534abb1ccf5bb4135448e67021501b6020ec..79c15c83ce808fbb43252d938744debaccbc19f9 100644 (file)
--- a/arch/ppc/kernel/ppc_ksyms.c
+++ b/arch/ppc/kernel/ppc_ksyms.c
@@ -314,7 +314,6 @@ EXPORT_SYMBOL(ppc_irq_dispatch_handler);
  EXPORT_SYMBOL(tb_ticks_per_jiffy);
  EXPORT_SYMBOL(get_wchan);
  EXPORT_SYMBOL(console_drivers);
-EXPORT_SYMBOL(console_lock);
  #ifdef CONFIG_XMON
  EXPORT_SYMBOL(xmon);
  #endif
diff --git a/arch/ppc/kernel/process.c b/arch/ppc/kernel/process.c

index df427dfa227a915c508d13159dedc021cbab4a4c..1ada6a519cbf6d2e1fa7d11accd046ba366d103b 100644 (file)
--- a/arch/ppc/kernel/process.c
+++ b/arch/ppc/kernel/process.c
@@ -48,7 +48,6 @@ extern unsigned long _get_SP(void);
  
  struct task_struct *last_task_used_math = NULL;
  struct task_struct *last_task_used_altivec = NULL;
-static struct vm_area_struct init_mmap = INIT_MMAP;
  static struct fs_struct init_fs = INIT_FS;
  static struct files_struct init_files = INIT_FILES;
  static struct signal_struct init_signals = INIT_SIGNALS;
diff --git a/arch/s390/kernel/init_task.c b/arch/s390/kernel/init_task.c

index 3e2600776631ffc10b2e24a19c03b28eb3ca1169..630264ab0e91a26f9934ac359ead37d12a1ab1dd 100644 (file)
--- a/arch/s390/kernel/init_task.c
+++ b/arch/s390/kernel/init_task.c
@@ -12,7 +12,6 @@
  #include <asm/uaccess.h>
  #include <asm/pgtable.h>
  
-static struct vm_area_struct init_mmap = INIT_MMAP;
  static struct fs_struct init_fs = INIT_FS;
  static struct files_struct init_files = INIT_FILES;
  static struct signal_struct init_signals = INIT_SIGNALS;
diff --git a/arch/s390x/kernel/init_task.c b/arch/s390x/kernel/init_task.c

index 74cf730b0fc887e82291d147e562d7ba92719630..39d3825fd6d2cf6ad253c562716c509a3506dfc1 100644 (file)
--- a/arch/s390x/kernel/init_task.c
+++ b/arch/s390x/kernel/init_task.c
@@ -12,7 +12,6 @@
  #include <asm/uaccess.h>
  #include <asm/pgtable.h>
  
-static struct vm_area_struct init_mmap = INIT_MMAP;
  static struct fs_struct init_fs = INIT_FS;
  static struct files_struct init_files = INIT_FILES;
  static struct signal_struct init_signals = INIT_SIGNALS;
diff --git a/arch/sh/kernel/init_task.c b/arch/sh/kernel/init_task.c

index aacd8f304f9e40e0af1557bc217da6e2ee1ecde8..6f92b9ef5993b4326000c47c55c9c358c4396b0e 100644 (file)
--- a/arch/sh/kernel/init_task.c
+++ b/arch/sh/kernel/init_task.c
@@ -5,7 +5,6 @@
  #include <asm/uaccess.h>
  #include <asm/pgtable.h>
  
-static struct vm_area_struct init_mmap = INIT_MMAP;
  static struct fs_struct init_fs = INIT_FS;
  static struct files_struct init_files = INIT_FILES;
  static struct signal_struct init_signals = INIT_SIGNALS;
diff --git a/arch/sparc/kernel/init_task.c b/arch/sparc/kernel/init_task.c

index daa07bb78f8339d9440df0238d1d6e9e3929e611..5633c882f3278becf895dfa3aa0b42c66ba953e8 100644 (file)
--- a/arch/sparc/kernel/init_task.c
+++ b/arch/sparc/kernel/init_task.c
@@ -4,7 +4,6 @@
  #include <asm/pgtable.h>
  #include <asm/uaccess.h>
  
-static struct vm_area_struct init_mmap = INIT_MMAP;
  static struct fs_struct init_fs = INIT_FS;
  static struct files_struct init_files = INIT_FILES;
  static struct signal_struct init_signals = INIT_SIGNALS;
diff --git a/arch/sparc/lib/debuglocks.c b/arch/sparc/lib/debuglocks.c

index b60cee65d623ae4cec10d14a5183fed0a381b60c..a7382681b0fcd24259be83e86ce7ca181c4bb662 100644 (file)
--- a/arch/sparc/lib/debuglocks.c
+++ b/arch/sparc/lib/debuglocks.c
@@ -29,11 +29,9 @@
  static inline void show(char *str, spinlock_t *lock, unsigned long caller)
  {
         int cpu = smp_processor_id();
-       extern spinlock_t console_lock;
  
-       if (lock != &console_lock)
-               printk("%s(%p) CPU#%d stuck at %08lx, owner PC(%08lx):CPU(%lx)\n",str,
-                       lock, cpu, caller, lock->owner_pc & ~3, lock->owner_pc & 3);
+       printk("%s(%p) CPU#%d stuck at %08lx, owner PC(%08lx):CPU(%lx)\n",str,
+               lock, cpu, caller, lock->owner_pc & ~3, lock->owner_pc & 3);
  }
  
  static inline void show_read(char *str, rwlock_t *lock, unsigned long caller)
diff --git a/arch/sparc64/kernel/init_task.c b/arch/sparc64/kernel/init_task.c

index f7e9ddc697b12b4880a817ed14865014451f08bc..e8ddf9f3ee105b373f50ef7bcd588e2548f56535 100644 (file)
--- a/arch/sparc64/kernel/init_task.c
+++ b/arch/sparc64/kernel/init_task.c
@@ -4,7 +4,6 @@
  #include <asm/pgtable.h>
  #include <asm/uaccess.h>
  
-static struct vm_area_struct init_mmap = INIT_MMAP;
  static struct fs_struct init_fs = INIT_FS;
  static struct files_struct init_files = INIT_FILES;
  static struct signal_struct init_signals = INIT_SIGNALS;
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c

index 9f1c8140d5e368e8bb670d2c26307cf36222021a..d1eb67387251ef8e82970e79b9ea6eb9dcc03811 100644 (file)
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -974,12 +974,7 @@ void ll_rw_block(int rw, int nr, struct buffer_head * bhs[])
         major = MAJOR(bhs[0]->b_dev);
  
         /* Determine correct block size for this device. */
-       correct_size = BLOCK_SIZE;
-       if (blksize_size[major]) {
-               i = blksize_size[major][MINOR(bhs[0]->b_dev)];
-               if (i)
-                       correct_size = i;
-       }
+       correct_size = get_hardsect_size(bhs[0]->b_dev);
  
         /* Verify requested block sizes. */
         for (i = 0; i < nr; i++) {
diff --git a/drivers/block/loop.c b/drivers/block/loop.c

index d2b6c8f44752c2ed892d81db8dfc2d50835a7124..e5accb20dce24770d58fcb7f9218703f886e6be5 100644 (file)
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -87,10 +87,12 @@ static devfs_handle_t devfs_handle;      /*  For the directory */
  static int transfer_none(struct loop_device *lo, int cmd, char *raw_buf,
                          char *loop_buf, int size, int real_block)
  {
-       if (cmd == READ)
-               memcpy(loop_buf, raw_buf, size);
-       else
-               memcpy(raw_buf, loop_buf, size);
+       if (raw_buf != loop_buf) {
+               if (cmd == READ)
+                       memcpy(loop_buf, raw_buf, size);
+               else
+                       memcpy(raw_buf, loop_buf, size);
+       }
  
         return 0;
  }
@@ -118,6 +120,7 @@ static int transfer_xor(struct loop_device *lo, int cmd, char *raw_buf,
  
  static int none_status(struct loop_device *lo, struct loop_info *info)
  {
+       lo->lo_flags |= LO_FLAGS_BH_REMAP;
         return 0;
  }
  
@@ -314,9 +317,13 @@ static int do_bh_filebacked(struct loop_device *lo, struct buffer_head *bh, int
         return ret;
  }
  
+static void loop_end_io_transfer(struct buffer_head *bh, int uptodate);
  static void loop_put_buffer(struct buffer_head *bh)
  {
-       if (bh) {
+       /*
+        * check b_end_io, may just be a remapped bh and not an allocated one
+        */
+       if (bh && bh->b_end_io == loop_end_io_transfer) {
                 __free_page(bh->b_page);
                 kmem_cache_free(bh_cachep, bh);
         }
@@ -386,6 +393,14 @@ static struct buffer_head *loop_get_buffer(struct loop_device *lo,
  {
         struct buffer_head *bh;
  
+       /*
+        * for xfer_funcs that can operate on the same bh, do that
+        */
+       if (lo->lo_flags & LO_FLAGS_BH_REMAP) {
+               bh = rbh;
+               goto out_bh;
+       }
+
         do {
                 bh = kmem_cache_alloc(bh_cachep, SLAB_NOIO);
                 if (bh)
@@ -398,9 +413,6 @@ static struct buffer_head *loop_get_buffer(struct loop_device *lo,
  
         bh->b_size = rbh->b_size;
         bh->b_dev = rbh->b_rdev;
-       spin_lock_irq(&lo->lo_lock);
-       bh->b_rdev = lo->lo_device;
-       spin_unlock_irq(&lo->lo_lock);
         bh->b_state = (1 << BH_Req) | (1 << BH_Mapped) | (1 << BH_Lock);
  
         /*
@@ -419,9 +431,15 @@ static struct buffer_head *loop_get_buffer(struct loop_device *lo,
  
         bh->b_data = page_address(bh->b_page);
         bh->b_end_io = loop_end_io_transfer;
-       bh->b_rsector = rbh->b_rsector + (lo->lo_offset >> 9);
+       bh->b_private = rbh;
         init_waitqueue_head(&bh->b_wait);
  
+out_bh:
+       bh->b_rsector = rbh->b_rsector + (lo->lo_offset >> 9);
+       spin_lock_irq(&lo->lo_lock);
+       bh->b_rdev = lo->lo_device;
+       spin_unlock_irq(&lo->lo_lock);
+
         return bh;
  }
  
@@ -476,8 +494,7 @@ static int loop_make_request(request_queue_t *q, int rw, struct buffer_head *rbh
          * piggy old buffer on original, and submit for I/O
          */
         bh = loop_get_buffer(lo, rbh);
-       bh->b_private = rbh;
-       IV = loop_get_iv(lo, bh->b_rsector);
+       IV = loop_get_iv(lo, rbh->b_rsector);
         if (rw == WRITE) {
                 set_bit(BH_Dirty, &bh->b_state);
                 if (lo_do_transfer(lo, WRITE, bh->b_data, rbh->b_data,
@@ -601,7 +618,7 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file, kdev_t dev,
         error = -EBUSY;
         if (lo->lo_state != Lo_unbound)
                 goto out;
-        
+
         error = -EBADF;
         file = fget(arg);
         if (!file)
@@ -621,7 +638,6 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file, kdev_t dev,
                  * If we can't read - sorry. If we only can't write - well,
                  * it's going to be read-only.
                  */
-               error = -EINVAL;
                 if (!aops->readpage)
                         goto out_putf;
  
diff --git a/drivers/block/rd.c b/drivers/block/rd.c

index 73d31e6b62ef835ffdb07914aa92eb80427c2813..633a9c9df4b1f49b8362459d61cfe9c175260516 100644 (file)
--- a/drivers/block/rd.c
+++ b/drivers/block/rd.c
@@ -100,7 +100,7 @@ static int rd_hardsec[NUM_RAMDISKS];                /* Size of real blocks in bytes */
  static int rd_blocksizes[NUM_RAMDISKS];                /* Size of 1024 byte blocks :)  */
  static int rd_kbsize[NUM_RAMDISKS];            /* Size in blocks of 1024 bytes */
  static devfs_handle_t devfs_handle;
-static struct block_device *rd_bdev[NUM_RAMDISKS];/* Protected device data */
+static struct inode *rd_inode[NUM_RAMDISKS];   /* Protected device inodes */
  
  /*
   * Parameters for the boot-loading of the RAM disk.  These are set by
@@ -186,6 +186,79 @@ __setup("ramdisk_blocksize=", ramdisk_blocksize);
  
  #endif
  
+static int rd_blkdev_pagecache_IO(int rw, struct buffer_head * sbh, int minor)
+{
+       struct address_space * mapping = rd_inode[minor]->i_mapping;
+       unsigned long index;
+       int offset, size, err = 0;
+
+       if (sbh->b_page->mapping == mapping) {
+               if (rw != READ)
+                       SetPageDirty(sbh->b_page);
+               goto out;
+       }
+
+       index = sbh->b_rsector >> (PAGE_CACHE_SHIFT - 9);
+       offset = (sbh->b_rsector << 9) & ~PAGE_CACHE_MASK;
+       size = sbh->b_size;
+
+       do {
+               int count;
+               struct page ** hash;
+               struct page * page;
+               const char * src;
+               char * dst;
+               int unlock = 0;
+
+               count = PAGE_CACHE_SIZE - offset;
+               if (count > size)
+                       count = size;
+               size -= count;
+
+               hash = page_hash(mapping, index);
+               page = __find_get_page(mapping, index, hash);
+               if (!page && rw != READ) {
+                       page = grab_cache_page(mapping, index);
+                       err = -ENOMEM;
+                       if (!page)
+                               goto out;
+                       err = 0;
+                       unlock = 1;
+               }
+
+               index++;
+               if (!page) {
+                       offset = 0;
+                       continue;
+               }
+
+               if (rw == READ) {
+                       src = kmap(page);
+                       src += offset;
+                       dst = bh_kmap(sbh);
+               } else {
+                       dst = kmap(page);
+                       dst += offset;
+                       src = bh_kmap(sbh);
+               }
+               offset = 0;
+
+               memcpy(dst, src, count);
+
+               kunmap(page);
+               bh_kunmap(sbh);
+
+               if (rw != READ)
+                       SetPageDirty(page);
+               if (unlock)
+                       UnlockPage(page);
+               __free_page(page);
+       } while (size);
+
+ out:
+       return err;
+}
+
  /*
   *  Basically, my strategy here is to set up a buffer-head which can't be
   *  deleted, and make that my Ramdisk.  If the request is outside of the
@@ -198,10 +271,7 @@ static int rd_make_request(request_queue_t * q, int rw, struct buffer_head *sbh)
  {
         unsigned int minor;
         unsigned long offset, len;
-       struct buffer_head *rbh;
-       char *bdata;
  
-       
         minor = MINOR(sbh->b_rdev);
  
         if (minor >= NUM_RAMDISKS)
@@ -221,20 +291,8 @@ static int rd_make_request(request_queue_t * q, int rw, struct buffer_head *sbh)
                 goto fail;
         }
  
-       rbh = getblk(sbh->b_rdev, sbh->b_rsector/(sbh->b_size>>9), sbh->b_size);
-       /* I think that it is safe to assume that rbh is not in HighMem, though
-        * sbh might be - NeilBrown
-        */
-       bdata = bh_kmap(sbh);
-       if (rw == READ) {
-               if (sbh != rbh)
-                       memcpy(bdata, rbh->b_data, rbh->b_size);
-       } else
-               if (sbh != rbh)
-                       memcpy(rbh->b_data, bdata, rbh->b_size);
-       bh_kunmap(sbh);
-       mark_buffer_protected(rbh);
-       brelse(rbh);
+       if (rd_blkdev_pagecache_IO(rw, sbh, minor))
+               goto fail;
  
         sbh->b_end_io(sbh,1);
         return 0;
@@ -259,10 +317,21 @@ static int rd_ioctl(struct inode *inode, struct file *file, unsigned int cmd, un
                         /* special: we want to release the ramdisk memory,
                            it's not like with the other blockdevices where
                            this ioctl only flushes away the buffer cache. */
-                       if ((atomic_read(&rd_bdev[minor]->bd_openers) > 2))
-                               return -EBUSY;
-                       destroy_buffers(inode->i_rdev);
-                       rd_blocksizes[minor] = 0;
+                       {
+                               struct block_device * bdev = inode->i_bdev;
+
+                               down(&bdev->bd_sem);
+                               if (bdev->bd_openers > 2) {
+                                       up(&bdev->bd_sem);
+                                       return -EBUSY;
+                               }
+                               bdev->bd_openers--;
+                               bdev->bd_cache_openers--;
+                               iput(rd_inode[minor]);
+                               rd_inode[minor] = NULL;
+                               rd_blocksizes[minor] = rd_blocksize;
+                               up(&bdev->bd_sem);
+                       }
                         break;
  
                 case BLKGETSIZE:   /* Return device size */
@@ -305,20 +374,16 @@ static int initrd_release(struct inode *inode,struct file *file)
  {
         extern void free_initrd_mem(unsigned long, unsigned long);
  
-       lock_kernel();
         if (!--initrd_users) {
-               blkdev_put(inode->i_bdev, BDEV_FILE);
                 free_initrd_mem(initrd_start, initrd_end);
                 initrd_start = 0;
         }
-       unlock_kernel();
         return 0;
  }
  
  
  static struct file_operations initrd_fops = {
         read:           initrd_read,
-       release:        initrd_release,
  };
  
  #endif
@@ -326,26 +391,37 @@ static struct file_operations initrd_fops = {
  
  static int rd_open(struct inode * inode, struct file * filp)
  {
-       int unit = DEVICE_NR(inode->i_rdev);
-
  #ifdef CONFIG_BLK_DEV_INITRD
-       if (unit == INITRD_MINOR) {
+       if (DEVICE_NR(inode->i_rdev) == INITRD_MINOR) {
+               static struct block_device_operations initrd_bd_op = {
+                       open:           rd_open,
+                       release:        initrd_release,
+               };
+
                 if (!initrd_start) return -ENODEV;
                 initrd_users++;
                 filp->f_op = &initrd_fops;
+               inode->i_bdev->bd_op = &initrd_bd_op;
                 return 0;
         }
  #endif
  
-       if (unit >= NUM_RAMDISKS)
+       if (DEVICE_NR(inode->i_rdev) >= NUM_RAMDISKS)
                 return -ENXIO;
  
         /*
          * Immunize device against invalidate_buffers() and prune_icache().
          */
-       if (rd_bdev[unit] == NULL) {
-               rd_bdev[unit] = bdget(kdev_t_to_nr(inode->i_rdev));
-               atomic_inc(&rd_bdev[unit]->bd_openers);
+       if (rd_inode[DEVICE_NR(inode->i_rdev)] == NULL) {
+               if (!inode->i_bdev) return -ENXIO;
+               if ((rd_inode[DEVICE_NR(inode->i_rdev)] = igrab(inode)) != NULL) {
+                       struct block_device *bdev = inode->i_bdev;
+
+                       /* bdev->bd_sem is held by caller */
+                       bdev->bd_openers++;
+                       bdev->bd_cache_openers++;
+                       bdev->bd_inode = inode;
+               }
         }
  
         MOD_INC_USE_COUNT;
@@ -359,7 +435,7 @@ static int rd_release(struct inode * inode, struct file * filp)
         return 0;
  }
  
-static struct block_device_operations fd_fops = {
+static struct block_device_operations rd_bd_op = {
         open:           rd_open,
         release:        rd_release,
         ioctl:          rd_ioctl,
@@ -372,11 +448,18 @@ static void __exit rd_cleanup (void)
         int i;
  
         for (i = 0 ; i < NUM_RAMDISKS; i++) {
-               struct block_device *bdev = rd_bdev[i];
-               rd_bdev[i] = NULL;
-               if (bdev) {
-                       blkdev_put(bdev, BDEV_FILE);
-                       bdput(bdev);
+               if (rd_inode[i]) {
+                       /* withdraw invalidate_buffers() and prune_icache() immunity */
+                       struct block_device *bdev = rd_inode[i]->i_bdev;
+
+                       down(&bdev->bd_sem);
+                       bdev->bd_openers--;
+                       bdev->bd_cache_openers--;
+                       up(&bdev->bd_sem);
+
+                       /* remove stale pointer to module address space */
+                       rd_inode[i]->i_bdev->bd_op = NULL;
+                       iput(rd_inode[i]);
                 }
                 destroy_buffers(MKDEV(MAJOR_NR, i));
         }
@@ -402,7 +485,7 @@ int __init rd_init (void)
                 rd_blocksize = BLOCK_SIZE;
         }
  
-       if (register_blkdev(MAJOR_NR, "ramdisk", &fd_fops)) {
+       if (register_blkdev(MAJOR_NR, "ramdisk", &rd_bd_op)) {
                 printk("RAMDISK: Could not get major %d", MAJOR_NR);
                 return -EIO;
         }
@@ -420,14 +503,14 @@ int __init rd_init (void)
         devfs_register_series (devfs_handle, "%u", NUM_RAMDISKS,
                                DEVFS_FL_DEFAULT, MAJOR_NR, 0,
                                S_IFBLK | S_IRUSR | S_IWUSR,
-                              &fd_fops, NULL);
+                              &rd_bd_op, NULL);
  
         for (i = 0; i < NUM_RAMDISKS; i++)
-               register_disk(NULL, MKDEV(MAJOR_NR,i), 1, &fd_fops, rd_size<<1);
+               register_disk(NULL, MKDEV(MAJOR_NR,i), 1, &rd_bd_op, rd_size<<1);
  
  #ifdef CONFIG_BLK_DEV_INITRD
         /* We ought to separate initrd operations here */
-       register_disk(NULL, MKDEV(MAJOR_NR,INITRD_MINOR), 1, &fd_fops, rd_size<<1);
+       register_disk(NULL, MKDEV(MAJOR_NR,INITRD_MINOR), 1, &rd_bd_op, rd_size<<1);
  #endif
  
         hardsect_size[MAJOR_NR] = rd_hardsec;           /* Size of the RAM disk blocks */
@@ -597,8 +680,10 @@ static void __init rd_load_image(kdev_t device, int offset, int unit)
         outfile.f_op = &def_blk_fops;
         init_special_inode(out_inode, S_IFBLK | S_IRUSR | S_IWUSR, kdev_t_to_nr(ram_device));
  
-       if (blkdev_open(inode, &infile) != 0)
+       if (blkdev_open(inode, &infile) != 0) {
+               iput(out_inode);
                 goto free_inode;
+       }
         if (blkdev_open(out_inode, &outfile) != 0)
                 goto free_inodes;
  
@@ -661,14 +746,15 @@ static void __init rd_load_image(kdev_t device, int offset, int unit)
                 if (i && (i % devblocks == 0)) {
                         printk("done disk #%d.\n", i/devblocks);
                         rotate = 0;
-                       invalidate_buffers(device);
-                       if (infile.f_op->release)
-                               infile.f_op->release(inode, &infile);
+                       if (blkdev_close(inode, &infile) != 0) {
+                               printk("Error closing the disk.\n");
+                               goto noclose_input;
+                       }
                         printk("Please insert disk #%d and press ENTER\n", i/devblocks+1);
                         wait_for_keypress();
                         if (blkdev_open(inode, &infile) != 0)  {
                                 printk("Error opening disk.\n");
-                               goto done;
+                               goto noclose_input;
                         }
                         infile.f_pos = 0;
                         printk("Loading disk #%d... ", i/devblocks+1);
@@ -686,18 +772,20 @@ static void __init rd_load_image(kdev_t device, int offset, int unit)
         kfree(buf);
  
  successful_load:
-       invalidate_buffers(device);
         ROOT_DEV = MKDEV(MAJOR_NR, unit);
         if (ROOT_DEVICE_NAME != NULL) strcpy (ROOT_DEVICE_NAME, "rd/0");
  
  done:
-       if (infile.f_op->release)
-               infile.f_op->release(inode, &infile);
+       blkdev_close(inode, &infile);
+noclose_input:
+       blkdev_close(out_inode, &outfile);
+       iput(inode);
+       iput(out_inode);
         set_fs(fs);
         return;
  free_inodes: /* free inodes on error */ 
         iput(out_inode);
-       blkdev_put(inode->i_bdev, BDEV_FILE);
+       blkdev_close(inode, &infile);
  free_inode:
         iput(inode);
  }
diff --git a/drivers/char/console.c b/drivers/char/console.c

index 947273154ec8f09d65503bc8ef615db96637dc0d..f0a1255599beafa035bbc9e774c82d8244004f64 100644 (file)
--- a/drivers/char/console.c
+++ b/drivers/char/console.c
@@ -69,6 +69,9 @@
   *
   * Removed old-style timers, introduced console_timer, made timer
   * deletion SMP-safe.  17Jun00, Andrew Morton <andrewm@uow.edu.au>
+ *
+ * Removed console_lock, enabled interrupts across all console operations
+ * 13 March 2001, Andrew Morton
   */
  
  #include <linux/module.h>
@@ -149,6 +152,7 @@ static void set_vesa_blanking(unsigned long arg);
  static void set_cursor(int currcons);
  static void hide_cursor(int currcons);
  static void unblank_screen_t(unsigned long dummy);
+static void console_callback(void *ignored);
  
  static int printable;          /* Is console ready for printing? */
  
@@ -159,6 +163,10 @@ static int vesa_blank_mode; /* 0:none 1:suspendV 2:suspendH 3:powerdown */
  static int blankinterval = 10*60*HZ;
  static int vesa_off_interval;
  
+static struct tq_struct console_callback_tq = {
+       routine: console_callback,
+};
+
  /*
   * fg_console is the current virtual console,
   * last_console is the last used one,
@@ -180,15 +188,13 @@ static struct vc_data *master_display_fg;
  
  /*
   * Unfortunately, we need to delay tty echo when we're currently writing to the
- * console since the code is (and always was) not re-entrant, so we insert
- * all filp requests to con_task_queue instead of tq_timer and run it from
- * the console_tasklet.  The console_tasklet is protected by the IRQ
- * protected console_lock.
+ * console since the code is (and always was) not re-entrant, so we schedule
+ * all flip requests to process context with schedule-task() and run it from
+ * console_callback().
   */
-DECLARE_TASK_QUEUE(con_task_queue);
  
  /*
- * For the same reason, we defer scrollback to the console tasklet.
+ * For the same reason, we defer scrollback to the console callback.
   */
  static int scrollback_delta;
  
@@ -232,7 +238,12 @@ static inline unsigned short *screenpos(int currcons, int offset, int viewed)
  static inline void scrolldelta(int lines)
  {
         scrollback_delta += lines;
-       tasklet_schedule(&console_tasklet);
+       schedule_console_callback();
+}
+
+void schedule_console_callback(void)
+{
+       schedule_task(&console_callback_tq);
  }
  
  static void scrup(int currcons, unsigned int t, unsigned int b, int nr)
@@ -780,6 +791,7 @@ int vc_resize(unsigned int lines, unsigned int cols,
  
  void vc_disallocate(unsigned int currcons)
  {
+       acquire_console_sem();
         if (vc_cons_allocated(currcons)) {
             sw->con_deinit(vc_cons[currcons].d);
             if (kmalloced)
@@ -788,6 +800,7 @@ void vc_disallocate(unsigned int currcons)
                 kfree(vc_cons[currcons].d);
             vc_cons[currcons].d = NULL;
         }
+       release_console_sem();
  }
  
  /*
@@ -1026,6 +1039,7 @@ static void default_attr(int currcons)
         color = def_color;
  }
  
+/* console_sem is held */
  static void csi_m(int currcons)
  {
         int i;
@@ -1165,6 +1179,7 @@ int mouse_reporting(void)
         return report_mouse;
  }
  
+/* console_sem is held */
  static void set_mode(int currcons, int on_off)
  {
         int i;
@@ -1230,6 +1245,7 @@ static void set_mode(int currcons, int on_off)
                 }
  }
  
+/* console_sem is held */
  static void setterm_command(int currcons)
  {
         switch(par[0]) {
@@ -1284,19 +1300,7 @@ static void setterm_command(int currcons)
         }
  }
  
-static void insert_line(int currcons, unsigned int nr)
-{
-       scrdown(currcons,y,bottom,nr);
-       need_wrap = 0;
-}
-
-
-static void delete_line(int currcons, unsigned int nr)
-{
-       scrup(currcons,y,bottom,nr);
-       need_wrap = 0;
-}
-
+/* console_sem is held */
  static void csi_at(int currcons, unsigned int nr)
  {
         if (nr > video_num_columns - x)
@@ -1306,15 +1310,18 @@ static void csi_at(int currcons, unsigned int nr)
         insert_char(currcons, nr);
  }
  
+/* console_sem is held */
  static void csi_L(int currcons, unsigned int nr)
  {
         if (nr > video_num_lines - y)
                 nr = video_num_lines - y;
         else if (!nr)
                 nr = 1;
-       insert_line(currcons, nr);
+       scrdown(currcons,y,bottom,nr);
+       need_wrap = 0;
  }
  
+/* console_sem is held */
  static void csi_P(int currcons, unsigned int nr)
  {
         if (nr > video_num_columns - x)
@@ -1324,15 +1331,18 @@ static void csi_P(int currcons, unsigned int nr)
         delete_char(currcons, nr);
  }
  
+/* console_sem is held */
  static void csi_M(int currcons, unsigned int nr)
  {
         if (nr > video_num_lines - y)
                 nr = video_num_lines - y;
         else if (!nr)
                 nr=1;
-       delete_line(currcons, nr);
+       scrup(currcons,y,bottom,nr);
+       need_wrap = 0;
  }
  
+/* console_sem is held (except via vc_init->reset_terminal */
  static void save_cur(int currcons)
  {
         saved_x         = x;
@@ -1347,6 +1357,7 @@ static void save_cur(int currcons)
         saved_G1        = G1_charset;
  }
  
+/* console_sem is held */
  static void restore_cur(int currcons)
  {
         gotoxy(currcons,saved_x,saved_y);
@@ -1367,6 +1378,7 @@ enum { ESnormal, ESesc, ESsquare, ESgetpars, ESgotpars, ESfunckey,
         EShash, ESsetG0, ESsetG1, ESpercent, ESignore, ESnonstd,
         ESpalette };
  
+/* console_sem is held (except via vc_init()) */
  static void reset_terminal(int currcons, int do_clear)
  {
         top             = 0;
@@ -1422,6 +1434,7 @@ static void reset_terminal(int currcons, int do_clear)
             csi_J(currcons,2);
  }
  
+/* console_sem is held */
  static void do_con_trol(struct tty_struct *tty, unsigned int currcons, int c)
  {
         /*
@@ -1802,6 +1815,7 @@ char con_buf[PAGE_SIZE];
  #define CON_BUF_SIZE   PAGE_SIZE
  DECLARE_MUTEX(con_buf_sem);
  
+/* acquires console_sem */
  static int do_con_write(struct tty_struct * tty, int from_user,
                         const unsigned char *buf, int count)
  {
@@ -1822,6 +1836,9 @@ static int do_con_write(struct tty_struct * tty, int from_user,
         const unsigned char *orig_buf = NULL;
         int orig_count;
  
+       if (in_interrupt())
+               return count;
+               
         currcons = vt->vc_num;
         if (!vc_cons_allocated(currcons)) {
             /* could this happen? */
@@ -1842,6 +1859,7 @@ static int do_con_write(struct tty_struct * tty, int from_user,
  again:
                 if (count > CON_BUF_SIZE)
                         count = CON_BUF_SIZE;
+               console_conditional_schedule();
                 if (copy_from_user(con_buf, buf, count)) {
                         n = 0; /* ?? are error codes legal here ?? */
                         goto out;
@@ -1857,7 +1875,7 @@ again:
          * the console spinlock during the entire write.
          */
  
-       spin_lock_irq(&console_lock);
+       acquire_console_sem();
  
         himask = hi_font_mask;
         charmask = himask ? 0x1ff : 0xff;
@@ -1975,7 +1993,8 @@ again:
                 do_con_trol(tty, currcons, c);
         }
         FLUSH
-       spin_unlock_irq(&console_lock);
+       console_conditional_schedule();
+       release_console_sem();
  
  out:
         if (from_user) {
@@ -1999,23 +2018,17 @@ out:
  }
  
  /*
- * This is the console switching tasklet.
+ * This is the console switching callback.
   *
- * Doing console switching in a tasklet allows
+ * Doing console switching in a process context allows
   * us to do the switches asynchronously (needed when we want
   * to switch due to a keyboard interrupt).  Synchronization
   * with other console code and prevention of re-entrancy is
- * ensured with console_lock.
+ * ensured with console_sem.
   */
-static void console_softint(unsigned long ignored)
+static void console_callback(void *ignored)
  {
-       /* Runs the task queue outside of the console lock.  These
-        * callbacks can come back into the console code and thus
-        * will perform their own locking.
-        */
-       run_task_queue(&con_task_queue);
-
-       spin_lock_irq(&console_lock);
+       acquire_console_sem();
  
         if (want_console >= 0) {
                 if (want_console != fg_console && vc_cons_allocated(want_console)) {
@@ -2039,7 +2052,13 @@ static void console_softint(unsigned long ignored)
                 scrollback_delta = 0;
         }
  
-       spin_unlock_irq(&console_lock);
+       release_console_sem();
+}
+
+void set_console(int nr)
+{
+       want_console = nr;
+       schedule_console_callback();
  }
  
  #ifdef CONFIG_VT_CONSOLE
@@ -2047,7 +2066,7 @@ static void console_softint(unsigned long ignored)
  /*
   *     Console on virtual terminal
   *
- * The console_lock must be held when we get here.
+ * The console must be locked when we get here.
   */
  
  void vt_console_print(struct console *co, const char * b, unsigned count)
@@ -2134,6 +2153,9 @@ void vt_console_print(struct console *co, const char * b, unsigned count)
         }
         set_cursor(currcons);
  
+       if (!oops_in_progress)
+               poke_blanked_console();
+
  quit:
         clear_bit(0, &printing);
  }
@@ -2158,27 +2180,45 @@ struct console vt_console_driver = {
   *     Handling of Linux-specific VC ioctls
   */
  
+/*
+ * Generally a bit racy with respect to console_sem().
+ *
+ * There are some functions which don't need it.
+ *
+ * There are some functions which can sleep for arbitrary periods (paste_selection)
+ * but we don't need the lock there anyway.
+ *
+ * set_selection has locking, and definitely needs it
+ */
+
  int tioclinux(struct tty_struct *tty, unsigned long arg)
  {
         char type, data;
+       int ret;
  
         if (tty->driver.type != TTY_DRIVER_TYPE_CONSOLE)
                 return -EINVAL;
-       if (current->tty != tty && !suser())
+       if (current->tty != tty && !capable(CAP_SYS_ADMIN))
                 return -EPERM;
         if (get_user(type, (char *)arg))
                 return -EFAULT;
+       ret = 0;
         switch (type)
         {
                 case 2:
-                       return set_selection(arg, tty, 1);
+                       acquire_console_sem();
+                       ret = set_selection(arg, tty, 1);
+                       release_console_sem();
+                       break;
                 case 3:
-                       return paste_selection(tty);
+                       ret = paste_selection(tty);
+                       break;
                 case 4:
                         unblank_screen();
-                       return 0;
+                       break;
                 case 5:
-                       return sel_loadlut(arg);
+                       ret = sel_loadlut(arg);
+                       break;
                 case 6:
                         
         /*
@@ -2188,24 +2228,33 @@ int tioclinux(struct tty_struct *tty, unsigned long arg)
          * related to the kernel should not use this.
          */
                         data = shift_state;
-                       return __put_user(data, (char *) arg);
+                       ret = __put_user(data, (char *) arg);
+                       break;
                 case 7:
                         data = mouse_reporting();
-                       return __put_user(data, (char *) arg);
+                       ret = __put_user(data, (char *) arg);
+                       break;
                 case 10:
                         set_vesa_blanking(arg);
-                       return 0;
+                       break;;
                 case 11:        /* set kmsg redirect */
-                       if (!suser())
-                               return -EPERM;
-                       if (get_user(data, (char *)arg+1))
-                                       return -EFAULT;
-                       kmsg_redirect = data;
-                       return 0;
+                       if (!capable(CAP_SYS_ADMIN)) {
+                               ret = -EPERM;
+                       } else {
+                               if (get_user(data, (char *)arg+1))
+                                       ret = -EFAULT;
+                               else
+                                       kmsg_redirect = data;
+                       }
+                       break;
                 case 12:        /* get fg_console */
-                       return fg_console;
+                       ret = fg_console;
+                       break;
+               default:
+                       ret = -EINVAL;
+                       break;
         }
-       return -EINVAL;
+       return ret;
  }
  
  /*
@@ -2226,6 +2275,8 @@ static int con_write(struct tty_struct * tty, int from_user,
  
  static void con_put_char(struct tty_struct *tty, unsigned char ch)
  {
+       if (in_interrupt())
+               return;         /* n_r3964 calls put_char() from interrupt context */
         pm_access(pm_con);
         do_con_write(tty, 0, &ch, 1);
  }
@@ -2290,13 +2341,15 @@ static void con_start(struct tty_struct *tty)
  
  static void con_flush_chars(struct tty_struct *tty)
  {
-       unsigned long flags;
         struct vt_struct *vt = (struct vt_struct *)tty->driver_data;
  
+       if (in_interrupt())     /* from flush_to_ldisc */
+               return;
+
         pm_access(pm_con);
-       spin_lock_irqsave(&console_lock, flags);
+       acquire_console_sem();
         set_cursor(vt->vc_num);
-       spin_unlock_irqrestore(&console_lock, flags);
+       release_console_sem();
  }
  
  /*
@@ -2367,8 +2420,6 @@ static void vc_init(unsigned int currcons, unsigned int rows, unsigned int cols,
  struct tty_driver console_driver;
  static int console_refcount;
  
-DECLARE_TASKLET_DISABLED(console_tasklet, console_softint, 0);
-
  void __init con_init(void)
  {
         const char *display_desc = NULL;
@@ -2453,9 +2504,6 @@ void __init con_init(void)
  #ifdef CONFIG_VT_CONSOLE
         register_console(&vt_console_driver);
  #endif
-
-       tasklet_enable(&console_tasklet);
-       tasklet_schedule(&console_tasklet);
  }
  
  #ifndef VT_SINGLE_DRIVER
@@ -2561,6 +2609,9 @@ void __init con_init_devfs (void)
                                     console_driver.minor_start + i);
  }
  
+/*
+ * This is called by a timer handler
+ */
  static void vesa_powerdown(void)
  {
      struct vc_data *c = vc_cons[fg_console].d;
@@ -2581,9 +2632,12 @@ static void vesa_powerdown(void)
      }
  }
  
+/*
+ * This is a timer handler
+ */
  static void vesa_powerdown_screen(unsigned long dummy)
  {
-       console_timer.function = unblank_screen_t;      /* I don't have a clue why this is necessary */
+       console_timer.function = unblank_screen_t;
  
         vesa_powerdown();
  }
@@ -2642,11 +2696,17 @@ void do_blank_screen(int entering_gfx)
         timer_do_blank_screen(entering_gfx, 0);
  }
  
+/*
+ * This is a timer handler
+ */
  static void unblank_screen_t(unsigned long dummy)
  {
         unblank_screen();
  }
  
+/*
+ * Called by timer as well as from vt_console_driver
+ */
  void unblank_screen(void)
  {
         int currcons;
@@ -2677,6 +2737,9 @@ void unblank_screen(void)
         set_cursor(fg_console);
  }
  
+/*
+ * This is both a user-level callable and a timer handler
+ */
  static void blank_screen(unsigned long dummy)
  {
         timer_do_blank_screen(0, 1);
@@ -2684,7 +2747,7 @@ static void blank_screen(unsigned long dummy)
  
  void poke_blanked_console(void)
  {
-       del_timer(&console_timer);      /* Can't use _sync here: called from tasklet */
+       del_timer(&console_timer);
         if (!vt_cons[fg_console] || vt_cons[fg_console]->vc_mode == KD_GRAPHICS)
                 return;
         if (console_blanked) {
@@ -2832,9 +2895,9 @@ int con_font_op(int currcons, struct console_font_op *op)
                 op->data = temp;
         }
  
-       spin_lock_irq(&console_lock);
+       acquire_console_sem();
         rc = sw->con_font_op(vc_cons[currcons].d, op);
-       spin_unlock_irq(&console_lock);
+       release_console_sem();
  
         op->data = old_op.data;
         if (!rc && !set) {
diff --git a/drivers/char/keyboard.c b/drivers/char/keyboard.c

index 996045c0259c939b73a4f05aa71072c86aa7bfed..a50cf0c236292d97e26a0f76299a934aa6dcee9a 100644 (file)
--- a/drivers/char/keyboard.c
+++ b/drivers/char/keyboard.c
@@ -205,15 +205,12 @@ void handle_scancode(unsigned char scancode, int down)
         char raw_mode;
  
         pm_access(pm_kbd);
-
-       do_poke_blanked_console = 1;
-       tasklet_schedule(&console_tasklet);
         add_keyboard_randomness(scancode | up_flag);
  
         tty = ttytab? ttytab[fg_console]: NULL;
         if (tty && (!tty->driver_data)) {
                 /*
-                * We touch the tty structure via the the ttytab array
+                * We touch the tty structure via the ttytab array
                  * without knowing whether or not tty is open, which
                  * is inherently dangerous.  We currently rely on that
                  * fact that console_open sets tty->driver_data when
@@ -233,7 +230,7 @@ void handle_scancode(unsigned char scancode, int down)
          *  Convert scancode to keycode
          */
         if (!kbd_translate(scancode, &keycode, raw_mode))
-           return;
+               goto out;
  
         /*
          * At this point the variable `keycode' contains the keycode.
@@ -252,11 +249,11 @@ void handle_scancode(unsigned char scancode, int down)
  #ifdef CONFIG_MAGIC_SYSRQ              /* Handle the SysRq Hack */
         if (keycode == SYSRQ_KEY) {
                 sysrq_pressed = !up_flag;
-               return;
+               goto out;
         } else if (sysrq_pressed) {
                 if (!up_flag) {
                         handle_sysrq(kbd_sysrq_xlate[keycode], kbd_pt_regs, kbd, tty);
-                       return;
+                       goto out;
                 }
         }
  #endif
@@ -298,7 +295,7 @@ void handle_scancode(unsigned char scancode, int down)
                         if (type >= 0xf0) {
                             type -= 0xf0;
                             if (raw_mode && ! (TYPES_ALLOWED_IN_RAW_MODE & (1 << type)))
-                               return;
+                               goto out;
                             if (type == KT_LETTER) {
                                 type = KT_LATIN;
                                 if (vc_kbd_led(kbd, VC_CAPSLOCK)) {
@@ -322,13 +319,16 @@ void handle_scancode(unsigned char scancode, int down)
                         compute_shiftstate();
                         kbd->slockstate = 0; /* play it safe */
  #else
-                       keysym = U(plain_map[keycode]);
+                       keysym =  U(plain_map[keycode]);
                         type = KTYP(keysym);
                         if (type == KT_SHIFT)
                           (*key_handler[type])(keysym & 0xff, up_flag);
  #endif
                 }
         }
+out:
+       do_poke_blanked_console = 1;
+       schedule_console_callback();
  }
  
  
diff --git a/drivers/char/pc_keyb.c b/drivers/char/pc_keyb.c

index 9cf4f038d790fcb2506f8e1c5c79644fb605e442..69f0590ec34dd5a30b5daa435f34533094c038a7 100644 (file)
--- a/drivers/char/pc_keyb.c
+++ b/drivers/char/pc_keyb.c
@@ -33,6 +33,7 @@
  #include <linux/kbd_kern.h>
  #include <linux/vt_kern.h>
  #include <linux/smp_lock.h>
+#include <linux/kd.h>
  
  #include <asm/keyboard.h>
  #include <asm/bitops.h>
diff --git a/drivers/char/raw.c b/drivers/char/raw.c

index 9c9e00e7c30b6576833b3c88284c0b952d9cdafc..c9c5b6e4ff7c40ed3f896bbe5f6746bb66c63b03 100644 (file)
--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@ -20,8 +20,6 @@
  #define dprintk(x...) 
  
  typedef struct raw_device_data_s {
-       struct kiobuf * iobuf;
-       long iobuf_lock;
         struct block_device *binding;
         int inuse, sector_size, sector_bits;
         struct semaphore mutex;
@@ -87,6 +85,12 @@ int raw_open(struct inode *inode, struct file *filp)
                 return 0;
         }
         
+       if (!filp->f_iobuf) {
+               err = alloc_kiovec(1, &filp->f_iobuf);
+               if (err)
+                       return err;
+       }
+
         down(&raw_devices[minor].mutex);
         /*
          * No, it is a normal raw device.  All we need to do on open is
@@ -112,19 +116,6 @@ int raw_open(struct inode *inode, struct file *filp)
         if (raw_devices[minor].inuse++)
                 goto out;
  
-       /* 
-        * We'll just use one kiobuf
-        */
-
-       err = alloc_kiovec(1, &raw_devices[minor].iobuf);
-       if (err) {
-               raw_devices[minor].inuse--;
-               up(&raw_devices[minor].mutex);
-               blkdev_put(bdev, BDEV_RAW);
-               return err;
-       }
-
-       
         /* 
          * Don't interfere with mounted devices: we cannot safely set
          * the blocksize on a device which is already mounted.  
@@ -160,8 +151,7 @@ int raw_release(struct inode *inode, struct file *filp)
         minor = MINOR(inode->i_rdev);
         down(&raw_devices[minor].mutex);
         bdev = raw_devices[minor].binding;
-       if (!--raw_devices[minor].inuse)
-               free_kiovec(1, &raw_devices[minor].iobuf);
+       raw_devices[minor].inuse--;
         up(&raw_devices[minor].mutex);
         blkdev_put(bdev, BDEV_RAW);
         return 0;
@@ -300,8 +290,8 @@ ssize_t     rw_raw_dev(int rw, struct file *filp, char *buf,
         minor = MINOR(filp->f_dentry->d_inode->i_rdev);
  
         new_iobuf = 0;
-       iobuf = raw_devices[minor].iobuf;
-       if (test_and_set_bit(0, &raw_devices[minor].iobuf_lock)) {
+       iobuf = filp->f_iobuf;
+       if (test_and_set_bit(0, &filp->f_iobuf_lock)) {
                 /*
                  * A parallel read/write is using the preallocated iobuf
                  * so just run slow and allocate a new one.
@@ -384,7 +374,7 @@ ssize_t     rw_raw_dev(int rw, struct file *filp, char *buf,
  
   out_free:
         if (!new_iobuf)
-               clear_bit(0, &raw_devices[minor].iobuf_lock);
+               clear_bit(0, &filp->f_iobuf_lock);
         else
                 free_kiovec(1, &iobuf);
   out:  
diff --git a/drivers/char/serial.c b/drivers/char/serial.c

index 8671a4519209b0e927a19f227ed459aaebdbacad..55b642532fa8b27c3df8a834b564c37aa63b6313 100644 (file)
--- a/drivers/char/serial.c
+++ b/drivers/char/serial.c
@@ -1766,11 +1766,13 @@ static void change_speed(struct async_struct *info,
                 if (I_IGNPAR(info->tty))
                         info->ignore_status_mask |= UART_LSR_OE;
         }
+#if 0 /* breaks serial console during boot stage */
         /*
          * !!! ignore all characters if CREAD is not set
          */
         if ((cflag & CREAD) == 0)
                 info->ignore_status_mask |= UART_LSR_DR;
+#endif
         save_flags(flags); cli();
         if (uart_config[info->state->type].flags & UART_STARTECH) {
                 serial_outp(info, UART_LCR, 0xBF);
@@ -5763,7 +5765,7 @@ static inline void wait_for_xmitr(struct async_struct *info)
   *     Print a string to the serial port trying not to disturb
   *     any possible real use of the port...
   *
- *     The console_lock must be held when we get here.
+ *     The console must be locked when we get here.
   */
  static void serial_console_write(struct console *co, const char *s,
                                 unsigned count)
diff --git a/drivers/char/vc_screen.c b/drivers/char/vc_screen.c

index e6032c24d980b26a929ee1672f919af73612ce15..a7e1f2d679c0bd1449dfd235c87c03c1184d7939 100644 (file)
--- a/drivers/char/vc_screen.c
+++ b/drivers/char/vc_screen.c
@@ -109,7 +109,7 @@ vcs_read(struct file *file, char *buf, size_t count, loff_t *ppos)
         /* Select the proper current console and verify
          * sanity of the situation under the console lock.
          */
-       spin_lock_irq(&console_lock);
+       acquire_console_sem();
  
         attr = (currcons & 128);
         currcons = (currcons & 127);
@@ -232,13 +232,16 @@ vcs_read(struct file *file, char *buf, size_t count, loff_t *ppos)
                         }
                 }
  
-               /* Finally, temporarily drop the console lock and push
+               /* Finally, release the console semaphore while we push
                  * all the data to userspace from our temporary buffer.
+                *
+                * AKPM: Even though it's a semaphore, we should drop it because
+                * the pagefault handling code may want to call printk().
                  */
  
-               spin_unlock_irq(&console_lock);
+               release_console_sem();
                 ret = copy_to_user(buf, con_buf_start, orig_count);
-               spin_lock_irq(&console_lock);
+               acquire_console_sem();
  
                 if (ret) {
                         read += (orig_count - ret);
@@ -254,7 +257,7 @@ vcs_read(struct file *file, char *buf, size_t count, loff_t *ppos)
         if (read)
                 ret = read;
  unlock_out:
-       spin_unlock_irq(&console_lock);
+       release_console_sem();
         up(&con_buf_sem);
         return ret;
  }
@@ -276,7 +279,7 @@ vcs_write(struct file *file, const char *buf, size_t count, loff_t *ppos)
         /* Select the proper current console and verify
          * sanity of the situation under the console lock.
          */
-       spin_lock_irq(&console_lock);
+       acquire_console_sem();
  
         attr = (currcons & 128);
         currcons = (currcons & 127);
@@ -310,9 +313,9 @@ vcs_write(struct file *file, const char *buf, size_t count, loff_t *ppos)
                 /* Temporarily drop the console lock so that we can read
                  * in the write data from userspace safely.
                  */
-               spin_unlock_irq(&console_lock);
+               release_console_sem();
                 ret = copy_from_user(con_buf, buf, this_round);
-               spin_lock_irq(&console_lock);
+               acquire_console_sem();
  
                 if (ret) {
                         this_round -= ret;
@@ -436,7 +439,7 @@ vcs_write(struct file *file, const char *buf, size_t count, loff_t *ppos)
         ret = written;
  
  unlock_out:
-       spin_unlock_irq(&console_lock);
+       release_console_sem();
  
         up(&con_buf_sem);
  
diff --git a/drivers/char/vt.c b/drivers/char/vt.c

index fed34c943fdfa1c2386b45fdbdce687413b03bb7..e15c3505c8241f382dc353669e5f89923c1e6ceb 100644 (file)
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -835,9 +835,9 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
                                  * make sure we are atomic with respect to
                                  * other console switches..
                                  */
-                               spin_lock_irq(&console_lock);
+                               acquire_console_sem();
                                 complete_change_console(newvt);
-                               spin_unlock_irq(&console_lock);
+                               release_console_sem();
                         }
                 }
  
@@ -1161,7 +1161,8 @@ void reset_vc(unsigned int new_console)
         vt_cons[new_console]->vt_mode.frsig = 0;
         vt_cons[new_console]->vt_pid = -1;
         vt_cons[new_console]->vt_newvt = -1;
-       reset_palette (new_console) ;
+       if (!in_interrupt())    /* Via keyboard.c:SAK() - akpm */
+               reset_palette(new_console) ;
  }
  
  /*
diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c

index 9723d0c5bcd445689f7c6f6eae944859bb8a0852..0639ebee9e0ba21723265662684f6d3edc3b8380 100644 (file)
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -931,7 +931,7 @@ byte ide_dump_status (ide_drive_t *drive, const char *msg, byte stat)
                                           cur & 0xf,
                                           IN_BYTE(IDE_SECTOR_REG));
                                 }
-                               if (HWGROUP(drive)->rq)
+                               if (HWGROUP(drive) && HWGROUP(drive)->rq)
                                         printk(", sector=%ld", HWGROUP(drive)->rq->sector);
                         }
                 }
diff --git a/drivers/md/lvm.c b/drivers/md/lvm.c

index 3b758a70c40eb4217dbaf7066bcc1c02d03cb423..ab36ea410ded7c86c38a4672634a2c8301ffd334 100644 (file)
--- a/drivers/md/lvm.c
+++ b/drivers/md/lvm.c
@@ -197,9 +197,13 @@ static char *lvm_short_version = "version 0.9.1_beta2 (18/01/2001)";
  
  #include "lvm-snap.h"
  
-#define        LVM_CORRECT_READ_AHEAD( a) \
-   if      ( a < LVM_MIN_READ_AHEAD || \
-             a > LVM_MAX_READ_AHEAD) a = LVM_MAX_READ_AHEAD;
+#define        LVM_CORRECT_READ_AHEAD(a)               \
+do {                                           \
+       if ((a) < LVM_MIN_READ_AHEAD ||         \
+           (a) > LVM_MAX_READ_AHEAD)           \
+               (a) = LVM_DEFAULT_READ_AHEAD;   \
+       read_ahead[MAJOR_NR] = (a);             \
+} while(0)
  
  #ifndef WRITEA
  #  define WRITEA WRITE
@@ -905,6 +909,7 @@ static int lvm_blk_ioctl(struct inode *inode, struct file *file,
                     (long) arg > LVM_MAX_READ_AHEAD)
                         return -EINVAL;
                 lv_ptr->lv_read_ahead = (long) arg;
+               read_ahead[MAJOR_NR] = lv_ptr->lv_read_ahead;
                 break;
  
  
diff --git a/drivers/md/md.c b/drivers/md/md.c

index 52d88f7668fbc8df5e24fa5557b860a612c5e281..8372fe457ae7433d9e1476f6931746375f66f931 100644 (file)
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1635,7 +1635,7 @@ static int do_md_run (mddev_t * mddev)
         mddev->param.chunk_size = chunk_size;
         mddev->param.personality = pnum;
  
-       if ((pnum != MULTIPATH) && (pnum != RAID1) && (pnum != LINEAR)) {
+       if ((pnum != MULTIPATH) && (pnum != RAID1)) {
                 if (!chunk_size) {
                         /*
                          * 'default chunksize' in the old md code used to
@@ -1663,7 +1663,7 @@ static int do_md_run (mddev_t * mddev)
                 }
         } else
                 if (chunk_size)
-                       printk(KERN_INFO "RAID level %d does not need chunksize! Continuing anyway.\n", mddev->sb->level);
+                       printk(KERN_INFO "md: RAID level %d does not need chunksize! Continuing anyway.\n", mddev->sb->level);
  
         if (pnum >= MAX_PERSONALITY) {
                 MD_BUG();
@@ -1679,7 +1679,7 @@ static int do_md_run (mddev_t * mddev)
                 if (!pers[pnum])
  #endif
                 {
-                       printk(KERN_ERR "md.c: personality %d is not loaded!\n",
+                       printk(KERN_ERR "md: personality %d is not loaded!\n",
                                 pnum);
                         return -EINVAL;
                 }
@@ -1762,7 +1762,7 @@ static int restart_array (mddev_t *mddev)
                 if (mddev->pers->restart_resync)
                         mddev->pers->restart_resync(mddev);
         } else {
-               printk (KERN_ERR "md.c: md%d has no personality assigned.\n",
+               printk (KERN_ERR "md: md%d has no personality assigned.\n",
                         mdidx(mddev));
                 err = -EINVAL;
         }
@@ -2262,7 +2262,7 @@ static int hot_generate_error (mddev_t * mddev, kdev_t dev)
         if (!mddev->pers)
                 return -ENODEV;
   
-       printk("trying to generate %s error in md%d ... \n",
+       printk("md: trying to generate %s error in md%d ... \n",
                 partition_name(dev), mdidx(mddev));
   
         rdev = find_rdev(mddev, dev);
@@ -2284,7 +2284,7 @@ static int hot_generate_error (mddev_t * mddev, kdev_t dev)
                 MD_BUG();
                 return -ENODEV;
         }
-       printk("okay, generating error!\n");
+       printk("md: okay, generating error!\n");
  //     q->oneshot_error = 1; // disabled for now
   
         return 0;
@@ -3080,7 +3080,7 @@ static int status_resync (char * page, mddev_t * mddev)
         unsigned long max_blocks, resync, res, dt, db, rt;
  
         resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
-       max_blocks = mddev->sb->size << 1;
+       max_blocks = mddev->sb->size;
  
         /*
          * Should not happen.
diff --git a/drivers/net/3c59x.c b/drivers/net/3c59x.c

index f644f44441b45f8058e8ef814967aede98428cb9..578f530ee25b53ba098a04dd2c748fd49163b488 100644 (file)
--- a/drivers/net/3c59x.c
+++ b/drivers/net/3c59x.c
@@ -1299,14 +1299,9 @@ static int __devinit vortex_probe1(struct pci_dev *pdev,
         /* The 3c59x-specific entries in the device structure. */
         dev->open = vortex_open;
         if (vp->full_bus_master_tx) {
-               struct sysinfo sysinfo;
-
                 dev->hard_start_xmit = boomerang_start_xmit;
-               si_meminfo(&sysinfo);
-               if (sysinfo.totalhigh == 0) {
-                       /* Actually, it still should work with iommu. */
-                       dev->features |= NETIF_F_SG;
-               }
+               /* Actually, it still should work with iommu. */
+               dev->features |= NETIF_F_SG;
                 if (((hw_checksums[card_idx] == -1) && (vp->drv_flags & HAS_HWCKSM)) ||
                                         (hw_checksums[card_idx] == 1)) {
                                 dev->features |= NETIF_F_IP_CSUM;
diff --git a/drivers/net/eepro100.c b/drivers/net/eepro100.c

index 6cb131ecc4380effb0dd76dc69ba618ab47e767f..18f60b762cd00c5a689638071fabf9c01d27dd41 100644 (file)
--- a/drivers/net/eepro100.c
+++ b/drivers/net/eepro100.c
@@ -349,14 +349,17 @@ enum commands {
  /* Clear CmdSuspend (1<<30) avoiding interference with the card access to the
     status bits.  Previous driver versions used separate 16 bit fields for
     commands and statuses.  --SAW
-   FIXME: it may not work on non-IA32 architectures.
   */
-#if defined(__LITTLE_ENDIAN)
-#define clear_suspend(cmd)  ((__u16 *)&(cmd)->cmd_status)[1] &= ~0x4000
-#elif defined(__BIG_ENDIAN)
-#define clear_suspend(cmd)  ((__u16 *)&(cmd)->cmd_status)[1] &= ~0x0040
+#if defined(__alpha__)
+# define clear_suspend(cmd)  clear_bit(30, &(cmd)->cmd_status);
  #else
-#error Unsupported byteorder
+# if defined(__LITTLE_ENDIAN)
+#  define clear_suspend(cmd)  ((__u16 *)&(cmd)->cmd_status)[1] &= ~0x4000
+# elif defined(__BIG_ENDIAN)
+#  define clear_suspend(cmd)  ((__u16 *)&(cmd)->cmd_status)[1] &= ~0x0040
+# else
+#  error Unsupported byteorder
+# endif
  #endif
  
  enum SCBCmdBits {
diff --git a/drivers/sbus/char/sunkbd.c b/drivers/sbus/char/sunkbd.c

index dbd12740a8278a583e31b363f9220bed04380599..3a3bb4b1f758c26d3e5b494352b1fbd72645bc9a 100644 (file)
--- a/drivers/sbus/char/sunkbd.c
+++ b/drivers/sbus/char/sunkbd.c
@@ -514,7 +514,7 @@ static void __sunkbd_inchar(unsigned char ch, struct pt_regs *regs)
         }
         
         do_poke_blanked_console = 1;
-       tasklet_schedule(&console_tasklet);
+       schedule_console_callback();
         add_keyboard_randomness(keycode);
  
         tty = ttytab? ttytab[fg_console]: NULL;
diff --git a/drivers/scsi/megaraid.c b/drivers/scsi/megaraid.c

index 802f1f625878b376b4114b79512302990ae2461c..3b68638de95105123c7977ebfe2123535d6c4d62 100644 (file)
--- a/drivers/scsi/megaraid.c
+++ b/drivers/scsi/megaraid.c
@@ -2857,6 +2857,7 @@ static int mega_findCard (Scsi_Host_Template * pHostTmpl,
                 if (!host)
                         goto err_unmap;
  
+#if 0
                 /*
                  * Comment the following initialization if you know 'max_sectors' is
                  * not defined for this kernel.
@@ -2864,6 +2865,7 @@ static int mega_findCard (Scsi_Host_Template * pHostTmpl,
                  * greatly increases the IO performance - AM
                  */
                 host->max_sectors = 1024;
+#endif
  
                 scsi_set_pci_device(host, pdev);
                 megaCfg = (mega_host_config *) host->hostdata;
diff --git a/drivers/scsi/qla1280.h b/drivers/scsi/qla1280.h

index 27a16ad81979cd91849d8c8053e6aae2d75af501..9dc200174bd57bffa2d54623bb7bb404adbd2da8 100644 (file)
--- a/drivers/scsi/qla1280.h
+++ b/drivers/scsi/qla1280.h
@@ -371,8 +371,6 @@ typedef long  int32_t;
  #define SG_SEGMENTS     32             /* Cmd entry + 6 continuations */
  
  
-typedef struct timer_list   timer_t;         /* timer */
-
  /*
   * SCSI Request Block structure
   */
diff --git a/drivers/video/fbcon.c b/drivers/video/fbcon.c

index 4ccd3bb4fd0aa46389360b95d3bd811e2308f9f0..f9858b93f164ba62659b71dab112a5b60bc86224 100644 (file)
--- a/drivers/video/fbcon.c
+++ b/drivers/video/fbcon.c
@@ -1150,11 +1150,13 @@ static void fbcon_redraw(struct vc_data *conp, struct display *p,
                 }
             }
             scr_writew(c, d);
+           console_conditional_schedule();
             s++;
             d++;
         } while (s < le);
         if (s > start)
             p->dispsw->putcs(conp, p, start, s - start, real_y(p, line), x);
+       console_conditional_schedule();
         if (offset > 0)
                 line++;
         else {
diff --git a/fs/block_dev.c b/fs/block_dev.c

index f8951c81c5139f256b360e15da36961e6903266c..e515186e2811d83bb4c37b64722d1e8620543475 100644 (file)
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -2,6 +2,7 @@
   *  linux/fs/block_dev.c
   *
   *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 2001  Andrea Arcangeli <andrea@suse.de> SuSE
   */
  
  #include <linux/config.h>
@@ -14,311 +15,297 @@
  #include <linux/major.h>
  #include <linux/devfs_fs_kernel.h>
  #include <linux/smp_lock.h>
+#include <linux/iobuf.h>
+#include <linux/highmem.h>
+#include <linux/blkdev.h>
  
  #include <asm/uaccess.h>
  
-extern int *blk_size[];
-extern int *blksize_size[];
+static inline int blkdev_get_block(struct inode * inode, long iblock, struct buffer_head * bh_result)
+{
+       int err;
+
+       err = -EIO;
+       if (iblock >= buffered_blk_size(inode->i_rdev) >> (BUFFERED_BLOCKSIZE_BITS - BLOCK_SIZE_BITS))
+               goto out;
  
-#define MAX_BUF_PER_PAGE (PAGE_SIZE / 512)
-#define NBUF 64
+       bh_result->b_blocknr = iblock;
+       bh_result->b_state |= 1UL << BH_Mapped;
+       err = 0;
+
+ out:
+       return err;
+}
  
-ssize_t block_write(struct file * filp, const char * buf,
-                   size_t count, loff_t *ppos)
+static int blkdev_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize)
  {
-       struct inode * inode = filp->f_dentry->d_inode;
-       ssize_t blocksize, blocksize_bits, i, buffercount, write_error;
-       ssize_t block, blocks;
-       loff_t offset;
-       ssize_t chars;
-       ssize_t written, retval;
-       struct buffer_head * bhlist[NBUF];
-       size_t size;
-       kdev_t dev = inode->i_rdev;
-       struct buffer_head * bh, *bufferlist[NBUF];
-       register char * p;
-
-       if (is_read_only(dev))
-               return -EPERM;
-
-       retval = written = write_error = buffercount = 0;
-       blocksize = BLOCK_SIZE;
-       if (blksize_size[MAJOR(dev)] && blksize_size[MAJOR(dev)][MINOR(dev)])
-               blocksize = blksize_size[MAJOR(dev)][MINOR(dev)];
-
-       i = blocksize;
-       blocksize_bits = 0;
-       while(i != 1) {
-               blocksize_bits++;
-               i >>= 1;
-       }
+       int i, nr_blocks, retval, dev = inode->i_rdev;
+       unsigned long * blocks = iobuf->blocks;
  
-       block = *ppos >> blocksize_bits;
-       offset = *ppos & (blocksize-1);
-
-       if (blk_size[MAJOR(dev)])
-               size = ((loff_t) blk_size[MAJOR(dev)][MINOR(dev)] << BLOCK_SIZE_BITS) >> blocksize_bits;
-       else
-               size = INT_MAX;
-       while (count>0) {
-               if (block >= size) {
-                       retval = -ENOSPC;
-                       goto cleanup;
-               }
-               chars = blocksize - offset;
-               if (chars > count)
-                       chars=count;
-
-#if 0
-               /* get the buffer head */
-               {
-                       struct buffer_head * (*fn)(kdev_t, int, int) = getblk;
-                       if (chars != blocksize)
-                               fn = bread;
-                       bh = fn(dev, block, blocksize);
-                       if (!bh) {
-                               retval = -EIO;
-                               goto cleanup;
-                       }
-                       if (!buffer_uptodate(bh))
-                               wait_on_buffer(bh);
-               }
-#else
-               bh = getblk(dev, block, blocksize);
-               if (!bh) {
-                       retval = -EIO;
-                       goto cleanup;
-               }
+       if (blocksize != BUFFERED_BLOCKSIZE)
+               BUG();
  
-               if (!buffer_uptodate(bh))
-               {
-                 if (chars == blocksize)
-                   wait_on_buffer(bh);
-                 else
-                 {
-                   bhlist[0] = bh;
-                   if (!filp->f_reada || !read_ahead[MAJOR(dev)]) {
-                     /* We do this to force the read of a single buffer */
-                     blocks = 1;
-                   } else {
-                     /* Read-ahead before write */
-                     blocks = read_ahead[MAJOR(dev)] / (blocksize >> 9) / 2;
-                     if (block + blocks > size) blocks = size - block;
-                     if (blocks > NBUF) blocks=NBUF;
-                     if (!blocks) blocks = 1;
-                     for(i=1; i<blocks; i++)
-                     {
-                       bhlist[i] = getblk (dev, block+i, blocksize);
-                       if (!bhlist[i])
-                       {
-                         while(i >= 0) brelse(bhlist[i--]);
-                         retval = -EIO;
-                         goto cleanup;
-                       }
-                     }
-                   }
-                   ll_rw_block(READ, blocks, bhlist);
-                   for(i=1; i<blocks; i++) brelse(bhlist[i]);
-                   wait_on_buffer(bh);
-                   if (!buffer_uptodate(bh)) {
-                         brelse(bh);
-                         retval = -EIO;
-                         goto cleanup;
-                   }
-                 };
-               };
-#endif
-               block++;
-               p = offset + bh->b_data;
-               offset = 0;
-               *ppos += chars;
-               written += chars;
-               count -= chars;
-               copy_from_user(p,buf,chars);
-               p += chars;
-               buf += chars;
-               mark_buffer_uptodate(bh, 1);
-               mark_buffer_dirty(bh);
-               if (filp->f_flags & O_SYNC)
-                       bufferlist[buffercount++] = bh;
-               else
-                       brelse(bh);
-               if (buffercount == NBUF){
-                       ll_rw_block(WRITE, buffercount, bufferlist);
-                       for(i=0; i<buffercount; i++){
-                               wait_on_buffer(bufferlist[i]);
-                               if (!buffer_uptodate(bufferlist[i]))
-                                       write_error=1;
-                               brelse(bufferlist[i]);
-                       }
-                       buffercount=0;
-               }
-               balance_dirty();
-               if (write_error)
-                       break;
+       nr_blocks = iobuf->length >> BUFFERED_BLOCKSIZE_BITS;
+       /* build the blocklist */
+       for (i = 0; i < nr_blocks; i++, blocknr++) {
+               struct buffer_head bh;
+
+               retval = blkdev_get_block(inode, blocknr, &bh);
+               if (retval)
+                       goto out;
+
+               blocks[i] = bh.b_blocknr;
         }
-       cleanup:
-       if ( buffercount ){
-               ll_rw_block(WRITE, buffercount, bufferlist);
-               for(i=0; i<buffercount; i++){
-                       wait_on_buffer(bufferlist[i]);
-                       if (!buffer_uptodate(bufferlist[i]))
-                               write_error=1;
-                       brelse(bufferlist[i]);
+
+       retval = brw_kiovec(rw, 1, &iobuf, dev, iobuf->blocks, blocksize);
+
+ out:
+       return retval;
+}
+
+static int blkdev_writepage(struct page * page)
+{
+       int err, i;
+       unsigned long block;
+       struct buffer_head *bh, *head;
+       struct inode *inode = page->mapping->host;
+
+       if (!PageLocked(page))
+               BUG();
+
+       if (!page->buffers)
+               create_empty_buffers(page, inode->i_rdev, BUFFERED_BLOCKSIZE);
+       head = page->buffers;
+
+       block = page->index << (PAGE_CACHE_SHIFT - BUFFERED_BLOCKSIZE_BITS);
+
+       bh = head;
+       i = 0;
+
+       /* Stage 1: make sure we have all the buffers mapped! */
+       do {
+               /*
+                * If the buffer isn't up-to-date, we can't be sure
+                * that the buffer has been initialized with the proper
+                * block number information etc..
+                *
+                * Leave it to the low-level FS to make all those
+                * decisions (block #0 may actually be a valid block)
+                */
+               if (!buffer_mapped(bh)) {
+                       err = blkdev_get_block(inode, block, bh);
+                       if (err)
+                               goto out;
                 }
-       }               
-       if(!retval)
-               filp->f_reada = 1;
-       if(write_error)
-               return -EIO;
-       return written ? written : retval;
+               bh = bh->b_this_page;
+               block++;
+       } while (bh != head);
+
+       /* Stage 2: lock the buffers, mark them clean */
+       do {
+               lock_buffer(bh);
+               set_buffer_async_io(bh);
+               set_bit(BH_Uptodate, &bh->b_state);
+               clear_bit(BH_Dirty, &bh->b_state);
+               bh = bh->b_this_page;
+       } while (bh != head);
+
+       /* Stage 3: submit the IO */
+       do {
+               submit_bh(WRITE, bh);
+               bh = bh->b_this_page;
+       } while (bh != head);
+
+       /* Done - end_buffer_io_async will unlock */
+       SetPageUptodate(page);
+       return 0;
+
+out:
+       ClearPageUptodate(page);
+       UnlockPage(page);
+       return err;
  }
  
-ssize_t block_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
+static int blkdev_readpage(struct file * file, struct page * page)
  {
-       struct inode * inode = filp->f_dentry->d_inode;
-       size_t block;
-       loff_t offset;
-       ssize_t blocksize;
-       ssize_t blocksize_bits, i;
-       size_t blocks, rblocks, left;
-       int bhrequest, uptodate;
-       struct buffer_head ** bhb, ** bhe;
-       struct buffer_head * buflist[NBUF];
-       struct buffer_head * bhreq[NBUF];
-       unsigned int chars;
-       loff_t size;
-       kdev_t dev;
-       ssize_t read;
-
-       dev = inode->i_rdev;
-       blocksize = BLOCK_SIZE;
-       if (blksize_size[MAJOR(dev)] && blksize_size[MAJOR(dev)][MINOR(dev)])
-               blocksize = blksize_size[MAJOR(dev)][MINOR(dev)];
-       i = blocksize;
-       blocksize_bits = 0;
-       while (i != 1) {
-               blocksize_bits++;
-               i >>= 1;
-       }
+       struct inode *inode = page->mapping->host;
+       kdev_t dev = inode->i_rdev;
+       unsigned long iblock, lblock;
+       struct buffer_head *bh, *head, *arr[1 << (PAGE_CACHE_SHIFT - BUFFERED_BLOCKSIZE_BITS)];
+       unsigned int blocks;
+       int nr, i;
+
+       if (!PageLocked(page))
+               PAGE_BUG(page);
+       if (!page->buffers)
+               create_empty_buffers(page, dev, BUFFERED_BLOCKSIZE);
+       head = page->buffers;
+
+       blocks = PAGE_CACHE_SIZE >> BUFFERED_BLOCKSIZE_BITS;
+       iblock = page->index << (PAGE_CACHE_SHIFT - BUFFERED_BLOCKSIZE_BITS);
+       lblock = buffered_blk_size(dev) >> (BUFFERED_BLOCKSIZE_BITS - BLOCK_SIZE_BITS);
+       bh = head;
+       nr = 0;
+       i = 0;
+
+       do {
+               if (buffer_uptodate(bh))
+                       continue;
  
-       offset = *ppos;
-       if (blk_size[MAJOR(dev)])
-               size = (loff_t) blk_size[MAJOR(dev)][MINOR(dev)] << BLOCK_SIZE_BITS;
-       else
-               size = (loff_t) INT_MAX << BLOCK_SIZE_BITS;
-
-       if (offset > size)
-               left = 0;
-       /* size - offset might not fit into left, so check explicitly. */
-       else if (size - offset > INT_MAX)
-               left = INT_MAX;
-       else
-               left = size - offset;
-       if (left > count)
-               left = count;
-       if (left <= 0)
+               if (!buffer_mapped(bh)) {
+                       if (iblock <= lblock) {
+                               if (blkdev_get_block(inode, iblock, bh))
+                                       continue;
+                       }
+                       if (!buffer_mapped(bh)) {
+                               memset(kmap(page) + i * BUFFERED_BLOCKSIZE, 0, BUFFERED_BLOCKSIZE);
+                               flush_dcache_page(page);
+                               kunmap(page);
+                               set_bit(BH_Uptodate, &bh->b_state);
+                               continue;
+                       }
+                       /* get_block() might have updated the buffer synchronously */
+                       if (buffer_uptodate(bh))
+                               continue;
+               }
+
+               arr[nr] = bh;
+               nr++;
+       } while (i++, iblock++, (bh = bh->b_this_page) != head);
+
+       if (!nr) {
+               /*
+                * all buffers are uptodate - we can set the page
+                * uptodate as well.
+                */
+               SetPageUptodate(page);
+               UnlockPage(page);
                 return 0;
-       read = 0;
-       block = offset >> blocksize_bits;
-       offset &= blocksize-1;
-       size >>= blocksize_bits;
-       rblocks = blocks = (left + offset + blocksize - 1) >> blocksize_bits;
-       bhb = bhe = buflist;
-       if (filp->f_reada) {
-               if (blocks < read_ahead[MAJOR(dev)] / (blocksize >> 9))
-                       blocks = read_ahead[MAJOR(dev)] / (blocksize >> 9);
-               if (rblocks > blocks)
-                       blocks = rblocks;
-               
         }
-       if (block + blocks > size) {
-               blocks = size - block;
-               if (blocks == 0)
-                       return 0;
+
+       /* Stage two: lock the buffers */
+       for (i = 0; i < nr; i++) {
+               struct buffer_head * bh = arr[i];
+               lock_buffer(bh);
+               set_buffer_async_io(bh);
         }
  
-       /* We do this in a two stage process.  We first try to request
-          as many blocks as we can, then we wait for the first one to
-          complete, and then we try to wrap up as many as are actually
-          done.  This routine is rather generic, in that it can be used
-          in a filesystem by substituting the appropriate function in
-          for getblk.
+       /* Stage 3: start the IO */
+       for (i = 0; i < nr; i++)
+               submit_bh(READ, arr[i]);
  
-          This routine is optimized to make maximum use of the various
-          buffers and caches. */
+       return 0;
+}
  
-       do {
-               bhrequest = 0;
-               uptodate = 1;
-               while (blocks) {
-                       --blocks;
-                       *bhb = getblk(dev, block++, blocksize);
-                       if (*bhb && !buffer_uptodate(*bhb)) {
-                               uptodate = 0;
-                               bhreq[bhrequest++] = *bhb;
-                       }
+static int __blkdev_prepare_write(struct inode *inode, struct page *page,
+                                 unsigned from, unsigned to)
+{
+       kdev_t dev = inode->i_rdev;
+       unsigned block_start, block_end;
+       unsigned long block;
+       int err = 0;
+       struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
+       kmap(page);
  
-                       if (++bhb == &buflist[NBUF])
-                               bhb = buflist;
+       if (!page->buffers)
+               create_empty_buffers(page, dev, BUFFERED_BLOCKSIZE);
+       head = page->buffers;
  
-                       /* If the block we have on hand is uptodate, go ahead
-                          and complete processing. */
-                       if (uptodate)
-                               break;
-                       if (bhb == bhe)
-                               break;
-               }
+       block = page->index << (PAGE_CACHE_SHIFT - BUFFERED_BLOCKSIZE_BITS);
  
-               /* Now request them all */
-               if (bhrequest) {
-                       ll_rw_block(READ, bhrequest, bhreq);
+       for(bh = head, block_start = 0; bh != head || !block_start;
+           block++, block_start=block_end, bh = bh->b_this_page) {
+               if (!bh)
+                       BUG();
+               block_end = block_start + BUFFERED_BLOCKSIZE;
+               if (block_end <= from)
+                       continue;
+               if (block_start >= to)
+                       break;
+               if (!buffer_mapped(bh)) {
+                       err = blkdev_get_block(inode, block, bh);
+                       if (err)
+                               goto out;
+               }
+               if (Page_Uptodate(page)) {
+                       set_bit(BH_Uptodate, &bh->b_state);
+                       continue; 
                 }
+               if (!buffer_uptodate(bh) &&
+                    (block_start < from || block_end > to)) {
+                       ll_rw_block(READ, 1, &bh);
+                       *wait_bh++=bh;
+               }
+       }
+       /*
+        * If we issued read requests - let them complete.
+        */
+       while(wait_bh > wait) {
+               wait_on_buffer(*--wait_bh);
+               err = -EIO;
+               if (!buffer_uptodate(*wait_bh))
+                       goto out;
+       }
+       return 0;
+out:
+       return err;
+}
  
-               do { /* Finish off all I/O that has actually completed */
-                       if (*bhe) {
-                               wait_on_buffer(*bhe);
-                               if (!buffer_uptodate(*bhe)) {   /* read error? */
-                                       brelse(*bhe);
-                                       if (++bhe == &buflist[NBUF])
-                                         bhe = buflist;
-                                       left = 0;
-                                       break;
-                               }
-                       }                       
-                       if (left < blocksize - offset)
-                               chars = left;
-                       else
-                               chars = blocksize - offset;
-                       *ppos += chars;
-                       left -= chars;
-                       read += chars;
-                       if (*bhe) {
-                               copy_to_user(buf,offset+(*bhe)->b_data,chars);
-                               brelse(*bhe);
-                               buf += chars;
-                       } else {
-                               while (chars-- > 0)
-                                       put_user(0,buf++);
+static int blkdev_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to)
+{
+       struct inode *inode = page->mapping->host;
+       int err = __blkdev_prepare_write(inode, page, from, to);
+       if (err) {
+               ClearPageUptodate(page);
+               kunmap(page);
+       }
+       return err;
+}
+
+static int __blkdev_commit_write(struct inode *inode, struct page *page,
+                                unsigned from, unsigned to)
+{
+       unsigned block_start, block_end;
+       int partial = 0, need_balance_dirty = 0;
+       struct buffer_head *bh, *head;
+
+       for(bh = head = page->buffers, block_start = 0;
+           bh != head || !block_start;
+           block_start=block_end, bh = bh->b_this_page) {
+               block_end = block_start + BUFFERED_BLOCKSIZE;
+               if (block_end <= from || block_start >= to) {
+                       if (!buffer_uptodate(bh))
+                               partial = 1;
+               } else {
+                       set_bit(BH_Uptodate, &bh->b_state);
+                       if (!atomic_set_buffer_dirty(bh)) {
+                               __mark_dirty(bh);
+                               buffer_insert_inode_data_queue(bh, inode);
+                               need_balance_dirty = 1;
                         }
-                       offset = 0;
-                       if (++bhe == &buflist[NBUF])
-                               bhe = buflist;
-               } while (left > 0 && bhe != bhb && (!*bhe || !buffer_locked(*bhe)));
-               if (bhe == bhb && !blocks)
-                       break;
-       } while (left > 0);
-
-/* Release the read-ahead blocks */
-       while (bhe != bhb) {
-               brelse(*bhe);
-               if (++bhe == &buflist[NBUF])
-                       bhe = buflist;
-       };
-       if (!read)
-               return -EIO;
-       filp->f_reada = 1;
-       return read;
+               }
+       }
+
+       if (need_balance_dirty)
+               balance_dirty();
+       /*
+        * is this a partial write that happened to make all buffers
+        * uptodate then we can optimize away a bogus readpage() for
+        * the next read(). Here we 'discover' wether the page went
+        * uptodate as a result of this (potentially partial) write.
+        */
+       if (!partial)
+               SetPageUptodate(page);
+       return 0;
+}
+
+static int blkdev_commit_write(struct file *file, struct page *page,
+                              unsigned from, unsigned to)
+{
+       struct inode *inode = page->mapping->host;
+       __blkdev_commit_write(inode,page,from,to);
+       kunmap(page);
+       return 0;
  }
  
  /*
@@ -354,6 +341,17 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
  }
         
  
+static int __block_fsync(struct inode * inode)
+{
+       int ret;
+
+       filemap_fdatasync(inode->i_mapping);
+       ret = sync_buffers(inode->i_rdev, 1);
+       filemap_fdatawait(inode->i_mapping);
+
+       return ret;
+}
+
  /*
   *     Filp may be NULL when we are called by an msync of a vma
   *     since the vma has no handle.
@@ -361,7 +359,9 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
   
  static int block_fsync(struct file *filp, struct dentry *dentry, int datasync)
  {
-       return fsync_dev(dentry->d_inode->i_rdev);
+       struct inode * inode = dentry->d_inode;
+
+       return __block_fsync(inode);
  }
  
  /*
@@ -452,6 +452,7 @@ struct block_device *bdget(dev_t dev)
         atomic_set(&new_bdev->bd_count,1);
         new_bdev->bd_dev = dev;
         new_bdev->bd_op = NULL;
+       new_bdev->bd_inode = NULL;
         spin_lock(&bdev_lock);
         bdev = bdfind(dev, head);
         if (!bdev) {
@@ -467,9 +468,11 @@ struct block_device *bdget(dev_t dev)
  void bdput(struct block_device *bdev)
  {
         if (atomic_dec_and_test(&bdev->bd_count)) {
-               spin_lock(&bdev_lock);
-               if (atomic_read(&bdev->bd_openers))
+               if (bdev->bd_openers)
+                       BUG();
+               if (bdev->bd_cache_openers)
                         BUG();
+               spin_lock(&bdev_lock);
                 list_del(&bdev->bd_hash);
                 spin_unlock(&bdev_lock);
                 destroy_bdev(bdev);
@@ -616,6 +619,7 @@ int blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags, int kind)
         int ret = -ENODEV;
         kdev_t rdev = to_kdev_t(bdev->bd_dev); /* this should become bdev */
         down(&bdev->bd_sem);
+       lock_kernel();
         if (!bdev->bd_op)
                 bdev->bd_op = get_blkfops(MAJOR(rdev));
         if (bdev->bd_op) {
@@ -638,13 +642,15 @@ int blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags, int kind)
                         ret = 0;
                         if (bdev->bd_op->open)
                                 ret = bdev->bd_op->open(fake_inode, &fake_file);
-                       if (!ret)
-                               atomic_inc(&bdev->bd_openers);
-                       else if (!atomic_read(&bdev->bd_openers))
+                       if (!ret) {
+                               bdev->bd_openers++;
+                               atomic_inc(&bdev->bd_count);
+                       } else if (!bdev->bd_openers)
                                 bdev->bd_op = NULL;
                         iput(fake_inode);
                 }
         }
+       unlock_kernel();
         up(&bdev->bd_sem);
         return ret;
  }
@@ -653,6 +659,15 @@ int blkdev_open(struct inode * inode, struct file * filp)
  {
         int ret = -ENXIO;
         struct block_device *bdev = inode->i_bdev;
+
+       /*
+        * Preserve backwards compatibility and allow large file access
+        * even if userspace doesn't ask for it explicitly. Some mkfs
+        * binary needs it. We might want to drop this workaround
+        * during an unstable branch.
+        */
+       filp->f_flags |= O_LARGEFILE;
+
         down(&bdev->bd_sem);
         lock_kernel();
         if (!bdev->bd_op)
@@ -661,9 +676,21 @@ int blkdev_open(struct inode * inode, struct file * filp)
                 ret = 0;
                 if (bdev->bd_op->open)
                         ret = bdev->bd_op->open(inode,filp);
-               if (!ret)
-                       atomic_inc(&bdev->bd_openers);
-               else if (!atomic_read(&bdev->bd_openers))
+               if (!ret) {
+                       bdev->bd_openers++;
+                       if (!bdev->bd_cache_openers && bdev->bd_inode)
+                               BUG();
+                       if (bdev->bd_cache_openers && !bdev->bd_inode)
+                               BUG();
+                       if (!bdev->bd_cache_openers++)
+                               bdev->bd_inode = inode;
+                       else {
+                               if (bdev->bd_inode != inode && !inode->i_mapping_overload++) {
+                                       inode->i_mapping = bdev->bd_inode->i_mapping;
+                                       atomic_inc(&bdev->bd_inode->i_count);
+                               }
+                       }
+               } else if (!bdev->bd_openers)
                         bdev->bd_op = NULL;
         }       
         unlock_kernel();
@@ -676,16 +703,14 @@ int blkdev_put(struct block_device *bdev, int kind)
         int ret = 0;
         kdev_t rdev = to_kdev_t(bdev->bd_dev); /* this should become bdev */
         down(&bdev->bd_sem);
-       /* syncing will go here */
         lock_kernel();
         if (kind == BDEV_FILE)
                 fsync_dev(rdev);
         else if (kind == BDEV_FS)
                 fsync_no_super(rdev);
-       if (atomic_dec_and_test(&bdev->bd_openers)) {
-               /* invalidating buffers will go here */
+       /* only filesystems uses buffer cache for the metadata these days */
+       if (kind == BDEV_FS)
                 invalidate_buffers(rdev);
-       }
         if (bdev->bd_op->release) {
                 struct inode * fake_inode = get_empty_inode();
                 ret = -ENOMEM;
@@ -693,19 +718,84 @@ int blkdev_put(struct block_device *bdev, int kind)
                         fake_inode->i_rdev = rdev;
                         ret = bdev->bd_op->release(fake_inode, NULL);
                         iput(fake_inode);
-               }
+               } else
+                       printk(KERN_WARNING "blkdev_put: ->release couldn't be run due -ENOMEM\n");
         }
-       if (!atomic_read(&bdev->bd_openers))
+       if (!--bdev->bd_openers)
                 bdev->bd_op = NULL;     /* we can't rely on driver being */
                                         /* kind to stay around. */
         unlock_kernel();
         up(&bdev->bd_sem);
+       bdput(bdev);
         return ret;
  }
  
-static int blkdev_close(struct inode * inode, struct file * filp)
+int blkdev_close(struct inode * inode, struct file * filp)
  {
-       return blkdev_put(inode->i_bdev, BDEV_FILE);
+       struct block_device *bdev = inode->i_bdev;
+       int ret = 0;
+       struct inode * bd_inode = bdev->bd_inode;
+
+       if (bd_inode->i_mapping != inode->i_mapping)
+               BUG();
+       down(&bdev->bd_sem);
+       lock_kernel();
+       /* cache coherency protocol */
+       if (!--bdev->bd_cache_openers) {
+               struct super_block * sb;
+
+               /* flush the pagecache to disk */
+               __block_fsync(inode);
+               /* drop the pagecache, uptodate info is on disk by now */
+               truncate_inode_pages(inode->i_mapping, 0);
+               /* forget the bdev pagecache address space */
+               bdev->bd_inode = NULL;
+
+               /* if the fs was mounted ro just throw away most of its caches */
+               sb = get_super(inode->i_rdev);
+               if (sb) {
+                       if (sb->s_flags & MS_RDONLY) {
+                               /*
+                                * This call is not destructive in terms of
+                                * dirty cache, so it is safe to run it
+                                * even if the fs gets mounted read write
+                                * under us.
+                                */
+                               invalidate_device(inode->i_rdev, 0);
+                       }
+
+                       /*
+                        * Now only if an underlying fs is mounted ro we'll
+                        * try to refill its pinned buffer cache from disk.
+                        * The fs cannot go away under us because we hold
+                        * the read semaphore of the superblock, but
+                        * we must also serialize against ->remount_fs and
+                        * ->read_super callbacks to avoid MS_RDONLY to go
+                        * away under us.
+                        */
+                       lock_super(sb);
+                       if (sb->s_flags & MS_RDONLY)
+                               /* now refill the obsolete pinned buffers from disk */
+                               update_buffers(inode->i_rdev);
+                       unlock_super(sb);
+
+                       drop_super(sb);
+               }
+       }
+       if (inode != bd_inode && !--inode->i_mapping_overload) {
+               inode->i_mapping = &inode->i_data;
+               iput(bd_inode);
+       }
+
+       /* release the device driver */
+       if (bdev->bd_op->release)
+               ret = bdev->bd_op->release(inode, NULL);
+       if (!--bdev->bd_openers)
+               bdev->bd_op = NULL;
+       unlock_kernel();
+       up(&bdev->bd_sem);
+
+       return ret;
  }
  
  static int blkdev_ioctl(struct inode *inode, struct file *file, unsigned cmd,
@@ -716,12 +806,22 @@ static int blkdev_ioctl(struct inode *inode, struct file *file, unsigned cmd,
         return -EINVAL;
  }
  
+struct address_space_operations def_blk_aops = {
+       readpage: blkdev_readpage,
+       writepage: blkdev_writepage,
+       sync_page: block_sync_page,
+       prepare_write: blkdev_prepare_write,
+       commit_write: blkdev_commit_write,
+       direct_IO: blkdev_direct_IO,
+};
+
  struct file_operations def_blk_fops = {
         open:           blkdev_open,
         release:        blkdev_close,
         llseek:         block_llseek,
-       read:           block_read,
-       write:          block_write,
+       read:           generic_file_read,
+       write:          generic_file_write,
+       mmap:           generic_file_mmap,
         fsync:          block_fsync,
         ioctl:          blkdev_ioctl,
  };
diff --git a/fs/buffer.c b/fs/buffer.c

index 03c1907d45f2a99cf76d82bda6e9c41e66b07521..c0909b2424d1de6999d56af9a5ca22cd95300075 100644 (file)
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -131,10 +131,11 @@ union bdflush_param {
  
  /* These are the min and max parameter values that we will allow to be assigned */
  int bdflush_min[N_PARAM] = {  0,  10,    5,   25,  0,   1*HZ,   0, 0, 0};
-int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,600*HZ, 6000*HZ, 100, 0, 0};
+int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,10000*HZ, 6000*HZ, 100, 0, 0};
  
  inline void unlock_buffer(struct buffer_head *bh)
  {
+       clear_bit(BH_Wait_IO, &bh->b_state);
         clear_bit(BH_Lock, &bh->b_state);
         smp_mb__after_clear_bit();
         if (waitqueue_active(&bh->b_wait))
@@ -208,7 +209,7 @@ static int write_some_buffers(kdev_t dev)
         int nr;
  
         next = lru_list[BUF_DIRTY];
-       nr = nr_buffers_type[BUF_DIRTY] * 2;
+       nr = nr_buffers_type[BUF_DIRTY];
         count = 0;
         while (next && --nr >= 0) {
                 struct buffer_head * bh = next;
@@ -262,7 +263,7 @@ static int wait_for_buffers(kdev_t dev, int index, int refile)
         int nr;
  
         next = lru_list[index];
-       nr = nr_buffers_type[index] * 2;
+       nr = nr_buffers_type[index];
         while (next && --nr >= 0) {
                 struct buffer_head *bh = next;
                 next = bh->b_next_free;
@@ -309,7 +310,7 @@ static int wait_for_locked_buffers(kdev_t dev, int index, int refile)
   * We will ultimately want to put these in a separate list, but for
   * now we search all of the lists for dirty buffers.
   */
-static int sync_buffers(kdev_t dev, int wait)
+int sync_buffers(kdev_t dev, int wait)
  {
         int err = 0;
  
@@ -624,6 +625,16 @@ void buffer_insert_inode_queue(struct buffer_head *bh, struct inode *inode)
         spin_unlock(&lru_list_lock);
  }
  
+void buffer_insert_inode_data_queue(struct buffer_head *bh, struct inode *inode)
+{
+       spin_lock(&lru_list_lock);
+       if (bh->b_inode)
+               list_del(&bh->b_inode_buffers);
+       bh->b_inode = inode;
+       list_add(&bh->b_inode_buffers, &inode->i_dirty_data_buffers);
+       spin_unlock(&lru_list_lock);
+}
+
  /* The caller must have the lru_list lock before calling the 
     remove_inode_queue functions.  */
  static void __remove_inode_queue(struct buffer_head *bh)
@@ -643,13 +654,12 @@ int inode_has_buffers(struct inode *inode)
         int ret;
         
         spin_lock(&lru_list_lock);
-       ret = !list_empty(&inode->i_dirty_buffers);
+       ret = !list_empty(&inode->i_dirty_buffers) || !list_empty(&inode->i_dirty_data_buffers);
         spin_unlock(&lru_list_lock);
         
         return ret;
  }
  
-
  /* If invalidate_buffers() will trash dirty buffers, it means some kind
     of fs corruption is going on. Trashing dirty data always imply losing
     information that was supposed to be just stored on the physical layer
@@ -669,8 +679,16 @@ int inode_has_buffers(struct inode *inode)
  
     These are two special cases. Normal usage imply the device driver
     to issue a sync on the device (without waiting I/O completion) and
-   then an invalidate_buffers call that doesn't trash dirty buffers. */
-void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
+   then an invalidate_buffers call that doesn't trash dirty buffers.
+
+   For handling cache coherency with the blkdev pagecache the 'update' case
+   is been introduced. It is needed to re-read from disk any pinned
+   buffer. NOTE: re-reading from disk is destructive so we can do it only
+   when we assume nobody is changing the buffercache under our I/O and when
+   we think the disk contains more recent information than the buffercache.
+   The update == 1 pass marks the buffers we need to update, the update == 2
+   pass does the actual I/O. */
+void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers, int update)
  {
         int i, nlist, slept;
         struct buffer_head * bh, * bh_next;
@@ -701,13 +719,36 @@ void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
                         }
  
                         write_lock(&hash_table_lock);
-                       if (!atomic_read(&bh->b_count) &&
-                           (destroy_dirty_buffers || !buffer_dirty(bh))) {
-                               remove_inode_queue(bh);
-                               __remove_from_queues(bh);
-                               put_last_free(bh);
+                       /* All buffers in the lru lists are mapped */
+                       if (!buffer_mapped(bh))
+                               BUG();
+                       if (!atomic_read(&bh->b_count)) {
+                               if (destroy_dirty_buffers || !buffer_dirty(bh)) {
+                                       remove_inode_queue(bh);
+                                       __remove_from_queues(bh);
+                                       put_last_free(bh);
+                               }
+                       } else if (update) {
+                               if ((update == 2) ^ buffer_uptodate(bh)  &&
+                                   (update == 2) ^ buffer_req(bh)) {
+                                       write_unlock(&hash_table_lock);
+                                       atomic_inc(&bh->b_count);
+                                       spin_unlock(&lru_list_lock);
+
+                                       if (update == 2) {
+                                               ll_rw_block(READ, 1, &bh);
+                                               wait_on_buffer(bh);
+                                       } else {
+                                               lock_buffer(bh);
+                                               clear_bit(BH_Uptodate, &bh->b_state);
+                                               clear_bit(BH_Req, &bh->b_state);
+                                               unlock_buffer(bh);
+                                       }                                               
+
+                                       atomic_dec(&bh->b_count);
+                                       goto retry;
+                               }
                         }
-                       /* else complain loudly? */
  
                         write_unlock(&hash_table_lock);
                         if (slept)
@@ -798,9 +839,7 @@ void set_blocksize(kdev_t dev, int size)
  static void free_more_memory(void)
  {
         balance_dirty();
-       page_launder(GFP_NOFS, 0);              
         wakeup_bdflush();
-       wakeup_kswapd();
         current->policy |= SCHED_YIELD;
         __set_current_state(TASK_RUNNING);
         schedule();
@@ -853,17 +892,17 @@ static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
          * that unlock the page..
          */
         spin_lock_irqsave(&page_uptodate_lock, flags);
+       mark_buffer_async(bh, 0);
         unlock_buffer(bh);
         tmp = bh->b_this_page;
         while (tmp != bh) {
-               if (tmp->b_end_io == end_buffer_io_async && buffer_locked(tmp))
+               if (buffer_async(tmp) && buffer_locked(tmp))
                         goto still_busy;
                 tmp = tmp->b_this_page;
         }
  
         /* OK, the async IO on this page is complete. */
         spin_unlock_irqrestore(&page_uptodate_lock, flags);
-       put_bh(bh);
  
         /*
          * if none of the buffers had errors then we can set the
@@ -883,13 +922,13 @@ static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
         return;
  
  still_busy:
-       put_bh(bh);
         spin_unlock_irqrestore(&page_uptodate_lock, flags);
         return;
  }
  
-void set_buffer_async_io(struct buffer_head *bh) {
+inline void set_buffer_async_io(struct buffer_head *bh) {
      bh->b_end_io = end_buffer_io_async ;
+    mark_buffer_async(bh, 1);
  }
  
  /*
@@ -961,6 +1000,54 @@ int fsync_inode_buffers(struct inode *inode)
                 return err2;
  }
  
+int fsync_inode_data_buffers(struct inode *inode)
+{
+       struct buffer_head *bh;
+       struct inode tmp;
+       int err = 0, err2;
+       
+       INIT_LIST_HEAD(&tmp.i_dirty_data_buffers);
+       
+       spin_lock(&lru_list_lock);
+
+       while (!list_empty(&inode->i_dirty_data_buffers)) {
+               bh = BH_ENTRY(inode->i_dirty_data_buffers.next);
+               list_del(&bh->b_inode_buffers);
+               if (!buffer_dirty(bh) && !buffer_locked(bh))
+                       bh->b_inode = NULL;
+               else {
+                       bh->b_inode = &tmp;
+                       list_add(&bh->b_inode_buffers, &tmp.i_dirty_data_buffers);
+                       if (buffer_dirty(bh)) {
+                               get_bh(bh);
+                               spin_unlock(&lru_list_lock);
+                               ll_rw_block(WRITE, 1, &bh);
+                               brelse(bh);
+                               spin_lock(&lru_list_lock);
+                       }
+               }
+       }
+
+       while (!list_empty(&tmp.i_dirty_data_buffers)) {
+               bh = BH_ENTRY(tmp.i_dirty_data_buffers.prev);
+               remove_inode_queue(bh);
+               get_bh(bh);
+               spin_unlock(&lru_list_lock);
+               wait_on_buffer(bh);
+               if (!buffer_uptodate(bh))
+                       err = -EIO;
+               brelse(bh);
+               spin_lock(&lru_list_lock);
+       }
+       
+       spin_unlock(&lru_list_lock);
+       err2 = osync_inode_data_buffers(inode);
+
+       if (err)
+               return err;
+       else
+               return err2;
+}
  
  /*
   * osync is designed to support O_SYNC io.  It waits synchronously for
@@ -1002,6 +1089,35 @@ int osync_inode_buffers(struct inode *inode)
         return err;
  }
  
+int osync_inode_data_buffers(struct inode *inode)
+{
+       struct buffer_head *bh;
+       struct list_head *list;
+       int err = 0;
+
+       spin_lock(&lru_list_lock);
+       
+ repeat:
+
+       for (list = inode->i_dirty_data_buffers.prev; 
+            bh = BH_ENTRY(list), list != &inode->i_dirty_data_buffers;
+            list = bh->b_inode_buffers.prev) {
+               if (buffer_locked(bh)) {
+                       get_bh(bh);
+                       spin_unlock(&lru_list_lock);
+                       wait_on_buffer(bh);
+                       if (!buffer_uptodate(bh))
+                               err = -EIO;
+                       brelse(bh);
+                       spin_lock(&lru_list_lock);
+                       goto repeat;
+               }
+       }
+
+       spin_unlock(&lru_list_lock);
+       return err;
+}
+
  
  /*
   * Invalidate any and all dirty buffers on a given inode.  We are
@@ -1010,15 +1126,13 @@ int osync_inode_buffers(struct inode *inode)
   */
  void invalidate_inode_buffers(struct inode *inode)
  {
-       struct list_head *list, *next;
+       struct list_head * entry;
         
         spin_lock(&lru_list_lock);
-       list = inode->i_dirty_buffers.next; 
-       while (list != &inode->i_dirty_buffers) {
-               next = list->next;
-               remove_inode_queue(BH_ENTRY(list));
-               list = next;
-       }
+       while ((entry = inode->i_dirty_buffers.next) != &inode->i_dirty_buffers)
+               remove_inode_queue(BH_ENTRY(entry));
+       while ((entry = inode->i_dirty_data_buffers.next) != &inode->i_dirty_data_buffers)
+               remove_inode_queue(BH_ENTRY(entry));
         spin_unlock(&lru_list_lock);
  }
  
@@ -1070,6 +1184,7 @@ repeat:
         out:
                 write_unlock(&hash_table_lock);
                 spin_unlock(&lru_list_lock);
+               touch_buffer(bh);
                 return bh;
         }
  
@@ -1138,7 +1253,7 @@ void balance_dirty(void)
         }
  }
  
-static __inline__ void __mark_dirty(struct buffer_head *bh)
+inline void __mark_dirty(struct buffer_head *bh)
  {
         bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
         refile_buffer(bh);
@@ -1171,8 +1286,6 @@ static void __refile_buffer(struct buffer_head *bh)
                 dispose = BUF_LOCKED;
         if (buffer_dirty(bh))
                 dispose = BUF_DIRTY;
-       if (buffer_protected(bh))
-               dispose = BUF_PROTECTED;
         if (dispose != bh->b_list) {
                 __remove_from_lru_list(bh, bh->b_list);
                 bh->b_list = dispose;
@@ -1212,11 +1325,11 @@ void __bforget(struct buffer_head * buf)
         /* grab the lru lock here to block bdflush. */
         spin_lock(&lru_list_lock);
         write_lock(&hash_table_lock);
-       if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf) || buffer_protected(buf))
+       if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf))
                 goto in_use;
         __hash_unlink(buf);
-       remove_inode_queue(buf);
         write_unlock(&hash_table_lock);
+       remove_inode_queue(buf);
         __remove_from_lru_list(buf, buf->b_list);
         spin_unlock(&lru_list_lock);
         put_last_free(buf);
@@ -1236,7 +1349,6 @@ struct buffer_head * bread(kdev_t dev, int block, int size)
         struct buffer_head * bh;
  
         bh = getblk(dev, block, size);
-       touch_buffer(bh);
         if (buffer_uptodate(bh))
                 return bh;
         ll_rw_block(READ, 1, &bh);
@@ -1437,7 +1549,7 @@ static void unmap_buffer(struct buffer_head * bh)
   * we have truncated the file and are going to free the
   * blocks on-disk..
   */
-int block_flushpage(struct page *page, unsigned long offset)
+int discard_bh_page(struct page *page, unsigned long offset, int drop_pagecache)
  {
         struct buffer_head *head, *bh, *next;
         unsigned int curr_off = 0;
@@ -1474,7 +1586,8 @@ int block_flushpage(struct page *page, unsigned long offset)
          */
         if (!offset) {
                 if (!try_to_free_buffers(page, 0)) {
-                       atomic_inc(&buffermem_pages);
+                       if (drop_pagecache)
+                               atomic_inc(&buffermem_pages);
                         return 0;
                 }
         }
@@ -1482,7 +1595,7 @@ int block_flushpage(struct page *page, unsigned long offset)
         return 1;
  }
  
-static void create_empty_buffers(struct page *page, kdev_t dev, unsigned long blocksize)
+void create_empty_buffers(struct page *page, kdev_t dev, unsigned long blocksize)
  {
         struct buffer_head *bh, *head, *tail;
  
@@ -1591,8 +1704,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page, get_b
         /* Stage 2: lock the buffers, mark them clean */
         do {
                 lock_buffer(bh);
-               bh->b_end_io = end_buffer_io_async;
-               get_bh(bh);
+               set_buffer_async_io(bh);
                 set_bit(BH_Uptodate, &bh->b_state);
                 clear_bit(BH_Dirty, &bh->b_state);
                 bh = bh->b_this_page;
@@ -1706,7 +1818,7 @@ static int __block_commit_write(struct inode *inode, struct page *page,
                         set_bit(BH_Uptodate, &bh->b_state);
                         if (!atomic_set_buffer_dirty(bh)) {
                                 __mark_dirty(bh);
-                               buffer_insert_inode_queue(bh, inode);
+                               buffer_insert_inode_data_queue(bh, inode);
                                 need_balance_dirty = 1;
                         }
                 }
@@ -1793,8 +1905,7 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
         for (i = 0; i < nr; i++) {
                 struct buffer_head * bh = arr[i];
                 lock_buffer(bh);
-               bh->b_end_io = end_buffer_io_async;
-               get_bh(bh);
+               set_buffer_async_io(bh);
         }
  
         /* Stage 3: start the IO */
@@ -2035,6 +2146,47 @@ int generic_block_bmap(struct address_space *mapping, long block, get_block_t *g
         return tmp.b_blocknr;
  }
  
+int generic_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize, get_block_t * get_block)
+{
+       int i, nr_blocks, retval;
+       unsigned long * blocks = iobuf->blocks;
+
+       nr_blocks = iobuf->length / blocksize;
+       /* build the blocklist */
+       for (i = 0; i < nr_blocks; i++, blocknr++) {
+               struct buffer_head bh;
+
+               bh.b_state = 0;
+               bh.b_dev = inode->i_dev;
+               bh.b_size = blocksize;
+
+               retval = get_block(inode, blocknr, &bh, rw == READ ? 0 : 1);
+               if (retval)
+                       goto out;
+
+               if (rw == READ) {
+                       if (buffer_new(&bh))
+                               BUG();
+                       if (!buffer_mapped(&bh)) {
+                               /* there was an hole in the filesystem */
+                               blocks[i] = -1UL;
+                               continue;
+                       }
+               } else {
+                       if (buffer_new(&bh))
+                               unmap_underlying_metadata(&bh);
+                       if (!buffer_mapped(&bh))
+                               BUG();
+               }
+               blocks[i] = bh.b_blocknr;
+       }
+
+       retval = brw_kiovec(rw, 1, &iobuf, inode->i_dev, iobuf->blocks, blocksize);
+
+ out:
+       return retval;
+}
+
  /*
   * IO completion routine for a buffer_head being used for kiobuf IO: we
   * can't dispatch the kiobuf callback until io_count reaches 0.  
@@ -2150,6 +2302,18 @@ int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
                         
                         while (length > 0) {
                                 blocknr = b[bufind++];
+                               if (blocknr == -1UL) {
+                                       if (rw == READ) {
+                                               /* there was an hole in the filesystem */
+                                               memset(kmap(map) + offset, 0, size);
+                                               flush_dcache_page(map);
+                                               kunmap(map);
+
+                                               transferred += size;
+                                               goto skip_block;
+                                       } else
+                                               BUG();
+                               }
                                 tmp = bhs[bhind++];
  
                                 tmp->b_dev = B_FREE;
@@ -2168,9 +2332,6 @@ int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
                                 } else
                                         set_bit(BH_Uptodate, &tmp->b_state);
  
-                               length -= size;
-                               offset += size;
-
                                 atomic_inc(&iobuf->io_count);
                                 submit_bh(rw, tmp);
                                 /* 
@@ -2185,7 +2346,11 @@ int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
                                                 goto finished;
                                         bhind = 0;
                                 }
-                               
+
+                       skip_block:
+                               length -= size;
+                               offset += size;
+
                                 if (offset >= PAGE_SIZE) {
                                         offset = 0;
                                         break;
@@ -2238,8 +2403,7 @@ int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
                 lock_buffer(bh);
                 bh->b_blocknr = *(b++);
                 set_bit(BH_Mapped, &bh->b_state);
-               bh->b_end_io = end_buffer_io_async;
-               get_bh(bh);
+               set_buffer_async_io(bh);
                 bh = bh->b_this_page;
         } while (bh != head);
  
@@ -2351,40 +2515,37 @@ out:
         return 0;
  }
  
-/*
- * Sync all the buffers on one page..
- *
- * If we have old buffers that are locked, we'll
- * wait on them, but we won't wait on the new ones
- * we're writing out now.
- *
- * This all is required so that we can free up memory
- * later.
- *
- * Wait:
- *     0 - no wait (this does not get called - see try_to_free_buffers below)
- *     1 - start IO for dirty buffers
- *     2 - wait for completion of locked buffers
- */
-static void sync_page_buffers(struct buffer_head *bh, unsigned int gfp_mask)
+static int sync_page_buffers(struct buffer_head *bh, unsigned int gfp_mask)
  {
-       struct buffer_head * tmp = bh;
+       struct buffer_head * p = bh;
+       int tryagain = 1;
  
         do {
-               struct buffer_head *p = tmp;
-               tmp = tmp->b_this_page;
-               if (buffer_locked(p)) {
-                       if (gfp_mask & __GFP_WAIT)
-                               __wait_on_buffer(p);
-               } else if (buffer_dirty(p))
-                       ll_rw_block(WRITE, 1, &p);
-       } while (tmp != bh);
+               if (buffer_dirty(p) || buffer_locked(p)) {
+                       if (test_and_set_bit(BH_Wait_IO, &p->b_state)) {
+                               if (buffer_dirty(p)) {
+                                       ll_rw_block(WRITE, 1, &p);
+                                       tryagain = 0;
+                               } else if (buffer_locked(p)) {
+                                       if (gfp_mask & __GFP_WAIT) {
+                                               wait_on_buffer(p);
+                                               tryagain = 1;
+                                       } else
+                                               tryagain = 0;
+                               }
+                       } else
+                               tryagain = 0;
+               }
+               p = p->b_this_page;
+       } while (p != bh);
+
+       return tryagain;
  }
  
  /*
   * Can the buffer be thrown out?
   */
-#define BUFFER_BUSY_BITS       ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
+#define BUFFER_BUSY_BITS       ((1<<BH_Dirty) | (1<<BH_Lock))
  #define buffer_busy(bh)                (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
  
  /*
@@ -2449,16 +2610,16 @@ busy_buffer_page:
         write_unlock(&hash_table_lock);
         spin_unlock(&lru_list_lock);
         if (gfp_mask & __GFP_IO) {
-               if (!(gfp_mask & __GFP_HIGHIO) && PageHighMem(page))
-                       return 0;
-               sync_page_buffers(bh, gfp_mask);
-               /* We waited synchronously, so we can free the buffers. */
-               if (gfp_mask & __GFP_WAIT) {
-                       gfp_mask = 0;   /* no IO or waiting this time around */
-                       goto cleaned_buffers_try_again;
+               if ((gfp_mask & __GFP_HIGHIO) || !PageHighMem(page)) {
+                       if (sync_page_buffers(bh, gfp_mask)) {
+                               /* no IO or waiting next time */
+                               gfp_mask = 0;
+                               goto cleaned_buffers_try_again;
+                       }
                 }
-               wakeup_bdflush();
         }
+       if (balance_dirty_state() >= 0)
+               wakeup_bdflush();
         return 0;
  }
  
@@ -2469,9 +2630,8 @@ void show_buffers(void)
  #ifdef CONFIG_SMP
         struct buffer_head * bh;
         int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
-       int protected = 0;
         int nlist;
-       static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", "PROTECTED", };
+       static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", };
  #endif
  
         printk("Buffer memory:   %6dkB\n",
@@ -2481,7 +2641,7 @@ void show_buffers(void)
         if (!spin_trylock(&lru_list_lock))
                 return;
         for(nlist = 0; nlist < NR_LIST; nlist++) {
-               found = locked = dirty = used = lastused = protected = 0;
+               found = locked = dirty = used = lastused = 0;
                 bh = lru_list[nlist];
                 if(!bh) continue;
  
@@ -2489,8 +2649,6 @@ void show_buffers(void)
                         found++;
                         if (buffer_locked(bh))
                                 locked++;
-                       if (buffer_protected(bh))
-                               protected++;
                         if (buffer_dirty(bh))
                                 dirty++;
                         if (atomic_read(&bh->b_count))
@@ -2504,9 +2662,9 @@ void show_buffers(void)
                                        buf_types[nlist], found, tmp);
                 }
                 printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
-                      "%d locked, %d protected, %d dirty\n",
+                      "%d locked, %d dirty\n",
                        buf_types[nlist], found, size_buffers_type[nlist]>>10,
-                      used, lastused, locked, protected, dirty);
+                      used, lastused, locked, dirty);
         }
         spin_unlock(&lru_list_lock);
  #endif
diff --git a/fs/dcache.c b/fs/dcache.c

index 59df6ce8fcb342a2ba1c0d233b3c442c3c46a3c3..313b4fe02f39ddfbe74668633916c2c47d058ef2 100644 (file)
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -565,7 +565,7 @@ int shrink_dcache_memory(int priority, unsigned int gfp_mask)
         if (!(gfp_mask & __GFP_FS))
                 return 0;
  
-       count = dentry_stat.nr_unused >> priority;
+       count = dentry_stat.nr_unused / priority;
  
         prune_dcache(count);
         kmem_cache_shrink(dentry_cache);
diff --git a/fs/devices.c b/fs/devices.c

index 875f0e9f364c4e1af414b19082654b6c988cc62f..5cdfa8cf5f0ef2be8a55d7478e6e955e78f2c1e5 100644 (file)
--- a/fs/devices.c
+++ b/fs/devices.c
@@ -206,6 +206,7 @@ void init_special_inode(struct inode *inode, umode_t mode, int rdev)
                 inode->i_cdev = cdget(rdev);
         } else if (S_ISBLK(mode)) {
                 inode->i_fop = &def_blk_fops;
+               inode->i_mapping->a_ops = &def_blk_aops;
                 inode->i_rdev = to_kdev_t(rdev);
                 inode->i_bdev = bdget(rdev);
         } else if (S_ISFIFO(mode))
diff --git a/fs/exec.c b/fs/exec.c

index 401571db831e6eeb92702c8296969a367d4917ca..e7ca71fa2b689a112bbf4388d101dc2447144858 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -34,6 +34,7 @@
  #include <linux/pagemap.h>
  #include <linux/highmem.h>
  #include <linux/spinlock.h>
+#include <linux/personality.h>
  #define __NO_VERSION__
  #include <linux/module.h>
  
@@ -186,7 +187,7 @@ int copy_strings(int argc,char ** argv, struct linux_binprm *bprm)
                 int len;
                 unsigned long pos;
  
-               if (get_user(str, argv+argc) || !str || !(len = strnlen_user(str, bprm->p))) 
+               if (get_user(str, argv+argc) || !(len = strnlen_user(str, bprm->p)))
                         return -EFAULT;
                 if (bprm->p < len) 
                         return -E2BIG; 
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c

index da504713770d8e8b762da353e3232096ecd8f43e..9fc9fd62b6035e0577a0cdf24044c82ce9aea642 100644 (file)
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -303,7 +303,7 @@ struct ext2_dir_entry_2 * ext2_find_entry (struct inode * dir,
         const char *name = dentry->d_name.name;
         int namelen = dentry->d_name.len;
         unsigned reclen = EXT2_DIR_REC_LEN(namelen);
-       unsigned long n;
+       unsigned long start, n;
         unsigned long npages = dir_pages(dir);
         struct page *page = NULL;
         ext2_dirent * de;
@@ -311,24 +311,32 @@ struct ext2_dir_entry_2 * ext2_find_entry (struct inode * dir,
         /* OFFSET_CACHE */
         *res_page = NULL;
  
-       for (n = 0; n < npages; n++) {
+       start = dir->u.ext2_i.i_dir_start_lookup;
+       if (start >= npages)
+               start = 0;
+       n = start;
+       do {
                 char *kaddr;
                 page = ext2_get_page(dir, n);
-               if (IS_ERR(page))
-                       continue;
-
-               kaddr = page_address(page);
-               de = (ext2_dirent *) kaddr;
-               kaddr += PAGE_CACHE_SIZE - reclen;
-               for ( ; (char *) de <= kaddr ; de = ext2_next_entry(de))
-                       if (ext2_match (namelen, name, de))
-                               goto found;
-               ext2_put_page(page);
-       }
+               if (!IS_ERR(page)) {
+                       kaddr = page_address(page);
+                       de = (ext2_dirent *) kaddr;
+                       kaddr += PAGE_CACHE_SIZE - reclen;
+                       while ((char *) de <= kaddr) {
+                               if (ext2_match (namelen, name, de))
+                                       goto found;
+                               de = ext2_next_entry(de);
+                       }
+                       ext2_put_page(page);
+               }
+               if (++n >= npages)
+                       n = 0;
+       } while (n != start);
         return NULL;
  
  found:
         *res_page = page;
+       dir->u.ext2_i.i_dir_start_lookup = n;
         return de;
  }
  
diff --git a/fs/ext2/fsync.c b/fs/ext2/fsync.c

index 8b2ecab4181beb9dc604a09c6f2778cfb71d3a9b..211e52c7fc5b47832025f771d1a2c1641357736a 100644 (file)
--- a/fs/ext2/fsync.c
+++ b/fs/ext2/fsync.c
@@ -44,6 +44,7 @@ int ext2_fsync_inode(struct inode *inode, int datasync)
         int err;
         
         err  = fsync_inode_buffers(inode);
+       err |= fsync_inode_data_buffers(inode);
         if (!(inode->i_state & I_DIRTY))
                 return err;
         if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c

index 4d179008c62f8f3ca37dd8d28189443befe94342..0d06dededeb8898af794c6b5475049f5acace06d 100644 (file)
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -586,13 +586,18 @@ static int ext2_bmap(struct address_space *mapping, long block)
  {
         return generic_block_bmap(mapping,block,ext2_get_block);
  }
+static int ext2_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize)
+{
+       return generic_direct_IO(rw, inode, iobuf, blocknr, blocksize, ext2_get_block);
+}
  struct address_space_operations ext2_aops = {
         readpage: ext2_readpage,
         writepage: ext2_writepage,
         sync_page: block_sync_page,
         prepare_write: ext2_prepare_write,
         commit_write: generic_commit_write,
-       bmap: ext2_bmap
+       bmap: ext2_bmap,
+       direct_IO: ext2_direct_IO,
  };
  
  /*
diff --git a/fs/fcntl.c b/fs/fcntl.c

index ee546648635bca83ef7e360f74f41993d249ea2a..9f5171ecc9486fbd85403336aecca87913c88501 100644 (file)
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -10,6 +10,7 @@
  #include <linux/dnotify.h>
  #include <linux/smp_lock.h>
  #include <linux/slab.h>
+#include <linux/iobuf.h>
  
  #include <asm/poll.h>
  #include <asm/siginfo.h>
@@ -194,7 +195,7 @@ asmlinkage long sys_dup(unsigned int fildes)
         return ret;
  }
  
-#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | FASYNC)
+#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | FASYNC | O_DIRECT)
  
  static int setfl(int fd, struct file * filp, unsigned long arg)
  {
@@ -217,6 +218,25 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
                 }
         }
  
+       if (arg & O_DIRECT) {
+               /*
+                * alloc_kiovec() can sleep and we are only serialized by
+                * the big kernel lock here, so abuse the i_sem to serialize
+                * this case too. We of course wouldn't need to go deep down
+                * to the inode layer, we could stay at the file layer, but
+                * we don't want to pay for the memory of a semaphore in each
+                * file structure too and we use the inode semaphore that we just
+                * pay for anyways.
+                */
+               error = 0;
+               down(&inode->i_sem);
+               if (!filp->f_iobuf)
+                       error = alloc_kiovec(1, &filp->f_iobuf);
+               up(&inode->i_sem);
+               if (error < 0)
+                       return error;
+       }
+
         /* required for strict SunOS emulation */
         if (O_NONBLOCK != O_NDELAY)
                if (arg & O_NDELAY)
diff --git a/fs/file_table.c b/fs/file_table.c

index b76e93acc19683729c5d4a63b5ca48b6a0067593..27713eb016fc6c112279c7bf34b58ef0cb563ad2 100644 (file)
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -11,6 +11,7 @@
  #include <linux/init.h>
  #include <linux/module.h>
  #include <linux/smp_lock.h>
+#include <linux/iobuf.h>
  
  /* sysctl tunables... */
  struct files_stat_struct files_stat = {0, 0, NR_FILE};
@@ -104,6 +105,10 @@ void fput(struct file * file)
  
         if (atomic_dec_and_test(&file->f_count)) {
                 locks_remove_flock(file);
+
+               if (file->f_iobuf)
+                       free_kiovec(1, &file->f_iobuf);
+
                 if (file->f_op && file->f_op->release)
                         file->f_op->release(inode, file);
                 fops_put(file->f_op);
diff --git a/fs/inode.c b/fs/inode.c

index 599a10c94b210440e9c5473d4faecafe336f10c9..e034073de7315b6fa9a1b38157730968913205d5 100644 (file)
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -16,6 +16,7 @@
  #include <linux/swap.h>
  #include <linux/swapctl.h>
  #include <linux/prefetch.h>
+#include <linux/locks.h>
  
  /*
   * New inode.c implementation.
@@ -78,7 +79,7 @@ static kmem_cache_t * inode_cachep;
          ((struct inode *) kmem_cache_alloc(inode_cachep, SLAB_KERNEL))
  static void destroy_inode(struct inode *inode) 
  {
-       if (!list_empty(&inode->i_dirty_buffers))
+       if (inode_has_buffers(inode))
                 BUG();
         kmem_cache_free(inode_cachep, (inode));
  }
@@ -104,6 +105,7 @@ static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
                 INIT_LIST_HEAD(&inode->i_data.locked_pages);
                 INIT_LIST_HEAD(&inode->i_dentry);
                 INIT_LIST_HEAD(&inode->i_dirty_buffers);
+               INIT_LIST_HEAD(&inode->i_dirty_data_buffers);
                 sema_init(&inode->i_sem, 1);
                 sema_init(&inode->i_zombie, 1);
                 spin_lock_init(&inode->i_data.i_shared_lock);
@@ -135,6 +137,9 @@ void __mark_inode_dirty(struct inode *inode, int flags)
  {
         struct super_block * sb = inode->i_sb;
  
+       if (!sb)
+               return;
+
         /* Don't do this for I_DIRTY_PAGES - that doesn't actually dirty the inode itself */
         if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
                 if (sb->s_op && sb->s_op->dirty_inode)
@@ -273,27 +278,18 @@ static inline void wait_on_locked(struct list_head *head)
         }
  }
  
-static inline int try_to_sync_unused_list(struct list_head *head)
+static inline int try_to_sync_unused_list(struct list_head *head, int nr_inodes)
  {
         struct list_head *tmp = head;
         struct inode *inode;
  
-       while ((tmp = tmp->prev) != head) {
+       while (nr_inodes && (tmp = tmp->prev) != head) {
                 inode = list_entry(tmp, struct inode, i_list);
  
                 if (!atomic_read(&inode->i_count)) {
-                       /* 
-                        * We're under PF_MEMALLOC here, and syncing the 
-                        * inode may have to allocate memory. To avoid
-                        * running into a OOM deadlock, we write one 
-                        * inode synchronously and stop syncing in case 
-                        * we're under freepages.low
-                        */
+                       __sync_one(inode, 0);
+                       nr_inodes--;
  
-                       int sync = nr_free_pages() < freepages.low;
-                       __sync_one(inode, sync);
-                       if (sync) 
-                               return 0;
                         /* 
                          * __sync_one moved the inode to another list,
                          * so we have to start looking from the list head.
@@ -301,7 +297,8 @@ static inline int try_to_sync_unused_list(struct list_head *head)
                         tmp = head;
                 }
         }
-       return 1;
+
+       return nr_inodes;
  }
  
  void sync_inodes_sb(struct super_block *sb)
@@ -397,24 +394,25 @@ void sync_inodes(kdev_t dev)
         }
  }
  
-/*
- * Called with the spinlock already held..
- */
-static void try_to_sync_unused_inodes(void)
+static void try_to_sync_unused_inodes(void * arg)
  {
         struct super_block * sb;
+       int nr_inodes = inodes_stat.nr_unused;
  
+       spin_lock(&inode_lock);
         spin_lock(&sb_lock);
         sb = sb_entry(super_blocks.next);
-       for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.next)) {
+       for (; nr_inodes && sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.next)) {
                 spin_unlock(&sb_lock);
-               if (!try_to_sync_unused_list(&sb->s_dirty))
-                       return;
+               nr_inodes = try_to_sync_unused_list(&sb->s_dirty, nr_inodes);
                 spin_lock(&sb_lock);
         }
         spin_unlock(&sb_lock);
+       spin_unlock(&inode_lock);
  }
  
+static struct tq_struct unused_inodes_flush_task;
+
  /**
   *     write_inode_now -       write an inode to disk
   *     @inode: inode to write to disk
@@ -433,6 +431,8 @@ void write_inode_now(struct inode *inode, int sync)
                 while (inode->i_state & I_DIRTY)
                         sync_one(inode, sync);
                 spin_unlock(&inode_lock);
+               if (sync)
+                       wait_on_inode(inode);
         }
         else
                 printk(KERN_ERR "write_inode_now: no super block\n");
@@ -447,9 +447,9 @@ void write_inode_now(struct inode *inode, int sync)
   * O_SYNC flag set, to flush dirty writes to disk.  
   */
  
-int generic_osync_inode(struct inode *inode, int datasync)
+int generic_osync_inode(struct inode *inode, int what)
  {
-       int err;
+       int err = 0, err2 = 0, need_write_inode_now = 0;
         
         /* 
          * WARNING
@@ -472,23 +472,24 @@ int generic_osync_inode(struct inode *inode, int datasync)
          * every O_SYNC write, not just the synchronous I/Os.  --sct
          */
  
-#ifdef WRITERS_QUEUE_IO
-       err = osync_inode_buffers(inode);
-#else
-       err = fsync_inode_buffers(inode);
-#endif
+       if (what & OSYNC_METADATA)
+               err = fsync_inode_buffers(inode);
+       if (what & OSYNC_DATA)
+               err2 = fsync_inode_data_buffers(inode);
+       if (!err)
+               err = err2;
  
         spin_lock(&inode_lock);
-       if (!(inode->i_state & I_DIRTY))
-               goto out;
-       if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-               goto out;
+       if ((inode->i_state & I_DIRTY) &&
+           ((what & OSYNC_INODE) || (inode->i_state & I_DIRTY_DATASYNC)))
+               need_write_inode_now = 1;
         spin_unlock(&inode_lock);
-       write_inode_now(inode, 1);
-       return err;
  
- out:
-       spin_unlock(&inode_lock);
+       if (need_write_inode_now)
+               write_inode_now(inode, 1);
+       else
+               wait_on_inode(inode);
+
         return err;
  }
  
@@ -503,8 +504,7 @@ int generic_osync_inode(struct inode *inode, int datasync)
   
  void clear_inode(struct inode *inode)
  {
-       if (!list_empty(&inode->i_dirty_buffers))
-               invalidate_inode_buffers(inode);
+       invalidate_inode_buffers(inode);
         
         if (inode->i_data.nrpages)
                 BUG();
@@ -630,6 +630,13 @@ int invalidate_device(kdev_t dev, int do_sync)
         res = 0;
         sb = get_super(dev);
         if (sb) {
+               /*
+                * no need to lock the super, get_super holds the
+                * read semaphore so the filesystem cannot go away
+                * under us (->put_super runs with the write lock
+                * hold).
+                */
+               shrink_dcache_sb(sb);
                 res = invalidate_inodes(sb);
                 drop_super(sb);
         }
@@ -658,12 +665,11 @@ void prune_icache(int goal)
  {
         LIST_HEAD(list);
         struct list_head *entry, *freeable = &list;
-       int count, synced = 0;
+       int count;
         struct inode * inode;
  
         spin_lock(&inode_lock);
  
-free_unused:
         count = 0;
         entry = inode_unused.prev;
         while (entry != &inode_unused)
@@ -693,18 +699,13 @@ free_unused:
         dispose_list(freeable);
  
         /* 
-        * If we freed enough clean inodes, avoid writing 
-        * dirty ones. Also giveup if we already tried to
-        * sync dirty inodes.
+        * If we didn't freed enough clean inodes schedule
+        * a sync of the dirty inodes, we cannot do it
+        * from here or we're either synchronously dogslow
+        * or we deadlock with oom.
          */
-       if (!goal || synced)
-               return;
-       
-       synced = 1;
-
-       spin_lock(&inode_lock);
-       try_to_sync_unused_inodes();
-       goto free_unused;
+       if (goal)
+               schedule_task(&unused_inodes_flush_task);
  }
  
  int shrink_icache_memory(int priority, int gfp_mask)
@@ -721,7 +722,7 @@ int shrink_icache_memory(int priority, int gfp_mask)
         if (!(gfp_mask & __GFP_FS))
                 return 0;
  
-       count = inodes_stat.nr_unused >> priority;
+       count = inodes_stat.nr_unused / priority;
  
         prune_icache(count);
         kmem_cache_shrink(inode_cachep);
@@ -776,6 +777,7 @@ static void clean_inode(struct inode *inode)
         inode->i_nlink = 1;
         atomic_set(&inode->i_writecount, 0);
         inode->i_size = 0;
+       inode->i_blocks = 0;
         inode->i_generation = 0;
         memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
         inode->i_pipe = NULL;
@@ -1028,6 +1030,9 @@ void iput(struct inode *inode)
         if (inode) {
                 struct super_operations *op = NULL;
  
+               if (inode->i_state == I_CLEAR)
+                       BUG();
+
                 if (inode->i_sb && inode->i_sb->s_op)
                         op = inode->i_sb->s_op;
                 if (op && op->put_inode)
@@ -1164,6 +1169,8 @@ void __init inode_init(unsigned long mempages)
                                          NULL);
         if (!inode_cachep)
                 panic("cannot create inode slab cache");
+
+       unused_inodes_flush_task.routine = try_to_sync_unused_inodes;
  }
  
  /**
diff --git a/fs/open.c b/fs/open.c

index ed70f76713f8a7634d8dcbe81316879f7cd517a6..30c4833d52f3f01f982c7f8bd9a626ece644f570 100644 (file)
--- a/fs/open.c
+++ b/fs/open.c
@@ -14,6 +14,7 @@
  #include <linux/module.h>
  #include <linux/slab.h>
  #include <linux/tty.h>
+#include <linux/iobuf.h>
  
  #include <asm/uaccess.h>
  
@@ -656,6 +657,16 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
         f->f_reada = 0;
         f->f_op = fops_get(inode->i_fop);
         file_move(f, &inode->i_sb->s_files);
+
+       /* preallocate kiobuf for O_DIRECT */
+       f->f_iobuf = NULL;
+       f->f_iobuf_lock = 0;
+       if (f->f_flags & O_DIRECT) {
+               error = alloc_kiovec(1, &f->f_iobuf);
+               if (error)
+                       goto cleanup_all;
+       }
+
         if (f->f_op && f->f_op->open) {
                 error = f->f_op->open(inode,f);
                 if (error)
@@ -666,6 +677,8 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
         return f;
  
  cleanup_all:
+       if (f->f_iobuf)
+               free_kiovec(1, &f->f_iobuf);
         fops_put(f->f_op);
         if (f->f_mode & FMODE_WRITE)
                 put_write_access(inode);
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c

index 149ad0aaa4b93b56120b895189b86e0041938019..69aeddd3f1667c2cd46e14290206581805be2a84 100644 (file)
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -14,7 +14,6 @@
  #include <asm/uaccess.h>
  #include <asm/io.h>
  
-extern unsigned long log_size;
  extern wait_queue_head_t log_wait;
  
  extern int do_syslog(int type, char * bug, int count);
@@ -39,7 +38,7 @@ static ssize_t kmsg_read(struct file * file, char * buf,
  static unsigned int kmsg_poll(struct file *file, poll_table * wait)
  {
         poll_wait(file, &log_wait, wait);
-       if (log_size)
+       if (do_syslog(9, 0, 0))
                 return POLLIN | POLLRDNORM;
         return 0;
  }
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c

index 91994ff3b9faf687b4085b4a7c2184f4d46c7db0..f041736e515fca41f6e35b2597af2fca0cc38459 100644 (file)
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -145,12 +145,12 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
   * display in kilobytes.
   */
  #define K(x) ((x) << (PAGE_SHIFT - 10))
-#define B(x) ((x) << PAGE_SHIFT)
+#define B(x) ((unsigned long long)(x) << PAGE_SHIFT)
         si_meminfo(&i);
         si_swapinfo(&i);
         len = sprintf(page, "        total:    used:    free:  shared: buffers:  cached:\n"
-               "Mem:  %8lu %8lu %8lu %8lu %8lu %8u\n"
-               "Swap: %8lu %8lu %8lu\n",
+               "Mem:  %8Lu %8Lu %8Lu %8Lu %8Lu %8Lu\n"
+               "Swap: %8Lu %8Lu %8Lu\n",
                 B(i.totalram), B(i.totalram-i.freeram), B(i.freeram),
                 B(i.sharedram), B(i.bufferram),
                 B(atomic_read(&page_cache_size)), B(i.totalswap),
@@ -168,9 +168,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
                 "Cached:       %8lu kB\n"
                 "SwapCached:   %8lu kB\n"
                 "Active:       %8u kB\n"
-               "Inact_dirty:  %8u kB\n"
-               "Inact_clean:  %8u kB\n"
-               "Inact_target: %8lu kB\n"
+               "Inactive:     %8u kB\n"
                 "HighTotal:    %8lu kB\n"
                 "HighFree:     %8lu kB\n"
                 "LowTotal:     %8lu kB\n"
@@ -184,9 +182,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
                 K(atomic_read(&page_cache_size) - swapper_space.nrpages),
                 K(swapper_space.nrpages),
                 K(nr_active_pages),
-               K(nr_inactive_dirty_pages),
-               K(nr_inactive_clean_pages()),
-               K(inactive_target),
+               K(nr_inactive_pages),
                 K(i.totalhigh),
                 K(i.freehigh),
                 K(i.totalram-i.totalhigh),
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c

index 47c9598ec4e37b909d497a1c3f8bcd509c5d6666..59d17112ca7d5f34f318246c8ca6506fe22c6757 100644 (file)
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -76,7 +76,7 @@ static int reiserfs_sync_file(
                               ) {
    struct inode * p_s_inode = p_s_dentry->d_inode;
    struct reiserfs_transaction_handle th ;
-  int n_err = 0;
+  int n_err;
    int windex ;
    int jbegin_count = 1 ;
  
@@ -86,6 +86,7 @@ static int reiserfs_sync_file(
        BUG ();
  
    n_err = fsync_inode_buffers(p_s_inode) ;
+  n_err |= fsync_inode_data_buffers(p_s_inode);
    /* commit the current transaction to flush any metadata
    ** changes.  sys_fsync takes care of flushing the dirty pages for us
    */
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c

index ed5ff5abc53ee2c22093c7c4fc5df69a36a207d5..64181320e85c52123e4b1359bfaf49494e7556c6 100644 (file)
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1813,7 +1813,6 @@ static inline void submit_bh_for_writepage(struct buffer_head **bhp, int nr) {
      for(i = 0 ; i < nr ; i++) {
          bh = bhp[i] ;
         lock_buffer(bh) ;
-       get_bh(bh) ;               /* async end_io handler puts this */
         set_buffer_async_io(bh) ;
         /* submit_bh doesn't care if the buffer is dirty, but nobody
         ** later on in the call chain will be cleaning it.  So, we
diff --git a/fs/super.c b/fs/super.c

index bc816a1581288e1adfa807113d81f686a17a4397..5541b10196fca34e7c2989f6917db3816155b05c 100644 (file)
--- a/fs/super.c
+++ b/fs/super.c
@@ -1983,7 +1983,11 @@ int __init change_root(kdev_t new_root_dev,const char *put_old)
                         blivet = do_umount(old_rootmnt, 0);
                         mntput(old_rootmnt);
                         if (!blivet) {
-                               ioctl_by_bdev(ramdisk, BLKFLSBUF, 0);
+                               int ioctl_err;
+
+                               ioctl_err = ioctl_by_bdev(ramdisk, BLKFLSBUF, 0);
+                               if (ioctl_err)
+                                       printk("failed to release ramdisk %d...", ioctl_err);
                                 printk("okay\n");
                                 error = 0;
                         }
diff --git a/include/asm-alpha/fcntl.h b/include/asm-alpha/fcntl.h

index 292765efa51aa687aa2bc284a9a9bb1a67610ea7..eca824335d72bc396418bc03179b9d55381567c7 100644 (file)
--- a/include/asm-alpha/fcntl.h
+++ b/include/asm-alpha/fcntl.h
@@ -17,10 +17,10 @@
  #define O_NDELAY       O_NONBLOCK
  #define O_SYNC         040000
  #define FASYNC         020000  /* fcntl, for BSD compatibility */
-#define O_DIRECT       040000  /* direct disk access - should check with OSF/1 */
  #define O_DIRECTORY    0100000 /* must be a directory */
  #define O_NOFOLLOW     0200000 /* don't follow links */
  #define O_LARGEFILE    0400000 /* will be set by the kernel on every open */
+#define O_DIRECT       02000000 /* direct disk access - should check with OSF/1 */
  
  #define F_DUPFD                0       /* dup */
  #define F_GETFD                1       /* get close_on_exec */
diff --git a/include/asm-alpha/processor.h b/include/asm-alpha/processor.h

index 61e83a4bcf133efb745cf0f5b16bf5ede18b64b9..005af29717b8483070cb8231087dae989174282d 100644 (file)
--- a/include/asm-alpha/processor.h
+++ b/include/asm-alpha/processor.h
@@ -72,9 +72,6 @@ struct thread_struct {
         int bpt_nsaved;
  };
  
-#define INIT_MMAP { &init_mm, PAGE_OFFSET,  PAGE_OFFSET+0x10000000, \
-       NULL, PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
-
  #define INIT_THREAD  { \
         0, 0, 0, \
         0, 0, 0, \
@@ -151,4 +148,25 @@ unsigned long get_wchan(struct task_struct *p);
  #define init_task      (init_task_union.task)
  #define init_stack     (init_task_union.stack)
  
+#define ARCH_HAS_PREFETCH
+#define ARCH_HAS_PREFETCHW
+#define ARCH_HAS_SPINLOCK_PREFETCH
+
+extern inline void prefetch(const void *ptr)  
+{ 
+       __asm__ ("ldl $31,%0" : : "m"(*(char *)ptr)); 
+}
+
+extern inline void prefetchw(const void *ptr)  
+{
+       __asm__ ("ldl $31,%0" : : "m"(*(char *)ptr)); 
+}
+
+extern inline void spin_lock_prefetch(const void *ptr)  
+{
+       __asm__ ("ldl $31,%0" : : "m"(*(char *)ptr)); 
+}
+       
+
+
  #endif /* __ASM_ALPHA_PROCESSOR_H */
diff --git a/include/asm-arm/processor.h b/include/asm-arm/processor.h

index 5fe9f79c8ec78b6472c79f760732c17bc40a007f..be44c2d71910b32639668267c37b2f44cd4b706f 100644 (file)
--- a/include/asm-arm/processor.h
+++ b/include/asm-arm/processor.h
@@ -68,13 +68,6 @@ struct thread_struct {
         EXTRA_THREAD_STRUCT
  };
  
-#define INIT_MMAP {                                    \
-       vm_mm:          &init_mm,                       \
-       vm_page_prot:   PAGE_SHARED,                    \
-       vm_flags:       VM_READ | VM_WRITE | VM_EXEC,   \
-       vm_avl_height:  1,                              \
-}
-
  #define INIT_THREAD  {                                 \
         refcount:       ATOMIC_INIT(1),                 \
         EXTRA_THREAD_STRUCT_INIT                        \
diff --git a/include/asm-cris/processor.h b/include/asm-cris/processor.h

index 0291f21c27f2eb5d6cc4fb1bcf491b87657483a5..5bc95faebb088a6cffdec3d9dd43112b8b6f711d 100644 (file)
--- a/include/asm-cris/processor.h
+++ b/include/asm-cris/processor.h
@@ -77,16 +77,6 @@ struct thread_struct {
  
  #define current_regs() user_regs(current)
  
-/* INIT_MMAP is the kernels map of memory, between KSEG_C and KSEG_D */
-
-#ifdef CONFIG_CRIS_LOW_MAP
-#define INIT_MMAP { &init_mm, KSEG_6, KSEG_7, NULL, PAGE_SHARED, \
-                            VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
-#else
-#define INIT_MMAP { &init_mm, KSEG_C, KSEG_D, NULL, PAGE_SHARED, \
-                            VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
-#endif
-
  #define INIT_THREAD  { \
     0, 0, 0x20 }  /* ccr = int enable, nothing else */
  
diff --git a/include/asm-i386/fcntl.h b/include/asm-i386/fcntl.h

index 5637eca593e753409750a720ca25d4ae225263a4..41e3c4d9144efc721b972651e2384b2ee66e1a2e 100644 (file)
--- a/include/asm-i386/fcntl.h
+++ b/include/asm-i386/fcntl.h
@@ -16,7 +16,7 @@
  #define O_NDELAY       O_NONBLOCK
  #define O_SYNC          010000
  #define FASYNC          020000 /* fcntl, for BSD compatibility */
-#define O_DIRECT        040000 /* direct disk access hint - currently ignored */
+#define O_DIRECT        040000 /* direct disk access hint */
  #define O_LARGEFILE    0100000
  #define O_DIRECTORY    0200000 /* must be a directory */
  #define O_NOFOLLOW     0400000 /* don't follow links */
diff --git a/include/asm-i386/hw_irq.h b/include/asm-i386/hw_irq.h

index cdedca57e80cfb6ca5070bdc840e4e2dba76855d..fc8b17144adfde6fa9949cc666e2ab9f9ed104ca 100644 (file)
--- a/include/asm-i386/hw_irq.h
+++ b/include/asm-i386/hw_irq.h
@@ -158,9 +158,9 @@ __asm__( \
         "\n" __ALIGN_STR"\n" \
         "common_interrupt:\n\t" \
         SAVE_ALL \
-       "pushl $ret_from_intr\n\t" \
         SYMBOL_NAME_STR(call_do_IRQ)":\n\t" \
-       "jmp "SYMBOL_NAME_STR(do_IRQ));
+       "call " SYMBOL_NAME_STR(do_IRQ) "\n\t" \
+       "jmp ret_from_intr\n");
  
  /* 
   * subtle. orig_eax is used by the signal code to distinct between
diff --git a/include/asm-i386/irq.h b/include/asm-i386/irq.h

index 9b8f64a3ccca0a4dd4d6878ab2b7b0e08e85d37e..08c96a276f04f2383e1db5f692a814d5584fd763 100644 (file)
--- a/include/asm-i386/irq.h
+++ b/include/asm-i386/irq.h
@@ -32,4 +32,8 @@ extern void disable_irq(unsigned int);
  extern void disable_irq_nosync(unsigned int);
  extern void enable_irq(unsigned int);
  
+#ifdef CONFIG_X86_LOCAL_APIC
+#define ARCH_HAS_NMI_WATCHDOG          /* See include/linux/irq.h */
+#endif
+
  #endif /* _ASM_IRQ_H */
diff --git a/include/asm-i386/kmap_types.h b/include/asm-i386/kmap_types.h

index 82af4b9e8379db475e0e9cab3d3257902cff89e8..5107c3db1647ae0963cdf80337b1e09939d18020 100644 (file)
--- a/include/asm-i386/kmap_types.h
+++ b/include/asm-i386/kmap_types.h
@@ -3,7 +3,6 @@
  
  enum km_type {
         KM_BOUNCE_READ,
-       KM_BOUNCE_WRITE,
         KM_SKB_DATA,
         KM_SKB_DATA_SOFTIRQ,
         KM_USER0,
diff --git a/include/asm-i386/page.h b/include/asm-i386/page.h

index 0af0a28ffcc1fdefd30937943a89b11bd6015b64..50a21adf4ba716aa96939f99cce75567c0508588 100644 (file)
--- a/include/asm-i386/page.h
+++ b/include/asm-i386/page.h
@@ -86,8 +86,8 @@ typedef struct { unsigned long pgprot; } pgprot_t;
   * Tell the user there is some problem. Beep too, so we can
   * see^H^H^Hhear bugs in early bootup as well!
   */
-#define BUG() do { \
-       __asm__ __volatile__(".byte 0x0f,0x0b"); \
+#define BUG() do {                                     \
+       __asm__ __volatile__(".byte 0x0f,0x0b");        \
  } while (0)
  
  #define PAGE_BUG(page) do { \
diff --git a/include/asm-i386/pgalloc.h b/include/asm-i386/pgalloc.h

index 232e18ab7ce027f8620c872051ffb596c959ae9e..11b5d8a75b4640845c68d331f908eba19dedc8ec 100644 (file)
--- a/include/asm-i386/pgalloc.h
+++ b/include/asm-i386/pgalloc.h
@@ -128,9 +128,14 @@ static __inline__ void pte_free_slow(pte_t *pte)
         free_page((unsigned long)pte);
  }
  
-#define pte_free(pte)          pte_free_slow(pte)
+#define pte_free(pte)          pte_free_fast(pte)
+#ifdef CONFIG_X86_PAE
+#define pgd_alloc(mm)          get_pgd_slow()
  #define pgd_free(pgd)          free_pgd_slow(pgd)
+#else
  #define pgd_alloc(mm)          get_pgd_fast()
+#define pgd_free(pgd)          free_pgd_fast(pgd)
+#endif
  
  /*
   * allocating and freeing a pmd is trivial: the 1-entry pmd is
diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h

index 2949a9733e854ddf6ba90e7ea00844f171b6de65..4445df1150520e8582938d035c414c3d56a36f92 100644 (file)
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -14,6 +14,7 @@
  #include <asm/types.h>
  #include <asm/sigcontext.h>
  #include <asm/cpufeature.h>
+#include <linux/cache.h>
  #include <linux/config.h>
  #include <linux/threads.h>
  
@@ -52,7 +53,7 @@ struct cpuinfo_x86 {
         unsigned long *pmd_quick;
         unsigned long *pte_quick;
         unsigned long pgtable_cache_sz;
-};
+} __attribute__((__aligned__(SMP_CACHE_BYTES)));
  
  #define X86_VENDOR_INTEL 0
  #define X86_VENDOR_CYRIX 1
@@ -392,9 +393,6 @@ struct thread_struct {
         0,{~0,}                 /* io permissions */            \
  }
  
-#define INIT_MMAP \
-{ &init_mm, 0, 0, NULL, PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
-
  #define INIT_TSS  {                                            \
         0,0, /* back_link, __blh */                             \
         sizeof(init_stack) + (long) &init_stack, /* esp0 */     \
diff --git a/include/asm-ia64/processor.h b/include/asm-ia64/processor.h

index bf8411d0fcb58129b7b70db6b9fe1f8bf5160865..bef8ac6a8e42183b1e30349e87d18bc22d5deb40 100644 (file)
--- a/include/asm-ia64/processor.h
+++ b/include/asm-ia64/processor.h
@@ -364,11 +364,6 @@ struct thread_struct {
         struct ia64_fpreg fph[96];      /* saved/loaded on demand */
  };
  
-#define INIT_MMAP {                                                            \
-       &init_mm, PAGE_OFFSET, PAGE_OFFSET + 0x10000000, NULL, PAGE_SHARED,     \
-        VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL                            \
-}
-
  #define INIT_THREAD {                                  \
         0,                              /* ksp */       \
         0,                              /* flags */     \
@@ -974,6 +969,25 @@ ia64_thash (__u64 addr)
         return result;
  }
  
+
+#define ARCH_HAS_PREFETCH
+#define ARCH_HAS_PREFETCHW
+#define ARCH_HAS_SPINLOCK_PREFETCH
+#define PREFETCH_STRIDE 256
+
+extern inline void prefetch(const void *x)
+{
+         __asm__ __volatile__ ("lfetch [%0]" : : "r"(x));
+}
+         
+extern inline void prefetchw(const void *x)
+{
+       __asm__ __volatile__ ("lfetch.excl [%0]" : : "r"(x));
+}
+
+#define spin_lock_prefetch(x)   prefetchw(x)
+
+                  
  #endif /* !__ASSEMBLY__ */
  
  #endif /* _ASM_IA64_PROCESSOR_H */
diff --git a/include/asm-m68k/processor.h b/include/asm-m68k/processor.h

index 0ad99a7f8936d7f28a929d96275ec95fdd6ed808..cbe657e72652f80350abfe6a1376851abedc3d7d 100644 (file)
--- a/include/asm-m68k/processor.h
+++ b/include/asm-m68k/processor.h
@@ -78,8 +78,6 @@ struct thread_struct {
         unsigned char  fpstate[FPSTATESIZE];  /* floating point state */
  };
  
-#define INIT_MMAP { &init_mm, 0, 0x40000000, NULL, __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED), VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
-
  #define INIT_THREAD  { \
         sizeof(init_stack) + (unsigned long) init_stack, 0, \
         PS_S, __KERNEL_DS, \
diff --git a/include/asm-mips/processor.h b/include/asm-mips/processor.h

index 973d42d00f261fffffa94f99ea708ef74be4cc26..90c0b6c74b872db3df8ad1c33cc1ae64f1432a50 100644 (file)
--- a/include/asm-mips/processor.h
+++ b/include/asm-mips/processor.h
@@ -170,9 +170,6 @@ struct thread_struct {
  
  #endif /* !defined (_LANGUAGE_ASSEMBLY) */
  
-#define INIT_MMAP { &init_mm, KSEG0, KSEG1, NULL, PAGE_SHARED, \
-                    VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
-
  #define INIT_THREAD  { \
          /* \
           * saved main processor registers \
diff --git a/include/asm-mips64/processor.h b/include/asm-mips64/processor.h

index 8a8cb9116a619ad4bbb07c966b176d142b095614..879843188bfcbf2010506d0fca27b6c140cd030f 100644 (file)
--- a/include/asm-mips64/processor.h
+++ b/include/asm-mips64/processor.h
@@ -198,9 +198,6 @@ struct thread_struct {
  
  #endif /* !defined (_LANGUAGE_ASSEMBLY) */
  
-#define INIT_MMAP { &init_mm, KSEG0, KSEG1, NULL, PAGE_SHARED, \
-                    VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
-
  #define INIT_THREAD  { \
          /* \
           * saved main processor registers \
diff --git a/include/asm-parisc/processor.h b/include/asm-parisc/processor.h

index 12ae777a5296bdd0c3a93a48a0dcd99f44ff453d..c612fa8048383d9649315fbbcc346643df5ba2d0 100644 (file)
--- a/include/asm-parisc/processor.h
+++ b/include/asm-parisc/processor.h
@@ -107,9 +107,6 @@ struct thread_struct {
  /* Thread struct flags. */
  #define PARISC_KERNEL_DEATH    (1UL << 31)     /* see die_if_kernel()... */
  
-#define INIT_MMAP { &init_mm, 0, 0, NULL, PAGE_SHARED, \
-                   VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
-
  #define INIT_THREAD { {                        \
         { 0, 0, 0, 0, 0, 0, 0, 0,       \
           0, 0, 0, 0, 0, 0, 0, 0,       \
diff --git a/include/asm-ppc/kmap_types.h b/include/asm-ppc/kmap_types.h

index af043961090867ca5bac0329d57cd998f674aec0..427b2807724b18519a26680220bee68b6c3746f3 100644 (file)
--- a/include/asm-ppc/kmap_types.h
+++ b/include/asm-ppc/kmap_types.h
@@ -7,7 +7,6 @@
  
  enum km_type {
         KM_BOUNCE_READ,
-       KM_BOUNCE_WRITE,
         KM_SKB_DATA,
         KM_SKB_DATA_SOFTIRQ,
         KM_USER0,
diff --git a/include/asm-ppc/processor.h b/include/asm-ppc/processor.h

index 39c6f6483743e58e385d0bf6b463a9b78f8f74b3..633becd3efa0b0e7f494ff16cc113a78e61fccf7 100644 (file)
--- a/include/asm-ppc/processor.h
+++ b/include/asm-ppc/processor.h
@@ -620,14 +620,6 @@ struct thread_struct {
         {0}, 0, 0 \
  }
  
-/*
- * Note: the vm_start and vm_end fields here should *not*
- * be in kernel space.  (Could vm_end == vm_start perhaps?)
- */
-#define INIT_MMAP { &init_mm, 0, 0x1000, NULL, \
-                   PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, \
-                   1, NULL, NULL }
-
  /*
   * Return saved PC of a blocked thread. For now, this is the "user" PC
   */
diff --git a/include/asm-s390/processor.h b/include/asm-s390/processor.h

index c8ee18c9ec1bff90cfe15e7f23fae93795e19435..c1822dabf126b17145fe22fed6b0e0f24bf68b4e 100644 (file)
--- a/include/asm-s390/processor.h
+++ b/include/asm-s390/processor.h
@@ -95,10 +95,6 @@ struct thread_struct
  
  typedef struct thread_struct thread_struct;
  
-#define INIT_MMAP \
-{ &init_mm, 0, 0, NULL, PAGE_SHARED, \
-VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
-
  #define INIT_THREAD { (struct pt_regs *) 0,                       \
                      { 0,{{0},{0},{0},{0},{0},{0},{0},{0},{0},{0}, \
                             {0},{0},{0},{0},{0},{0}}},            \
diff --git a/include/asm-s390x/processor.h b/include/asm-s390x/processor.h

index 84bbf3db83131c46ed70ad527ea43ce83242f1e7..b33891a1129f2a7a82bdeb5a526aa41eee4dfaa2 100644 (file)
--- a/include/asm-s390x/processor.h
+++ b/include/asm-s390x/processor.h
@@ -99,10 +99,6 @@ struct thread_struct
  
  typedef struct thread_struct thread_struct;
  
-#define INIT_MMAP \
-{ &init_mm, 0, 0, NULL, PAGE_SHARED, \
-VM_READ | VM_WRITE | VM_EXEC, 1, NULL,NULL }
-
  #define INIT_THREAD { (struct pt_regs *) 0,                       \
                      { 0,{{0},{0},{0},{0},{0},{0},{0},{0},{0},{0}, \
                             {0},{0},{0},{0},{0},{0}}},            \
diff --git a/include/asm-sh/processor.h b/include/asm-sh/processor.h

index 0e16df80b644f7ec5e14b597cb72dbf07850c9e7..d6283095ec65a92ae008b993c38101bd067402c1 100644 (file)
--- a/include/asm-sh/processor.h
+++ b/include/asm-sh/processor.h
@@ -110,9 +110,6 @@ struct thread_struct {
         union sh_fpu_union fpu;
  };
  
-#define INIT_MMAP \
-{ &init_mm, 0, 0, NULL, PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
-
  #define INIT_THREAD  {                                         \
         sizeof(init_stack) + (long) &init_stack, /* sp */       \
         0,                                       /* pc */       \
diff --git a/include/asm-sparc/fcntl.h b/include/asm-sparc/fcntl.h

index 00e5e5b4cc6eecb04e211253c32d85ad17fa6b80..465c6e05fb78960e4f8b794bd80c0ea12668392f 100644 (file)
--- a/include/asm-sparc/fcntl.h
+++ b/include/asm-sparc/fcntl.h
@@ -20,6 +20,7 @@
  #define O_DIRECTORY    0x10000 /* must be a directory */
  #define O_NOFOLLOW     0x20000 /* don't follow links */
  #define O_LARGEFILE    0x40000
+#define O_DIRECT        0x100000 /* direct disk access hint */
  
  #define F_DUPFD                0       /* dup */
  #define F_GETFD                1       /* get close_on_exec */
diff --git a/include/asm-sparc/kmap_types.h b/include/asm-sparc/kmap_types.h

index 82af4b9e8379db475e0e9cab3d3257902cff89e8..5107c3db1647ae0963cdf80337b1e09939d18020 100644 (file)
--- a/include/asm-sparc/kmap_types.h
+++ b/include/asm-sparc/kmap_types.h
@@ -3,7 +3,6 @@
  
  enum km_type {
         KM_BOUNCE_READ,
-       KM_BOUNCE_WRITE,
         KM_SKB_DATA,
         KM_SKB_DATA_SOFTIRQ,
         KM_USER0,
diff --git a/include/asm-sparc/processor.h b/include/asm-sparc/processor.h

index ad35e1613e3c6d5d0d2e3966c223957651b90259..deccc745a12893d40ebf3dfb1414e15bc46dcb0c 100644 (file)
--- a/include/asm-sparc/processor.h
+++ b/include/asm-sparc/processor.h
@@ -91,9 +91,6 @@ struct thread_struct {
  #define SPARC_FLAG_KTHREAD      0x1    /* task is a kernel thread */
  #define SPARC_FLAG_UNALIGNED    0x2    /* is allowed to do unaligned accesses */
  
-#define INIT_MMAP { &init_mm, (0), (0), \
-                   NULL, __pgprot(0x0) , VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
-
  #define INIT_THREAD  { \
  /* uwinmask, kregs, ksp, kpc, kpsr, kwim */ \
     0,        0,     0,   0,   0,    0, \
diff --git a/include/asm-sparc64/fcntl.h b/include/asm-sparc64/fcntl.h

index 5b119bc78937517a1df7c9a0a9c9f8c925b94a5c..521efc96f44f47c78e88441c49736dc1786ffba1 100644 (file)
--- a/include/asm-sparc64/fcntl.h
+++ b/include/asm-sparc64/fcntl.h
@@ -20,6 +20,8 @@
  #define O_DIRECTORY    0x10000 /* must be a directory */
  #define O_NOFOLLOW     0x20000 /* don't follow links */
  #define O_LARGEFILE    0x40000
+#define O_DIRECT        0x100000 /* direct disk access hint */
+
  
  #define F_DUPFD                0       /* dup */
  #define F_GETFD                1       /* get close_on_exec */
diff --git a/include/asm-sparc64/processor.h b/include/asm-sparc64/processor.h

index e767e4a0d46584bf91683ffceb6ae030cc3bf7cb..f6431ca84d9ef486d1f2be4335118b8f7d65f9a1 100644 (file)
--- a/include/asm-sparc64/processor.h
+++ b/include/asm-sparc64/processor.h
@@ -85,9 +85,6 @@ struct thread_struct {
  #define FAULT_CODE_ITLB                0x04    /* Miss happened in I-TLB               */
  #define FAULT_CODE_WINFIXUP    0x08    /* Miss happened during spill/fill      */
  
-#define INIT_MMAP { &init_mm, 0xfffff80000000000, 0xfffff80001000000, \
-                   NULL, PAGE_SHARED , VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
-
  #define INIT_THREAD  {                                 \
  /* ksp, wstate, cwp, flags, current_ds, */             \
     0,   0,      0,   0,     KERNEL_DS,                 \
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h

index 86ea92ae5a94436edcbb908ef552f83b93e7a4d1..f266229c340ce0ece20a9132c2ef59e0a4fe86ec 100644 (file)
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -203,4 +203,15 @@ static inline int get_hardsect_size(kdev_t dev)
  #define blk_finished_io(nsects)        do { } while (0)
  #define blk_started_io(nsects) do { } while (0)
  
+static inline int buffered_blk_size(kdev_t dev)
+{
+       int ret = INT_MAX;
+       int major = MAJOR(dev);
+
+       if (blk_size[major])
+               ret = blk_size[major][MINOR(dev)] + ((BUFFERED_BLOCKSIZE-1) >> BLOCK_SIZE_BITS);
+
+       return ret;
+}
+
  #endif
diff --git a/include/linux/cache.h b/include/linux/cache.h

index 2030eb72bd368a5f2160fb8b3d6368417fff76db..086accecfce48ea83025191325e1b59ab97a6e61 100644 (file)
--- a/include/linux/cache.h
+++ b/include/linux/cache.h
@@ -1,6 +1,7 @@
  #ifndef __LINUX_CACHE_H
  #define __LINUX_CACHE_H
  
+#include <linux/config.h>
  #include <asm/cache.h>
  
  #ifndef L1_CACHE_ALIGN
@@ -15,6 +16,14 @@
  #define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES)))
  #endif
  
+#ifndef ____cacheline_aligned_in_smp
+#ifdef CONFIG_SMP
+#define ____cacheline_aligned_in_smp ____cacheline_aligned
+#else
+#define ____cacheline_aligned_in_smp
+#endif /* CONFIG_SMP */
+#endif
+
  #ifndef __cacheline_aligned
  #ifdef MODULE
  #define __cacheline_aligned ____cacheline_aligned
diff --git a/include/linux/console.h b/include/linux/console.h

index 288f83b96f99c0f2417584279a12c76c9263f2e8..d2ae967af36c11744a99b183418f60e4ad425f1c 100644 (file)
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -91,8 +91,6 @@ extern struct console_cmdline console_list[MAX_CMDLINECONSOLES];
  #define CON_CONSDEV    (2) /* Last on the command line */
  #define CON_ENABLED    (4)
  
-extern spinlock_t console_lock;
-
  struct console
  {
         char    name[8];
@@ -111,6 +109,9 @@ struct console
  extern void register_console(struct console *);
  extern int unregister_console(struct console *);
  extern struct console *console_drivers;
+extern void acquire_console_sem(void);
+extern void release_console_sem(void);
+extern void console_conditional_schedule(void);
  
  /* VESA Blanking Levels */
  #define VESA_NO_BLANKING        0
diff --git a/include/linux/ext2_fs_i.h b/include/linux/ext2_fs_i.h

index 3f027f9d7ac8a014dc18f474040e8332a758becb..7f02e7537ba3b6db6caf82a9c5e485d3ba2fddfb 100644 (file)
--- a/include/linux/ext2_fs_i.h
+++ b/include/linux/ext2_fs_i.h
@@ -34,6 +34,7 @@ struct ext2_inode_info {
         __u32   i_next_alloc_goal;
         __u32   i_prealloc_block;
         __u32   i_prealloc_count;
+       __u32   i_dir_start_lookup;
         int     i_new_inode:1;  /* Is a freshly allocated inode */
  };
  
diff --git a/include/linux/fs.h b/include/linux/fs.h

index 549459db59f9131a1062d02e9dd0f108415b19c7..31a2167afac85e1657780130ac6fa2c3452213b5 100644 (file)
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -46,6 +46,10 @@ struct poll_table_struct;
  #define BLOCK_SIZE_BITS 10
  #define BLOCK_SIZE (1<<BLOCK_SIZE_BITS)
  
+/* buffer header fixed size for the blkdev I/O through pagecache */
+#define BUFFERED_BLOCKSIZE_BITS 10
+#define BUFFERED_BLOCKSIZE (1 << BUFFERED_BLOCKSIZE_BITS)
+
  /* And dynamically-tunable limits and defaults: */
  struct files_stat_struct {
         int nr_files;           /* read only */
@@ -211,7 +215,8 @@ enum bh_state_bits {
         BH_Req,         /* 0 if the buffer has been invalidated */
         BH_Mapped,      /* 1 if the buffer has a disk mapping */
         BH_New,         /* 1 if the buffer is new and not yet written out */
-       BH_Protected,   /* 1 if the buffer is protected */
+       BH_Async,       /* 1 if the buffer is under end_buffer_io_async I/O */
+       BH_Wait_IO,     /* 1 if we should throttle on this buffer */
  
         BH_PrivateStart,/* not a state bit, but the first bit available
                          * for private allocation by other entities
@@ -271,7 +276,7 @@ void init_buffer(struct buffer_head *, bh_end_io_t *, void *);
  #define buffer_req(bh)         __buffer_state(bh,Req)
  #define buffer_mapped(bh)      __buffer_state(bh,Mapped)
  #define buffer_new(bh)         __buffer_state(bh,New)
-#define buffer_protected(bh)   __buffer_state(bh,Protected)
+#define buffer_async(bh)       __buffer_state(bh,Async)
  
  #define bh_offset(bh)          ((unsigned long)(bh)->b_data & ~PAGE_MASK)
  
@@ -366,6 +371,7 @@ struct iattr {
   */
  struct page;
  struct address_space;
+struct kiobuf;
  
  struct address_space_operations {
         int (*writepage)(struct page *);
@@ -375,6 +381,8 @@ struct address_space_operations {
         int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
         /* Unfortunately this kludge is needed for FIBMAP. Don't use it */
         int (*bmap)(struct address_space *, long);
+#define KERNEL_HAS_O_DIRECT /* this is for modules out of the kernel */
+       int (*direct_IO)(int, struct inode *, struct kiobuf *, unsigned long, int);
  };
  
  struct address_space {
@@ -401,9 +409,10 @@ struct char_device {
  struct block_device {
         struct list_head        bd_hash;
         atomic_t                bd_count;
-/*     struct address_space    bd_data; */
+       struct inode *          bd_inode;
         dev_t                   bd_dev;  /* not a kdev_t - it's a search key */
-       atomic_t                bd_openers;
+       int                     bd_openers;
+       int                     bd_cache_openers;
         const struct block_device_operations *bd_op;
         struct semaphore        bd_sem; /* open/close mutex */
  };
@@ -414,6 +423,7 @@ struct inode {
         struct list_head        i_dentry;
         
         struct list_head        i_dirty_buffers;
+       struct list_head        i_dirty_data_buffers;
  
         unsigned long           i_ino;
         atomic_t                i_count;
@@ -438,7 +448,8 @@ struct inode {
         wait_queue_head_t       i_wait;
         struct file_lock        *i_flock;
         struct address_space    *i_mapping;
-       struct address_space    i_data; 
+       struct address_space    i_data;
+       int                     i_mapping_overload;
         struct dquot            *i_dquot[MAXQUOTAS];
         /* These three should probably be a union */
         struct pipe_inode_info  *i_pipe;
@@ -512,6 +523,10 @@ struct file {
  
         /* needed for tty driver, and maybe others */
         void                    *private_data;
+
+       /* preallocated helper kiobuf to speedup O_DIRECT */
+       struct kiobuf           *f_iobuf;
+       long                    f_iobuf_lock;
  };
  extern spinlock_t files_lock;
  #define file_list_lock() spin_lock(&files_lock);
@@ -1035,7 +1050,9 @@ extern void bdput(struct block_device *);
  extern struct char_device *cdget(dev_t);
  extern void cdput(struct char_device *);
  extern int blkdev_open(struct inode *, struct file *);
+extern int blkdev_close(struct inode *, struct file *);
  extern struct file_operations def_blk_fops;
+extern struct address_space_operations def_blk_aops;
  extern struct file_operations def_fifo_fops;
  extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long);
  extern int blkdev_get(struct block_device *, mode_t, unsigned, int);
@@ -1074,8 +1091,7 @@ extern void set_buffer_async_io(struct buffer_head *bh) ;
  #define BUF_CLEAN      0
  #define BUF_LOCKED     1       /* Buffers scheduled for write */
  #define BUF_DIRTY      2       /* Dirty buffers, not yet scheduled for write */
-#define BUF_PROTECTED  3       /* Ramdisk persistent storage */
-#define NR_LIST                4
+#define NR_LIST                3
  
  static inline void get_bh(struct buffer_head * bh)
  {
@@ -1112,24 +1128,21 @@ static inline void mark_buffer_clean(struct buffer_head * bh)
                 __mark_buffer_clean(bh);
  }
  
-#define atomic_set_buffer_protected(bh) test_and_set_bit(BH_Protected, &(bh)->b_state)
-
-static inline void __mark_buffer_protected(struct buffer_head *bh)
-{
-       refile_buffer(bh);
-}
-
-static inline void mark_buffer_protected(struct buffer_head * bh)
-{
-       if (!atomic_set_buffer_protected(bh))
-               __mark_buffer_protected(bh);
-}
-
+extern void FASTCALL(__mark_dirty(struct buffer_head *bh));
  extern void FASTCALL(__mark_buffer_dirty(struct buffer_head *bh));
  extern void FASTCALL(mark_buffer_dirty(struct buffer_head *bh));
+extern void FASTCALL(buffer_insert_inode_data_queue(struct buffer_head *, struct inode *));
  
  #define atomic_set_buffer_dirty(bh) test_and_set_bit(BH_Dirty, &(bh)->b_state)
  
+static inline void mark_buffer_async(struct buffer_head * bh, int on)
+{
+       if (on)
+               set_bit(BH_Async, &bh->b_state);
+       else
+               clear_bit(BH_Async, &bh->b_state);
+}
+
  /*
   * If an error happens during the make_request, this function
   * has to be recalled. It marks the buffer as clean and not
@@ -1157,20 +1170,29 @@ extern int check_disk_change(kdev_t);
  extern int invalidate_inodes(struct super_block *);
  extern int invalidate_device(kdev_t, int);
  extern void invalidate_inode_pages(struct inode *);
+extern void invalidate_inode_pages2(struct address_space *);
  extern void invalidate_inode_buffers(struct inode *);
-#define invalidate_buffers(dev)        __invalidate_buffers((dev), 0)
-#define destroy_buffers(dev)   __invalidate_buffers((dev), 1)
-extern void __invalidate_buffers(kdev_t dev, int);
+#define invalidate_buffers(dev)        __invalidate_buffers((dev), 0, 0)
+#define destroy_buffers(dev)   __invalidate_buffers((dev), 1, 0)
+#define update_buffers(dev)                    \
+do {                                           \
+       __invalidate_buffers((dev), 0, 1);      \
+       __invalidate_buffers((dev), 0, 2);      \
+} while (0)
+extern void __invalidate_buffers(kdev_t dev, int, int);
  extern void sync_inodes(kdev_t);
  extern void sync_unlocked_inodes(void);
  extern void write_inode_now(struct inode *, int);
+extern int sync_buffers(kdev_t, int);
  extern void sync_dev(kdev_t);
  extern int fsync_dev(kdev_t);
  extern int fsync_super(struct super_block *);
  extern int fsync_no_super(kdev_t);
  extern void sync_inodes_sb(struct super_block *);
-extern int fsync_inode_buffers(struct inode *);
  extern int osync_inode_buffers(struct inode *);
+extern int osync_inode_data_buffers(struct inode *);
+extern int fsync_inode_buffers(struct inode *);
+extern int fsync_inode_data_buffers(struct inode *);
  extern int inode_has_buffers(struct inode *);
  extern void filemap_fdatasync(struct address_space *);
  extern void filemap_fdatawait(struct address_space *);
@@ -1329,7 +1351,9 @@ extern int brw_page(int, struct page *, kdev_t, int [], int);
  typedef int (get_block_t)(struct inode*,long,struct buffer_head*,int);
  
  /* Generic buffer handling for block filesystems.. */
-extern int block_flushpage(struct page *, unsigned long);
+extern int discard_bh_page(struct page *, unsigned long, int);
+#define block_flushpage(page, offset) discard_bh_page(page, offset, 1)
+#define block_invalidate_page(page) discard_bh_page(page, 0, 0)
  extern int block_symlink(struct inode *, const char *, int);
  extern int block_write_full_page(struct page*, get_block_t*);
  extern int block_read_full_page(struct page*, get_block_t*);
@@ -1341,6 +1365,8 @@ extern int block_sync_page(struct page *);
  int generic_block_bmap(struct address_space *, long, get_block_t *);
  int generic_commit_write(struct file *, struct page *, unsigned, unsigned);
  int block_truncate_page(struct address_space *, loff_t, get_block_t *);
+extern int generic_direct_IO(int, struct inode *, struct kiobuf *, unsigned long, int, get_block_t *);
+extern void create_empty_buffers(struct page *, kdev_t, unsigned long);
  
  extern int waitfor_one_page(struct page*);
  extern int generic_file_mmap(struct file *, struct vm_area_struct *);
@@ -1400,6 +1426,9 @@ extern ssize_t block_write(struct file *, const char *, size_t, loff_t *);
  extern int file_fsync(struct file *, struct dentry *, int);
  extern int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx);
  extern int generic_osync_inode(struct inode *, int);
+#define OSYNC_METADATA (1<<0)
+#define OSYNC_DATA (1<<1)
+#define OSYNC_INODE (1<<2)
  
  extern int inode_change_ok(struct inode *, struct iattr *);
  extern int inode_setattr(struct inode *, struct iattr *);
diff --git a/include/linux/highmem.h b/include/linux/highmem.h

index abc5e29d95b9e55b318b2713ceb06d5a916e10c2..1e2b8b1f4ae650f4a6d683efdbf9e3dc102ca6ed 100644 (file)
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -11,7 +11,7 @@ extern struct page *highmem_start_page;
  #include <asm/highmem.h>
  
  /* declarations for linux/mm/highmem.c */
-FASTCALL(unsigned int nr_free_highpages(void));
+unsigned int nr_free_highpages(void);
  
  extern struct buffer_head * create_bounce(int rw, struct buffer_head * bh_orig);
  
diff --git a/include/linux/irq.h b/include/linux/irq.h

index 3b6af49b3071ee76a646fbb2cfb0c808fe9dc106..fca74da7d54ba73baa7ec5d1acdaa2e554d2e76f 100644 (file)
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -56,6 +56,19 @@ extern irq_desc_t irq_desc [NR_IRQS];
  
  #include <asm/hw_irq.h> /* the arch dependent stuff */
  
+/**
+ * touch_nmi_watchdog - restart NMI watchdog timeout.
+ * 
+ * If the architecture supports the NMI watchdog, touch_nmi_watchdog()
+ * may be used to reset the timeout - for code which intentionally
+ * disables interrupts for a long time. This call is stateless.
+ */
+#ifdef ARCH_HAS_NMI_WATCHDOG
+extern void touch_nmi_watchdog(void);
+#else
+# define touch_nmi_watchdog() do { } while(0)
+#endif
+
  extern int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
  extern int setup_irq(unsigned int , struct irqaction * );
  
diff --git a/include/linux/kbd_kern.h b/include/linux/kbd_kern.h

index cc442a6b865b63f6a378b81a0c7928aff5c98acb..7d0395582eb767e1a65abc081c811d6fc69b136b 100644 (file)
--- a/include/linux/kbd_kern.h
+++ b/include/linux/kbd_kern.h
@@ -69,23 +69,12 @@ extern int kbd_init(void);
  extern unsigned char getledstate(void);
  extern void setledstate(struct kbd_struct *kbd, unsigned int led);
  
-extern struct tasklet_struct console_tasklet;
-
  extern int do_poke_blanked_console;
  
  extern void (*kbd_ledfunc)(unsigned int led);
  
-static inline void show_console(void)
-{
-       do_poke_blanked_console = 1;
-       tasklet_schedule(&console_tasklet);
-}
-
-static inline void set_console(int nr)
-{
-       want_console = nr;
-       tasklet_schedule(&console_tasklet);
-}
+extern void set_console(int nr);
+extern void schedule_console_callback(void);
  
  static inline void set_leds(void)
  {
@@ -159,12 +148,9 @@ extern unsigned int keymap_count;
  
  /* console.c */
  
-extern task_queue con_task_queue;
-
  static inline void con_schedule_flip(struct tty_struct *t)
  {
-       queue_task(&t->flip.tqueue, &con_task_queue);
-       tasklet_schedule(&console_tasklet);
+       schedule_task(&t->flip.tqueue);
  }
  
  #endif
diff --git a/include/linux/kernel.h b/include/linux/kernel.h

index 4d0143310aca6ff7e4b4f52c35238d6d4450b01b..1ee66f6e10c95bf2b4f83b00ecdda11cb2b32735 100644 (file)
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -93,6 +93,9 @@ static inline void console_verbose(void)
                 console_loglevel = 15;
  }
  
+extern void bust_spinlocks(int yes);
+extern int oops_in_progress;           /* If set, an oops, panic(), BUG() or die() is in progress */
+
  #if DEBUG
  #define pr_debug(fmt,arg...) \
         printk(KERN_DEBUG fmt,##arg)
diff --git a/include/linux/list.h b/include/linux/list.h

index 48fea84b49ed07c32a2e135a4381331358603603..4a88363eeca4650139bc9eb7097efc3efbbf179f 100644 (file)
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -3,6 +3,8 @@
  
  #if defined(__KERNEL__) || defined(_LVM_H_INCLUDE)
  
+#include <linux/prefetch.h>
+
  /*
   * Simple doubly linked list implementation.
   *
@@ -90,6 +92,7 @@ static __inline__ void __list_del(struct list_head * prev,
  static __inline__ void list_del(struct list_head *entry)
  {
         __list_del(entry->prev, entry->next);
+       entry->next = entry->prev = 0;
  }
  
  /**
@@ -147,8 +150,9 @@ static __inline__ void list_splice(struct list_head *list, struct list_head *hea
   * @head:      the head for your list.
   */
  #define list_for_each(pos, head) \
-       for (pos = (head)->next; pos != (head); pos = pos->next)
-
+       for (pos = (head)->next, prefetch(pos->next); pos != (head); \
+               pos = pos->next, prefetch(pos->next))
+               
  #endif /* __KERNEL__ || _LVM_H_INCLUDE */
  
  #endif
diff --git a/include/linux/loop.h b/include/linux/loop.h

index 37eefbb13f9b916c43b8fa5358aa04772483d1a5..11622345f2f9e3efd4c0b9fdf17f97e9a5b4d564 100644 (file)
--- a/include/linux/loop.h
+++ b/include/linux/loop.h
@@ -77,6 +77,7 @@ static inline int lo_do_transfer(struct loop_device *lo, int cmd, char *rbuf,
   */
  #define LO_FLAGS_DO_BMAP       1
  #define LO_FLAGS_READ_ONLY     2
+#define LO_FLAGS_BH_REMAP      4
  
  /* 
   * Note that this structure gets the wrong offsets when directly used
diff --git a/include/linux/lvm.h b/include/linux/lvm.h

index e4f6c1f44a3b7c171d3e310ded016ffc5cc4a263..b1bc98c045a9dfcc9664a4b5ca51ec505257c750 100644 (file)
--- a/include/linux/lvm.h
+++ b/include/linux/lvm.h
@@ -283,8 +283,9 @@
  #define        LVM_MAX_STRIPES         128     /* max # of stripes */
  #define        LVM_MAX_SIZE            ( 1024LU * 1024 / SECTOR_SIZE * 1024 * 1024)    /* 1TB[sectors] */
  #define        LVM_MAX_MIRRORS         2       /* future use */
-#define        LVM_MIN_READ_AHEAD      2       /* minimum read ahead sectors */
-#define        LVM_MAX_READ_AHEAD      120     /* maximum read ahead sectors */
+#define        LVM_MIN_READ_AHEAD      0       /* minimum read ahead sectors */
+#define        LVM_DEFAULT_READ_AHEAD  1024    /* default read ahead sectors for 512k scsi segments */
+#define        LVM_MAX_READ_AHEAD      10000   /* maximum read ahead sectors */
  #define        LVM_MAX_LV_IO_TIMEOUT   60      /* seconds I/O timeout (future use) */
  #define        LVM_PARTITION           0xfe    /* LVM partition id */
  #define        LVM_NEW_PARTITION       0x8e    /* new LVM partition id (10/09/1999) */
diff --git a/include/linux/mm.h b/include/linux/mm.h

index 949bb22391f7f2527b91af4e1ab79df5549abee7..1efd8c8b1fd28bd8036dbe48b0284a2c9b18f94a 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -11,6 +11,7 @@
  #include <linux/list.h>
  #include <linux/mmzone.h>
  #include <linux/swap.h>
+#include <linux/rbtree.h>
  
  extern unsigned long max_mapnr;
  extern unsigned long num_physpages;
@@ -18,7 +19,7 @@ extern void * high_memory;
  extern int page_cluster;
  /* The inactive_clean lists are per zone. */
  extern struct list_head active_list;
-extern struct list_head inactive_dirty_list;
+extern struct list_head inactive_list;
  
  #include <asm/page.h>
  #include <asm/pgtable.h>
@@ -50,10 +51,7 @@ struct vm_area_struct {
         pgprot_t vm_page_prot;          /* Access permissions of this VMA. */
         unsigned long vm_flags;         /* Flags, listed below. */
  
-       /* AVL tree of VM areas per task, sorted by address */
-       short vm_avl_height;
-       struct vm_area_struct * vm_avl_left;
-       struct vm_area_struct * vm_avl_right;
+       rb_node_t vm_rb;
  
         /*
          * For areas with an address space and backing store,
@@ -156,7 +154,6 @@ typedef struct page {
                                            updated asynchronously */
         struct list_head lru;           /* Pageout list, eg. active_list;
                                            protected by pagemap_lru_lock !! */
-       unsigned long age;              /* Page aging counter. */
         wait_queue_head_t wait;         /* Page locked?  Stand in line... */
         struct page **pprev_hash;       /* Complement to *next_hash. */
         struct buffer_head * buffers;   /* Buffer maps us to a disk block. */
@@ -275,16 +272,14 @@ typedef struct page {
  #define PG_dirty                4
  #define PG_decr_after           5
  #define PG_active               6
-#define PG_inactive_dirty       7
+#define PG_inactive             7
  #define PG_slab                         8
  #define PG_swap_cache           9
  #define PG_skip                        10
-#define PG_inactive_clean      11
-#define PG_highmem             12
-#define PG_checked             13      /* kill me in 2.5.<early>. */
-                               /* bits 21-29 unused */
-#define PG_arch_1              30
-#define PG_reserved            31
+#define PG_highmem             11
+#define PG_checked             12      /* kill me in 2.5.<early>. */
+#define PG_arch_1              13
+#define PG_reserved            14
  
  /* Make it prettier to test the above... */
  #define Page_Uptodate(page)    test_bit(PG_uptodate, &(page)->flags)
@@ -347,14 +342,14 @@ static inline void set_page_dirty(struct page * page)
  #define PageActive(page)       test_bit(PG_active, &(page)->flags)
  #define SetPageActive(page)    set_bit(PG_active, &(page)->flags)
  #define ClearPageActive(page)  clear_bit(PG_active, &(page)->flags)
+#define TestandSetPageActive(page)     test_and_set_bit(PG_active, &(page)->flags)
+#define TestandClearPageActive(page)   test_and_clear_bit(PG_active, &(page)->flags)
  
-#define PageInactiveDirty(page)        test_bit(PG_inactive_dirty, &(page)->flags)
-#define SetPageInactiveDirty(page)     set_bit(PG_inactive_dirty, &(page)->flags)
-#define ClearPageInactiveDirty(page)   clear_bit(PG_inactive_dirty, &(page)->flags)
-
-#define PageInactiveClean(page)        test_bit(PG_inactive_clean, &(page)->flags)
-#define SetPageInactiveClean(page)     set_bit(PG_inactive_clean, &(page)->flags)
-#define ClearPageInactiveClean(page)   clear_bit(PG_inactive_clean, &(page)->flags)
+#define PageInactive(page)     test_bit(PG_inactive, &(page)->flags)
+#define SetPageInactive(page)  set_bit(PG_inactive, &(page)->flags)
+#define ClearPageInactive(page)        clear_bit(PG_inactive, &(page)->flags)
+#define TestandSetPageInactive(page)   test_and_set_bit(PG_inactive, &(page)->flags)
+#define TestandClearPageInactive(page) test_and_clear_bit(PG_inactive, &(page)->flags)
  
  #ifdef CONFIG_HIGHMEM
  #define PageHighMem(page)              test_bit(PG_highmem, &(page)->flags)
@@ -380,11 +375,11 @@ extern mem_map_t * mem_map;
   * can allocate highmem pages, the *get*page*() variants return
   * virtual kernel addresses to the allocated page(s).
   */
-extern struct page * FASTCALL(_alloc_pages(unsigned int gfp_mask, unsigned long order));
-extern struct page * FASTCALL(__alloc_pages(unsigned int gfp_mask, unsigned long order, zonelist_t *zonelist));
-extern struct page * alloc_pages_node(int nid, int gfp_mask, unsigned long order);
+extern struct page * FASTCALL(_alloc_pages(unsigned int gfp_mask, unsigned int order));
+extern struct page * FASTCALL(__alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist));
+extern struct page * alloc_pages_node(int nid, unsigned int gfp_mask, unsigned int order);
  
-static inline struct page * alloc_pages(int gfp_mask, unsigned long order)
+static inline struct page * alloc_pages(unsigned int gfp_mask, unsigned int order)
  {
         /*
          * Gets optimized away by the compiler.
@@ -396,8 +391,8 @@ static inline struct page * alloc_pages(int gfp_mask, unsigned long order)
  
  #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
  
-extern unsigned long FASTCALL(__get_free_pages(int gfp_mask, unsigned long order));
-extern unsigned long FASTCALL(get_zeroed_page(int gfp_mask));
+extern unsigned long FASTCALL(__get_free_pages(unsigned int gfp_mask, unsigned int order));
+extern unsigned long FASTCALL(get_zeroed_page(unsigned int gfp_mask));
  
  #define __get_free_page(gfp_mask) \
                 __get_free_pages((gfp_mask),0)
@@ -413,8 +408,8 @@ extern unsigned long FASTCALL(get_zeroed_page(int gfp_mask));
  /*
   * There is only one 'core' page-freeing function.
   */
-extern void FASTCALL(__free_pages(struct page *page, unsigned long order));
-extern void FASTCALL(free_pages(unsigned long addr, unsigned long order));
+extern void FASTCALL(__free_pages(struct page *page, unsigned int order));
+extern void FASTCALL(free_pages(unsigned long addr, unsigned int order));
  
  #define __free_page(page) __free_pages((page), 0)
  #define free_page(addr) free_pages((addr),0)
@@ -451,7 +446,7 @@ extern int ptrace_attach(struct task_struct *tsk);
   */
  static inline pmd_t *pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
  {
-       if (!pgd_present(*pgd))
+       if (pgd_none(*pgd))
                 return __pmd_alloc(mm, pgd, address);
         return pmd_offset(pgd, address);
  }
@@ -468,6 +463,11 @@ extern void show_mem(void);
  extern void si_meminfo(struct sysinfo * val);
  extern void swapin_readahead(swp_entry_t);
  
+static inline int is_page_cache_freeable(struct page * page)
+{
+       return page_count(page) - !!page->buffers == 1;
+}
+
  /*
   * Work out if there are any other processes sharing this
   * swap cache page. Never mind the buffers.
@@ -490,7 +490,7 @@ extern void lock_vma_mappings(struct vm_area_struct *);
  extern void unlock_vma_mappings(struct vm_area_struct *);
  extern void insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
  extern void __insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
-extern void build_mmap_avl(struct mm_struct *);
+extern void build_mmap_rb(struct mm_struct *);
  extern void exit_mmap(struct mm_struct *);
  
  extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
@@ -516,6 +516,22 @@ extern int do_munmap(struct mm_struct *, unsigned long, size_t);
  
  extern unsigned long do_brk(unsigned long, unsigned long);
  
+static inline void __vma_unlink(struct mm_struct * mm, struct vm_area_struct * vma, struct vm_area_struct * prev)
+{
+       prev->vm_next = vma->vm_next;
+       rb_erase(&vma->vm_rb, &mm->mm_rb);
+       if (mm->mmap_cache == vma)
+               mm->mmap_cache = prev;
+}
+
+static inline int can_vma_merge(struct vm_area_struct * vma, unsigned long vm_flags)
+{
+       if (!vma->vm_file && vma->vm_flags == vm_flags)
+               return 1;
+       else
+               return 0;
+}
+
  struct zone_t;
  /* filemap.c */
  extern void remove_inode_page(struct page *);
@@ -562,6 +578,11 @@ static inline int expand_stack(struct vm_area_struct * vma, unsigned long addres
  {
         unsigned long grow;
  
+       /*
+        * vma->vm_start/vm_end cannot change under us because the caller is required
+        * to hold the mmap_sem in write mode. We need to get the spinlock only
+        * before relocating the vma range ourself.
+        */
         address &= PAGE_MASK;
         grow = (vma->vm_start - address) >> PAGE_SHIFT;
         if (vma->vm_end - address > current->rlim[RLIMIT_STACK].rlim_cur ||
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h

index ab122514d5089c6df44d9b69ea3f7517fe532128..ea14bd835c6879798895df8500b654ad5483893e 100644 (file)
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -39,14 +39,12 @@ typedef struct zone_struct {
          */
         spinlock_t              lock;
         unsigned long           free_pages;
-       unsigned long           inactive_clean_pages;
-       unsigned long           inactive_dirty_pages;
         unsigned long           pages_min, pages_low, pages_high;
+       int                     need_balance;
  
         /*
          * free areas of different sizes
          */
-       struct list_head        inactive_clean_list;
         free_area_t             free_area[MAX_ORDER];
  
         /*
@@ -101,6 +99,7 @@ struct bootmem_data;
  typedef struct pglist_data {
         zone_t node_zones[MAX_NR_ZONES];
         zonelist_t node_zonelists[GFP_ZONEMASK+1];
+       int nr_zones;
         struct page *node_mem_map;
         unsigned long *valid_addr_bitmap;
         struct bootmem_data *bdata;
@@ -114,8 +113,8 @@ typedef struct pglist_data {
  extern int numnodes;
  extern pg_data_t *pgdat_list;
  
-#define memclass(pgzone, tzone)        (((pgzone)->zone_pgdat == (tzone)->zone_pgdat) \
-                       && ((pgzone) <= (tzone)))
+#define memclass(pgzone, classzone)    (((pgzone)->zone_pgdat == (classzone)->zone_pgdat) \
+                       && ((pgzone) <= (classzone)))
  
  /*
   * The following two are not meant for general usage. They are here as
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h

index fa422a86f02a63b3eddaf3a4ef3d33a60b095dee..88366342a2c9c14be96f86328ad3ec08fbd321b6 100644 (file)
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -29,7 +29,6 @@
  #define PAGE_CACHE_ALIGN(addr) (((addr)+PAGE_CACHE_SIZE-1)&PAGE_CACHE_MASK)
  
  #define page_cache_get(x)      get_page(x)
-#define page_cache_free(x)     __free_page(x)
  #define page_cache_release(x)  __free_page(x)
  
  static inline struct page *page_cache_alloc(struct address_space *x)
diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h

new file mode 100644 (file)

index 0000000..96f20e1
--- /dev/null
+++ b/include/linux/rbtree.h
@@ -0,0 +1,133 @@
+/*
+  Red Black Trees
+  (C) 1999  Andrea Arcangeli <andrea@suse.de>
+  
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+  linux/include/linux/rbtree.h
+
+  To use rbtrees you'll have to implement your own insert and search cores.
+  This will avoid us to use callbacks and to drop drammatically performances.
+  I know it's not the cleaner way,  but in C (not in C++) to get
+  performances and genericity...
+
+  Some example of insert and search follows here. The search is a plain
+  normal search over an ordered tree. The insert instead must be implemented
+  int two steps: as first thing the code must insert the element in
+  order as a red leaf in the tree, then the support library function
+  rb_insert_color() must be called. Such function will do the
+  not trivial work to rebalance the rbtree if necessary.
+
+-----------------------------------------------------------------------
+static inline struct page * rb_search_page_cache(struct inode * inode,
+                                                unsigned long offset)
+{
+       rb_node_t * n = inode->i_rb_page_cache.rb_node;
+       struct page * page;
+
+       while (n)
+       {
+               page = rb_entry(n, struct page, rb_page_cache);
+
+               if (offset < page->offset)
+                       n = n->rb_left;
+               else if (offset > page->offset)
+                       n = n->rb_right;
+               else
+                       return page;
+       }
+       return NULL;
+}
+
+static inline struct page * __rb_insert_page_cache(struct inode * inode,
+                                                  unsigned long offset,
+                                                  rb_node_t * node)
+{
+       rb_node_t ** p = &inode->i_rb_page_cache.rb_node;
+       rb_node_t * parent = NULL;
+       struct page * page;
+
+       while (*p)
+       {
+               parent = *p;
+               page = rb_entry(parent, struct page, rb_page_cache);
+
+               if (offset < page->offset)
+                       p = &(*p)->rb_left;
+               else if (offset > page->offset)
+                       p = &(*p)->rb_right;
+               else
+                       return page;
+       }
+
+       rb_link_node(node, parent, p);
+
+       return NULL;
+}
+
+static inline struct page * rb_insert_page_cache(struct inode * inode,
+                                                unsigned long offset,
+                                                rb_node_t * node)
+{
+       struct page * ret;
+       if ((ret = __rb_insert_page_cache(inode, offset, node)))
+               goto out;
+       rb_insert_color(node, &inode->i_rb_page_cache);
+ out:
+       return ret;
+}
+-----------------------------------------------------------------------
+*/
+
+#ifndef        _LINUX_RBTREE_H
+#define        _LINUX_RBTREE_H
+
+#include <linux/kernel.h>
+#include <linux/stddef.h>
+
+typedef struct rb_node_s
+{
+       struct rb_node_s * rb_parent;
+       int rb_color;
+#define        RB_RED          0
+#define        RB_BLACK        1
+       struct rb_node_s * rb_right;
+       struct rb_node_s * rb_left;
+}
+rb_node_t;
+
+typedef struct rb_root_s
+{
+       struct rb_node_s * rb_node;
+}
+rb_root_t;
+
+#define RB_ROOT        (rb_root_t) { NULL, }
+#define        rb_entry(ptr, type, member)                                     \
+       ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
+
+extern void rb_insert_color(rb_node_t *, rb_root_t *);
+extern void rb_erase(rb_node_t *, rb_root_t *);
+
+static inline void rb_link_node(rb_node_t * node, rb_node_t * parent, rb_node_t ** rb_link)
+{
+       node->rb_parent = parent;
+       node->rb_color = RB_RED;
+       node->rb_left = node->rb_right = NULL;
+
+       *rb_link = node;
+}
+
+#endif /* _LINUX_RBTREE_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 594acbc48a3415f6016c0e2cc284ee9677bcca77..b964abb8c5408c9812c9ba9c2e0ed26e3c0f6c89 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -12,6 +12,7 @@ extern unsigned long event;
  #include <linux/types.h>
  #include <linux/times.h>
  #include <linux/timex.h>
+#include <linux/rbtree.h>
  
  #include <asm/system.h>
  #include <asm/semaphore.h>
@@ -200,12 +201,9 @@ struct files_struct {
  /* Maximum number of active map areas.. This is a random (large) number */
  #define MAX_MAP_COUNT  (65536)
  
-/* Number of map areas at which the AVL tree is activated. This is arbitrary. */
-#define AVL_MIN_MAP_COUNT      32
-
  struct mm_struct {
         struct vm_area_struct * mmap;           /* list of VMAs */
-       struct vm_area_struct * mmap_avl;       /* tree of VMAs */
+       rb_root_t mm_rb;
         struct vm_area_struct * mmap_cache;     /* last find_vma result */
         pgd_t * pgd;
         atomic_t mm_users;                      /* How many users with user space? */
@@ -237,13 +235,10 @@ extern int mmlist_nr;
  
  #define INIT_MM(name) \
  {                                                      \
-       mmap:           &init_mmap,                     \
-       mmap_avl:       NULL,                           \
-       mmap_cache:     NULL,                           \
+       mm_rb:          RB_ROOT,                        \
         pgd:            swapper_pg_dir,                 \
         mm_users:       ATOMIC_INIT(2),                 \
         mm_count:       ATOMIC_INIT(1),                 \
-       map_count:      1,                              \
         mmap_sem:       __RWSEM_INITIALIZER(name.mmap_sem), \
         page_table_lock: SPIN_LOCK_UNLOCKED,            \
         mmlist:         LIST_HEAD_INIT(name.mmlist),    \
@@ -320,6 +315,8 @@ struct task_struct {
  
         struct task_struct *next_task, *prev_task;
         struct mm_struct *active_mm;
+       struct list_head local_pages;
+       unsigned int allocation_order, nr_local_pages;
  
  /* task state */
         struct linux_binfmt *binfmt;
@@ -416,6 +413,7 @@ struct task_struct {
  #define PF_DUMPCORE    0x00000200      /* dumped core */
  #define PF_SIGNALED    0x00000400      /* killed by a signal */
  #define PF_MEMALLOC    0x00000800      /* Allocating memory */
+#define PF_FREE_PAGES  0x00002000      /* per process page freeing */
  
  #define PF_USEDFPU     0x00100000      /* task used FPU this quantum (SMP) */
  
diff --git a/include/linux/slab.h b/include/linux/slab.h

index d5ec05fef0033a627a149e8fc791415b857dbdbd..efa8638d612b8fc42b4cc1ce1b9feca14b54c51a 100644 (file)
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -60,7 +60,7 @@ extern void kmem_cache_free(kmem_cache_t *, void *);
  extern void *kmalloc(size_t, int);
  extern void kfree(const void *);
  
-extern void kmem_cache_reap(int);
+extern int FASTCALL(kmem_cache_reap(int));
  extern int slabinfo_read_proc(char *page, char **start, off_t off,
                                  int count, int *eof, void *data);
  extern int slabinfo_write_proc(struct file *file, const char *buffer,
diff --git a/include/linux/swap.h b/include/linux/swap.h

index d10b6277b2c38f7a97f3e043c898483717d8e571..a6d28bee7d71c808247c50253e85191b7ff4632e 100644 (file)
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -8,7 +8,7 @@
  #define SWAP_FLAG_PRIO_MASK    0x7fff
  #define SWAP_FLAG_PRIO_SHIFT   0
  
-#define MAX_SWAPFILES 8
+#define MAX_SWAPFILES 32
  
  /*
   * Magic header for a swap area. The first part of the union is
@@ -80,10 +80,9 @@ struct swap_info_struct {
  
  extern int nr_swap_pages;
  extern unsigned int nr_free_pages(void);
-extern unsigned int nr_inactive_clean_pages(void);
  extern unsigned int nr_free_buffer_pages(void);
  extern int nr_active_pages;
-extern int nr_inactive_dirty_pages;
+extern int nr_inactive_pages;
  extern atomic_t nr_async_pages;
  extern struct address_space swapper_space;
  extern atomic_t page_cache_size;
@@ -99,26 +98,20 @@ struct sysinfo;
  struct zone_t;
  
  /* linux/mm/swap.c */
-extern int memory_pressure;
-extern void deactivate_page(struct page *);
-extern void deactivate_page_nolock(struct page *);
-extern void activate_page(struct page *);
-extern void activate_page_nolock(struct page *);
-extern void lru_cache_add(struct page *);
-extern void __lru_cache_del(struct page *);
-extern void lru_cache_del(struct page *);
-extern void recalculate_vm_stats(void);
+extern void FASTCALL(lru_cache_add(struct page *));
+extern void FASTCALL(__lru_cache_del(struct page *));
+extern void FASTCALL(lru_cache_del(struct page *));
+
+extern void FASTCALL(deactivate_page(struct page *));
+extern void FASTCALL(deactivate_page_nolock(struct page *));
+extern void FASTCALL(activate_page(struct page *));
+extern void FASTCALL(activate_page_nolock(struct page *));
+
  extern void swap_setup(void);
  
  /* linux/mm/vmscan.c */
-extern struct page * reclaim_page(zone_t *);
  extern wait_queue_head_t kswapd_wait;
-extern wait_queue_head_t kreclaimd_wait;
-extern int page_launder(int, int);
-extern int free_shortage(void);
-extern int inactive_shortage(void);
-extern void wakeup_kswapd(void);
-extern int try_to_free_pages(unsigned int gfp_mask);
+extern int FASTCALL(try_to_free_pages(zone_t *, unsigned int, unsigned int));
  
  /* linux/mm/page_io.c */
  extern void rw_swap_page(int, struct page *);
@@ -134,7 +127,6 @@ extern struct page * lookup_swap_cache(swp_entry_t);
  extern struct page * read_swap_cache_async(swp_entry_t);
  
  /* linux/mm/oom_kill.c */
-extern int out_of_memory(void);
  extern void oom_kill(void);
  
  /*
@@ -146,7 +138,6 @@ extern void delete_from_swap_cache_nolock(struct page *page);
  extern void free_page_and_swap_cache(struct page *page);
  
  /* linux/mm/swapfile.c */
-extern int vm_swap_full(void);
  extern unsigned int nr_swapfiles;
  extern struct swap_info_struct swap_info[];
  extern int is_swap_partition(kdev_t);
@@ -179,90 +170,51 @@ extern unsigned long swap_cache_find_success;
  
  extern spinlock_t pagemap_lru_lock;
  
-extern void FASTCALL(mark_page_accessed(struct page *));
-
-/*
- * Page aging defines.
- * Since we do exponential decay of the page age, we
- * can chose a fairly large maximum.
- */
-#define PAGE_AGE_START 2
-#define PAGE_AGE_ADV 3
-#define PAGE_AGE_MAX 64
-
  /*
   * List add/del helper macros. These must be called
   * with the pagemap_lru_lock held!
   */
-#define DEBUG_ADD_PAGE \
-       if (PageActive(page) || PageInactiveDirty(page) || \
-                                       PageInactiveClean(page)) BUG();
-
-#define ZERO_PAGE_BUG \
-       if (page_count(page) == 0) BUG();
-
-#define add_page_to_active_list(page) { \
-       DEBUG_ADD_PAGE \
-       ZERO_PAGE_BUG \
-       page->age = 0; \
-       ClearPageReferenced(page); \
-       SetPageActive(page); \
-       list_add(&(page)->lru, &active_list); \
-       nr_active_pages++; \
-}
-
-#define add_page_to_inactive_dirty_list(page) { \
-       DEBUG_ADD_PAGE \
-       ZERO_PAGE_BUG \
-       SetPageInactiveDirty(page); \
-       list_add(&(page)->lru, &inactive_dirty_list); \
-       nr_inactive_dirty_pages++; \
-       page->zone->inactive_dirty_pages++; \
-}
-
-#define add_page_to_inactive_clean_list(page) { \
-       DEBUG_ADD_PAGE \
-       ZERO_PAGE_BUG \
-       SetPageInactiveClean(page); \
-       list_add(&(page)->lru, &page->zone->inactive_clean_list); \
-       page->zone->inactive_clean_pages++; \
-}
-
-#define del_page_from_active_list(page) { \
-       list_del(&(page)->lru); \
-       ClearPageActive(page); \
-       nr_active_pages--; \
-       DEBUG_ADD_PAGE \
-       ZERO_PAGE_BUG \
-}
-
-#define del_page_from_inactive_dirty_list(page) { \
-       list_del(&(page)->lru); \
-       ClearPageInactiveDirty(page); \
-       nr_inactive_dirty_pages--; \
-       page->zone->inactive_dirty_pages--; \
-       DEBUG_ADD_PAGE \
-       ZERO_PAGE_BUG \
-}
-
-#define del_page_from_inactive_clean_list(page) { \
-       list_del(&(page)->lru); \
-       ClearPageInactiveClean(page); \
-       page->zone->inactive_clean_pages--; \
-       DEBUG_ADD_PAGE \
-       ZERO_PAGE_BUG \
-}
-
-/*
- * In mm/swap.c::recalculate_vm_stats(), we substract
- * inactive_target from memory_pressure every second.
- * This means that memory_pressure is smoothed over
- * 64 (1 << INACTIVE_SHIFT) seconds.
- */
-#define INACTIVE_SHIFT 6
-#define inactive_target min_t(unsigned long, \
-                           (memory_pressure >> INACTIVE_SHIFT), \
-                           (num_physpages / 4))
+#define DEBUG_LRU_PAGE(page)                   \
+do {                                           \
+       if (PageActive(page))                   \
+               BUG();                          \
+       if (PageInactive(page))                 \
+               BUG();                          \
+       if (page_count(page) == 0)              \
+               BUG();                          \
+} while (0)
+
+#define add_page_to_active_list(page)          \
+do {                                           \
+       DEBUG_LRU_PAGE(page);                   \
+       SetPageActive(page);                    \
+       list_add(&(page)->lru, &active_list);   \
+       nr_active_pages++;                      \
+} while (0)
+
+#define add_page_to_inactive_list(page)                \
+do {                                           \
+       DEBUG_LRU_PAGE(page);                   \
+       SetPageInactive(page);          \
+       list_add(&(page)->lru, &inactive_list); \
+       nr_inactive_pages++;                    \
+} while (0)
+
+#define del_page_from_active_list(page)                \
+do {                                           \
+       list_del(&(page)->lru);                 \
+       ClearPageActive(page);                  \
+       nr_active_pages--;                      \
+       DEBUG_LRU_PAGE(page);                   \
+} while (0)
+
+#define del_page_from_inactive_list(page)      \
+do {                                           \
+       list_del(&(page)->lru);                 \
+       ClearPageInactive(page);                \
+       nr_inactive_pages--;                    \
+       DEBUG_LRU_PAGE(page);                   \
+} while (0)
  
  /*
   * Ugly ugly ugly HACK to make sure the inactive lists
@@ -278,9 +230,6 @@ extern void FASTCALL(mark_page_accessed(struct page *));
  #include <linux/major.h>
  #endif
  
-#define page_ramdisk(page) \
-       (page->buffers && (MAJOR(page->buffers->b_dev) == RAMDISK_MAJOR))
-
  extern spinlock_t swaplock;
  
  #define swap_list_lock()       spin_lock(&swaplock)
diff --git a/include/linux/swapctl.h b/include/linux/swapctl.h

index f9f2d2acbf91196807aa6ae2697334e2417f27ef..de22bff4b78852ecd749a6c98c69a03c347c3e8c 100644 (file)
--- a/include/linux/swapctl.h
+++ b/include/linux/swapctl.h
@@ -1,28 +1,6 @@
  #ifndef _LINUX_SWAPCTL_H
  #define _LINUX_SWAPCTL_H
  
-#include <asm/page.h>
-#include <linux/fs.h>
-
-typedef struct buffer_mem_v1
-{
-       unsigned int    min_percent;
-       unsigned int    borrow_percent;
-       unsigned int    max_percent;
-} buffer_mem_v1;
-typedef buffer_mem_v1 buffer_mem_t;
-extern buffer_mem_t buffer_mem;
-extern buffer_mem_t page_cache;
-
-typedef struct freepages_v1
-{
-       unsigned int    min;
-       unsigned int    low;
-       unsigned int    high;
-} freepages_v1;
-typedef freepages_v1 freepages_t;
-extern freepages_t freepages;
-
  typedef struct pager_daemon_v1
  {
         unsigned int    tries_base;
diff --git a/include/linux/timer.h b/include/linux/timer.h

index e3249bf933a03ad17ea6f64cc08141d3b29feec2..803d268f12e3ff73e99e2eae7ac3829cd9e043ca 100644 (file)
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -19,6 +19,7 @@ struct timer_list {
         unsigned long data;
         void (*function)(unsigned long);
  };
+typedef struct timer_list timer_t;
  
  extern void add_timer(struct timer_list * timer);
  extern int del_timer(struct timer_list * timer);
diff --git a/kernel/Makefile b/kernel/Makefile

index 81238b94e91ee2a2623d76c407a2d9b5203ff381..02c4d3a5c51469ea699de1186d86742678db2db2 100644 (file)
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,7 @@
  
  O_TARGET := kernel.o
  
-export-objs = signal.o sys.o kmod.o context.o ksyms.o pm.o exec_domain.o
+export-objs = signal.o sys.o kmod.o context.o ksyms.o pm.o exec_domain.o printk.o
  
  obj-y     = sched.o dma.o fork.o exec_domain.o panic.o printk.o \
             module.o exit.o itimer.o info.o time.o softirq.o resource.o \
diff --git a/kernel/fork.c b/kernel/fork.c

index ebfbf2b693b8a18f3b2b36e31853a9de2147dbc5..9179e235dfe5d2018f2c5a5e65e37ada9cf2284a 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -40,8 +40,8 @@ void add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
  {
         unsigned long flags;
  
-       wq_write_lock_irqsave(&q->lock, flags);
         wait->flags &= ~WQ_FLAG_EXCLUSIVE;
+       wq_write_lock_irqsave(&q->lock, flags);
         __add_wait_queue(q, wait);
         wq_write_unlock_irqrestore(&q->lock, flags);
  }
@@ -50,8 +50,8 @@ void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)
  {
         unsigned long flags;
  
-       wq_write_lock_irqsave(&q->lock, flags);
         wait->flags |= WQ_FLAG_EXCLUSIVE;
+       wq_write_lock_irqsave(&q->lock, flags);
         __add_wait_queue_tail(q, wait);
         wq_write_unlock_irqrestore(&q->lock, flags);
  }
@@ -72,7 +72,7 @@ void __init fork_init(unsigned long mempages)
          * value: the thread structures can take up at most half
          * of memory.
          */
-       max_threads = mempages / (THREAD_SIZE/PAGE_SIZE) / 2;
+       max_threads = mempages / (THREAD_SIZE/PAGE_SIZE) / 16;
  
         init_task.rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
         init_task.rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
@@ -133,7 +133,6 @@ static inline int dup_mmap(struct mm_struct * mm)
         flush_cache_mm(current->mm);
         mm->locked_vm = 0;
         mm->mmap = NULL;
-       mm->mmap_avl = NULL;
         mm->mmap_cache = NULL;
         mm->map_count = 0;
         mm->rss = 0;
@@ -200,8 +199,7 @@ static inline int dup_mmap(struct mm_struct * mm)
                         goto fail_nomem;
         }
         retval = 0;
-       if (mm->map_count >= AVL_MIN_MAP_COUNT)
-               build_mmap_avl(mm);
+       build_mmap_rb(mm);
  
  fail_nomem:
         flush_tlb_mm(current->mm);
@@ -651,6 +649,8 @@ int do_fork(unsigned long clone_flags, unsigned long stack_start,
         p->lock_depth = -1;             /* -1 = no lock */
         p->start_time = jiffies;
  
+       INIT_LIST_HEAD(&p->local_pages);
+
         retval = -ENOMEM;
         /* copy all the process information */
         if (copy_files(clone_flags, p))
diff --git a/kernel/ksyms.c b/kernel/ksyms.c

index 0c1e8145662a0c64d7ef47830227538bc1959fbb..928d2239d3ba398de5eb62f17f841105cae479ab 100644 (file)
--- a/kernel/ksyms.c
+++ b/kernel/ksyms.c
@@ -210,6 +210,7 @@ EXPORT_SYMBOL(waitfor_one_page);
  EXPORT_SYMBOL(generic_file_read);
  EXPORT_SYMBOL(do_generic_file_read);
  EXPORT_SYMBOL(generic_file_write);
+EXPORT_SYMBOL(generic_direct_IO);
  EXPORT_SYMBOL(generic_file_mmap);
  EXPORT_SYMBOL(generic_ro_fops);
  EXPORT_SYMBOL(generic_buffer_fdatasync);
@@ -284,8 +285,6 @@ EXPORT_SYMBOL(tty_unregister_driver);
  EXPORT_SYMBOL(tty_std_termios);
  
  /* block device driver support */
-EXPORT_SYMBOL(block_read);
-EXPORT_SYMBOL(block_write);
  EXPORT_SYMBOL(blksize_size);
  EXPORT_SYMBOL(hardsect_size);
  EXPORT_SYMBOL(blk_size);
@@ -315,8 +314,6 @@ EXPORT_SYMBOL(tty_hung_up_p);
  EXPORT_SYMBOL(tty_flip_buffer_push);
  EXPORT_SYMBOL(tty_get_baud_rate);
  EXPORT_SYMBOL(do_SAK);
-EXPORT_SYMBOL(console_print);
-EXPORT_SYMBOL(console_loglevel);
  
  /* filesystem registration */
  EXPORT_SYMBOL(register_filesystem);
@@ -449,7 +446,6 @@ EXPORT_SYMBOL(nr_running);
  
  /* misc */
  EXPORT_SYMBOL(panic);
-EXPORT_SYMBOL(printk);
  EXPORT_SYMBOL(sprintf);
  EXPORT_SYMBOL(snprintf);
  EXPORT_SYMBOL(sscanf);
@@ -492,6 +488,7 @@ EXPORT_SYMBOL(si_meminfo);
  EXPORT_SYMBOL(sys_tz);
  EXPORT_SYMBOL(file_fsync);
  EXPORT_SYMBOL(fsync_inode_buffers);
+EXPORT_SYMBOL(fsync_inode_data_buffers);
  EXPORT_SYMBOL(clear_inode);
  EXPORT_SYMBOL(nr_async_pages);
  EXPORT_SYMBOL(___strtok);
@@ -523,10 +520,6 @@ EXPORT_SYMBOL(disk_name);  /* for md.c */
  /* binfmt_aout */
  EXPORT_SYMBOL(get_write_access);
  
-/* dynamic registering of consoles */
-EXPORT_SYMBOL(register_console);
-EXPORT_SYMBOL(unregister_console);
-
  /* time */
  EXPORT_SYMBOL(get_fast_time);
  
diff --git a/kernel/panic.c b/kernel/panic.c

index ac246f74589a5f3d53b322fddde86828f8f393bb..f9957aeef4d10b05f238b8b8308f5fd1bd23b2bd 100644 (file)
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -18,7 +18,6 @@
  #include <linux/interrupt.h>
  
  asmlinkage void sys_sync(void);        /* it's really int */
-extern void unblank_console(void);
  
  int panic_timeout;
  
@@ -36,9 +35,8 @@ __setup("panic=", panic_setup);
   *     panic - halt the system
   *     @fmt: The text string to print
   *
- *     Display a message, then unblank the console and perform
- *     cleanups. Functions in the panic notifier list are called
- *     after the filesystem cache is flushed (when possible).
+ *     Display a message, then perform cleanups. Functions in the panic
+ *     notifier list are called after the filesystem cache is flushed (when possible).
   *
   *     This function never returns.
   */
@@ -51,6 +49,7 @@ NORET_TYPE void panic(const char * fmt, ...)
          unsigned long caller = (unsigned long) __builtin_return_address(0);
  #endif
  
+       bust_spinlocks(1);
         va_start(args, fmt);
         vsprintf(buf, fmt, args);
         va_end(args);
@@ -61,8 +60,7 @@ NORET_TYPE void panic(const char * fmt, ...)
                 printk(KERN_EMERG "In idle task - not syncing\n");
         else
                 sys_sync();
-
-       unblank_console();
+       bust_spinlocks(0);
  
  #ifdef CONFIG_SMP
         smp_send_stop();
diff --git a/kernel/printk.c b/kernel/printk.c

index 2237fbd5964df48d1dd35daea2dd62b7e9825c57..f68bdb1146bed5cce72391bcdf47e01d38fba5eb 100644 (file)
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -12,6 +12,8 @@
   * Modified for sysctl support, 1/8/97, Chris Horn.
   * Fixed SMP synchronization, 08/08/99, Manfred Spraul 
   *     manfreds@colorfullife.com
+ * Rewrote bits to get rid of console_lock
+ *     01Mar01 Andrew Morton <andrewm@uow.edu.au>
   */
  
  #include <linux/mm.h>
@@ -20,14 +22,14 @@
  #include <linux/smp_lock.h>
  #include <linux/console.h>
  #include <linux/init.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>                   /* For in_interrupt() */
  
  #include <asm/uaccess.h>
  
-#define LOG_BUF_LEN    (16384)
+#define LOG_BUF_LEN    (16384)                 /* This must be a power of two */
  #define LOG_BUF_MASK   (LOG_BUF_LEN-1)
  
-static char buf[1024];
-
  /* printk's without a loglevel use this.. */
  #define DEFAULT_MESSAGE_LOGLEVEL 4 /* KERN_WARNING */
  
@@ -35,7 +37,6 @@ static char buf[1024];
  #define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */
  #define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */
  
-unsigned long log_size;
  DECLARE_WAIT_QUEUE_HEAD(log_wait);
  
  /* Keep together for sysctl support */
@@ -44,15 +45,41 @@ int default_message_loglevel = DEFAULT_MESSAGE_LOGLEVEL;
  int minimum_console_loglevel = MINIMUM_CONSOLE_LOGLEVEL;
  int default_console_loglevel = DEFAULT_CONSOLE_LOGLEVEL;
  
-spinlock_t console_lock = SPIN_LOCK_UNLOCKED;
+int oops_in_progress;
  
+/*
+ * console_sem protects the console_drivers list, and also
+ * provides serialisation for access to the entire console
+ * driver system.
+ */
+static DECLARE_MUTEX(console_sem);
  struct console *console_drivers;
+
+/*
+ * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars
+ * It is also used in interesting ways to provide interlocking in
+ * release_console_sem().
+ */
+static spinlock_t logbuf_lock = SPIN_LOCK_UNLOCKED;
+
  static char log_buf[LOG_BUF_LEN];
-static unsigned long log_start;
-static unsigned long logged_chars;
+#define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
+
+/*
+ * The indices into log_buf are not constrained to LOG_BUF_LEN - they
+ * must be masked before subscripting
+ */
+static unsigned long log_start;                        /* Index into log_buf: next char to be read by syslog() */
+static unsigned long con_start;                        /* Index into log_buf: next char to be sent to consoles */
+static unsigned long log_end;                  /* Index into log_buf: most-recently-written-char + 1 */
+static unsigned long logged_chars;             /* Number of chars produced since last read+clear operation */
+
  struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES];
  static int preferred_console = -1;
  
+/* Flag: console code may call schedule() */
+static int console_may_schedule;
+
  /*
   *     Setup a list of consoles. Called from init/main.c
   */
@@ -120,6 +147,7 @@ __setup("console=", console_setup);
   *     6 -- Disable printk's to console
   *     7 -- Enable printk's to console
   *     8 -- Set level of messages printed to console
+ *     9 -- Return number of unread characters in the log buffer
   */
  int do_syslog(int type, char * buf, int len)
  {
@@ -143,22 +171,21 @@ int do_syslog(int type, char * buf, int len)
                 error = verify_area(VERIFY_WRITE,buf,len);
                 if (error)
                         goto out;
-               error = wait_event_interruptible(log_wait, log_size);
+               error = wait_event_interruptible(log_wait, (log_start - log_end));
                 if (error)
                         goto out;
                 i = 0;
-               spin_lock_irq(&console_lock);
-               while (log_size && i < len) {
-                       c = log_buf[log_start & LOG_BUF_MASK];
+               spin_lock_irq(&logbuf_lock);
+               while ((log_start != log_end) && i < len) {
+                       c = LOG_BUF(log_start);
                         log_start++;
-                       log_size--;
-                       spin_unlock_irq(&console_lock);
+                       spin_unlock_irq(&logbuf_lock);
                         __put_user(c,buf);
                         buf++;
                         i++;
-                       spin_lock_irq(&console_lock);
+                       spin_lock_irq(&logbuf_lock);
                 }
-               spin_unlock_irq(&console_lock);
+               spin_unlock_irq(&logbuf_lock);
                 error = i;
                 break;
         case 4:         /* Read/clear last kernel messages */
@@ -177,12 +204,12 @@ int do_syslog(int type, char * buf, int len)
                 count = len;
                 if (count > LOG_BUF_LEN)
                         count = LOG_BUF_LEN;
-               spin_lock_irq(&console_lock);
+               spin_lock_irq(&logbuf_lock);
                 if (count > logged_chars)
                         count = logged_chars;
                 if (do_clear)
                         logged_chars = 0;
-               limit = log_start + log_size;
+               limit = log_end;
                 /*
                  * __put_user() could sleep, and while we sleep
                  * printk() could overwrite the messages 
@@ -191,14 +218,14 @@ int do_syslog(int type, char * buf, int len)
                  */
                 for(i=0;i < count;i++) {
                         j = limit-1-i;
-                       if (j+LOG_BUF_LEN < log_start+log_size)
+                       if (j+LOG_BUF_LEN < log_end)
                                 break;
-                       c = log_buf[ j  & LOG_BUF_MASK ];
-                       spin_unlock_irq(&console_lock);
+                       c = LOG_BUF(j);
+                       spin_unlock_irq(&logbuf_lock);
                         __put_user(c,&buf[count-1-i]);
-                       spin_lock_irq(&console_lock);
+                       spin_lock_irq(&logbuf_lock);
                 }
-               spin_unlock_irq(&console_lock);
+               spin_unlock_irq(&logbuf_lock);
                 error = i;
                 if(i != count) {
                         int offset = count-error;
@@ -211,31 +238,36 @@ int do_syslog(int type, char * buf, int len)
  
                 break;
         case 5:         /* Clear ring buffer */
-               spin_lock_irq(&console_lock);
+               spin_lock_irq(&logbuf_lock);
                 logged_chars = 0;
-               spin_unlock_irq(&console_lock);
+               spin_unlock_irq(&logbuf_lock);
                 break;
         case 6:         /* Disable logging to console */
-               spin_lock_irq(&console_lock);
+               spin_lock_irq(&logbuf_lock);
                 console_loglevel = minimum_console_loglevel;
-               spin_unlock_irq(&console_lock);
+               spin_unlock_irq(&logbuf_lock);
                 break;
         case 7:         /* Enable logging to console */
-               spin_lock_irq(&console_lock);
+               spin_lock_irq(&logbuf_lock);
                 console_loglevel = default_console_loglevel;
-               spin_unlock_irq(&console_lock);
+               spin_unlock_irq(&logbuf_lock);
                 break;
-       case 8:
+       case 8:         /* Set level of messages printed to console */
                 error = -EINVAL;
                 if (len < 1 || len > 8)
                         goto out;
                 if (len < minimum_console_loglevel)
                         len = minimum_console_loglevel;
-               spin_lock_irq(&console_lock);
+               spin_lock_irq(&logbuf_lock);
                 console_loglevel = len;
-               spin_unlock_irq(&console_lock);
+               spin_unlock_irq(&logbuf_lock);
                 error = 0;
                 break;
+       case 9:         /* Number of chars in the log buffer */
+               spin_lock_irq(&logbuf_lock);
+               error = log_end - log_start;
+               spin_unlock_irq(&logbuf_lock);
+               break;
         default:
                 error = -EINVAL;
                 break;
@@ -251,98 +283,250 @@ asmlinkage long sys_syslog(int type, char * buf, int len)
         return do_syslog(type, buf, len);
  }
  
-asmlinkage int printk(const char *fmt, ...)
+/*
+ * Call the console drivers on a range of log_buf
+ */
+static void __call_console_drivers(unsigned long start, unsigned long end)
  {
-       va_list args;
-       int i;
-       char *msg, *p, *buf_end;
-       int line_feed;
-       static signed char msg_level = -1;
-       long flags;
+       struct console *con;
  
-       spin_lock_irqsave(&console_lock, flags);
-       va_start(args, fmt);
-       i = vsprintf(buf + 3, fmt, args); /* hopefully i < sizeof(buf)-4 */
-       buf_end = buf + 3 + i;
-       va_end(args);
-       for (p = buf + 3; p < buf_end; p++) {
-               msg = p;
-               if (msg_level < 0) {
-                       if (
-                               p[0] != '<' ||
-                               p[1] < '0' || 
-                               p[1] > '7' ||
-                               p[2] != '>'
-                       ) {
-                               p -= 3;
-                               p[0] = '<';
-                               p[1] = default_message_loglevel + '0';
-                               p[2] = '>';
-                       } else
-                               msg += 3;
-                       msg_level = p[1] - '0';
+       for (con = console_drivers; con; con = con->next) {
+               if ((con->flags & CON_ENABLED) && con->write)
+                       con->write(con, &LOG_BUF(start), end - start);
+       }
+}
+
+/*
+ * Write out chars from start to end - 1 inclusive
+ */
+static void _call_console_drivers(unsigned long start, unsigned long end, int msg_log_level)
+{
+       if (msg_log_level < console_loglevel && console_drivers && start != end) {
+               if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) {
+                       /* wrapped write */
+                       __call_console_drivers(start & LOG_BUF_MASK, LOG_BUF_LEN);
+                       __call_console_drivers(0, end & LOG_BUF_MASK);
+               } else {
+                       __call_console_drivers(start, end);
+               }
+       }
+}
+
+/*
+ * Call the console drivers, asking them to write out
+ * log_buf[start] to log_buf[end - 1].
+ * The console_sem must be held.
+ */
+static void call_console_drivers(unsigned long start, unsigned long end)
+{
+       unsigned long cur_index, start_print;
+       static int msg_level = -1;
+
+       if (((long)(start - end)) > 0)
+               BUG();
+
+       cur_index = start;
+       start_print = start;
+       while (cur_index != end) {
+               if (    msg_level < 0 &&
+                       ((end - cur_index) > 2) &&
+                       LOG_BUF(cur_index + 0) == '<' &&
+                       LOG_BUF(cur_index + 1) >= '0' &&
+                       LOG_BUF(cur_index + 1) <= '7' &&
+                       LOG_BUF(cur_index + 2) == '>')
+               {
+                       msg_level = LOG_BUF(cur_index + 1) - '0';
+                       cur_index += 3;
+                       start_print = cur_index;
                 }
-               line_feed = 0;
-               for (; p < buf_end; p++) {
-                       log_buf[(log_start+log_size) & LOG_BUF_MASK] = *p;
-                       if (log_size < LOG_BUF_LEN)
-                               log_size++;
-                       else
-                               log_start++;
-
-                       logged_chars++;
-                       if (*p == '\n') {
-                               line_feed = 1;
+               while (cur_index != end) {
+                       char c = LOG_BUF(cur_index);
+                       cur_index++;
+
+                       if (c == '\n') {
+                               if (msg_level < 0) {
+                                       /*
+                                        * printk() has already given us loglevel tags in
+                                        * the buffer.  This code is here in case the
+                                        * log buffer has wrapped right round and scribbled
+                                        * on those tags
+                                        */
+                                       msg_level = default_message_loglevel;
+                               }
+                               _call_console_drivers(start_print, cur_index, msg_level);
+                               msg_level = -1;
+                               start_print = cur_index;
                                 break;
                         }
                 }
-               if (msg_level < console_loglevel && console_drivers) {
-                       struct console *c = console_drivers;
-                       while(c) {
-                               if ((c->flags & CON_ENABLED) && c->write)
-                                       c->write(c, msg, p - msg + line_feed);
-                               c = c->next;
+       }
+       _call_console_drivers(start_print, end, msg_level);
+}
+
+static void emit_log_char(char c)
+{
+       LOG_BUF(log_end) = c;
+       log_end++;
+       if (log_end - log_start > LOG_BUF_LEN)
+               log_start = log_end - LOG_BUF_LEN;
+       if (log_end - con_start > LOG_BUF_LEN)
+               con_start = log_end - LOG_BUF_LEN;
+       if (logged_chars < LOG_BUF_LEN)
+               logged_chars++;
+}
+
+/*
+ * This is printk.  It can be called from any context.  We want it to work.
+ * 
+ * We try to grab the console_sem.  If we succeed, it's easy - we log the output and
+ * call the console drivers.  If we fail to get the semaphore we place the output
+ * into the log buffer and return.  The current holder of the console_sem will
+ * notice the new output in release_console_sem() and will send it to the
+ * consoles before releasing the semaphore.
+ *
+ * One effect of this deferred printing is that code which calls printk() and
+ * then changes console_loglevel may break. This is because console_loglevel
+ * is inspected when the actual printing occurs.
+ */
+asmlinkage int printk(const char *fmt, ...)
+{
+       va_list args;
+       unsigned long flags;
+       int printed_len;
+       char *p;
+       static char printk_buf[1024];
+       static int log_level_unknown = 1;
+
+       if (oops_in_progress) {
+               /* If a crash is occurring, make sure we can't deadlock */
+               spin_lock_init(&logbuf_lock);
+               /* And make sure that we print immediately */
+               init_MUTEX(&console_sem);
+       }
+
+       /* This stops the holder of console_sem just where we want him */
+       spin_lock_irqsave(&logbuf_lock, flags);
+
+       /* Emit the output into the temporary buffer */
+       va_start(args, fmt);
+       printed_len = vsnprintf(printk_buf, sizeof(printk_buf), fmt, args);
+       va_end(args);
+
+       /*
+        * Copy the output into log_buf.  If the caller didn't provide
+        * appropriate log level tags, we insert them here
+        */
+       for (p = printk_buf; *p; p++) {
+               if (log_level_unknown) {
+                       if (p[0] != '<' || p[1] < '0' || p[1] > '7' || p[2] != '>') {
+                               emit_log_char('<');
+                               emit_log_char(default_message_loglevel + '0');
+                               emit_log_char('>');
                         }
+                       log_level_unknown = 0;
                 }
-               if (line_feed)
-                       msg_level = -1;
+               emit_log_char(*p);
+               if (*p == '\n')
+                       log_level_unknown = 1;
+       }
+
+       if (!down_trylock(&console_sem)) {
+               /*
+                * We own the drivers.  We can drop the spinlock and let
+                * release_console_sem() print the text
+                */
+               spin_unlock_irqrestore(&logbuf_lock, flags);
+               console_may_schedule = 0;
+               release_console_sem();
+       } else {
+               /*
+                * Someone else owns the drivers.  We drop the spinlock, which
+                * allows the semaphore holder to proceed and to call the
+                * console drivers with the output which we just produced.
+                */
+               spin_unlock_irqrestore(&logbuf_lock, flags);
         }
-       spin_unlock_irqrestore(&console_lock, flags);
-       wake_up_interruptible(&log_wait);
-       return i;
+       return printed_len;
  }
+EXPORT_SYMBOL(printk);
  
-void console_print(const char *s)
+/**
+ * acquire_console_sem - lock the console system for exclusive use.
+ *
+ * Acquires a semaphore which guarantees that the caller has
+ * exclusive access to the console system and the console_drivers list.
+ *
+ * Can sleep, returns nothing.
+ */
+void acquire_console_sem(void)
+{
+       if (in_interrupt())
+               BUG();
+       down(&console_sem);
+       console_may_schedule = 1;
+}
+EXPORT_SYMBOL(acquire_console_sem);
+
+/**
+ * release_console_sem - unlock the console system
+ *
+ * Releases the semaphore which the caller holds on the console system
+ * and the console driver list.
+ *
+ * While the semaphore was held, console output may have been buffered
+ * by printk().  If this is the case, release_console_sem() emits
+ * the output prior to releasing the semaphore.
+ *
+ * If there is output waiting for klogd, we wake it up.
+ *
+ * release_console_sem() may be called from any context.
+ */
+void release_console_sem(void)
  {
-       struct console *c;
         unsigned long flags;
-       int len = strlen(s);
-
-       spin_lock_irqsave(&console_lock, flags);
-       c = console_drivers;
-       while(c) {
-               if ((c->flags & CON_ENABLED) && c->write)
-                       c->write(c, s, len);
-               c = c->next;
+       unsigned long _con_start, _log_end;
+       unsigned long must_wake_klogd = 0;
+
+       for ( ; ; ) {
+               spin_lock_irqsave(&logbuf_lock, flags);
+               must_wake_klogd |= log_start - log_end;
+               if (con_start == log_end)
+                       break;                  /* Nothing to print */
+               _con_start = con_start;
+               _log_end = log_end;
+               con_start = log_end;            /* Flush */
+               spin_unlock_irqrestore(&logbuf_lock, flags);
+               call_console_drivers(_con_start, _log_end);
         }
-       spin_unlock_irqrestore(&console_lock, flags);
+       console_may_schedule = 0;
+       up(&console_sem);
+       spin_unlock_irqrestore(&logbuf_lock, flags);
+       if (must_wake_klogd && !oops_in_progress)
+               wake_up_interruptible(&log_wait);
  }
  
-void unblank_console(void)
+/** console_conditional_schedule - yield the CPU if required
+ *
+ * If the console code is currently allowed to sleep, and
+ * if this CPU should yield the CPU to another task, do
+ * so here.
+ *
+ * Must be called within acquire_console_sem().
+ */
+void console_conditional_schedule(void)
  {
-       struct console *c;
-       unsigned long flags;
-       
-       spin_lock_irqsave(&console_lock, flags);
-       c = console_drivers;
-       while(c) {
-               if ((c->flags & CON_ENABLED) && c->unblank)
-                       c->unblank();
-               c = c->next;
+       if (console_may_schedule && current->need_resched) {
+               set_current_state(TASK_RUNNING);
+               schedule();
         }
-       spin_unlock_irqrestore(&console_lock, flags);
  }
  
+void console_print(const char *s)
+{
+       printk(KERN_EMERG "%s", s);
+}
+EXPORT_SYMBOL(console_print);
+
  /*
   * The console driver calls this routine during kernel initialization
   * to register the console printing procedure with printk() and to
@@ -351,11 +535,7 @@ void unblank_console(void)
   */
  void register_console(struct console * console)
  {
-       int     i, j,len;
-       int     p;
-       char    buf[16];
-       signed char msg_level = -1;
-       char    *q;
+       int     i;
         unsigned long flags;
  
         /*
@@ -402,7 +582,7 @@ void register_console(struct console * console)
          *      Put this console in the list - keep the
          *      preferred driver at the head of the list.
          */
-       spin_lock_irqsave(&console_lock, flags);
+       acquire_console_sem();
         if ((console->flags & CON_CONSDEV) || console_drivers == NULL) {
                 console->next = console_drivers;
                 console_drivers = console;
@@ -410,57 +590,28 @@ void register_console(struct console * console)
                 console->next = console_drivers->next;
                 console_drivers->next = console;
         }
-       if ((console->flags & CON_PRINTBUFFER) == 0)
-               goto done;
-       /*
-        *      Print out buffered log messages.
-        */
-       p = log_start & LOG_BUF_MASK;
-
-       for (i=0,j=0; i < log_size; i++) {
-               buf[j++] = log_buf[p];
-               p = (p+1) & LOG_BUF_MASK;
-               if (buf[j-1] != '\n' && i < log_size - 1 && j < sizeof(buf)-1)
-                       continue;
-               buf[j] = 0;
-               q = buf;
-               len = j;
-               if (msg_level < 0) {
-                       if(buf[0] == '<' &&
-                               buf[1] >= '0' &&
-                               buf[1] <= '7' &&
-                               buf[2] == '>') {
-                               msg_level = buf[1] - '0';
-                               q = buf + 3;
-                               len -= 3;
-                       } else
-                       {
-                               msg_level = default_message_loglevel; 
-                       }
-               }
-               if (msg_level < console_loglevel)
-                       console->write(console, q, len);
-               if (buf[j-1] == '\n')
-                       msg_level = -1;
-               j = 0;
+       if (console->flags & CON_PRINTBUFFER) {
+               /*
+                * release_cosole_sem() will print out the buffered messages for us.
+                */
+               spin_lock_irqsave(&logbuf_lock, flags);
+               con_start = log_start;
+               spin_unlock_irqrestore(&logbuf_lock, flags);
         }
-done:
-       spin_unlock_irqrestore(&console_lock, flags);
+       release_console_sem();
  }
-
+EXPORT_SYMBOL(register_console);
  
  int unregister_console(struct console * console)
  {
          struct console *a,*b;
-       unsigned long flags;
         int res = 1;
  
-       spin_lock_irqsave(&console_lock, flags);
+       acquire_console_sem();
         if (console_drivers == console) {
                 console_drivers=console->next;
                 res = 0;
-       } else
-       {
+       } else {
                 for (a=console_drivers->next, b=console_drivers ;
                      a; b=a, a=b->next) {
                         if (a == console) {
@@ -479,13 +630,15 @@ int unregister_console(struct console * console)
                 preferred_console = -1;
                 
  
-       spin_unlock_irqrestore(&console_lock, flags);
+       release_console_sem();
         return res;
  }
+EXPORT_SYMBOL(unregister_console);
         
-/*
- * Write a message to a certain tty, not just the console. This is used for
- * messages that need to be redirected to a specific tty.
+/**
+ * tty_write_message - write a message to a certain tty, not just the console.
+ *
+ * This is used for messages that need to be redirected to a specific tty.
   * We don't put it into the syslog queue right now maybe in the future if
   * really needed.
   */
diff --git a/kernel/sched.c b/kernel/sched.c

index deb5854e89c1ef007ee3eecbf19aa43ef7b7f952..3d7c4659bb752d4f674e32758c6f63edd17fdc37 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -26,6 +26,7 @@
  #include <linux/interrupt.h>
  #include <linux/kernel_stat.h>
  #include <linux/completion.h>
+#include <linux/prefetch.h>
  
  #include <asm/uaccess.h>
  #include <asm/mmu_context.h>
@@ -246,7 +247,7 @@ send_now_idle:
          */
         oldest_idle = (cycles_t) -1;
         target_tsk = NULL;
-       max_prio = 1;
+       max_prio = 0;
  
         for (i = 0; i < smp_num_cpus; i++) {
                 cpu = cpu_logical_map(i);
@@ -292,7 +293,7 @@ send_now_idle:
         struct task_struct *tsk;
  
         tsk = cpu_curr(this_cpu);
-       if (preemption_goodness(tsk, p, this_cpu) > 1)
+       if (preemption_goodness(tsk, p, this_cpu) > 0)
                 tsk->need_resched = 1;
  #endif
  }
@@ -535,6 +536,8 @@ asmlinkage void schedule(void)
         struct list_head *tmp;
         int this_cpu, c;
  
+       spin_lock_prefetch(&runqueue_lock);
+
         if (!current->active_mm) BUG();
  need_resched_back:
         prev = current;
@@ -1173,7 +1176,7 @@ static void show_task(struct task_struct * p)
         else
                 printk(" (NOTLB)\n");
  
-#if defined(CONFIG_X86) || defined(CONFIG_SPARC64) || defined(CONFIG_ARM)
+#if defined(CONFIG_X86) || defined(CONFIG_SPARC64) || defined(CONFIG_ARM) || defined(CONFIG_ALPHA)
  /* This is very useful, but only works on ARM, x86 and sparc64 right now */
         {
                 extern void show_trace_task(struct task_struct *tsk);
@@ -1211,8 +1214,14 @@ void show_state(void)
         printk("  task                 PC        stack   pid father child younger older\n");
  #endif
         read_lock(&tasklist_lock);
-       for_each_task(p)
+       for_each_task(p) {
+               /*
+                * reset the NMI-timeout, listing all files on a slow
+                * console might take alot of time:
+                */
+               touch_nmi_watchdog();
                 show_task(p);
+       }
         read_unlock(&tasklist_lock);
  }
  
diff --git a/kernel/signal.c b/kernel/signal.c

index 5ac2d6112f00ae97a722d0813324f0b025360086..7f4d2e71c751d017f7787bf4d42d2b3fa9913e1e 100644 (file)
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -467,11 +467,6 @@ static inline void signal_wake_up(struct task_struct *t)
  {
         t->sigpending = 1;
  
-       if (t->state & TASK_INTERRUPTIBLE) {
-               wake_up_process(t);
-               return;
-       }
-
  #ifdef CONFIG_SMP
         /*
          * If the task is running on a different CPU 
@@ -488,6 +483,11 @@ static inline void signal_wake_up(struct task_struct *t)
                 smp_send_reschedule(t->processor);
         spin_unlock(&runqueue_lock);
  #endif /* CONFIG_SMP */
+
+       if (t->state & TASK_INTERRUPTIBLE) {
+               wake_up_process(t);
+               return;
+       }
  }
  
  static int deliver_signal(int sig, struct siginfo *info, struct task_struct *t)
@@ -544,8 +544,6 @@ printk("SIG queue (%s:%d): %d ", t->comm, t->pid, sig);
         ret = deliver_signal(sig, info, t);
  out:
         spin_unlock_irqrestore(&t->sigmask_lock, flags);
-       if ((t->state & TASK_INTERRUPTIBLE) && signal_pending(t))
-               wake_up_process(t);
  out_nolock:
  #if DEBUG_SIG
  printk(" %d -> %d\n", signal_pending(t), ret);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c

index f30c1a0b82a2ea14d90e28f1694d7284fe28387a..af54f02302b77d82bbf30420f8873b887c6ff213 100644 (file)
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -254,17 +254,11 @@ static ctl_table kern_table[] = {
  };
  
  static ctl_table vm_table[] = {
-       {VM_FREEPG, "freepages", 
-        &freepages, sizeof(freepages_t), 0444, NULL, &proc_dointvec},
         {VM_BDFLUSH, "bdflush", &bdf_prm, 9*sizeof(int), 0644, NULL,
          &proc_dointvec_minmax, &sysctl_intvec, NULL,
          &bdflush_min, &bdflush_max},
         {VM_OVERCOMMIT_MEMORY, "overcommit_memory", &sysctl_overcommit_memory,
          sizeof(sysctl_overcommit_memory), 0644, NULL, &proc_dointvec},
-       {VM_BUFFERMEM, "buffermem",
-        &buffer_mem, sizeof(buffer_mem_t), 0644, NULL, &proc_dointvec},
-       {VM_PAGECACHE, "pagecache",
-        &page_cache, sizeof(buffer_mem_t), 0644, NULL, &proc_dointvec},
         {VM_PAGERDAEMON, "kswapd",
          &pager_daemon, sizeof(pager_daemon_t), 0644, NULL, &proc_dointvec},
         {VM_PGT_CACHE, "pagetable_cache", 
diff --git a/lib/Makefile b/lib/Makefile

index c9aaf53f43eb425b9243751fdd7abbd670f256bd..38a91fa0c9e7294b1bdf952476152d7caa25b12c 100644 (file)
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -10,7 +10,7 @@ L_TARGET := lib.a
  
  export-objs := cmdline.o dec_and_lock.o rwsem-spinlock.o rwsem.o
  
-obj-y := errno.o ctype.o string.o vsprintf.o brlock.o cmdline.o
+obj-y := errno.o ctype.o string.o vsprintf.o brlock.o cmdline.o bust_spinlocks.o rbtree.o
  
  obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
  obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
diff --git a/lib/bust_spinlocks.c b/lib/bust_spinlocks.c

new file mode 100644 (file)

index 0000000..07f3eda
--- /dev/null
+++ b/lib/bust_spinlocks.c
@@ -0,0 +1,41 @@
+/*
+ * lib/bust_spinlocks.c
+ *
+ * Provides a minimal bust_spinlocks for architectures which don't have one of their own.
+ *
+ * bust_spinlocks() clears any spinlocks which would prevent oops, die(), BUG()
+ * and panic() information from reaching the user.
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/tty.h>
+#include <linux/wait.h>
+#include <linux/vt_kern.h>
+
+extern spinlock_t timerlist_lock;
+
+void bust_spinlocks(int yes)
+{
+       spin_lock_init(&timerlist_lock);
+       if (yes) {
+               oops_in_progress = 1;
+       } else {
+               int loglevel_save = console_loglevel;
+#ifdef CONFIG_VT
+               unblank_screen();
+#endif
+               oops_in_progress = 0;
+               /*
+                * OK, the message is on the console.  Now we call printk()
+                * without oops_in_progress set so that printk() will give klogd
+                * and the blanked console a poke.  Hold onto your hats...
+                */
+               console_loglevel = 15;          /* NMI oopser may have shut the console up */
+               printk(" ");
+               console_loglevel = loglevel_save;
+       }
+}
+
+
diff --git a/lib/rbtree.c b/lib/rbtree.c

new file mode 100644 (file)

index 0000000..ee6d971
--- /dev/null
+++ b/lib/rbtree.c
@@ -0,0 +1,293 @@
+/*
+  Red Black Trees
+  (C) 1999  Andrea Arcangeli <andrea@suse.de>
+  
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+  linux/lib/rbtree.c
+*/
+
+#include <linux/rbtree.h>
+
+static void __rb_rotate_left(rb_node_t * node, rb_root_t * root)
+{
+       rb_node_t * right = node->rb_right;
+
+       if ((node->rb_right = right->rb_left))
+               right->rb_left->rb_parent = node;
+       right->rb_left = node;
+
+       if ((right->rb_parent = node->rb_parent))
+       {
+               if (node == node->rb_parent->rb_left)
+                       node->rb_parent->rb_left = right;
+               else
+                       node->rb_parent->rb_right = right;
+       }
+       else
+               root->rb_node = right;
+       node->rb_parent = right;
+}
+
+static void __rb_rotate_right(rb_node_t * node, rb_root_t * root)
+{
+       rb_node_t * left = node->rb_left;
+
+       if ((node->rb_left = left->rb_right))
+               left->rb_right->rb_parent = node;
+       left->rb_right = node;
+
+       if ((left->rb_parent = node->rb_parent))
+       {
+               if (node == node->rb_parent->rb_right)
+                       node->rb_parent->rb_right = left;
+               else
+                       node->rb_parent->rb_left = left;
+       }
+       else
+               root->rb_node = left;
+       node->rb_parent = left;
+}
+
+void rb_insert_color(rb_node_t * node, rb_root_t * root)
+{
+       rb_node_t * parent, * gparent;
+
+       while ((parent = node->rb_parent) && parent->rb_color == RB_RED)
+       {
+               gparent = parent->rb_parent;
+
+               if (parent == gparent->rb_left)
+               {
+                       {
+                               register rb_node_t * uncle = gparent->rb_right;
+                               if (uncle && uncle->rb_color == RB_RED)
+                               {
+                                       uncle->rb_color = RB_BLACK;
+                                       parent->rb_color = RB_BLACK;
+                                       gparent->rb_color = RB_RED;
+                                       node = gparent;
+                                       continue;
+                               }
+                       }
+
+                       if (parent->rb_right == node)
+                       {
+                               register rb_node_t * tmp;
+                               __rb_rotate_left(parent, root);
+                               tmp = parent;
+                               parent = node;
+                               node = tmp;
+                       }
+
+                       parent->rb_color = RB_BLACK;
+                       gparent->rb_color = RB_RED;
+                       __rb_rotate_right(gparent, root);
+               } else {
+                       {
+                               register rb_node_t * uncle = gparent->rb_left;
+                               if (uncle && uncle->rb_color == RB_RED)
+                               {
+                                       uncle->rb_color = RB_BLACK;
+                                       parent->rb_color = RB_BLACK;
+                                       gparent->rb_color = RB_RED;
+                                       node = gparent;
+                                       continue;
+                               }
+                       }
+
+                       if (parent->rb_left == node)
+                       {
+                               register rb_node_t * tmp;
+                               __rb_rotate_right(parent, root);
+                               tmp = parent;
+                               parent = node;
+                               node = tmp;
+                       }
+
+                       parent->rb_color = RB_BLACK;
+                       gparent->rb_color = RB_RED;
+                       __rb_rotate_left(gparent, root);
+               }
+       }
+
+       root->rb_node->rb_color = RB_BLACK;
+}
+
+static void __rb_erase_color(rb_node_t * node, rb_node_t * parent,
+                            rb_root_t * root)
+{
+       rb_node_t * other;
+
+       while ((!node || node->rb_color == RB_BLACK) && node != root->rb_node)
+       {
+               if (parent->rb_left == node)
+               {
+                       other = parent->rb_right;
+                       if (other->rb_color == RB_RED)
+                       {
+                               other->rb_color = RB_BLACK;
+                               parent->rb_color = RB_RED;
+                               __rb_rotate_left(parent, root);
+                               other = parent->rb_right;
+                       }
+                       if ((!other->rb_left ||
+                            other->rb_left->rb_color == RB_BLACK)
+                           && (!other->rb_right ||
+                               other->rb_right->rb_color == RB_BLACK))
+                       {
+                               other->rb_color = RB_RED;
+                               node = parent;
+                               parent = node->rb_parent;
+                       }
+                       else
+                       {
+                               if (!other->rb_right ||
+                                   other->rb_right->rb_color == RB_BLACK)
+                               {
+                                       register rb_node_t * o_left;
+                                       if ((o_left = other->rb_left))
+                                               o_left->rb_color = RB_BLACK;
+                                       other->rb_color = RB_RED;
+                                       __rb_rotate_right(other, root);
+                                       other = parent->rb_right;
+                               }
+                               other->rb_color = parent->rb_color;
+                               parent->rb_color = RB_BLACK;
+                               if (other->rb_right)
+                                       other->rb_right->rb_color = RB_BLACK;
+                               __rb_rotate_left(parent, root);
+                               node = root->rb_node;
+                               break;
+                       }
+               }
+               else
+               {
+                       other = parent->rb_left;
+                       if (other->rb_color == RB_RED)
+                       {
+                               other->rb_color = RB_BLACK;
+                               parent->rb_color = RB_RED;
+                               __rb_rotate_right(parent, root);
+                               other = parent->rb_left;
+                       }
+                       if ((!other->rb_left ||
+                            other->rb_left->rb_color == RB_BLACK)
+                           && (!other->rb_right ||
+                               other->rb_right->rb_color == RB_BLACK))
+                       {
+                               other->rb_color = RB_RED;
+                               node = parent;
+                               parent = node->rb_parent;
+                       }
+                       else
+                       {
+                               if (!other->rb_left ||
+                                   other->rb_left->rb_color == RB_BLACK)
+                               {
+                                       register rb_node_t * o_right;
+                                       if ((o_right = other->rb_right))
+                                               o_right->rb_color = RB_BLACK;
+                                       other->rb_color = RB_RED;
+                                       __rb_rotate_left(other, root);
+                                       other = parent->rb_left;
+                               }
+                               other->rb_color = parent->rb_color;
+                               parent->rb_color = RB_BLACK;
+                               if (other->rb_left)
+                                       other->rb_left->rb_color = RB_BLACK;
+                               __rb_rotate_right(parent, root);
+                               node = root->rb_node;
+                               break;
+                       }
+               }
+       }
+       if (node)
+               node->rb_color = RB_BLACK;
+}
+
+void rb_erase(rb_node_t * node, rb_root_t * root)
+{
+       rb_node_t * child, * parent;
+       int color;
+
+       if (!node->rb_left)
+               child = node->rb_right;
+       else if (!node->rb_right)
+               child = node->rb_left;
+       else
+       {
+               rb_node_t * old = node, * left;
+
+               node = node->rb_right;
+               while ((left = node->rb_left))
+                       node = left;
+               child = node->rb_right;
+               parent = node->rb_parent;
+               color = node->rb_color;
+
+               if (child)
+                       child->rb_parent = parent;
+               if (parent)
+               {
+                       if (parent->rb_left == node)
+                               parent->rb_left = child;
+                       else
+                               parent->rb_right = child;
+               }
+               else
+                       root->rb_node = child;
+
+               if (node->rb_parent == old)
+                       parent = node;
+               node->rb_parent = old->rb_parent;
+               node->rb_color = old->rb_color;
+               node->rb_right = old->rb_right;
+               node->rb_left = old->rb_left;
+
+               if (old->rb_parent)
+               {
+                       if (old->rb_parent->rb_left == old)
+                               old->rb_parent->rb_left = node;
+                       else
+                               old->rb_parent->rb_right = node;
+               } else
+                       root->rb_node = node;
+
+               old->rb_left->rb_parent = node;
+               if (old->rb_right)
+                       old->rb_right->rb_parent = node;
+               goto color;
+       }
+
+       parent = node->rb_parent;
+       color = node->rb_color;
+
+       if (child)
+               child->rb_parent = parent;
+       if (parent)
+       {
+               if (parent->rb_left == node)
+                       parent->rb_left = child;
+               else
+                       parent->rb_right = child;
+       }
+       else
+               root->rb_node = child;
+
+ color:
+       if (color == RB_BLACK)
+               __rb_erase_color(child, parent, root);
+}
diff --git a/mm/filemap.c b/mm/filemap.c

index 60dfccdaea85aedb11c1dc4317e23bdc9c3d26cb..d9624c4cc3515db01576212f1a58d5d144b88919 100644 (file)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -22,6 +22,7 @@
  #include <linux/swapctl.h>
  #include <linux/init.h>
  #include <linux/mm.h>
+#include <linux/iobuf.h>
  
  #include <asm/pgalloc.h>
  #include <asm/uaccess.h>
@@ -45,12 +46,12 @@ atomic_t page_cache_size = ATOMIC_INIT(0);
  unsigned int page_hash_bits;
  struct page **page_hash_table;
  
-spinlock_t __cacheline_aligned pagecache_lock = SPIN_LOCK_UNLOCKED;
+spinlock_t pagecache_lock ____cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
  /*
   * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with
   *       the pagemap_lru_lock held.
   */
-spinlock_t __cacheline_aligned pagemap_lru_lock = SPIN_LOCK_UNLOCKED;
+spinlock_t pagemap_lru_lock ____cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
  
  #define CLUSTER_PAGES          (1 << page_cluster)
  #define CLUSTER_OFFSET(x)      (((x) >> page_cluster) << page_cluster)
@@ -200,7 +201,7 @@ static inline void truncate_partial_page(struct page *page, unsigned partial)
  
  }
  
-static inline void truncate_complete_page(struct page *page)
+static void truncate_complete_page(struct page *page)
  {
         /* Leave it on the LRU if it gets converted into anonymous buffers */
         if (!page->buffers || block_flushpage(page, 0))
@@ -224,8 +225,10 @@ static int truncate_list_pages(struct list_head *head, unsigned long start, unsi
  {
         struct list_head *curr;
         struct page * page;
+       int unlocked = 0;
  
-       curr = head->next;
+ restart:
+       curr = head->prev;
         while (curr != head) {
                 unsigned long offset;
  
@@ -234,33 +237,46 @@ static int truncate_list_pages(struct list_head *head, unsigned long start, unsi
  
                 /* Is one of the pages to truncate? */
                 if ((offset >= start) || (*partial && (offset + 1) == start)) {
-                       list_del(head);
-                       list_add(head, curr);
-                       if (TryLockPage(page)) {
-                               page_cache_get(page);
-                               spin_unlock(&pagecache_lock);
-                               wait_on_page(page);
-                               goto out_restart;
-                       }
+                       int failed;
+
                         page_cache_get(page);
+                       failed = TryLockPage(page);
+
+                       list_del(head);
+                       if (!failed)
+                               /* Restart after this page */
+                               list_add_tail(head, curr);
+                       else
+                               /* Restart on this page */
+                               list_add(head, curr);
+
                         spin_unlock(&pagecache_lock);
+                       unlocked = 1;
  
-                       if (*partial && (offset + 1) == start) {
-                               truncate_partial_page(page, *partial);
-                               *partial = 0;
-                       } else 
-                               truncate_complete_page(page);
+                       if (!failed) {
+                               if (*partial && (offset + 1) == start) {
+                                       truncate_partial_page(page, *partial);
+                                       *partial = 0;
+                               } else 
+                                       truncate_complete_page(page);
  
-                       UnlockPage(page);
-                       goto out_restart;
+                               UnlockPage(page);
+                       } else
+                               wait_on_page(page);
+
+                       page_cache_release(page);
+
+                       if (current->need_resched) {
+                               __set_current_state(TASK_RUNNING);
+                               schedule();
+                       }
+
+                       spin_lock(&pagecache_lock);
+                       goto restart;
                 }
-               curr = curr->next;
+               curr = curr->prev;
         }
-       return 0;
-out_restart:
-       page_cache_release(page);
-       spin_lock(&pagecache_lock);
-       return 1;
+       return unlocked;
  }
  
  
@@ -277,22 +293,118 @@ void truncate_inode_pages(struct address_space * mapping, loff_t lstart)
  {
         unsigned long start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
         unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
-       int complete;
+       int unlocked;
  
         spin_lock(&pagecache_lock);
         do {
-               complete = 1;
-               while (truncate_list_pages(&mapping->clean_pages, start, &partial))
-                       complete = 0;
-               while (truncate_list_pages(&mapping->dirty_pages, start, &partial))
-                       complete = 0;
-               while (truncate_list_pages(&mapping->locked_pages, start, &partial))
-                       complete = 0;
-       } while (!complete);
+               unlocked = truncate_list_pages(&mapping->clean_pages, start, &partial);
+               unlocked |= truncate_list_pages(&mapping->dirty_pages, start, &partial);
+               unlocked |= truncate_list_pages(&mapping->locked_pages, start, &partial);
+       } while (unlocked);
         /* Traversed all three lists without dropping the lock */
         spin_unlock(&pagecache_lock);
  }
  
+static inline int invalidate_this_page2(struct page * page,
+                                       struct list_head * curr,
+                                       struct list_head * head)
+{
+       int unlocked = 1;
+
+       /*
+        * The page is locked and we hold the pagecache_lock as well
+        * so both page_count(page) and page->buffers stays constant here.
+        */
+       if (page_count(page) == 1 + !!page->buffers) {
+               /* Restart after this page */
+               list_del(head);
+               list_add_tail(head, curr);
+
+               page_cache_get(page);
+               spin_unlock(&pagecache_lock);
+               truncate_complete_page(page);
+       } else {
+               if (page->buffers) {
+                       /* Restart after this page */
+                       list_del(head);
+                       list_add_tail(head, curr);
+
+                       page_cache_get(page);
+                       spin_unlock(&pagecache_lock);
+                       block_invalidate_page(page);
+               } else
+                       unlocked = 0;
+
+               ClearPageDirty(page);
+               ClearPageUptodate(page);
+       }
+
+       return unlocked;
+}
+
+static int FASTCALL(invalidate_list_pages2(struct list_head *));
+static int invalidate_list_pages2(struct list_head *head)
+{
+       struct list_head *curr;
+       struct page * page;
+       int unlocked = 0;
+
+ restart:
+       curr = head->prev;
+       while (curr != head) {
+               page = list_entry(curr, struct page, list);
+
+               if (!TryLockPage(page)) {
+                       int __unlocked;
+
+                       __unlocked = invalidate_this_page2(page, curr, head);
+                       UnlockPage(page);
+                       unlocked |= __unlocked;
+                       if (!__unlocked) {
+                               curr = curr->prev;
+                               continue;
+                       }
+               } else {
+                       /* Restart on this page */
+                       list_del(head);
+                       list_add(head, curr);
+
+                       page_cache_get(page);
+                       spin_unlock(&pagecache_lock);
+                       unlocked = 1;
+                       wait_on_page(page);
+               }
+
+               page_cache_release(page);
+               if (current->need_resched) {
+                       __set_current_state(TASK_RUNNING);
+                       schedule();
+               }
+
+               spin_lock(&pagecache_lock);
+               goto restart;
+       }
+       return unlocked;
+}
+
+/**
+ * invalidate_inode_pages2 - Clear all the dirty bits around if it can't
+ * free the pages because they're mapped.
+ * @mapping: the address_space which pages we want to invalidate
+ */
+void invalidate_inode_pages2(struct address_space * mapping)
+{
+       int unlocked;
+
+       spin_lock(&pagecache_lock);
+       do {
+               unlocked = invalidate_list_pages2(&mapping->clean_pages);
+               unlocked |= invalidate_list_pages2(&mapping->dirty_pages);
+               unlocked |= invalidate_list_pages2(&mapping->locked_pages);
+       } while (unlocked);
+       spin_unlock(&pagecache_lock);
+}
+
  static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
  {
         goto inside;
@@ -307,6 +419,9 @@ inside:
                 if (page->index == offset)
                         break;
         }
+
+       SetPageReferenced(page);
+
  not_found:
         return page;
  }
@@ -484,9 +599,9 @@ void add_to_page_cache_locked(struct page * page, struct address_space *mapping,
         if (!PageLocked(page))
                 BUG();
  
+       page->index = index;
         page_cache_get(page);
         spin_lock(&pagecache_lock);
-       page->index = index;
         add_page_to_inode_queue(mapping, page);
         add_page_to_hash_queue(page, page_hash(mapping, index));
         lru_cache_add(page);
@@ -506,7 +621,7 @@ static inline void __add_to_page_cache(struct page * page,
         if (PageLocked(page))
                 BUG();
  
-       flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced) | (1 << PG_arch_1) | (1 << PG_checked));
+       flags = page->flags & ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_dirty | 1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_checked);
         page->flags = flags | (1 << PG_locked);
         page_cache_get(page);
         page->index = offset;
@@ -546,7 +661,8 @@ static int add_to_page_cache_unique(struct page * page,
   * This adds the requested page to the page cache if it isn't already there,
   * and schedules an I/O to read in its contents from disk.
   */
-static inline int page_cache_read(struct file * file, unsigned long offset) 
+static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
+static int page_cache_read(struct file * file, unsigned long offset)
  {
         struct inode *inode = file->f_dentry->d_inode;
         struct address_space *mapping = inode->i_mapping;
@@ -554,7 +670,7 @@ static inline int page_cache_read(struct file * file, unsigned long offset)
         struct page *page; 
  
         spin_lock(&pagecache_lock);
-       page = __find_page_nolock(mapping, offset, *hash); 
+       page = __find_page_nolock(mapping, offset, *hash);
         spin_unlock(&pagecache_lock);
         if (page)
                 return 0;
@@ -572,7 +688,7 @@ static inline int page_cache_read(struct file * file, unsigned long offset)
          * We arrive here in the unlikely event that someone 
          * raced with us and added our page to the cache first.
          */
-       page_cache_free(page);
+       page_cache_release(page);
         return 0;
  }
  
@@ -580,6 +696,8 @@ static inline int page_cache_read(struct file * file, unsigned long offset)
   * Read in an entire cluster at once.  A cluster is usually a 64k-
   * aligned block that includes the page requested in "offset."
   */
+static int FASTCALL(read_cluster_nonblocking(struct file * file, unsigned long offset,
+                                            unsigned long filesize));
  static int read_cluster_nonblocking(struct file * file, unsigned long offset,
         unsigned long filesize)
  {
@@ -610,11 +728,10 @@ void ___wait_on_page(struct page *page)
  
         add_wait_queue(&page->wait, &wait);
         do {
-               sync_page(page);
                 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
                 if (!PageLocked(page))
                         break;
-               run_task_queue(&tq_disk);
+               sync_page(page);
                 schedule();
         } while (PageLocked(page));
         tsk->state = TASK_RUNNING;
@@ -632,12 +749,10 @@ static void __lock_page(struct page *page)
  
         add_wait_queue_exclusive(&page->wait, &wait);
         for (;;) {
-               sync_page(page);
                 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
                 if (PageLocked(page)) {
-                       run_task_queue(&tq_disk);
+                       sync_page(page);
                         schedule();
-                       continue;
                 }
                 if (!TryLockPage(page))
                         break;
@@ -843,16 +958,42 @@ static inline int get_max_readahead(struct inode * inode)
         return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
  }
  
+static inline unsigned long calc_end_index(struct inode * inode)
+{
+       unsigned long end_index;
+
+       if (!S_ISBLK(inode->i_mode))
+               end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+       else
+               end_index = buffered_blk_size(inode->i_rdev) >> (PAGE_CACHE_SHIFT - BLOCK_SIZE_BITS);
+
+       return end_index;
+}
+
+static inline loff_t calc_rsize(struct inode * inode)
+{
+       loff_t rsize;
+
+       if (!S_ISBLK(inode->i_mode))
+               rsize = inode->i_size;
+       else
+               rsize = (loff_t) buffered_blk_size(inode->i_rdev) << BLOCK_SIZE_BITS;
+
+       return rsize;
+}
+
  static void generic_file_readahead(int reada_ok,
         struct file * filp, struct inode * inode,
         struct page * page)
  {
-       unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+       unsigned long end_index;
         unsigned long index = page->index;
         unsigned long max_ahead, ahead;
         unsigned long raend;
         int max_readahead = get_max_readahead(inode);
  
+       end_index = calc_end_index(inode);
+
         raend = filp->f_raend;
         max_ahead = 0;
  
@@ -945,26 +1086,6 @@ static void generic_file_readahead(int reada_ok,
         return;
  }
  
-/*
- * Mark a page as having seen activity.
- *
- * If it was already so marked, move it
- * to the active queue and drop the referenced
- * bit. Otherwise, just mark it for future
- * action..
- */
-void mark_page_accessed(struct page *page)
-{
-       if (!PageActive(page) && PageReferenced(page)) {
-               activate_page(page);
-               ClearPageReferenced(page);
-               return;
-       }
-
-       /* Mark the page referenced, AFTER checking for previous usage.. */
-       SetPageReferenced(page);
-}
-
  /*
   * This is a generic file read routine, and uses the
   * inode->i_op->readpage() function for the actual low-level
@@ -1030,12 +1151,13 @@ void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t *
                 struct page *page, **hash;
                 unsigned long end_index, nr, ret;
  
-               end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+               end_index = calc_end_index(inode);
+                       
                 if (index > end_index)
                         break;
                 nr = PAGE_CACHE_SIZE;
                 if (index == end_index) {
-                       nr = inode->i_size & ~PAGE_CACHE_MASK;
+                       nr = calc_rsize(inode) & ~PAGE_CACHE_MASK;
                         if (nr <= offset)
                                 break;
                 }
@@ -1081,7 +1203,6 @@ page_ok:
                 index += offset >> PAGE_CACHE_SHIFT;
                 offset &= ~PAGE_CACHE_MASK;
  
-               mark_page_accessed(page);
                 page_cache_release(page);
                 if (ret == nr && desc->count)
                         continue;
@@ -1172,10 +1293,96 @@ no_cached_page:
         *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
         filp->f_reada = 1;
         if (cached_page)
-               page_cache_free(cached_page);
+               page_cache_release(cached_page);
         UPDATE_ATIME(inode);
  }
  
+static ssize_t generic_file_direct_IO(int rw, struct file * filp, char * buf, size_t count, loff_t offset)
+{
+       ssize_t retval;
+       int new_iobuf, chunk_size, blocksize_mask, blocksize, blocksize_bits, iosize, progress;
+       struct kiobuf * iobuf;
+       struct inode * inode = filp->f_dentry->d_inode;
+       struct address_space * mapping = inode->i_mapping;
+
+       new_iobuf = 0;
+       iobuf = filp->f_iobuf;
+       if (test_and_set_bit(0, &filp->f_iobuf_lock)) {
+               /*
+                * A parallel read/write is using the preallocated iobuf
+                * so just run slow and allocate a new one.
+                */
+               retval = alloc_kiovec(1, &iobuf);
+               if (retval)
+                       goto out;
+               new_iobuf = 1;
+       }
+
+       if (!S_ISBLK(inode->i_mode)) {
+               blocksize = inode->i_sb->s_blocksize;
+               blocksize_bits = inode->i_sb->s_blocksize_bits;
+       } else {
+               blocksize = BUFFERED_BLOCKSIZE;
+               blocksize_bits = BUFFERED_BLOCKSIZE_BITS;
+       }
+       blocksize_mask = blocksize - 1;
+       chunk_size = KIO_MAX_ATOMIC_IO << 10;
+
+       retval = -EINVAL;
+       if ((offset & blocksize_mask) || (count & blocksize_mask))
+               goto out_free;
+       if (!mapping->a_ops->direct_IO)
+               goto out_free;
+
+       /*
+        * Flush to disk exlusively the _data_, metadata must remains
+        * completly asynchronous or performance will go to /dev/null.
+        */
+       filemap_fdatasync(mapping);
+       retval = fsync_inode_data_buffers(inode);
+       filemap_fdatawait(mapping);
+       if (retval < 0)
+               goto out_free;
+
+       progress = retval = 0;
+       while (count > 0) {
+               iosize = count;
+               if (iosize > chunk_size)
+                       iosize = chunk_size;
+
+               retval = map_user_kiobuf(rw, iobuf, (unsigned long) buf, iosize);
+               if (retval)
+                       break;
+
+               retval = mapping->a_ops->direct_IO(rw, inode, iobuf, (offset+progress) >> blocksize_bits, blocksize);
+
+               if (rw == READ && retval > 0)
+                       mark_dirty_kiobuf(iobuf, retval);
+               
+               if (retval >= 0) {
+                       count -= retval;
+                       buf += retval;
+                       progress += retval;
+               }
+
+               unmap_kiobuf(iobuf);
+
+               if (retval != iosize)
+                       break;
+       }
+
+       if (progress)
+               retval = progress;
+
+ out_free:
+       if (!new_iobuf)
+               clear_bit(0, &filp->f_iobuf_lock);
+       else
+               free_kiovec(1, &iobuf);
+ out:  
+       return retval;
+}
+
  int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
  {
         char *kaddr;
@@ -1206,6 +1413,12 @@ ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *
  {
         ssize_t retval;
  
+       if ((ssize_t) count < 0)
+               return -EINVAL;
+
+       if (filp->f_flags & O_DIRECT)
+               goto o_direct;
+
         retval = -EFAULT;
         if (access_ok(VERIFY_WRITE, buf, count)) {
                 retval = 0;
@@ -1224,7 +1437,28 @@ ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *
                                 retval = desc.error;
                 }
         }
+ out:
         return retval;
+
+ o_direct:
+       {
+               loff_t pos = *ppos, size;
+               struct inode * inode = filp->f_dentry->d_inode;
+
+               retval = 0;
+               if (!count)
+                       goto out; /* skip atime */
+               size = calc_rsize(inode);
+               if (pos < size) {
+                       if (pos + count > size)
+                               count = size - pos;
+                       retval = generic_file_direct_IO(READ, filp, buf, count, pos);
+                       if (retval > 0)
+                               *ppos = pos + retval;
+               }
+               UPDATE_ATIME(filp->f_dentry->d_inode);
+               goto out;
+       }
  }
  
  static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size)
@@ -1409,6 +1643,7 @@ struct page * filemap_nopage(struct vm_area_struct * area,
         struct address_space *mapping = inode->i_mapping;
         struct page *page, **hash, *old_page;
         unsigned long size, pgoff;
+       loff_t rsize;
  
         pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
  
@@ -1417,7 +1652,8 @@ retry_all:
          * An external ptracer can access pages that normally aren't
          * accessible..
          */
-       size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+       rsize = calc_rsize(inode);
+       size = (rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
         if ((pgoff >= size) && (area->vm_mm == current->mm))
                 return NULL;
  
@@ -1658,22 +1894,7 @@ int filemap_sync(struct vm_area_struct * vma, unsigned long address,
         return error;
  }
  
-/*
- * Shared mappings need to be able to do the right thing at
- * close/unmap/sync. They will also use the private file as
- * backing-store for swapping..
- */
-static struct vm_operations_struct file_shared_mmap = {
-       nopage:         filemap_nopage,
-};
-
-/*
- * Private mappings just need to be able to load in the map.
- *
- * (This is actually used for shared mappings as well, if we
- * know they can't ever get write permissions..)
- */
-static struct vm_operations_struct file_private_mmap = {
+static struct vm_operations_struct generic_file_vm_ops = {
         nopage:         filemap_nopage,
  };
  
@@ -1681,21 +1902,18 @@ static struct vm_operations_struct file_private_mmap = {
  
  int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
  {
-       struct vm_operations_struct * ops;
         struct inode *inode = file->f_dentry->d_inode;
  
-       ops = &file_private_mmap;
         if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
                 if (!inode->i_mapping->a_ops->writepage)
                         return -EINVAL;
-               ops = &file_shared_mmap;
         }
         if (!inode->i_sb || !S_ISREG(inode->i_mode))
                 return -EACCES;
         if (!inode->i_mapping->a_ops->readpage)
                 return -ENOEXEC;
         UPDATE_ATIME(inode);
-       vma->vm_ops = ops;
+       vma->vm_ops = &generic_file_vm_ops;
         return 0;
  }
  
@@ -1802,6 +2020,7 @@ static long madvise_fixup_start(struct vm_area_struct * vma,
         unsigned long end, int behavior)
  {
         struct vm_area_struct * n;
+       struct mm_struct * mm = vma->vm_mm;
  
         n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
         if (!n)
@@ -1814,12 +2033,12 @@ static long madvise_fixup_start(struct vm_area_struct * vma,
                 get_file(n->vm_file);
         if (n->vm_ops && n->vm_ops->open)
                 n->vm_ops->open(n);
-       lock_vma_mappings(vma);
-       spin_lock(&vma->vm_mm->page_table_lock);
         vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
+       lock_vma_mappings(vma);
+       spin_lock(&mm->page_table_lock);
         vma->vm_start = end;
-       __insert_vm_struct(current->mm, n);
-       spin_unlock(&vma->vm_mm->page_table_lock);
+       __insert_vm_struct(mm, n);
+       spin_unlock(&mm->page_table_lock);
         unlock_vma_mappings(vma);
         return 0;
  }
@@ -1828,6 +2047,7 @@ static long madvise_fixup_end(struct vm_area_struct * vma,
         unsigned long start, int behavior)
  {
         struct vm_area_struct * n;
+       struct mm_struct * mm = vma->vm_mm;
  
         n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
         if (!n)
@@ -1842,10 +2062,10 @@ static long madvise_fixup_end(struct vm_area_struct * vma,
         if (n->vm_ops && n->vm_ops->open)
                 n->vm_ops->open(n);
         lock_vma_mappings(vma);
-       spin_lock(&vma->vm_mm->page_table_lock);
+       spin_lock(&mm->page_table_lock);
         vma->vm_end = start;
-       __insert_vm_struct(current->mm, n);
-       spin_unlock(&vma->vm_mm->page_table_lock);
+       __insert_vm_struct(mm, n);
+       spin_unlock(&mm->page_table_lock);
         unlock_vma_mappings(vma);
         return 0;
  }
@@ -1854,6 +2074,7 @@ static long madvise_fixup_middle(struct vm_area_struct * vma,
         unsigned long start, unsigned long end, int behavior)
  {
         struct vm_area_struct * left, * right;
+       struct mm_struct * mm = vma->vm_mm;
  
         left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
         if (!left)
@@ -1877,16 +2098,16 @@ static long madvise_fixup_middle(struct vm_area_struct * vma,
                 vma->vm_ops->open(left);
                 vma->vm_ops->open(right);
         }
-       lock_vma_mappings(vma);
-       spin_lock(&vma->vm_mm->page_table_lock);
         vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
+       vma->vm_raend = 0;
+       lock_vma_mappings(vma);
+       spin_lock(&mm->page_table_lock);
         vma->vm_start = start;
         vma->vm_end = end;
         setup_read_behavior(vma, behavior);
-       vma->vm_raend = 0;
-       __insert_vm_struct(current->mm, left);
-       __insert_vm_struct(current->mm, right);
-       spin_unlock(&vma->vm_mm->page_table_lock);
+       __insert_vm_struct(mm, left);
+       __insert_vm_struct(mm, right);
+       spin_unlock(&mm->page_table_lock);
         unlock_vma_mappings(vma);
         return 0;
  }
@@ -1930,13 +2151,14 @@ static long madvise_willneed(struct vm_area_struct * vma,
         long error = -EBADF;
         struct file * file;
         unsigned long size, rlim_rss;
+       loff_t rsize;
  
         /* Doesn't work if there's no mapped file. */
         if (!vma->vm_file)
                 return error;
         file = vma->vm_file;
-       size = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >>
-                                                       PAGE_CACHE_SHIFT;
+       rsize = calc_rsize(file->f_dentry->d_inode);
+       size = (rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
  
         start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
         if (end > vma->vm_end)
@@ -2313,8 +2535,7 @@ repeat:
                 }
         }
         if (cached_page)
-               page_cache_free(cached_page);
-       mark_page_accessed(page);
+               page_cache_release(cached_page);
         return page;
  }
  
@@ -2383,7 +2604,7 @@ struct page *grab_cache_page(struct address_space *mapping, unsigned long index)
         struct page *cached_page = NULL;
         struct page *page = __grab_cache_page(mapping,index,&cached_page);
         if (cached_page)
-               page_cache_free(cached_page);
+               page_cache_release(cached_page);
         return page;
  }
  
@@ -2418,7 +2639,7 @@ inline void remove_suid(struct inode *inode)
   *                                                     okir@monad.swb.de
   */
  ssize_t
-generic_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)
+generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos)
  {
         struct inode    *inode = file->f_dentry->d_inode; 
         struct address_space *mapping = inode->i_mapping;
@@ -2426,15 +2647,18 @@ generic_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)
         loff_t          pos;
         struct page     *page, *cached_page;
         unsigned long   written;
-       long            status;
+       long            status = 0;
         int             err;
         unsigned        bytes;
  
-       cached_page = NULL;
+       if ((ssize_t) count < 0)
+               return -EINVAL;
  
         if (!access_ok(VERIFY_READ, buf, count))
                 return -EFAULT;
-               
+
+       cached_page = NULL;
+
         down(&inode->i_sem);
  
         pos = *ppos;
@@ -2450,7 +2674,8 @@ generic_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)
  
         written = 0;
  
-       if (file->f_flags & O_APPEND)
+       /* FIXME: this is for backwards compatibility with 2.4 */
+       if (!S_ISBLK(inode->i_mode) && file->f_flags & O_APPEND)
                 pos = inode->i_size;
  
         /*
@@ -2493,30 +2718,49 @@ generic_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)
          *      Linus frestrict idea will clean these up nicely..
          */
          
-       if (pos >= inode->i_sb->s_maxbytes)
-       {
-               if (count || pos > inode->i_sb->s_maxbytes) {
-                       send_sig(SIGXFSZ, current, 0);
-                       err = -EFBIG;
+       if (!S_ISBLK(inode->i_mode)) {
+               if (pos >= inode->i_sb->s_maxbytes)
+               {
+                       if (count || pos > inode->i_sb->s_maxbytes) {
+                               send_sig(SIGXFSZ, current, 0);
+                               err = -EFBIG;
+                               goto out;
+                       }
+                       /* zero-length writes at ->s_maxbytes are OK */
+               }
+
+               if (pos + count > inode->i_sb->s_maxbytes)
+                       count = inode->i_sb->s_maxbytes - pos;
+       } else {
+               if (is_read_only(inode->i_rdev)) {
+                       err = -EPERM;
                         goto out;
                 }
-               /* zero-length writes at ->s_maxbytes are OK */
-       }
+               if (pos >= calc_rsize(inode)) {
+                       if (count || pos > calc_rsize(inode)) {
+                               /* FIXME: this is for backwards compatibility with 2.4 */
+                               err = -ENOSPC;
+                               goto out;
+                       }
+                       /* zero-length writes at blkdev end are OK */
+               }
  
-       if (pos + count > inode->i_sb->s_maxbytes)
-               count = inode->i_sb->s_maxbytes - pos;
+               if (pos + count > calc_rsize(inode))
+                       count = calc_rsize(inode) - pos;
+       }
  
-       if (count == 0) {
-               err = 0;
+       err = 0;
+       if (count == 0)
                 goto out;
-       }
  
-       status  = 0;
         remove_suid(inode);
         inode->i_ctime = inode->i_mtime = CURRENT_TIME;
         mark_inode_dirty_sync(inode);
  
-       while (count) {
+       if (file->f_flags & O_DIRECT)
+               goto o_direct;
+
+       do {
                 unsigned long index, offset;
                 long page_fault;
                 char *kaddr;
@@ -2578,17 +2822,18 @@ unlock:
  
                 if (status < 0)
                         break;
-       }
+       } while (count);
         *ppos = pos;
  
         if (cached_page)
-               page_cache_free(cached_page);
+               page_cache_release(cached_page);
  
         /* For now, when the user asks for O_SYNC, we'll actually
          * provide O_DSYNC. */
         if ((status >= 0) && (file->f_flags & O_SYNC))
-               status = generic_osync_inode(inode, 1); /* 1 means datasync */
+               status = generic_osync_inode(inode, OSYNC_METADATA|OSYNC_DATA);
         
+out_status:    
         err = written ? written : status;
  out:
  
@@ -2597,6 +2842,25 @@ out:
  fail_write:
         status = -EFAULT;
         goto unlock;
+
+o_direct:
+       written = generic_file_direct_IO(WRITE, file, (char *) buf, count, pos);
+       if (written > 0) {
+               loff_t end = pos + written;
+               if (end > inode->i_size && !S_ISBLK(inode->i_mode)) {
+                       inode->i_size = end;
+                       mark_inode_dirty(inode);
+               }
+               *ppos = end;
+               invalidate_inode_pages2(mapping);
+       }
+       /*
+        * Sync the fs metadata but not the minor inode changes and
+        * of course not the data as we did direct DMA for the IO.
+        */
+       if (written >= 0 && file->f_flags & O_SYNC)
+               status = generic_osync_inode(inode, OSYNC_METADATA);
+       goto out_status;
  }
  
  void __init page_cache_init(unsigned long mempages)
diff --git a/mm/highmem.c b/mm/highmem.c

index 61540ce78c4c7775f145ca95bc8f183d8427e79a..3b5d92903c5e972dd2603134a0c8b0540faef0a2 100644 (file)
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -212,9 +212,9 @@ static inline void copy_from_high_bh (struct buffer_head *to,
  
         p_from = from->b_page;
  
-       vfrom = kmap_atomic(p_from, KM_BOUNCE_WRITE);
+       vfrom = kmap_atomic(p_from, KM_USER0);
         memcpy(to->b_data, vfrom + bh_offset(from), to->b_size);
-       kunmap_atomic(vfrom, KM_BOUNCE_WRITE);
+       kunmap_atomic(vfrom, KM_USER0);
  }
  
  static inline void copy_to_high_bh_irq (struct buffer_head *to,
diff --git a/mm/memory.c b/mm/memory.c

index 0cd6915a45a10b07dbebddfb3bb201b4c87e4fce..30ffea7676342b6f4c547f15ae87352f7096a038 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -128,11 +128,13 @@ void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr)
  {
         pgd_t * page_dir = mm->pgd;
  
+       spin_lock(&mm->page_table_lock);
         page_dir += first;
         do {
                 free_one_pgd(page_dir);
                 page_dir++;
         } while (--nr);
+       spin_unlock(&mm->page_table_lock);
  
         /* keep the page table cache within bounds */
         check_pgt_cache();
@@ -272,12 +274,8 @@ static inline int free_pte(pte_t pte)
                  * free_page() used to be able to clear swap cache
                  * entries.  We may now have to do it manually.  
                  */
-               if (page->mapping) {
-                       if (pte_dirty(pte))
-                               set_page_dirty(page);
-                       if (pte_young(pte))
-                               mark_page_accessed(page);
-               }
+               if (pte_dirty(pte) && page->mapping)
+                       set_page_dirty(page);
                 free_page_and_swap_cache(page);
                 return 1;
         }
@@ -924,6 +922,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
                         break;
                 /* Recheck swapcachedness once the page is locked */
                 can_reuse = exclusive_swap_page(old_page);
+#if 1
+               if (can_reuse)
+                       delete_from_swap_cache_nolock(old_page);
+#endif
                 UnlockPage(old_page);
                 if (!can_reuse)
                         break;
@@ -1104,6 +1106,7 @@ static int do_swap_page(struct mm_struct * mm,
         struct page *page;
         swp_entry_t entry = pte_to_swp_entry(orig_pte);
         pte_t pte;
+       int ret = 1;
  
         spin_unlock(&mm->page_table_lock);
         page = lookup_swap_cache(entry);
@@ -1120,6 +1123,9 @@ static int do_swap_page(struct mm_struct * mm,
                          */
                         return pte_same(*page_table, orig_pte) ? -1 : 1;
                 }
+
+               /* Had to read the page from swap area: Major fault */
+               ret = 2;
         }
  
         /*
@@ -1146,12 +1152,13 @@ static int do_swap_page(struct mm_struct * mm,
  
         swap_free(entry);
         if (exclusive_swap_page(page)) {        
+#if 0
                 if (write_access)
                         pte = pte_mkwrite(pte_mkdirty(pte));
-               if (vm_swap_full()) {
-                       delete_from_swap_cache_nolock(page);
-                       pte = pte_mkdirty(pte);
-               }
+#else
+               delete_from_swap_cache_nolock(page);
+               pte = pte_mkwrite(pte_mkdirty(pte));
+#endif
         }
         UnlockPage(page);
  
@@ -1161,7 +1168,7 @@ static int do_swap_page(struct mm_struct * mm,
  
         /* No need to invalidate - it was non-present before */
         update_mmu_cache(vma, address, pte);
-       return 1;       /* Minor fault */
+       return ret;
  }
  
  /*
@@ -1378,7 +1385,7 @@ pmd_t *__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
                  * Because we dropped the lock, we should re-check the
                  * entry, as somebody else could have populated it..
                  */
-               if (pgd_present(*pgd)) {
+               if (!pgd_none(*pgd)) {
                         pmd_free(new);
                         goto out;
                 }
@@ -1396,7 +1403,7 @@ out:
   */
  pte_t *pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
  {
-       if (!pmd_present(*pmd)) {
+       if (pmd_none(*pmd)) {
                 pte_t *new;
  
                 /* "fast" allocation can happen without dropping the lock.. */
@@ -1412,7 +1419,7 @@ pte_t *pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
                          * Because we dropped the lock, we should re-check the
                          * entry, as somebody else could have populated it..
                          */
-                       if (pmd_present(*pmd)) {
+                       if (!pmd_none(*pmd)) {
                                 pte_free(new);
                                 goto out;
                         }
diff --git a/mm/mlock.c b/mm/mlock.c

index 75bca3639a586f742bbc06d4b64dcfdeb06f7503..3524645ed2627bf2a39517f9c65f1adc482e4718 100644 (file)
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -36,9 +36,9 @@ static inline int mlock_fixup_start(struct vm_area_struct * vma,
                 get_file(n->vm_file);
         if (n->vm_ops && n->vm_ops->open)
                 n->vm_ops->open(n);
+       vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
         lock_vma_mappings(vma);
         spin_lock(&vma->vm_mm->page_table_lock);
-       vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
         vma->vm_start = end;
         __insert_vm_struct(current->mm, n);
         spin_unlock(&vma->vm_mm->page_table_lock);
@@ -100,13 +100,13 @@ static inline int mlock_fixup_middle(struct vm_area_struct * vma,
                 vma->vm_ops->open(left);
                 vma->vm_ops->open(right);
         }
+       vma->vm_raend = 0;
+       vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
         lock_vma_mappings(vma);
         spin_lock(&vma->vm_mm->page_table_lock);
-       vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
         vma->vm_start = start;
         vma->vm_end = end;
         vma->vm_flags = newflags;
-       vma->vm_raend = 0;
         __insert_vm_struct(current->mm, left);
         __insert_vm_struct(current->mm, right);
         spin_unlock(&vma->vm_mm->page_table_lock);
diff --git a/mm/mmap.c b/mm/mmap.c

index e73a5f950b0f43abb38a597b1a2bf238e6cc84f0..eb1a2a33695bc9f2616dca62a2a951435b729244 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -13,10 +13,17 @@
  #include <linux/init.h>
  #include <linux/file.h>
  #include <linux/fs.h>
+#include <linux/personality.h>
  
  #include <asm/uaccess.h>
  #include <asm/pgalloc.h>
  
+/*
+ * WARNING: the debugging will use recursive algorithms so never enable this
+ * unless you know what you are doing.
+ */
+#undef DEBUG_MM_RB
+
  /* description of effects of mapping type and prot in current implementation.
   * this is due to the limited x86 page protection hardware.  The expected
   * behavior is in parens:
@@ -204,14 +211,193 @@ static inline unsigned long calc_vm_flags(unsigned long prot, unsigned long flag
  #undef _trans
  }
  
+#ifdef DEBUG_MM_RB
+static int browse_rb(rb_node_t * rb_node) {
+       int i = 0;
+       if (rb_node) {
+               i++;
+               i += browse_rb(rb_node->rb_left);
+               i += browse_rb(rb_node->rb_right);
+       }
+       return i;
+}
+
+static void validate_mm(struct mm_struct * mm) {
+       int bug = 0;
+       int i = 0;
+       struct vm_area_struct * tmp = mm->mmap;
+       while (tmp) {
+               tmp = tmp->vm_next;
+               i++;
+       }
+       if (i != mm->map_count)
+               printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
+       i = browse_rb(mm->mm_rb.rb_node);
+       if (i != mm->map_count)
+               printk("map_count %d rb %d\n", mm->map_count, i), bug = 1;
+       if (bug)
+               BUG();
+}
+#else
+#define validate_mm(mm) do { } while (0)
+#endif
+
+static struct vm_area_struct * find_vma_prepare(struct mm_struct * mm, unsigned long addr,
+                                               struct vm_area_struct ** pprev,
+                                               rb_node_t *** rb_link, rb_node_t ** rb_parent)
+{
+       struct vm_area_struct * vma;
+       rb_node_t ** __rb_link, * __rb_parent, * rb_prev;
+
+       __rb_link = &mm->mm_rb.rb_node;
+       rb_prev = __rb_parent = NULL;
+       vma = NULL;
+
+       while (*__rb_link) {
+               struct vm_area_struct *vma_tmp;
+
+               __rb_parent = *__rb_link;
+               vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
+
+               if (vma_tmp->vm_end > addr) {
+                       vma = vma_tmp;
+                       if (vma_tmp->vm_start <= addr)
+                               return vma;
+                       __rb_link = &__rb_parent->rb_left;
+               } else {
+                       rb_prev = __rb_parent;
+                       __rb_link = &__rb_parent->rb_right;
+               }
+       }
+
+       *pprev = NULL;
+       if (rb_prev)
+               *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
+       *rb_link = __rb_link;
+       *rb_parent = __rb_parent;
+       return vma;
+}
+
+static inline void __vma_link_list(struct mm_struct * mm, struct vm_area_struct * vma, struct vm_area_struct * prev,
+                                  rb_node_t * rb_parent)
+{
+       if (prev) {
+               vma->vm_next = prev->vm_next;
+               prev->vm_next = vma;
+       } else {
+               mm->mmap = vma;
+               if (rb_parent)
+                       vma->vm_next = rb_entry(rb_parent, struct vm_area_struct, vm_rb);
+               else
+                       vma->vm_next = NULL;
+       }
+}
+
+static inline void __vma_link_rb(struct mm_struct * mm, struct vm_area_struct * vma,
+                                rb_node_t ** rb_link, rb_node_t * rb_parent)
+{
+       rb_link_node(&vma->vm_rb, rb_parent, rb_link);
+       rb_insert_color(&vma->vm_rb, &mm->mm_rb);
+}
+
+static inline void __vma_link_file(struct vm_area_struct * vma)
+{
+       struct file * file;
+
+       file = vma->vm_file;
+       if (file) {
+               struct inode * inode = file->f_dentry->d_inode;
+               struct address_space *mapping = inode->i_mapping;
+               struct vm_area_struct **head;
+
+               if (vma->vm_flags & VM_DENYWRITE)
+                       atomic_dec(&inode->i_writecount);
+
+               head = &mapping->i_mmap;
+               if (vma->vm_flags & VM_SHARED)
+                       head = &mapping->i_mmap_shared;
+      
+               /* insert vma into inode's share list */
+               if((vma->vm_next_share = *head) != NULL)
+                       (*head)->vm_pprev_share = &vma->vm_next_share;
+               *head = vma;
+               vma->vm_pprev_share = head;
+       }
+}
+
+static void __vma_link(struct mm_struct * mm, struct vm_area_struct * vma,  struct vm_area_struct * prev,
+                      rb_node_t ** rb_link, rb_node_t * rb_parent)
+{
+       __vma_link_list(mm, vma, prev, rb_parent);
+       __vma_link_rb(mm, vma, rb_link, rb_parent);
+       __vma_link_file(vma);
+}
+
+static inline void vma_link(struct mm_struct * mm, struct vm_area_struct * vma, struct vm_area_struct * prev,
+                           rb_node_t ** rb_link, rb_node_t * rb_parent)
+{
+       lock_vma_mappings(vma);
+       spin_lock(&mm->page_table_lock);
+       __vma_link(mm, vma, prev, rb_link, rb_parent);
+       spin_unlock(&mm->page_table_lock);
+       unlock_vma_mappings(vma);
+
+       mm->map_count++;
+       validate_mm(mm);
+}
+
+static int vma_merge(struct mm_struct * mm, struct vm_area_struct * prev,
+                    rb_node_t * rb_parent, unsigned long addr, unsigned long end, unsigned long vm_flags)
+{
+       spinlock_t * lock = &mm->page_table_lock;
+       if (!prev) {
+               prev = rb_entry(rb_parent, struct vm_area_struct, vm_rb);
+               goto merge_next;
+       }
+       if (prev->vm_end == addr && can_vma_merge(prev, vm_flags)) {
+               struct vm_area_struct * next;
+
+               spin_lock(lock);
+               prev->vm_end = end;
+               next = prev->vm_next;
+               if (next && prev->vm_end == next->vm_start && can_vma_merge(next, vm_flags)) {
+                       prev->vm_end = next->vm_end;
+                       __vma_unlink(mm, next, prev);
+                       spin_unlock(lock);
+
+                       mm->map_count--;
+                       kmem_cache_free(vm_area_cachep, next);
+                       return 1;
+               }
+               spin_unlock(lock);
+               return 1;
+       }
+
+       prev = prev->vm_next;
+       if (prev) {
+ merge_next:
+               if (!can_vma_merge(prev, vm_flags))
+                       return 0;
+               if (end == prev->vm_start) {
+                       spin_lock(lock);
+                       prev->vm_start = addr;
+                       spin_unlock(lock);
+                       return 1;
+               }
+       }
+
+       return 0;
+}
+
  unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned long len,
         unsigned long prot, unsigned long flags, unsigned long pgoff)
  {
         struct mm_struct * mm = current->mm;
-       struct vm_area_struct * vma;
+       struct vm_area_struct * vma, * prev;
         unsigned int vm_flags;
         int correct_wcount = 0;
         int error;
+       rb_node_t ** rb_link, * rb_parent;
  
         if (file && (!file->f_op || !file->f_op->mmap))
                 return -ENODEV;
@@ -219,7 +405,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned lon
         if ((len = PAGE_ALIGN(len)) == 0)
                 return addr;
  
-       if (len > TASK_SIZE || addr > TASK_SIZE-len)
+       if (len > TASK_SIZE)
                 return -EINVAL;
  
         /* offset overflow? */
@@ -293,8 +479,13 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned lon
  
         /* Clear old maps */
         error = -ENOMEM;
-       if (do_munmap(mm, addr, len))
-               return -ENOMEM;
+munmap_back:
+       vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+       if (vma && vma->vm_start < addr + len) {
+               if (do_munmap(mm, addr, len))
+                       return -ENOMEM;
+               goto munmap_back;
+       }
  
         /* Check against address space limit. */
         if ((mm->total_vm << PAGE_SHIFT) + len
@@ -308,14 +499,9 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned lon
                 return -ENOMEM;
  
         /* Can we just expand an old anonymous mapping? */
-       if (addr && !file && !(vm_flags & VM_SHARED)) {
-               struct vm_area_struct * vma = find_vma(mm, addr-1);
-               if (vma && vma->vm_end == addr && !vma->vm_file && 
-                   vma->vm_flags == vm_flags) {
-                       vma->vm_end = addr + len;
+       if (!file && !(vm_flags & VM_SHARED) && rb_parent)
+               if (vma_merge(mm, prev, rb_parent, addr, addr + len, vm_flags))
                         goto out;
-               }
-       }
  
         /* Determine the object being mapped and call the appropriate
          * specific mapper. the address has already been validated, but
@@ -364,7 +550,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned lon
          */
         addr = vma->vm_start;
  
-       insert_vm_struct(mm, vma);
+       vma_link(mm, vma, prev, rb_link, rb_parent);
         if (correct_wcount)
                 atomic_inc(&file->f_dentry->d_inode->i_writecount);
  
@@ -408,9 +594,15 @@ static inline unsigned long arch_get_unmapped_area(struct file *filp, unsigned l
  
         if (len > TASK_SIZE)
                 return -ENOMEM;
-       if (!addr)
-               addr = TASK_UNMAPPED_BASE;
-       addr = PAGE_ALIGN(addr);
+
+       if (addr) {
+               addr = PAGE_ALIGN(addr);
+               vma = find_vma(current->mm, addr);
+               if (TASK_SIZE - len >= addr &&
+                   (!vma || addr + len <= vma->vm_start))
+                       return addr;
+       }
+       addr = PAGE_ALIGN(TASK_UNMAPPED_BASE);
  
         for (vma = find_vma(current->mm, addr); ; vma = vma->vm_next) {
                 /* At this point:  (!vma || addr < vma->vm_end). */
@@ -428,6 +620,8 @@ extern unsigned long arch_get_unmapped_area(struct file *, unsigned long, unsign
  unsigned long get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags)
  {
         if (flags & MAP_FIXED) {
+               if (addr > TASK_SIZE - len)
+                       return -EINVAL;
                 if (addr & ~PAGE_MASK)
                         return -EINVAL;
                 return addr;
@@ -439,10 +633,6 @@ unsigned long get_unmapped_area(struct file *file, unsigned long addr, unsigned
         return arch_get_unmapped_area(file, addr, len, pgoff, flags);
  }
  
-#define vm_avl_empty   (struct vm_area_struct *) NULL
-
-#include "mmap_avl.c"
-
  /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
  struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
  {
@@ -453,26 +643,23 @@ struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
                 /* (Cache hit rate is typically around 35%.) */
                 vma = mm->mmap_cache;
                 if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
-                       if (!mm->mmap_avl) {
-                               /* Go through the linear list. */
-                               vma = mm->mmap;
-                               while (vma && vma->vm_end <= addr)
-                                       vma = vma->vm_next;
-                       } else {
-                               /* Then go through the AVL tree quickly. */
-                               struct vm_area_struct * tree = mm->mmap_avl;
-                               vma = NULL;
-                               for (;;) {
-                                       if (tree == vm_avl_empty)
+                       rb_node_t * rb_node;
+
+                       rb_node = mm->mm_rb.rb_node;
+                       vma = NULL;
+
+                       while (rb_node) {
+                               struct vm_area_struct * vma_tmp;
+
+                               vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
+
+                               if (vma_tmp->vm_end > addr) {
+                                       vma = vma_tmp;
+                                       if (vma_tmp->vm_start <= addr)
                                                 break;
-                                       if (tree->vm_end > addr) {
-                                               vma = tree;
-                                               if (tree->vm_start <= addr)
-                                                       break;
-                                               tree = tree->vm_avl_left;
-                                       } else
-                                               tree = tree->vm_avl_right;
-                               }
+                                       rb_node = rb_node->rb_left;
+                               } else
+                                       rb_node = rb_node->rb_right;
                         }
                         if (vma)
                                 mm->mmap_cache = vma;
@@ -486,47 +673,42 @@ struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
                                       struct vm_area_struct **pprev)
  {
         if (mm) {
-               if (!mm->mmap_avl) {
-                       /* Go through the linear list. */
-                       struct vm_area_struct * prev = NULL;
-                       struct vm_area_struct * vma = mm->mmap;
-                       while (vma && vma->vm_end <= addr) {
-                               prev = vma;
-                               vma = vma->vm_next;
-                       }
-                       *pprev = prev;
-                       return vma;
-               } else {
-                       /* Go through the AVL tree quickly. */
-                       struct vm_area_struct * vma = NULL;
-                       struct vm_area_struct * last_turn_right = NULL;
-                       struct vm_area_struct * prev = NULL;
-                       struct vm_area_struct * tree = mm->mmap_avl;
-                       for (;;) {
-                               if (tree == vm_avl_empty)
+               /* Go through the RB tree quickly. */
+               struct vm_area_struct * vma;
+               rb_node_t * rb_node, * rb_last_right, * rb_prev;
+               
+               rb_node = mm->mm_rb.rb_node;
+               rb_last_right = rb_prev = NULL;
+               vma = NULL;
+
+               while (rb_node) {
+                       struct vm_area_struct * vma_tmp;
+
+                       vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
+
+                       if (vma_tmp->vm_end > addr) {
+                               vma = vma_tmp;
+                               rb_prev = rb_last_right;
+                               if (vma_tmp->vm_start <= addr)
                                         break;
-                               if (tree->vm_end > addr) {
-                                       vma = tree;
-                                       prev = last_turn_right;
-                                       if (tree->vm_start <= addr)
-                                               break;
-                                       tree = tree->vm_avl_left;
-                               } else {
-                                       last_turn_right = tree;
-                                       tree = tree->vm_avl_right;
-                               }
+                               rb_node = rb_node->rb_left;
+                       } else {
+                               rb_last_right = rb_node;
+                               rb_node = rb_node->rb_right;
                         }
-                       if (vma) {
-                               if (vma->vm_avl_left != vm_avl_empty) {
-                                       prev = vma->vm_avl_left;
-                                       while (prev->vm_avl_right != vm_avl_empty)
-                                               prev = prev->vm_avl_right;
-                               }
-                               if ((prev ? prev->vm_next : mm->mmap) != vma)
-                                       printk("find_vma_prev: tree inconsistent with list\n");
-                               *pprev = prev;
-                               return vma;
+               }
+               if (vma) {
+                       if (vma->vm_rb.rb_left) {
+                               rb_prev = vma->vm_rb.rb_left;
+                               while (rb_prev->rb_right)
+                                       rb_prev = rb_prev->rb_right;
                         }
+                       *pprev = NULL;
+                       if (rb_prev)
+                               *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
+                       if ((rb_prev ? (*pprev)->vm_next : mm->mmap) != vma)
+                               BUG();
+                       return vma;
                 }
         }
         *pprev = NULL;
@@ -601,11 +783,16 @@ static struct vm_area_struct * unmap_fixup(struct mm_struct *mm,
  
         /* Work out to one of the ends. */
         if (end == area->vm_end) {
+               /*
+                * here area isn't visible to the semaphore-less readers
+                * so we don't need to update it under the spinlock.
+                */
                 area->vm_end = addr;
                 lock_vma_mappings(area);
                 spin_lock(&mm->page_table_lock);
         } else if (addr == area->vm_start) {
                 area->vm_pgoff += (end - area->vm_start) >> PAGE_SHIFT;
+               /* same locking considerations of the above case */
                 area->vm_start = end;
                 lock_vma_mappings(area);
                 spin_lock(&mm->page_table_lock);
@@ -751,8 +938,7 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
                 *npp = mpnt->vm_next;
                 mpnt->vm_next = free;
                 free = mpnt;
-               if (mm->mmap_avl)
-                       avl_remove(mpnt, &mm->mmap_avl);
+               rb_erase(&mpnt->vm_rb, &mm->mm_rb);
         }
         mm->mmap_cache = NULL;  /* Kill the cache. */
         spin_unlock(&mm->page_table_lock);
@@ -793,6 +979,7 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
                 if (file)
                         atomic_inc(&file->f_dentry->d_inode->i_writecount);
         }
+       validate_mm(mm);
  
         /* Release the extra vma struct if it wasn't used */
         if (extra)
@@ -822,8 +1009,9 @@ asmlinkage long sys_munmap(unsigned long addr, size_t len)
  unsigned long do_brk(unsigned long addr, unsigned long len)
  {
         struct mm_struct * mm = current->mm;
-       struct vm_area_struct * vma;
-       unsigned long flags, retval;
+       struct vm_area_struct * vma, * prev;
+       unsigned long flags;
+       rb_node_t ** rb_link, * rb_parent;
  
         len = PAGE_ALIGN(len);
         if (!len)
@@ -842,9 +1030,13 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
         /*
          * Clear old maps.  this also does some error checking for us
          */
-       retval = do_munmap(mm, addr, len);
-       if (retval != 0)
-               return retval;
+ munmap_back:
+       vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+       if (vma && vma->vm_start < addr + len) {
+               if (do_munmap(mm, addr, len))
+                       return -ENOMEM;
+               goto munmap_back;
+       }
  
         /* Check against address space limits *after* clearing old maps... */
         if ((mm->total_vm << PAGE_SHIFT) + len
@@ -861,16 +1053,10 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
                                 MAP_FIXED|MAP_PRIVATE) | mm->def_flags;
  
         flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
-       
+
         /* Can we just expand an old anonymous mapping? */
-       if (addr) {
-               struct vm_area_struct * vma = find_vma(mm, addr-1);
-               if (vma && vma->vm_end == addr && !vma->vm_file && 
-                   vma->vm_flags == flags) {
-                       vma->vm_end = addr + len;
-                       goto out;
-               }
-       }       
+       if (rb_parent && vma_merge(mm, prev, rb_parent, addr, addr + len, flags))
+               goto out;
  
         /*
          * create a vma struct for an anonymous mapping
@@ -889,7 +1075,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
         vma->vm_file = NULL;
         vma->vm_private_data = NULL;
  
-       insert_vm_struct(mm, vma);
+       vma_link(mm, vma, prev, rb_link, rb_parent);
  
  out:
         mm->total_vm += len >> PAGE_SHIFT;
@@ -900,14 +1086,20 @@ out:
         return addr;
  }
  
-/* Build the AVL tree corresponding to the VMA list. */
-void build_mmap_avl(struct mm_struct * mm)
+/* Build the RB tree corresponding to the VMA list. */
+void build_mmap_rb(struct mm_struct * mm)
  {
         struct vm_area_struct * vma;
-
-       mm->mmap_avl = NULL;
-       for (vma = mm->mmap; vma; vma = vma->vm_next)
-               avl_insert(vma, &mm->mmap_avl);
+       rb_node_t ** rb_link, * rb_parent;
+
+       mm->mm_rb = RB_ROOT;
+       rb_link = &mm->mm_rb.rb_node;
+       rb_parent = NULL;
+       for (vma = mm->mmap; vma; vma = vma->vm_next) {
+               __vma_link_rb(mm, vma, rb_link, rb_parent);
+               rb_parent = &vma->vm_rb;
+               rb_link = &rb_parent->rb_right;
+       }
  }
  
  /* Release all mmaps. */
@@ -918,7 +1110,8 @@ void exit_mmap(struct mm_struct * mm)
         release_segments(mm);
         spin_lock(&mm->page_table_lock);
         mpnt = mm->mmap;
-       mm->mmap = mm->mmap_avl = mm->mmap_cache = NULL;
+       mm->mmap = mm->mmap_cache = NULL;
+       mm->mm_rb = RB_ROOT;
         mm->rss = 0;
         spin_unlock(&mm->page_table_lock);
         mm->total_vm = 0;
@@ -947,7 +1140,7 @@ void exit_mmap(struct mm_struct * mm)
  
         /* This is just debugging */
         if (mm->map_count)
-               printk("exit_mmap: map count is %d\n", mm->map_count);
+               BUG();
  
         clear_page_tables(mm, FIRST_USER_PGD_NR, USER_PTRS_PER_PGD);
  }
@@ -956,55 +1149,27 @@ void exit_mmap(struct mm_struct * mm)
   * and into the inode's i_mmap ring.  If vm_file is non-NULL
   * then the i_shared_lock must be held here.
   */
-void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vmp)
+void __insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
  {
-       struct vm_area_struct **pprev;
-       struct file * file;
-
-       if (!mm->mmap_avl) {
-               pprev = &mm->mmap;
-               while (*pprev && (*pprev)->vm_start <= vmp->vm_start)
-                       pprev = &(*pprev)->vm_next;
-       } else {
-               struct vm_area_struct *prev, *next;
-               avl_insert_neighbours(vmp, &mm->mmap_avl, &prev, &next);
-               pprev = (prev ? &prev->vm_next : &mm->mmap);
-               if (*pprev != next)
-                       printk("insert_vm_struct: tree inconsistent with list\n");
-       }
-       vmp->vm_next = *pprev;
-       *pprev = vmp;
+       struct vm_area_struct * __vma, * prev;
+       rb_node_t ** rb_link, * rb_parent;
  
+       __vma = find_vma_prepare(mm, vma->vm_start, &prev, &rb_link, &rb_parent);
+       if (__vma && __vma->vm_start < vma->vm_end)
+               BUG();
+       __vma_link(mm, vma, prev, rb_link, rb_parent);
         mm->map_count++;
-       if (mm->map_count >= AVL_MIN_MAP_COUNT && !mm->mmap_avl)
-               build_mmap_avl(mm);
-
-       file = vmp->vm_file;
-       if (file) {
-               struct inode * inode = file->f_dentry->d_inode;
-               struct address_space *mapping = inode->i_mapping;
-               struct vm_area_struct **head;
-
-               if (vmp->vm_flags & VM_DENYWRITE)
-                       atomic_dec(&inode->i_writecount);
-
-               head = &mapping->i_mmap;
-               if (vmp->vm_flags & VM_SHARED)
-                       head = &mapping->i_mmap_shared;
-      
-               /* insert vmp into inode's share list */
-               if((vmp->vm_next_share = *head) != NULL)
-                       (*head)->vm_pprev_share = &vmp->vm_next_share;
-               *head = vmp;
-               vmp->vm_pprev_share = head;
-       }
+       validate_mm(mm);
  }
  
-void insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vmp)
+void insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
  {
-       lock_vma_mappings(vmp);
-       spin_lock(&current->mm->page_table_lock);
-       __insert_vm_struct(mm, vmp);
-       spin_unlock(&current->mm->page_table_lock);
-       unlock_vma_mappings(vmp);
+       struct vm_area_struct * __vma, * prev;
+       rb_node_t ** rb_link, * rb_parent;
+
+       __vma = find_vma_prepare(mm, vma->vm_start, &prev, &rb_link, &rb_parent);
+       if (__vma && __vma->vm_start < vma->vm_end)
+               BUG();
+       vma_link(mm, vma, prev, rb_link, rb_parent);
+       validate_mm(mm);
  }
diff --git a/mm/mmap_avl.c b/mm/mmap_avl.c

deleted file mode 100644 (file)

index 5a48ce8..0000000
--- a/mm/mmap_avl.c
+++ /dev/null
@@ -1,374 +0,0 @@
-/*
- * Searching a VMA in the linear list task->mm->mmap is horribly slow.
- * Use an AVL (Adelson-Velskii and Landis) tree to speed up this search
- * from O(n) to O(log n), where n is the number of VMAs of the task
- * n is typically around 6, but may reach 3000 in some cases: object-oriented
- * databases, persistent store, generational garbage collection (Java, Lisp),
- * ElectricFence.
- * Written by Bruno Haible <haible@ma2s2.mathematik.uni-karlsruhe.de>.
- */
-
-/* We keep the list and tree sorted by address. */
-#define vm_avl_key     vm_end
-#define vm_avl_key_t   unsigned long   /* typeof(vma->avl_key) */
-
-/*
- * task->mm->mmap_avl is the AVL tree corresponding to task->mm->mmap
- * or, more exactly, its root.
- * A vm_area_struct has the following fields:
- *   vm_avl_left     left son of a tree node
- *   vm_avl_right    right son of a tree node
- *   vm_avl_height   1+max(heightof(left),heightof(right))
- * The empty tree is represented as NULL.
- */
-
-/* Since the trees are balanced, their height will never be large. */
-#define avl_maxheight  41      /* why this? a small exercise */
-#define heightof(tree) ((tree) == vm_avl_empty ? 0 : (tree)->vm_avl_height)
-/*
- * Consistency and balancing rules:
- * 1. tree->vm_avl_height == 1+max(heightof(tree->vm_avl_left),heightof(tree->vm_avl_right))
- * 2. abs( heightof(tree->vm_avl_left) - heightof(tree->vm_avl_right) ) <= 1
- * 3. foreach node in tree->vm_avl_left: node->vm_avl_key <= tree->vm_avl_key,
- *    foreach node in tree->vm_avl_right: node->vm_avl_key >= tree->vm_avl_key.
- */
-
-#ifdef DEBUG_AVL
-
-/* Look up the nodes at the left and at the right of a given node. */
-static void avl_neighbours (struct vm_area_struct * node, struct vm_area_struct * tree, struct vm_area_struct ** to_the_left, struct vm_area_struct ** to_the_right)
-{
-       vm_avl_key_t key = node->vm_avl_key;
-
-       *to_the_left = *to_the_right = NULL;
-       for (;;) {
-               if (tree == vm_avl_empty) {
-                       printk("avl_neighbours: node not found in the tree\n");
-                       return;
-               }
-               if (key == tree->vm_avl_key)
-                       break;
-               if (key < tree->vm_avl_key) {
-                       *to_the_right = tree;
-                       tree = tree->vm_avl_left;
-               } else {
-                       *to_the_left = tree;
-                       tree = tree->vm_avl_right;
-               }
-       }
-       if (tree != node) {
-               printk("avl_neighbours: node not exactly found in the tree\n");
-               return;
-       }
-       if (tree->vm_avl_left != vm_avl_empty) {
-               struct vm_area_struct * node;
-               for (node = tree->vm_avl_left; node->vm_avl_right != vm_avl_empty; node = node->vm_avl_right)
-                       continue;
-               *to_the_left = node;
-       }
-       if (tree->vm_avl_right != vm_avl_empty) {
-               struct vm_area_struct * node;
-               for (node = tree->vm_avl_right; node->vm_avl_left != vm_avl_empty; node = node->vm_avl_left)
-                       continue;
-               *to_the_right = node;
-       }
-       if ((*to_the_left && ((*to_the_left)->vm_next != node)) || (node->vm_next != *to_the_right))
-               printk("avl_neighbours: tree inconsistent with list\n");
-}
-
-#endif
-
-/*
- * Rebalance a tree.
- * After inserting or deleting a node of a tree we have a sequence of subtrees
- * nodes[0]..nodes[k-1] such that
- * nodes[0] is the root and nodes[i+1] = nodes[i]->{vm_avl_left|vm_avl_right}.
- */
-static void avl_rebalance (struct vm_area_struct *** nodeplaces_ptr, int count)
-{
-       for ( ; count > 0 ; count--) {
-               struct vm_area_struct ** nodeplace = *--nodeplaces_ptr;
-               struct vm_area_struct * node = *nodeplace;
-               struct vm_area_struct * nodeleft = node->vm_avl_left;
-               struct vm_area_struct * noderight = node->vm_avl_right;
-               int heightleft = heightof(nodeleft);
-               int heightright = heightof(noderight);
-               if (heightright + 1 < heightleft) {
-                       /*                                                      */
-                       /*                            *                         */
-                       /*                          /   \                       */
-                       /*                       n+2      n                     */
-                       /*                                                      */
-                       struct vm_area_struct * nodeleftleft = nodeleft->vm_avl_left;
-                       struct vm_area_struct * nodeleftright = nodeleft->vm_avl_right;
-                       int heightleftright = heightof(nodeleftright);
-                       if (heightof(nodeleftleft) >= heightleftright) {
-                               /*                                                        */
-                               /*                *                    n+2|n+3            */
-                               /*              /   \                  /    \             */
-                               /*           n+2      n      -->      /   n+1|n+2         */
-                               /*           / \                      |    /    \         */
-                               /*         n+1 n|n+1                 n+1  n|n+1  n        */
-                               /*                                                        */
-                               node->vm_avl_left = nodeleftright; nodeleft->vm_avl_right = node;
-                               nodeleft->vm_avl_height = 1 + (node->vm_avl_height = 1 + heightleftright);
-                               *nodeplace = nodeleft;
-                       } else {
-                               /*                                                        */
-                               /*                *                     n+2               */
-                               /*              /   \                 /     \             */
-                               /*           n+2      n      -->    n+1     n+1           */
-                               /*           / \                    / \     / \           */
-                               /*          n  n+1                 n   L   R   n          */
-                               /*             / \                                        */
-                               /*            L   R                                       */
-                               /*                                                        */
-                               nodeleft->vm_avl_right = nodeleftright->vm_avl_left;
-                               node->vm_avl_left = nodeleftright->vm_avl_right;
-                               nodeleftright->vm_avl_left = nodeleft;
-                               nodeleftright->vm_avl_right = node;
-                               nodeleft->vm_avl_height = node->vm_avl_height = heightleftright;
-                               nodeleftright->vm_avl_height = heightleft;
-                               *nodeplace = nodeleftright;
-                       }
-               }
-               else if (heightleft + 1 < heightright) {
-                       /* similar to the above, just interchange 'left' <--> 'right' */
-                       struct vm_area_struct * noderightright = noderight->vm_avl_right;
-                       struct vm_area_struct * noderightleft = noderight->vm_avl_left;
-                       int heightrightleft = heightof(noderightleft);
-                       if (heightof(noderightright) >= heightrightleft) {
-                               node->vm_avl_right = noderightleft; noderight->vm_avl_left = node;
-                               noderight->vm_avl_height = 1 + (node->vm_avl_height = 1 + heightrightleft);
-                               *nodeplace = noderight;
-                       } else {
-                               noderight->vm_avl_left = noderightleft->vm_avl_right;
-                               node->vm_avl_right = noderightleft->vm_avl_left;
-                               noderightleft->vm_avl_right = noderight;
-                               noderightleft->vm_avl_left = node;
-                               noderight->vm_avl_height = node->vm_avl_height = heightrightleft;
-                               noderightleft->vm_avl_height = heightright;
-                               *nodeplace = noderightleft;
-                       }
-               }
-               else {
-                       int height = (heightleft<heightright ? heightright : heightleft) + 1;
-                       if (height == node->vm_avl_height)
-                               break;
-                       node->vm_avl_height = height;
-               }
-       }
-}
-
-/* Insert a node into a tree. */
-static inline void avl_insert (struct vm_area_struct * new_node, struct vm_area_struct ** ptree)
-{
-       vm_avl_key_t key = new_node->vm_avl_key;
-       struct vm_area_struct ** nodeplace = ptree;
-       struct vm_area_struct ** stack[avl_maxheight];
-       int stack_count = 0;
-       struct vm_area_struct *** stack_ptr = &stack[0]; /* = &stack[stackcount] */
-       for (;;) {
-               struct vm_area_struct * node = *nodeplace;
-               if (node == vm_avl_empty)
-                       break;
-               *stack_ptr++ = nodeplace; stack_count++;
-               if (key < node->vm_avl_key)
-                       nodeplace = &node->vm_avl_left;
-               else
-                       nodeplace = &node->vm_avl_right;
-       }
-       new_node->vm_avl_left = vm_avl_empty;
-       new_node->vm_avl_right = vm_avl_empty;
-       new_node->vm_avl_height = 1;
-       *nodeplace = new_node;
-       avl_rebalance(stack_ptr,stack_count);
-}
-
-/* Insert a node into a tree, and
- * return the node to the left of it and the node to the right of it.
- */
-static inline void avl_insert_neighbours (struct vm_area_struct * new_node, struct vm_area_struct ** ptree,
-       struct vm_area_struct ** to_the_left, struct vm_area_struct ** to_the_right)
-{
-       vm_avl_key_t key = new_node->vm_avl_key;
-       struct vm_area_struct ** nodeplace = ptree;
-       struct vm_area_struct ** stack[avl_maxheight];
-       int stack_count = 0;
-       struct vm_area_struct *** stack_ptr = &stack[0]; /* = &stack[stackcount] */
-       *to_the_left = *to_the_right = NULL;
-       for (;;) {
-               struct vm_area_struct * node = *nodeplace;
-               if (node == vm_avl_empty)
-                       break;
-               *stack_ptr++ = nodeplace; stack_count++;
-               if (key < node->vm_avl_key) {
-                       *to_the_right = node;
-                       nodeplace = &node->vm_avl_left;
-               } else {
-                       *to_the_left = node;
-                       nodeplace = &node->vm_avl_right;
-               }
-       }
-       new_node->vm_avl_left = vm_avl_empty;
-       new_node->vm_avl_right = vm_avl_empty;
-       new_node->vm_avl_height = 1;
-       *nodeplace = new_node;
-       avl_rebalance(stack_ptr,stack_count);
-}
-
-/* Removes a node out of a tree. */
-static void avl_remove (struct vm_area_struct * node_to_delete, struct vm_area_struct ** ptree)
-{
-       vm_avl_key_t key = node_to_delete->vm_avl_key;
-       struct vm_area_struct ** nodeplace = ptree;
-       struct vm_area_struct ** stack[avl_maxheight];
-       int stack_count = 0;
-       struct vm_area_struct *** stack_ptr = &stack[0]; /* = &stack[stackcount] */
-       struct vm_area_struct ** nodeplace_to_delete;
-       for (;;) {
-               struct vm_area_struct * node = *nodeplace;
-#ifdef DEBUG_AVL
-               if (node == vm_avl_empty) {
-                       /* what? node_to_delete not found in tree? */
-                       printk("avl_remove: node to delete not found in tree\n");
-                       return;
-               }
-#endif
-               *stack_ptr++ = nodeplace; stack_count++;
-               if (key == node->vm_avl_key)
-                       break;
-               if (key < node->vm_avl_key)
-                       nodeplace = &node->vm_avl_left;
-               else
-                       nodeplace = &node->vm_avl_right;
-       }
-       nodeplace_to_delete = nodeplace;
-       /* Have to remove node_to_delete = *nodeplace_to_delete. */
-       if (node_to_delete->vm_avl_left == vm_avl_empty) {
-               *nodeplace_to_delete = node_to_delete->vm_avl_right;
-               stack_ptr--; stack_count--;
-       } else {
-               struct vm_area_struct *** stack_ptr_to_delete = stack_ptr;
-               struct vm_area_struct ** nodeplace = &node_to_delete->vm_avl_left;
-               struct vm_area_struct * node;
-               for (;;) {
-                       node = *nodeplace;
-                       if (node->vm_avl_right == vm_avl_empty)
-                               break;
-                       *stack_ptr++ = nodeplace; stack_count++;
-                       nodeplace = &node->vm_avl_right;
-               }
-               *nodeplace = node->vm_avl_left;
-               /* node replaces node_to_delete */
-               node->vm_avl_left = node_to_delete->vm_avl_left;
-               node->vm_avl_right = node_to_delete->vm_avl_right;
-               node->vm_avl_height = node_to_delete->vm_avl_height;
-               *nodeplace_to_delete = node; /* replace node_to_delete */
-               *stack_ptr_to_delete = &node->vm_avl_left; /* replace &node_to_delete->vm_avl_left */
-       }
-       avl_rebalance(stack_ptr,stack_count);
-}
-
-#ifdef DEBUG_AVL
-
-/* print a list */
-static void printk_list (struct vm_area_struct * vma)
-{
-       printk("[");
-       while (vma) {
-               printk("%08lX-%08lX", vma->vm_start, vma->vm_end);
-               vma = vma->vm_next;
-               if (!vma)
-                       break;
-               printk(" ");
-       }
-       printk("]");
-}
-
-/* print a tree */
-static void printk_avl (struct vm_area_struct * tree)
-{
-       if (tree != vm_avl_empty) {
-               printk("(");
-               if (tree->vm_avl_left != vm_avl_empty) {
-                       printk_avl(tree->vm_avl_left);
-                       printk("<");
-               }
-               printk("%08lX-%08lX", tree->vm_start, tree->vm_end);
-               if (tree->vm_avl_right != vm_avl_empty) {
-                       printk(">");
-                       printk_avl(tree->vm_avl_right);
-               }
-               printk(")");
-       }
-}
-
-static char *avl_check_point = "somewhere";
-
-/* check a tree's consistency and balancing */
-static void avl_checkheights (struct vm_area_struct * tree)
-{
-       int h, hl, hr;
-
-       if (tree == vm_avl_empty)
-               return;
-       avl_checkheights(tree->vm_avl_left);
-       avl_checkheights(tree->vm_avl_right);
-       h = tree->vm_avl_height;
-       hl = heightof(tree->vm_avl_left);
-       hr = heightof(tree->vm_avl_right);
-       if ((h == hl+1) && (hr <= hl) && (hl <= hr+1))
-               return;
-       if ((h == hr+1) && (hl <= hr) && (hr <= hl+1))
-               return;
-       printk("%s: avl_checkheights: heights inconsistent\n",avl_check_point);
-}
-
-/* check that all values stored in a tree are < key */
-static void avl_checkleft (struct vm_area_struct * tree, vm_avl_key_t key)
-{
-       if (tree == vm_avl_empty)
-               return;
-       avl_checkleft(tree->vm_avl_left,key);
-       avl_checkleft(tree->vm_avl_right,key);
-       if (tree->vm_avl_key < key)
-               return;
-       printk("%s: avl_checkleft: left key %lu >= top key %lu\n",avl_check_point,tree->vm_avl_key,key);
-}
-
-/* check that all values stored in a tree are > key */
-static void avl_checkright (struct vm_area_struct * tree, vm_avl_key_t key)
-{
-       if (tree == vm_avl_empty)
-               return;
-       avl_checkright(tree->vm_avl_left,key);
-       avl_checkright(tree->vm_avl_right,key);
-       if (tree->vm_avl_key > key)
-               return;
-       printk("%s: avl_checkright: right key %lu <= top key %lu\n",avl_check_point,tree->vm_avl_key,key);
-}
-
-/* check that all values are properly increasing */
-static void avl_checkorder (struct vm_area_struct * tree)
-{
-       if (tree == vm_avl_empty)
-               return;
-       avl_checkorder(tree->vm_avl_left);
-       avl_checkorder(tree->vm_avl_right);
-       avl_checkleft(tree->vm_avl_left,tree->vm_avl_key);
-       avl_checkright(tree->vm_avl_right,tree->vm_avl_key);
-}
-
-/* all checks */
-static void avl_check (struct task_struct * task, char *caller)
-{
-       avl_check_point = caller;
-/*     printk("task \"%s\", %s\n",task->comm,caller); */
-/*     printk("task \"%s\" list: ",task->comm); printk_list(task->mm->mmap); printk("\n"); */
-/*     printk("task \"%s\" tree: ",task->comm); printk_avl(task->mm->mmap_avl); printk("\n"); */
-       avl_checkheights(task->mm->mmap_avl);
-       avl_checkorder(task->mm->mmap_avl);
-}
-
-#endif
diff --git a/mm/mprotect.c b/mm/mprotect.c

index 10c500100b5799e9ff7fd8988e62f504580e316a..fe69b33dabebb710517fdbe8f0807c0cd33a0933 100644 (file)
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -91,22 +91,52 @@ static void change_protection(unsigned long start, unsigned long end, pgprot_t n
         return;
  }
  
-static inline int mprotect_fixup_all(struct vm_area_struct * vma,
+static inline int mprotect_fixup_all(struct vm_area_struct * vma, struct vm_area_struct ** pprev,
         int newflags, pgprot_t prot)
  {
-       spin_lock(&vma->vm_mm->page_table_lock);
+       struct vm_area_struct * prev = *pprev;
+       struct mm_struct * mm = vma->vm_mm;
+
+       if (prev && prev->vm_end == vma->vm_start && can_vma_merge(prev, newflags) &&
+           !vma->vm_file && !(vma->vm_flags & VM_SHARED)) {
+               spin_lock(&mm->page_table_lock);
+               prev->vm_end = vma->vm_end;
+               __vma_unlink(mm, vma, prev);
+               spin_unlock(&mm->page_table_lock);
+
+               kmem_cache_free(vm_area_cachep, vma);
+               mm->map_count--;
+
+               return 0;
+       }
+
+       spin_lock(&mm->page_table_lock);
         vma->vm_flags = newflags;
         vma->vm_page_prot = prot;
-       spin_unlock(&vma->vm_mm->page_table_lock);
+       spin_unlock(&mm->page_table_lock);
+
+       *pprev = vma;
+
         return 0;
  }
  
-static inline int mprotect_fixup_start(struct vm_area_struct * vma,
+static inline int mprotect_fixup_start(struct vm_area_struct * vma, struct vm_area_struct ** pprev,
         unsigned long end,
         int newflags, pgprot_t prot)
  {
-       struct vm_area_struct * n;
+       struct vm_area_struct * n, * prev = *pprev;
+
+       *pprev = vma;
+
+       if (prev && prev->vm_end == vma->vm_start && can_vma_merge(prev, newflags) &&
+           !vma->vm_file && !(vma->vm_flags & VM_SHARED)) {
+               spin_lock(&vma->vm_mm->page_table_lock);
+               prev->vm_end = end;
+               vma->vm_start = end;
+               spin_unlock(&vma->vm_mm->page_table_lock);
  
+               return 0;
+       }
         n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
         if (!n)
                 return -ENOMEM;
@@ -119,17 +149,18 @@ static inline int mprotect_fixup_start(struct vm_area_struct * vma,
                 get_file(n->vm_file);
         if (n->vm_ops && n->vm_ops->open)
                 n->vm_ops->open(n);
+       vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
         lock_vma_mappings(vma);
         spin_lock(&vma->vm_mm->page_table_lock);
-       vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
         vma->vm_start = end;
         __insert_vm_struct(current->mm, n);
         spin_unlock(&vma->vm_mm->page_table_lock);
         unlock_vma_mappings(vma);
+
         return 0;
  }
  
-static inline int mprotect_fixup_end(struct vm_area_struct * vma,
+static inline int mprotect_fixup_end(struct vm_area_struct * vma, struct vm_area_struct ** pprev,
         unsigned long start,
         int newflags, pgprot_t prot)
  {
@@ -154,10 +185,13 @@ static inline int mprotect_fixup_end(struct vm_area_struct * vma,
         __insert_vm_struct(current->mm, n);
         spin_unlock(&vma->vm_mm->page_table_lock);
         unlock_vma_mappings(vma);
+
+       *pprev = n;
+
         return 0;
  }
  
-static inline int mprotect_fixup_middle(struct vm_area_struct * vma,
+static inline int mprotect_fixup_middle(struct vm_area_struct * vma, struct vm_area_struct ** pprev,
         unsigned long start, unsigned long end,
         int newflags, pgprot_t prot)
  {
@@ -184,39 +218,44 @@ static inline int mprotect_fixup_middle(struct vm_area_struct * vma,
                 vma->vm_ops->open(left);
                 vma->vm_ops->open(right);
         }
+       vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
+       vma->vm_raend = 0;
+       vma->vm_page_prot = prot;
         lock_vma_mappings(vma);
         spin_lock(&vma->vm_mm->page_table_lock);
-       vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
         vma->vm_start = start;
         vma->vm_end = end;
         vma->vm_flags = newflags;
-       vma->vm_raend = 0;
-       vma->vm_page_prot = prot;
         __insert_vm_struct(current->mm, left);
         __insert_vm_struct(current->mm, right);
         spin_unlock(&vma->vm_mm->page_table_lock);
         unlock_vma_mappings(vma);
+
+       *pprev = right;
+
         return 0;
  }
  
-static int mprotect_fixup(struct vm_area_struct * vma, 
+static int mprotect_fixup(struct vm_area_struct * vma, struct vm_area_struct ** pprev,
         unsigned long start, unsigned long end, unsigned int newflags)
  {
         pgprot_t newprot;
         int error;
  
-       if (newflags == vma->vm_flags)
+       if (newflags == vma->vm_flags) {
+               *pprev = vma;
                 return 0;
+       }
         newprot = protection_map[newflags & 0xf];
         if (start == vma->vm_start) {
                 if (end == vma->vm_end)
-                       error = mprotect_fixup_all(vma, newflags, newprot);
+                       error = mprotect_fixup_all(vma, pprev, newflags, newprot);
                 else
-                       error = mprotect_fixup_start(vma, end, newflags, newprot);
+                       error = mprotect_fixup_start(vma, pprev, end, newflags, newprot);
         } else if (end == vma->vm_end)
-               error = mprotect_fixup_end(vma, start, newflags, newprot);
+               error = mprotect_fixup_end(vma, pprev, start, newflags, newprot);
         else
-               error = mprotect_fixup_middle(vma, start, end, newflags, newprot);
+               error = mprotect_fixup_middle(vma, pprev, start, end, newflags, newprot);
  
         if (error)
                 return error;
@@ -228,7 +267,7 @@ static int mprotect_fixup(struct vm_area_struct * vma,
  asmlinkage long sys_mprotect(unsigned long start, size_t len, unsigned long prot)
  {
         unsigned long nstart, end, tmp;
-       struct vm_area_struct * vma, * next;
+       struct vm_area_struct * vma, * next, * prev;
         int error = -EINVAL;
  
         if (start & ~PAGE_MASK)
@@ -242,42 +281,56 @@ asmlinkage long sys_mprotect(unsigned long start, size_t len, unsigned long prot
         if (end == start)
                 return 0;
  
-       /* XXX: maybe this could be down_read ??? - Rik */
         down_write(&current->mm->mmap_sem);
  
-       vma = find_vma(current->mm, start);
+       vma = find_vma_prev(current->mm, start, &prev);
         error = -EFAULT;
         if (!vma || vma->vm_start > start)
                 goto out;
  
         for (nstart = start ; ; ) {
                 unsigned int newflags;
+               int last = 0;
  
                 /* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
  
                 newflags = prot | (vma->vm_flags & ~(PROT_READ | PROT_WRITE | PROT_EXEC));
                 if ((newflags & ~(newflags >> 4)) & 0xf) {
                         error = -EACCES;
-                       break;
+                       goto out;
                 }
  
-               if (vma->vm_end >= end) {
-                       error = mprotect_fixup(vma, nstart, end, newflags);
-                       break;
+               if (vma->vm_end > end) {
+                       error = mprotect_fixup(vma, &prev, nstart, end, newflags);
+                       goto out;
                 }
+               if (vma->vm_end == end)
+                       last = 1;
  
                 tmp = vma->vm_end;
                 next = vma->vm_next;
-               error = mprotect_fixup(vma, nstart, tmp, newflags);
+               error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
                 if (error)
+                       goto out;
+               if (last)
                         break;
                 nstart = tmp;
                 vma = next;
                 if (!vma || vma->vm_start != nstart) {
                         error = -EFAULT;
-                       break;
+                       goto out;
                 }
         }
+       if (next && prev->vm_end == next->vm_start && can_vma_merge(next, prev->vm_flags) &&
+           !prev->vm_file && !(prev->vm_flags & VM_SHARED)) {
+               spin_lock(&prev->vm_mm->page_table_lock);
+               prev->vm_end = next->vm_end;
+               __vma_unlink(prev->vm_mm, next, prev);
+               spin_unlock(&prev->vm_mm->page_table_lock);
+
+               kmem_cache_free(vm_area_cachep, next);
+               prev->vm_mm->map_count--;
+       }
  out:
         up_write(&current->mm->mmap_sem);
         return error;
diff --git a/mm/mremap.c b/mm/mremap.c

index 118e71f5d27b6eed60dbe4c26bcf5895045de2b0..dd423d93e3cae45474bbc7503e286baebd7bad38 100644 (file)
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -127,11 +127,58 @@ static inline unsigned long move_vma(struct vm_area_struct * vma,
         unsigned long addr, unsigned long old_len, unsigned long new_len,
         unsigned long new_addr)
  {
-       struct vm_area_struct * new_vma;
+       struct mm_struct * mm = vma->vm_mm;
+       struct vm_area_struct * new_vma, * next, * prev;
+       int allocated_vma;
+
+       new_vma = NULL;
+       next = find_vma_prev(mm, new_addr, &prev);
+       if (next) {
+               if (prev && prev->vm_end == new_addr &&
+                   can_vma_merge(prev, vma->vm_flags) && !vma->vm_file && !(vma->vm_flags & VM_SHARED)) {
+                       spin_lock(&mm->page_table_lock);
+                       prev->vm_end = new_addr + new_len;
+                       spin_unlock(&mm->page_table_lock);
+                       new_vma = prev;
+                       if (next != prev->vm_next)
+                               BUG();
+                       if (prev->vm_end == next->vm_start && can_vma_merge(next, prev->vm_flags)) {
+                               spin_lock(&mm->page_table_lock);
+                               prev->vm_end = next->vm_end;
+                               __vma_unlink(mm, next, prev);
+                               spin_unlock(&mm->page_table_lock);
+
+                               mm->map_count--;
+                               kmem_cache_free(vm_area_cachep, next);
+                       }
+               } else if (next->vm_start == new_addr + new_len &&
+                          can_vma_merge(next, vma->vm_flags) && !vma->vm_file && !(vma->vm_flags & VM_SHARED)) {
+                       spin_lock(&mm->page_table_lock);
+                       next->vm_start = new_addr;
+                       spin_unlock(&mm->page_table_lock);
+                       new_vma = next;
+               }
+       } else {
+               prev = find_vma(mm, new_addr-1);
+               if (prev && prev->vm_end == new_addr &&
+                   can_vma_merge(prev, vma->vm_flags) && !vma->vm_file && !(vma->vm_flags & VM_SHARED)) {
+                       spin_lock(&mm->page_table_lock);
+                       prev->vm_end = new_addr + new_len;
+                       spin_unlock(&mm->page_table_lock);
+                       new_vma = prev;
+               }
+       }
  
-       new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
-       if (new_vma) {
-               if (!move_page_tables(current->mm, new_addr, addr, old_len)) {
+       allocated_vma = 0;
+       if (!new_vma) {
+               new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+               if (!new_vma)
+                       goto out;
+               allocated_vma = 1;
+       }
+
+       if (!move_page_tables(current->mm, new_addr, addr, old_len)) {
+               if (allocated_vma) {
                         *new_vma = *vma;
                         new_vma->vm_start = new_addr;
                         new_vma->vm_end = new_addr+new_len;
@@ -142,17 +189,19 @@ static inline unsigned long move_vma(struct vm_area_struct * vma,
                         if (new_vma->vm_ops && new_vma->vm_ops->open)
                                 new_vma->vm_ops->open(new_vma);
                         insert_vm_struct(current->mm, new_vma);
-                       do_munmap(current->mm, addr, old_len);
-                       current->mm->total_vm += new_len >> PAGE_SHIFT;
-                       if (new_vma->vm_flags & VM_LOCKED) {
-                               current->mm->locked_vm += new_len >> PAGE_SHIFT;
-                               make_pages_present(new_vma->vm_start,
-                                                  new_vma->vm_end);
-                       }
-                       return new_addr;
                 }
-               kmem_cache_free(vm_area_cachep, new_vma);
+               do_munmap(current->mm, addr, old_len);
+               current->mm->total_vm += new_len >> PAGE_SHIFT;
+               if (new_vma->vm_flags & VM_LOCKED) {
+                       current->mm->locked_vm += new_len >> PAGE_SHIFT;
+                       make_pages_present(new_vma->vm_start,
+                                          new_vma->vm_end);
+               }
+               return new_addr;
         }
+       if (allocated_vma)
+               kmem_cache_free(vm_area_cachep, new_vma);
+ out:
         return -ENOMEM;
  }
  
diff --git a/mm/numa.c b/mm/numa.c

index 662e7c5449502d4f4a4ca2929b9d571395f81457..0b602eff9acea4fc8071e37f309f5f396b59b0b4 100644 (file)
--- a/mm/numa.c
+++ b/mm/numa.c
@@ -31,7 +31,7 @@ void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap,
  
  #endif /* !CONFIG_DISCONTIGMEM */
  
-struct page * alloc_pages_node(int nid, int gfp_mask, unsigned long order)
+struct page * alloc_pages_node(int nid, unsigned int gfp_mask, unsigned int order)
  {
  #ifdef CONFIG_NUMA
         return __alloc_pages(gfp_mask, order, NODE_DATA(nid)->node_zonelists + (gfp_mask & GFP_ZONEMASK));
@@ -82,8 +82,8 @@ void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap,
         memset(pgdat->valid_addr_bitmap, 0, size);
  }
  
-static struct page * alloc_pages_pgdat(pg_data_t *pgdat, int gfp_mask,
-       unsigned long order)
+static struct page * alloc_pages_pgdat(pg_data_t *pgdat, unsigned int gfp_mask,
+       unsigned int order)
  {
         return __alloc_pages(gfp_mask, order, pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK));
  }
@@ -92,7 +92,7 @@ static struct page * alloc_pages_pgdat(pg_data_t *pgdat, int gfp_mask,
   * This can be refined. Currently, tries to do round robin, instead
   * should do concentratic circle search, starting from current node.
   */
-struct page * _alloc_pages(unsigned int gfp_mask, unsigned long order)
+struct page * _alloc_pages(unsigned int gfp_mask, unsigned int order)
  {
         struct page *ret = 0;
         pg_data_t *start, *temp;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c

index 23b0580164e51943d0256266908634fda76bb847..0b8e02aca809f7e465d9df6c856911112c6a7b37 100644 (file)
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -192,43 +192,3 @@ void oom_kill(void)
         schedule();
         return;
  }
-
-/**
- * out_of_memory - is the system out of memory?
- *
- * Returns 0 if there is still enough memory left,
- * 1 when we are out of memory (otherwise).
- */
-int out_of_memory(void)
-{
-       long cache_mem, limit;
-
-       /* Enough free memory?  Not OOM. */
-       if (nr_free_pages() > freepages.min)
-               return 0;
-
-       if (nr_free_pages() + nr_inactive_clean_pages() > freepages.low)
-               return 0;
-
-       /*
-        * If the buffer and page cache (excluding swap cache) are over
-        * their (/proc tunable) minimum, we're still not OOM.  We test
-        * this to make sure we don't return OOM when the system simply
-        * has a hard time with the cache.
-        */
-       cache_mem = atomic_read(&page_cache_size);
-       cache_mem += atomic_read(&buffermem_pages);
-       cache_mem -= swapper_space.nrpages;
-       limit = (page_cache.min_percent + buffer_mem.min_percent);
-       limit *= num_physpages / 100;
-
-       if (cache_mem > limit)
-               return 0;
-
-       /* Enough swap space left?  Not OOM. */
-       if (nr_swap_pages > 0)
-               return 0;
-
-       /* Else... */
-       return 1;
-}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 423beaac2c24657298d534273e9f71335655905c..9f1f3125630a10f79e87984858048e931a28f4ed 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -20,16 +20,16 @@
  
  int nr_swap_pages;
  int nr_active_pages;
-int nr_inactive_dirty_pages;
+int nr_inactive_pages;
+struct list_head inactive_list;
+struct list_head active_list;
  pg_data_t *pgdat_list;
  
  static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
-static int zone_balance_ratio[MAX_NR_ZONES] = { 32, 128, 128, };
-static int zone_balance_min[MAX_NR_ZONES] = { 10 , 10, 10, };
-static int zone_balance_max[MAX_NR_ZONES] = { 255 , 255, 255, };
+static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 32, 128, 128, };
+static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, };
+static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, };
  
-struct list_head active_list;
-struct list_head inactive_dirty_list;
  /*
   * Free_page() adds the page to the free lists. This is optimized for
   * fast normal cases (no error jumps taken normally).
@@ -61,8 +61,8 @@ struct list_head inactive_dirty_list;
   * Hint: -mask = 1+~mask
   */
  
-static void FASTCALL(__free_pages_ok (struct page *page, unsigned long order));
-static void __free_pages_ok (struct page *page, unsigned long order)
+static void FASTCALL(__free_pages_ok (struct page *page, unsigned int order));
+static void __free_pages_ok (struct page *page, unsigned int order)
  {
         unsigned long index, page_idx, mask, flags;
         free_area_t *area;
@@ -83,14 +83,15 @@ static void __free_pages_ok (struct page *page, unsigned long order)
                 BUG();
         if (PageActive(page))
                 BUG();
-       if (PageInactiveDirty(page))
+       if (PageInactive(page))
                 BUG();
-       if (PageInactiveClean(page))
+       if (PageDirty(page))
                 BUG();
  
-       page->flags &= ~((1<<PG_referenced) | (1<<PG_dirty));
-       page->age = PAGE_AGE_START;
-       
+       if (current->flags & PF_FREE_PAGES)
+               goto local_freelist;
+ back_local_freelist:
+
         zone = page->zone;
  
         mask = (~0UL) << order;
@@ -135,14 +136,21 @@ static void __free_pages_ok (struct page *page, unsigned long order)
         memlist_add_head(&(base + page_idx)->list, &area->free_list);
  
         spin_unlock_irqrestore(&zone->lock, flags);
+       return;
  
+ local_freelist:
         /*
-        * We don't want to protect this variable from race conditions
-        * since it's nothing important, but we do want to make sure
-        * it never gets negative.
+        * This is a little subtle: if the allocation order
+        * wanted is major than zero we'd better take all the pages
+        * local since we must deal with fragmentation too and we
+        * can't rely on the nr_local_pages information.
          */
-       if (memory_pressure > NR_CPUS)
-               memory_pressure--;
+       if (current->nr_local_pages && !current->allocation_order)
+               goto back_local_freelist;
+
+       list_add(&page->list, &current->local_pages);
+       page->index = order;
+       current->nr_local_pages++;
  }
  
  #define MARK_USED(index, order, area) \
@@ -169,11 +177,11 @@ static inline struct page * expand (zone_t *zone, struct page *page,
         return page;
  }
  
-static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned long order));
-static struct page * rmqueue(zone_t *zone, unsigned long order)
+static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned int order));
+static struct page * rmqueue(zone_t *zone, unsigned int order)
  {
         free_area_t * area = zone->free_area + order;
-       unsigned long curr_order = order;
+       unsigned int curr_order = order;
         struct list_head *head, *curr;
         unsigned long flags;
         struct page *page;
@@ -193,7 +201,7 @@ static struct page * rmqueue(zone_t *zone, unsigned long order)
                         index = page - zone->zone_mem_map;
                         if (curr_order != MAX_ORDER-1)
                                 MARK_USED(index, curr_order, area);
-                       zone->free_pages -= 1 << order;
+                       zone->free_pages -= 1UL << order;
  
                         page = expand(zone, page, index, order, curr_order, area);
                         spin_unlock_irqrestore(&zone->lock, flags);
@@ -201,7 +209,7 @@ static struct page * rmqueue(zone_t *zone, unsigned long order)
                         set_page_count(page, 1);
                         if (BAD_RANGE(zone,page))
                                 BUG();
-                       DEBUG_ADD_PAGE
+                       DEBUG_LRU_PAGE(page);
                         return page;    
                 }
                 curr_order++;
@@ -212,305 +220,193 @@ static struct page * rmqueue(zone_t *zone, unsigned long order)
         return NULL;
  }
  
-#define PAGES_MIN      0
-#define PAGES_LOW      1
-#define PAGES_HIGH     2
+#ifndef CONFIG_DISCONTIGMEM
+struct page *_alloc_pages(unsigned int gfp_mask, unsigned int order)
+{
+       return __alloc_pages(gfp_mask, order,
+               contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK));
+}
+#endif
  
-/*
- * This function does the dirty work for __alloc_pages
- * and is separated out to keep the code size smaller.
- * (suggested by Davem at 1:30 AM, typed by Rik at 6 AM)
- */
-static struct page * __alloc_pages_limit(zonelist_t *zonelist,
-                       unsigned long order, int limit, int direct_reclaim)
+static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, unsigned int, int *));
+static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed)
  {
-       zone_t **zone = zonelist->zones;
+       struct page * page = NULL;
+       int __freed = 0;
  
-       for (;;) {
-               zone_t *z = *(zone++);
-               unsigned long water_mark;
+       if (!(gfp_mask & __GFP_WAIT))
+               goto out;
+       if (in_interrupt())
+               BUG();
  
-               if (!z)
-                       break;
-               if (!z->size)
-                       BUG();
+       current->allocation_order = order;
+       current->flags |= PF_MEMALLOC | PF_FREE_PAGES;
+
+       __freed = try_to_free_pages(classzone, gfp_mask, order);
+
+       current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES);
+
+       if (current->nr_local_pages) {
+               struct list_head * entry, * local_pages;
+               struct page * tmp;
+               int nr_pages;
+
+               local_pages = &current->local_pages;
+
+               if (__freed) {
+                       /* pick from the last inserted so we're lifo */
+                       entry = local_pages->next;
+                       do {
+                               tmp = list_entry(entry, struct page, list);
+                               if (tmp->index == order && memclass(tmp->zone, classzone)) {
+                                       list_del(entry);
+                                       current->nr_local_pages--;
+                                       set_page_count(tmp, 1);
+                                       page = tmp;
+
+                                       if (page->buffers)
+                                               BUG();
+                                       if (page->mapping)
+                                               BUG();
+                                       if (!VALID_PAGE(page))
+                                               BUG();
+                                       if (PageSwapCache(page))
+                                               BUG();
+                                       if (PageLocked(page))
+                                               BUG();
+                                       if (PageDecrAfter(page))
+                                               BUG();
+                                       if (PageActive(page))
+                                               BUG();
+                                       if (PageInactive(page))
+                                               BUG();
+                                       if (PageDirty(page))
+                                               BUG();
  
-               /*
-                * We allocate if the number of free + inactive_clean
-                * pages is above the watermark.
-                */
-               switch (limit) {
-                       default:
-                       case PAGES_MIN:
-                               water_mark = z->pages_min;
-                               break;
-                       case PAGES_LOW:
-                               water_mark = z->pages_low;
-                               break;
-                       case PAGES_HIGH:
-                               water_mark = z->pages_high;
+                                       break;
+                               }
+                       } while ((entry = entry->next) != local_pages);
                 }
  
-               if (z->free_pages + z->inactive_clean_pages >= water_mark) {
-                       struct page *page = NULL;
-                       /* If possible, reclaim a page directly. */
-                       if (direct_reclaim)
-                               page = reclaim_page(z);
-                       /* If that fails, fall back to rmqueue. */
-                       if (!page)
-                               page = rmqueue(z, order);
-                       if (page)
-                               return page;
+               nr_pages = current->nr_local_pages;
+               /* free in reverse order so that the global order will be lifo */
+               while ((entry = local_pages->prev) != local_pages) {
+                       list_del(entry);
+                       tmp = list_entry(entry, struct page, list);
+                       __free_pages_ok(tmp, tmp->index);
+                       if (!nr_pages--)
+                               BUG();
                 }
+               current->nr_local_pages = 0;
         }
-
-       /* Found nothing. */
-       return NULL;
+ out:
+       *freed = __freed;
+       return page;
  }
  
-#ifndef CONFIG_DISCONTIGMEM
-struct page *_alloc_pages(unsigned int gfp_mask, unsigned long order)
+static inline unsigned long zone_free_pages(zone_t * zone, unsigned int order)
  {
-       return __alloc_pages(gfp_mask, order,
-               contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK));
+       long free = zone->free_pages - (1UL << order);
+       return free >= 0 ? free : 0;
  }
-#endif
  
  /*
   * This is the 'heart' of the zoned buddy allocator:
   */
-struct page * __alloc_pages(unsigned int gfp_mask, unsigned long order, zonelist_t *zonelist)
+struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist)
  {
-       zone_t **zone;
-       int direct_reclaim = 0;
+       zone_t **zone, * classzone;
         struct page * page;
+       int freed;
  
-       /*
-        * Allocations put pressure on the VM subsystem.
-        */
-       memory_pressure++;
+       zone = zonelist->zones;
+       classzone = *zone;
+       for (;;) {
+               zone_t *z = *(zone++);
+               if (!z)
+                       break;
  
-       /*
-        * (If anyone calls gfp from interrupts nonatomically then it
-        * will sooner or later tripped up by a schedule().)
-        *
-        * We are falling back to lower-level zones if allocation
-        * in a higher zone fails.
-        */
+               if (zone_free_pages(z, order) > z->pages_low) {
+                       page = rmqueue(z, order);
+                       if (page)
+                               return page;
+               }
+       }
  
-       /*
-        * Can we take pages directly from the inactive_clean
-        * list?
-        */
-       if (order == 0 && (gfp_mask & __GFP_WAIT))
-               direct_reclaim = 1;
+       classzone->need_balance = 1;
+       mb();
+       if (waitqueue_active(&kswapd_wait))
+               wake_up_interruptible(&kswapd_wait);
  
-try_again:
-       /*
-        * First, see if we have any zones with lots of free memory.
-        *
-        * We allocate free memory first because it doesn't contain
-        * any data ... DUH!
-        */
         zone = zonelist->zones;
         for (;;) {
                 zone_t *z = *(zone++);
                 if (!z)
                         break;
-               if (!z->size)
-                       BUG();
  
-               if (z->free_pages >= z->pages_low) {
+               if (zone_free_pages(z, order) > (gfp_mask & __GFP_HIGH ? z->pages_min / 2 : z->pages_min)) {
                         page = rmqueue(z, order);
                         if (page)
                                 return page;
-               } else if (z->free_pages < z->pages_min &&
-                                       waitqueue_active(&kreclaimd_wait)) {
-                               wake_up_interruptible(&kreclaimd_wait);
                 }
         }
  
-       /*
-        * Try to allocate a page from a zone with a HIGH
-        * amount of free + inactive_clean pages.
-        *
-        * If there is a lot of activity, inactive_target
-        * will be high and we'll have a good chance of
-        * finding a page using the HIGH limit.
-        */
-       page = __alloc_pages_limit(zonelist, order, PAGES_HIGH, direct_reclaim);
-       if (page)
-               return page;
+       /* here we're in the low on memory slow path */
  
-       /*
-        * Then try to allocate a page from a zone with more
-        * than zone->pages_low free + inactive_clean pages.
-        *
-        * When the working set is very large and VM activity
-        * is low, we're most likely to have our allocation
-        * succeed here.
-        */
-       page = __alloc_pages_limit(zonelist, order, PAGES_LOW, direct_reclaim);
-       if (page)
-               return page;
+       if (current->flags & PF_MEMALLOC) {
+               zone = zonelist->zones;
+               for (;;) {
+                       zone_t *z = *(zone++);
+                       if (!z)
+                               break;
  
-       /*
-        * OK, none of the zones on our zonelist has lots
-        * of pages free.
-        *
-        * We wake up kswapd, in the hope that kswapd will
-        * resolve this situation before memory gets tight.
-        *
-        * We also yield the CPU, because that:
-        * - gives kswapd a chance to do something
-        * - slows down allocations, in particular the
-        *   allocations from the fast allocator that's
-        *   causing the problems ...
-        * - ... which minimises the impact the "bad guys"
-        *   have on the rest of the system
-        * - if we don't have __GFP_IO set, kswapd may be
-        *   able to free some memory we can't free ourselves
-        */
-       wakeup_kswapd();
-       if (gfp_mask & __GFP_WAIT) {
-               __set_current_state(TASK_RUNNING);
-               current->policy |= SCHED_YIELD;
-               schedule();
+                       page = rmqueue(z, order);
+                       if (page)
+                               return page;
+               }
+               return NULL;
         }
  
-       /*
-        * After waking up kswapd, we try to allocate a page
-        * from any zone which isn't critical yet.
-        *
-        * Kswapd should, in most situations, bring the situation
-        * back to normal in no time.
-        */
-       page = __alloc_pages_limit(zonelist, order, PAGES_MIN, direct_reclaim);
+       page = balance_classzone(classzone, gfp_mask, order, &freed);
         if (page)
                 return page;
  
-       /*
-        * Damn, we didn't succeed.
-        *
-        * This can be due to 2 reasons:
-        * - we're doing a higher-order allocation
-        *      --> move pages to the free list until we succeed
-        * - we're /really/ tight on memory
-        *      --> try to free pages ourselves with page_launder
-        */
-       if (!(current->flags & PF_MEMALLOC)) {
-               /*
-                * Are we dealing with a higher order allocation?
-                *
-                * Move pages from the inactive_clean to the free list
-                * in the hope of creating a large, physically contiguous
-                * piece of free memory.
-                */
-               if (order > 0 && (gfp_mask & __GFP_WAIT)) {
-                       zone = zonelist->zones;
-                       /* First, clean some dirty pages. */
-                       current->flags |= PF_MEMALLOC;
-                       page_launder(gfp_mask, 1);
-                       current->flags &= ~PF_MEMALLOC;
-                       for (;;) {
-                               zone_t *z = *(zone++);
-                               if (!z)
-                                       break;
-                               if (!z->size)
-                                       continue;
-                               while (z->inactive_clean_pages) {
-                                       struct page * page;
-                                       /* Move one page to the free list. */
-                                       page = reclaim_page(z);
-                                       if (!page)
-                                               break;
-                                       __free_page(page);
-                                       /* Try if the allocation succeeds. */
-                                       page = rmqueue(z, order);
-                                       if (page)
-                                               return page;
-                               }
-                       }
-               }
-               /*
-                * When we arrive here, we are really tight on memory.
-                * Since kswapd didn't succeed in freeing pages for us,
-                * we try to help it.
-                *
-                * Single page allocs loop until the allocation succeeds.
-                * Multi-page allocs can fail due to memory fragmentation;
-                * in that case we bail out to prevent infinite loops and
-                * hanging device drivers ...
-                *
-                * Another issue are GFP_NOFS allocations; because they
-                * do not have __GFP_FS set it's possible we cannot make
-                * any progress freeing pages, in that case it's better
-                * to give up than to deadlock the kernel looping here.
-                */
-               if (gfp_mask & __GFP_WAIT) {
-                       if (!order || free_shortage()) {
-                               int progress = try_to_free_pages(gfp_mask);
-                               if (progress || (gfp_mask & __GFP_FS))
-                                       goto try_again;
-                               /*
-                                * Fail in case no progress was made and the
-                                * allocation may not be able to block on IO.
-                                */
-                               return NULL;
-                       }
-               }
-       }
-
-       /*
-        * Final phase: allocate anything we can!
-        *
-        * Higher order allocations, GFP_ATOMIC allocations and
-        * recursive allocations (PF_MEMALLOC) end up here.
-        *
-        * Only recursive allocations can use the very last pages
-        * in the system, otherwise it would be just too easy to
-        * deadlock the system...
-        */
         zone = zonelist->zones;
-       for (;;) {
-               zone_t *z = *(zone++);
-               struct page * page = NULL;
-               if (!z)
-                       break;
-               if (!z->size)
-                       BUG();
+       if (__builtin_expect(freed, 1)) {
+               for (;;) {
+                       zone_t *z = *(zone++);
+                       if (!z)
+                               break;
  
-               /*
-                * SUBTLE: direct_reclaim is only possible if the task
-                * becomes PF_MEMALLOC while looping above. This will
-                * happen when the OOM killer selects this task for
-                * instant execution...
-                */
-               if (direct_reclaim) {
-                       page = reclaim_page(z);
-                       if (page)
-                               return page;
+                       if (zone_free_pages(z, order) > (gfp_mask & __GFP_HIGH ? z->pages_min / 2 : z->pages_min)) {
+                               page = rmqueue(z, order);
+                               if (page)
+                                       return page;
+                       }
                 }
+       } else {
+               for (;;) {
+                       zone_t *z = *(zone++);
+                       if (!z)
+                               break;
  
-               /* XXX: is pages_min/4 a good amount to reserve for this? */
-               if (z->free_pages < z->pages_min / 4 &&
-                               !(current->flags & PF_MEMALLOC))
-                       continue;
-               page = rmqueue(z, order);
-               if (page)
-                       return page;
+                       if (zone_free_pages(z, order) > z->pages_high) {
+                               page = rmqueue(z, order);
+                               if (page)
+                                       return page;
+                       }
+               }
         }
  
-       /* No luck.. */
-       printk(KERN_ERR "__alloc_pages: %lu-order allocation failed (gfp=0x%x/%i).\n",
-               order, gfp_mask, !!(current->flags & PF_MEMALLOC));
+       printk(KERN_NOTICE "__alloc_pages: %u-order allocation failed (gfp=0x%x/%i) from %p\n",
+              order, gfp_mask, !!(current->flags & PF_MEMALLOC), __builtin_return_address(0));
         return NULL;
  }
  
  /*
   * Common helper functions.
   */
-unsigned long __get_free_pages(int gfp_mask, unsigned long order)
+unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order)
  {
         struct page * page;
  
@@ -520,7 +416,7 @@ unsigned long __get_free_pages(int gfp_mask, unsigned long order)
         return (unsigned long) page_address(page);
  }
  
-unsigned long get_zeroed_page(int gfp_mask)
+unsigned long get_zeroed_page(unsigned int gfp_mask)
  {
         struct page * page;
  
@@ -533,13 +429,13 @@ unsigned long get_zeroed_page(int gfp_mask)
         return 0;
  }
  
-void __free_pages(struct page *page, unsigned long order)
+void __free_pages(struct page *page, unsigned int order)
  {
         if (!PageReserved(page) && put_page_testzero(page))
                 __free_pages_ok(page, order);
  }
  
-void free_pages(unsigned long addr, unsigned long order)
+void free_pages(unsigned long addr, unsigned int order)
  {
         if (addr != 0)
                 __free_pages(virt_to_page(addr), order);
@@ -563,48 +459,27 @@ unsigned int nr_free_pages (void)
         return sum;
  }
  
-/*
- * Total amount of inactive_clean (allocatable) RAM:
- */
-unsigned int nr_inactive_clean_pages (void)
-{
-       unsigned int sum;
-       zone_t *zone;
-       pg_data_t *pgdat = pgdat_list;
-
-       sum = 0;
-       while (pgdat) {
-               for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++)
-                       sum += zone->inactive_clean_pages;
-               pgdat = pgdat->node_next;
-       }
-       return sum;
-}
-
  /*
   * Amount of free RAM allocatable as buffer memory:
   */
  unsigned int nr_free_buffer_pages (void)
  {
+       pg_data_t *pgdat = pgdat_list;
         unsigned int sum = 0;
         zonelist_t *zonelist;
         zone_t **zonep, *zone;
  
-       zonelist = contig_page_data.node_zonelists + (GFP_NOFS & GFP_ZONEMASK);
-       zonep = zonelist->zones;
+       do {
+               zonelist = pgdat->node_zonelists + __GFP_HIGHMEM;
+               zonep = zonelist->zones;
  
-       for (zone = *zonep++; zone; zone = *zonep++) {
-               unsigned int pages = zone->free_pages +
-                       zone->inactive_clean_pages +
-                       zone->inactive_dirty_pages;
+               for (zone = *zonep++; zone; zone = *zonep++)
+                       sum += zone->free_pages;
  
-               /* Allow the buffer cache to fill up at least "pages_high" pages */
-               if (pages < zone->pages_high)
-                       pages = zone->pages_high;
-               sum += pages;
-       }
+               pgdat = pgdat->node_next;
+       } while (pgdat);
  
-       return sum;
+       return sum + nr_active_pages + nr_inactive_pages;
  }
  
  #if CONFIG_HIGHMEM
@@ -628,21 +503,17 @@ unsigned int nr_free_highpages (void)
   */
  void show_free_areas_core(pg_data_t *pgdat)
  {
-       unsigned long order;
+       unsigned int order;
         unsigned type;
  
         printk("Free pages:      %6dkB (%6dkB HighMem)\n",
                 nr_free_pages() << (PAGE_SHIFT-10),
                 nr_free_highpages() << (PAGE_SHIFT-10));
  
-       printk("( Active: %d, inactive_dirty: %d, inactive_clean: %d, free: %d (%d %d %d) )\n",
-               nr_active_pages,
-               nr_inactive_dirty_pages,
-               nr_inactive_clean_pages(),
-               nr_free_pages(),
-               freepages.min,
-               freepages.low,
-               freepages.high);
+       printk("( Active: %d, inactive: %d, free: %d )\n",
+              nr_active_pages,
+              nr_inactive_pages,
+              nr_free_pages());
  
         for (type = 0; type < MAX_NR_ZONES; type++) {
                 struct list_head *head, *curr;
@@ -762,8 +633,8 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
                         
         printk("On node %d totalpages: %lu\n", nid, realtotalpages);
  
-       memlist_init(&active_list);
-       memlist_init(&inactive_dirty_list);
+       INIT_LIST_HEAD(&active_list);
+       INIT_LIST_HEAD(&inactive_list);
  
         /*
          * Some architectures (with lots of mem and discontinous memory
@@ -782,6 +653,7 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
         pgdat->node_size = totalpages;
         pgdat->node_start_paddr = zone_start_paddr;
         pgdat->node_start_mapnr = (lmem_map - mem_map);
+       pgdat->nr_zones = 0;
  
         /*
          * Initially all pages are reserved - free ones are freed
@@ -811,12 +683,11 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
                 zone->lock = SPIN_LOCK_UNLOCKED;
                 zone->zone_pgdat = pgdat;
                 zone->free_pages = 0;
-               zone->inactive_clean_pages = 0;
-               zone->inactive_dirty_pages = 0;
-               memlist_init(&zone->inactive_clean_list);
                 if (!size)
                         continue;
  
+               pgdat->nr_zones = j+1;
+
                 mask = (realsize / zone_balance_ratio[j]);
                 if (mask < zone_balance_min[j])
                         mask = zone_balance_min[j];
@@ -825,20 +696,7 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
                 zone->pages_min = mask;
                 zone->pages_low = mask*2;
                 zone->pages_high = mask*3;
-               /*
-                * Add these free targets to the global free target;
-                * we have to be SURE that freepages.high is higher
-                * than SUM [zone->pages_min] for all zones, otherwise
-                * we may have bad bad problems.
-                *
-                * This means we cannot make the freepages array writable
-                * in /proc, but have to add a separate extra_free_target
-                * for people who require it to catch load spikes in eg.
-                * gigabit ethernet routing...
-                */
-               freepages.min += mask;
-               freepages.low += mask*2;
-               freepages.high += mask*3;
+
                 zone->zone_mem_map = mem_map + offset;
                 zone->zone_start_mapnr = offset;
                 zone->zone_start_paddr = zone_start_paddr;
diff --git a/mm/shmem.c b/mm/shmem.c

index 45eb00d9af9fbf3421d7d5c8faf94a867fcb156b..b917e1ea1320540ae54e8ed3d43b9961d0df130c 100644 (file)
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -353,7 +353,7 @@ repeat:
                 swap_free(*entry);
                 *entry = (swp_entry_t) {0};
                 delete_from_swap_cache_nolock(page);
-               flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced) | (1 << PG_arch_1));
+               flags = page->flags & ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_referenced | 1 << PG_arch_1);
                 page->flags = flags | (1 << PG_dirty);
                 add_to_page_cache_locked(page, mapping, idx);
                 info->swapped--;
diff --git a/mm/slab.c b/mm/slab.c

index aa84825d6652a926ecca142fb24fd9cc2ede4ebe..e7bf381df8499fa9b4b6cb1917018036869eeaea 100644 (file)
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -926,8 +926,10 @@ static int __kmem_cache_shrink(kmem_cache_t *cachep)
                         break;
  
                 slabp = list_entry(cachep->slabs_free.prev, slab_t, list);
+#if DEBUG
                 if (slabp->inuse)
                         BUG();
+#endif
                 list_del(&slabp->list);
  
                 spin_unlock_irq(&cachep->spinlock);
@@ -1205,7 +1207,6 @@ static int kmem_extra_free_checks (kmem_cache_t * cachep,
  
  static inline void kmem_cache_alloc_head(kmem_cache_t *cachep, int flags)
  {
-#if DEBUG
         if (flags & SLAB_DMA) {
                 if (!(cachep->gfpflags & GFP_DMA))
                         BUG();
@@ -1213,11 +1214,10 @@ static inline void kmem_cache_alloc_head(kmem_cache_t *cachep, int flags)
                 if (cachep->gfpflags & GFP_DMA)
                         BUG();
         }
-#endif
  }
  
  static inline void * kmem_cache_alloc_one_tail (kmem_cache_t *cachep,
-                                               slab_t *slabp, int partial)
+                                               slab_t *slabp)
  {
         void *objp;
  
@@ -1230,14 +1230,9 @@ static inline void * kmem_cache_alloc_one_tail (kmem_cache_t *cachep,
         objp = slabp->s_mem + slabp->free*cachep->objsize;
         slabp->free=slab_bufctl(slabp)[slabp->free];
  
-       if (slabp->free == BUFCTL_END) {
+       if (__builtin_expect(slabp->free == BUFCTL_END, 0)) {
                 list_del(&slabp->list);
                 list_add(&slabp->list, &cachep->slabs_full);
-       } else {
-               if (!partial) {
-                       list_del(&slabp->list);
-                       list_add(&slabp->list, &cachep->slabs_partial);
-               }
         }
  #if DEBUG
         if (cachep->flags & SLAB_POISON)
@@ -1264,20 +1259,23 @@ static inline void * kmem_cache_alloc_one_tail (kmem_cache_t *cachep,
   */
  #define kmem_cache_alloc_one(cachep)                           \
  ({                                                             \
-       slab_t  *slabp;                                         \
-       struct list_head * slab_freelist;                       \
-       int partial = 1;                                        \
+       struct list_head * slabs_partial, * entry;              \
+       slab_t *slabp;                                          \
                                                                 \
-       slab_freelist = &(cachep)->slabs_partial;               \
-       if (list_empty(slab_freelist)) {                        \
-               partial = 0;                                    \
-               slab_freelist = &(cachep)->slabs_free;          \
-               if (list_empty(slab_freelist))                  \
+       slabs_partial = &(cachep)->slabs_partial;               \
+       entry = slabs_partial->next;                            \
+       if (__builtin_expect(entry == slabs_partial, 0)) {      \
+               struct list_head * slabs_free;                  \
+               slabs_free = &(cachep)->slabs_free;             \
+               entry = slabs_free->next;                       \
+               if (__builtin_expect(entry == slabs_free, 0))   \
                         goto alloc_new_slab;                    \
+               list_del(entry);                                \
+               list_add(entry, slabs_partial);                 \
         }                                                       \
                                                                 \
-       slabp = list_entry(slab_freelist->next, slab_t, list);  \
-       kmem_cache_alloc_one_tail(cachep, slabp, partial);      \
+       slabp = list_entry(entry, slab_t, list);                \
+       kmem_cache_alloc_one_tail(cachep, slabp);               \
  })
  
  #ifdef CONFIG_SMP
@@ -1285,25 +1283,27 @@ void* kmem_cache_alloc_batch(kmem_cache_t* cachep, int flags)
  {
         int batchcount = cachep->batchcount;
         cpucache_t* cc = cc_data(cachep);
-       struct list_head * slab_freelist;
-       int partial;
-       slab_t *slabp;
  
         spin_lock(&cachep->spinlock);
         while (batchcount--) {
+               struct list_head * slabs_partial, * entry;
+               slab_t *slabp;
                 /* Get slab alloc is to come from. */
-               slab_freelist = &(cachep)->slabs_partial;
-               partial = 1;
-               if (list_empty(slab_freelist)) {
-                       partial = 0;
-                       slab_freelist = &(cachep)->slabs_free;
-                       if (list_empty(slab_freelist))
+               slabs_partial = &(cachep)->slabs_partial;
+               entry = slabs_partial->next;
+               if (__builtin_expect(entry == slabs_partial, 0)) {
+                       struct list_head * slabs_free;
+                       slabs_free = &(cachep)->slabs_free;
+                       entry = slabs_free->next;
+                       if (__builtin_expect(entry == slabs_free, 0))
                                 break;
+                       list_del(entry);
+                       list_add(entry, slabs_partial);
                 }
  
-               slabp = list_entry(slab_freelist->next, slab_t, list);
+               slabp = list_entry(entry, slab_t, list);
                 cc_entry(cc)[cc->avail++] =
-                               kmem_cache_alloc_one_tail(cachep, slabp, partial);
+                               kmem_cache_alloc_one_tail(cachep, slabp);
         }
         spin_unlock(&cachep->spinlock);
  
@@ -1434,23 +1434,18 @@ static inline void kmem_cache_free_one(kmem_cache_t *cachep, void *objp)
         STATS_DEC_ACTIVE(cachep);
         
         /* fixup slab chains */
-       if (!--slabp->inuse)
-               goto moveslab_free;
-       if (slabp->inuse + 1 == cachep->num)
-               goto moveslab_partial;
-       return;
-
-moveslab_partial:
-       /* Was full. */
-       list_del(&slabp->list);
-       list_add(&slabp->list, &cachep->slabs_partial);
-       return;
-
-moveslab_free:
-       /* Was partial, now empty. */
-       list_del(&slabp->list);
-       list_add(&slabp->list, &cachep->slabs_free);
-       return;
+       {
+               int inuse = slabp->inuse;
+               if (__builtin_expect(!--slabp->inuse, 0)) {
+                       /* Was partial or full, now empty. */
+                       list_del(&slabp->list);
+                       list_add(&slabp->list, &cachep->slabs_free);
+               } else if (__builtin_expect(inuse == cachep->num, 0)) {
+                       /* Was full. */
+                       list_del(&slabp->list);
+                       list_add(&slabp->list, &cachep->slabs_partial);
+               }
+       }
  }
  
  #ifdef CONFIG_SMP
@@ -1709,7 +1704,7 @@ static void enable_all_cpucaches (void)
   *
   * Called from do_try_to_free_pages() and __alloc_pages()
   */
-void kmem_cache_reap (int gfp_mask)
+int kmem_cache_reap (int gfp_mask)
  {
         slab_t *slabp;
         kmem_cache_t *searchp;
@@ -1717,12 +1712,13 @@ void kmem_cache_reap (int gfp_mask)
         unsigned int best_pages;
         unsigned int best_len;
         unsigned int scan;
+       int ret = 0;
  
         if (gfp_mask & __GFP_WAIT)
                 down(&cache_chain_sem);
         else
                 if (down_trylock(&cache_chain_sem))
-                       return;
+                       return 0;
  
         scan = REAP_SCANLEN;
         best_len = 0;
@@ -1758,8 +1754,10 @@ void kmem_cache_reap (int gfp_mask)
                 p = searchp->slabs_free.next;
                 while (p != &searchp->slabs_free) {
                         slabp = list_entry(p, slab_t, list);
+#if DEBUG
                         if (slabp->inuse)
                                 BUG();
+#endif
                         full_free++;
                         p = p->next;
                 }
@@ -1809,8 +1807,10 @@ perfect:
                 if (p == &best_cachep->slabs_free)
                         break;
                 slabp = list_entry(p,slab_t,list);
+#if DEBUG
                 if (slabp->inuse)
                         BUG();
+#endif
                 list_del(&slabp->list);
                 STATS_INC_REAPED(best_cachep);
  
@@ -1822,9 +1822,10 @@ perfect:
                 spin_lock_irq(&best_cachep->spinlock);
         }
         spin_unlock_irq(&best_cachep->spinlock);
+       ret = scan * (1 << best_cachep->gfporder);
  out:
         up(&cache_chain_sem);
-       return;
+       return ret;
  }
  
  #ifdef CONFIG_PROC_FS
diff --git a/mm/swap.c b/mm/swap.c

index 271060f5f6e7879cb29001c4270c98c9b58cf4ce..37b9ea1babb6ec497a40cc0b02a0b5a8daacf808 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -24,50 +24,13 @@
  #include <asm/uaccess.h> /* for copy_to/from_user */
  #include <asm/pgtable.h>
  
-/*
- * We identify three levels of free memory.  We never let free mem
- * fall below the freepages.min except for atomic allocations.  We
- * start background swapping if we fall below freepages.high free
- * pages, and we begin intensive swapping below freepages.low.
- *
- * Actual initialization is done in mm/page_alloc.c
- */
-freepages_t freepages = {
-       0,      /* freepages.min */
-       0,      /* freepages.low */
-       0       /* freepages.high */
-};
-
  /* How many pages do we try to swap or page in/out together? */
  int page_cluster;
  
-/*
- * This variable contains the amount of page steals the system
- * is doing, averaged over a minute. We use this to determine how
- * many inactive pages we should have.
- *
- * In reclaim_page and __alloc_pages: memory_pressure++
- * In __free_pages_ok: memory_pressure--
- * In recalculate_vm_stats the value is decayed (once a second)
- */
-int memory_pressure;
-
  /* We track the number of pages currently being asynchronously swapped
     out, so that we don't try to swap TOO many pages out at once */
  atomic_t nr_async_pages = ATOMIC_INIT(0);
  
-buffer_mem_t buffer_mem = {
-       2,      /* minimum percent buffer */
-       10,     /* borrow percent buffer */
-       60      /* maximum percent buffer */
-};
-
-buffer_mem_t page_cache = {
-       2,      /* minimum percent page cache */
-       15,     /* borrow percent page cache */
-       75      /* maximum */
-};
-
  pager_daemon_t pager_daemon = {
         512,    /* base number for calculating the number of tries */
         SWAP_CLUSTER_MAX,       /* minimum number of tries */
@@ -87,25 +50,9 @@ pager_daemon_t pager_daemon = {
   */
  void deactivate_page_nolock(struct page * page)
  {
-       /*
-        * One for the cache, one for the extra reference the
-        * caller has and (maybe) one for the buffers.
-        *
-        * This isn't perfect, but works for just about everything.
-        * Besides, as long as we don't move unfreeable pages to the
-        * inactive_clean list it doesn't need to be perfect...
-        */
-       int maxcount = (page->buffers ? 3 : 2);
-       page->age = 0;
-       ClearPageReferenced(page);
-
-       /*
-        * Don't touch it if it's not on the active list.
-        * (some pages aren't on any list at all)
-        */
-       if (PageActive(page) && page_count(page) <= maxcount && !page_ramdisk(page)) {
+       if (PageActive(page)) {
                 del_page_from_active_list(page);
-               add_page_to_inactive_dirty_list(page);
+               add_page_to_inactive_list(page);
         }
  }      
  
@@ -121,22 +68,10 @@ void deactivate_page(struct page * page)
   */
  void activate_page_nolock(struct page * page)
  {
-       if (PageInactiveDirty(page)) {
-               del_page_from_inactive_dirty_list(page);
+       if (PageInactive(page)) {
+               del_page_from_inactive_list(page);
                 add_page_to_active_list(page);
-       } else if (PageInactiveClean(page)) {
-               del_page_from_inactive_clean_list(page);
-               add_page_to_active_list(page);
-       } else {
-               /*
-                * The page was not on any list, so we take care
-                * not to do anything.
-                */
         }
-
-       /* Make sure the page gets a fair chance at staying active. */
-       if (page->age < PAGE_AGE_START)
-               page->age = PAGE_AGE_START;
  }
  
  void activate_page(struct page * page)
@@ -152,11 +87,10 @@ void activate_page(struct page * page)
   */
  void lru_cache_add(struct page * page)
  {
-       spin_lock(&pagemap_lru_lock);
         if (!PageLocked(page))
                 BUG();
-       add_page_to_inactive_dirty_list(page);
-       page->age = 0;
+       spin_lock(&pagemap_lru_lock);
+       add_page_to_inactive_list(page);
         spin_unlock(&pagemap_lru_lock);
  }
  
@@ -171,14 +105,11 @@ void __lru_cache_del(struct page * page)
  {
         if (PageActive(page)) {
                 del_page_from_active_list(page);
-       } else if (PageInactiveDirty(page)) {
-               del_page_from_inactive_dirty_list(page);
-       } else if (PageInactiveClean(page)) {
-               del_page_from_inactive_clean_list(page);
-       } else {
+       } else if (PageInactive(page)) {
+               del_page_from_inactive_list(page);
+       } else
                 printk("VM: __lru_cache_del, found unknown page ?!\n");
-       }
-       DEBUG_ADD_PAGE
+       DEBUG_LRU_PAGE(page);
  }
  
  /**
@@ -194,22 +125,6 @@ void lru_cache_del(struct page * page)
         spin_unlock(&pagemap_lru_lock);
  }
  
-/**
- * recalculate_vm_stats - recalculate VM statistics
- *
- * This function should be called once a second to recalculate
- * some useful statistics the VM subsystem uses to determine
- * its behaviour.
- */
-void recalculate_vm_stats(void)
-{
-       /*
-        * Substract one second worth of memory_pressure from
-        * memory_pressure.
-        */
-       memory_pressure -= (memory_pressure >> INACTIVE_SHIFT);
-}
-
  /*
   * Perform any setup for the swap system
   */
diff --git a/mm/swap_state.c b/mm/swap_state.c

index bbfb111895710387cdfcb70898302fa2e42ce145..0d9278742bbbcdf852af162c7e7a247f969404b4 100644 (file)
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -23,17 +23,6 @@
   */
  static int swap_writepage(struct page *page)
  {
-       /* One for the page cache, one for this user, one for page->buffers */
-       if (page_count(page) > 2 + !!page->buffers)
-               goto in_use;
-       if (swap_count(page) > 1)
-               goto in_use;
-
-       delete_from_swap_cache_nolock(page);
-       UnlockPage(page);
-       return 0;
-
-in_use:
         rw_swap_page(WRITE, page);
         return 0;
  }
@@ -81,9 +70,8 @@ void add_to_swap_cache(struct page *page, swp_entry_t entry)
                 BUG();
  
         /* clear PG_dirty so a subsequent set_page_dirty takes effect */
-       flags = page->flags & ~((1 << PG_error) | (1 << PG_dirty) | (1 << PG_arch_1));
+       flags = page->flags & ~(1 << PG_error | 1 << PG_dirty | 1 << PG_arch_1 | 1 << PG_referenced);
         page->flags = flags | (1 << PG_uptodate);
-       page->age = PAGE_AGE_START;
         add_to_page_cache_locked(page, &swapper_space, entry.val);
  }
  
diff --git a/mm/swapfile.c b/mm/swapfile.c

index 4ecae70a8ed8aeed92c857cac18d4c7c68829c5d..a7038a0824ebce173250364068af82fa07aa2f37 100644 (file)
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -31,25 +31,6 @@ struct swap_list_t swap_list = {-1, -1};
  
  struct swap_info_struct swap_info[MAX_SWAPFILES];
  
-/*
- * When swap space gets filled up, we will set this flag.
- * This will make do_swap_page(), in the page fault path,
- * free swap entries on swapin so we'll reclaim swap space
- * in order to be able to swap something out.
- *
- * At the moment we start reclaiming when swap usage goes
- * over 80% of swap space.
- *
- * XXX: Random numbers, fixme.
- */
-#define SWAP_FULL_PCT 80
-int vm_swap_full (void)
-{
-       int swap_used = total_swap_pages - nr_swap_pages;
-
-       return swap_used * 100 > total_swap_pages * SWAP_FULL_PCT;
-}
-
  #define SWAPFILE_CLUSTER 256
  
  static inline int scan_swap_map(struct swap_info_struct *si, unsigned short count)
@@ -471,7 +452,6 @@ static int try_to_unuse(unsigned int type)
                 lock_page(page);
                 if (PageSwapCache(page))
                         delete_from_swap_cache_nolock(page);
-               SetPageDirty(page);
                 UnlockPage(page);
                 flush_page_to_ram(page);
  
@@ -512,6 +492,7 @@ static int try_to_unuse(unsigned int type)
                         mmput(start_mm);
                         start_mm = new_start_mm;
                 }
+               ClearPageDirty(page);
                 page_cache_release(page);
  
                 /*
diff --git a/mm/vmalloc.c b/mm/vmalloc.c

index b9ad759fa27c9d4b3d2a8daf16be61b4d76b31fd..87fd3e0ed3a46cf4d40579ae42ba326120c98ac6 100644 (file)
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -144,7 +144,6 @@ inline int vmalloc_area_pages (unsigned long address, unsigned long size,
         int ret;
  
         dir = pgd_offset_k(address);
-       flush_cache_all();
         spin_lock(&init_mm.page_table_lock);
         do {
                 pmd_t *pmd;
@@ -164,7 +163,6 @@ inline int vmalloc_area_pages (unsigned long address, unsigned long size,
                 ret = 0;
         } while (address && (address < end));
         spin_unlock(&init_mm.page_table_lock);
-       flush_tlb_all();
         return ret;
  }
  
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 3025c2ca3620d15e27ada4b43db1b679889febcf..9bb4dbcba7194de8e5a7b6b84cefd04035ee6bf1 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -32,19 +32,6 @@
   */
  #define DEF_PRIORITY (6)
  
-static inline void age_page_up(struct page *page)
-{
-       unsigned age = page->age + PAGE_AGE_ADV;
-       if (age > PAGE_AGE_MAX)
-               age = PAGE_AGE_MAX;
-       page->age = age;
-}
-
-static inline void age_page_down(struct page * page)
-{
-       page->age /= 2;
-}
-
  /*
   * The swap-out function returns 1 if it successfully
   * scanned all the pages it was asked to (`count').
@@ -54,55 +41,24 @@ static inline void age_page_down(struct page * page)
   * doesn't count as having freed a page.
   */
  
-/*
- * Estimate whether a zone has enough inactive or free pages..
- */
-static unsigned int zone_inactive_plenty(zone_t *zone)
-{
-       unsigned int inactive;
-
-       if (!zone->size)
-               return 0;
-               
-       inactive = zone->inactive_dirty_pages;
-       inactive += zone->inactive_clean_pages;
-       inactive += zone->free_pages;
-
-       return (inactive > (zone->size / 3));
-}
-
-static unsigned int zone_free_plenty(zone_t *zone)
-{
-       unsigned int free;
-
-       free = zone->free_pages;
-       free += zone->inactive_clean_pages;
-
-       return free > zone->pages_high*2;
-}
-
  /* mm->page_table_lock is held. mmap_sem is not held */
-static void try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page)
+static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page, zone_t * classzone)
  {
         pte_t pte;
         swp_entry_t entry;
  
-       /* 
-        * If we are doing a zone-specific scan, do not
-        * touch pages from zones which don't have a 
-        * shortage.
-        */
-       if (zone_inactive_plenty(page->zone))
-               return;
-
         /* Don't look at this pte if it's been accessed recently. */
         if (ptep_test_and_clear_young(page_table)) {
-               mark_page_accessed(page);
-               return;
+               flush_tlb_page(vma, address);
+               SetPageReferenced(page);
+               return 0;
         }
  
+       if (!memclass(page->zone, classzone))
+               return 0;
+
         if (TryLockPage(page))
-               return;
+               return 0;
  
         /* From this point on, the odds are that we're going to
          * nuke this pte, so read and clear the pte.  This hook
@@ -127,11 +83,14 @@ set_swap_pte:
                 set_pte(page_table, swp_entry_to_pte(entry));
  drop_pte:
                 mm->rss--;
-               if (!PageReferenced(page))
-                       deactivate_page(page);
                 UnlockPage(page);
-               page_cache_release(page);
-               return;
+               {
+                       int freeable = page_count(page) - !!page->buffers <= 2;
+                       if (freeable)
+                               deactivate_page(page);
+                       page_cache_release(page);
+                       return freeable;
+               }
         }
  
         /*
@@ -178,11 +137,11 @@ drop_pte:
  out_unlock_restore:
         set_pte(page_table, pte);
         UnlockPage(page);
-       return;
+       return 0;
  }
  
  /* mm->page_table_lock is held. mmap_sem is not held */
-static int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count)
+static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
  {
         pte_t * pte;
         unsigned long pmd_end;
@@ -206,20 +165,22 @@ static int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_
                         struct page *page = pte_page(*pte);
  
                         if (VALID_PAGE(page) && !PageReserved(page)) {
-                               try_to_swap_out(mm, vma, address, pte, page);
-                               if (!--count)
+                               count -= try_to_swap_out(mm, vma, address, pte, page, classzone);
+                               if (!count) {
+                                       address += PAGE_SIZE;
                                         break;
+                               }
                         }
                 }
                 address += PAGE_SIZE;
                 pte++;
         } while (address && (address < end));
-       mm->swap_address = address + PAGE_SIZE;
+       mm->swap_address = address;
         return count;
  }
  
  /* mm->page_table_lock is held. mmap_sem is not held */
-static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count)
+static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
  {
         pmd_t * pmd;
         unsigned long pgd_end;
@@ -239,7 +200,7 @@ static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vm
                 end = pgd_end;
         
         do {
-               count = swap_out_pmd(mm, vma, pmd, address, end, count);
+               count = swap_out_pmd(mm, vma, pmd, address, end, count, classzone);
                 if (!count)
                         break;
                 address = (address + PMD_SIZE) & PMD_MASK;
@@ -249,7 +210,7 @@ static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vm
  }
  
  /* mm->page_table_lock is held. mmap_sem is not held */
-static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count)
+static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count, zone_t * classzone)
  {
         pgd_t *pgdir;
         unsigned long end;
@@ -264,7 +225,7 @@ static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsi
         if (address >= end)
                 BUG();
         do {
-               count = swap_out_pgd(mm, vma, pgdir, address, end, count);
+               count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone);
                 if (!count)
                         break;
                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
@@ -273,25 +234,26 @@ static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsi
         return count;
  }
  
+/* Placeholder for swap_out(): may be updated by fork.c:mmput() */
+struct mm_struct *swap_mm = &init_mm;
+
  /*
   * Returns non-zero if we scanned all `count' pages
   */
-static int swap_out_mm(struct mm_struct * mm, int count)
+static inline int swap_out_mm(struct mm_struct * mm, int count, int * race, zone_t * classzone)
  {
         unsigned long address;
         struct vm_area_struct* vma;
  
-       if (!count)
-               return 1;
-       /*
-        * Go through process' page directory.
-        */
-
         /*
          * Find the proper vm-area after freezing the vma chain 
          * and ptes.
          */
         spin_lock(&mm->page_table_lock);
+       *race = 1;
+       if (swap_mm != mm)
+               goto out_unlock;
+       *race = 0;
         address = mm->swap_address;
         vma = find_vma(mm, address);
         if (vma) {
@@ -299,7 +261,7 @@ static int swap_out_mm(struct mm_struct * mm, int count)
                         address = vma->vm_start;
  
                 for (;;) {
-                       count = swap_out_vma(mm, vma, address, count);
+                       count = swap_out_vma(mm, vma, address, count, classzone);
                         if (!count)
                                 goto out_unlock;
                         vma = vma->vm_next;
@@ -311,224 +273,106 @@ static int swap_out_mm(struct mm_struct * mm, int count)
         /* Reset to 0 when we reach the end of address space */
         mm->swap_address = 0;
  
+       spin_lock(&mmlist_lock);
+       swap_mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
+       spin_unlock(&mmlist_lock);
+
  out_unlock:
         spin_unlock(&mm->page_table_lock);
-       return !count;
-}
-
-#define SWAP_MM_SHIFT  4
-#define SWAP_SHIFT     5
-#define SWAP_MIN       8
  
-static inline int swap_amount(struct mm_struct *mm)
-{
-       int nr = mm->rss >> SWAP_SHIFT;
-       if (nr < SWAP_MIN) {
-               nr = SWAP_MIN;
-               if (nr > mm->rss)
-                       nr = mm->rss;
-       }
-       return nr;
+       return count;
  }
  
-/* Placeholder for swap_out(): may be updated by fork.c:mmput() */
-struct mm_struct *swap_mm = &init_mm;
-
-static void swap_out(unsigned int priority, int gfp_mask)
+static int FASTCALL(swap_out(unsigned int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages));
+static int swap_out(unsigned int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages)
  {
-       int counter;
-       int retval = 0;
-       struct mm_struct *mm = current->mm;
-
-       /* Always start by trying to penalize the process that is allocating memory */
-       if (mm)
-               retval = swap_out_mm(mm, swap_amount(mm));
+       int counter, race;
+       struct mm_struct *mm;
  
         /* Then, look at the other mm's */
-       counter = (mmlist_nr << SWAP_MM_SHIFT) >> priority;
+       counter = mmlist_nr / priority;
         do {
+               if (current->need_resched)
+                       schedule();
+
                 spin_lock(&mmlist_lock);
                 mm = swap_mm;
                 if (mm == &init_mm) {
                         mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
                         if (mm == &init_mm)
                                 goto empty;
+                       swap_mm = mm;
                 }
-               /* Set pointer for next call to next in the list */
-               swap_mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
  
                 /* Make sure the mm doesn't disappear when we drop the lock.. */
                 atomic_inc(&mm->mm_users);
                 spin_unlock(&mmlist_lock);
  
-               /* Walk about 6% of the address space each time */
-               retval |= swap_out_mm(mm, swap_amount(mm));
+               nr_pages = swap_out_mm(mm, nr_pages, &race, classzone);
+
                 mmput(mm);
-       } while (--counter >= 0);
-       return;
+
+               if (!nr_pages)
+                       return 1;
+       } while (race || --counter >= 0);
+
+       return 0;
  
  empty:
         spin_unlock(&mmlist_lock);
+       return 0;
  }
  
-
-/**
- * reclaim_page -      reclaims one page from the inactive_clean list
- * @zone: reclaim a page from this zone
- *
- * The pages on the inactive_clean can be instantly reclaimed.
- * The tests look impressive, but most of the time we'll grab
- * the first page of the list and exit successfully.
- */
-struct page * reclaim_page(zone_t * zone)
+static int FASTCALL(shrink_cache(struct list_head * lru, int * max_scan, int nr_pages, zone_t * classzone, unsigned int gfp_mask));
+static int shrink_cache(struct list_head * lru, int * max_scan, int nr_pages, zone_t * classzone, unsigned int gfp_mask)
  {
-       struct page * page = NULL;
-       struct list_head * page_lru;
-       int maxscan;
+       LIST_HEAD(active_local_lru);
+       LIST_HEAD(inactive_local_lru);
+       struct list_head * entry;
+       int __max_scan = *max_scan;
  
-       /*
-        * We only need the pagemap_lru_lock if we don't reclaim the page,
-        * but we have to grab the pagecache_lock before the pagemap_lru_lock
-        * to avoid deadlocks and most of the time we'll succeed anyway.
-        */
-       spin_lock(&pagecache_lock);
         spin_lock(&pagemap_lru_lock);
-       maxscan = zone->inactive_clean_pages;
-       while ((page_lru = zone->inactive_clean_list.prev) !=
-                       &zone->inactive_clean_list && maxscan--) {
-               page = list_entry(page_lru, struct page, lru);
-
-               /* Wrong page on list?! (list corruption, should not happen) */
-               if (!PageInactiveClean(page)) {
-                       printk("VM: reclaim_page, wrong page on list.\n");
-                       list_del(page_lru);
-                       page->zone->inactive_clean_pages--;
-                       continue;
-               }
-
-               /* Page is referenced? Clear and move to the head of the list.. */
-               if (PageTestandClearReferenced(page)) {
-                       list_del(page_lru);
-                       list_add(page_lru, &zone->inactive_clean_list);
-               }
-
-               /* The page is dirty, or locked, move to inactive_dirty list. */
-               if (page->buffers || PageDirty(page) || TryLockPage(page)) {
-                       del_page_from_inactive_clean_list(page);
-                       add_page_to_inactive_dirty_list(page);
-                       continue;
-               }
+       while (__max_scan && (entry = lru->prev) != lru) {
+               struct page * page;
  
-               /* Page is in use?  Move it to the active list. */
-               if (page_count(page) > 1) {
-                       UnlockPage(page);
-                       del_page_from_inactive_clean_list(page);
-                       add_page_to_active_list(page);
+               if (__builtin_expect(current->need_resched, 0)) {
+                       spin_unlock(&pagemap_lru_lock);
+                       schedule();
+                       spin_lock(&pagemap_lru_lock);
                         continue;
                 }
  
-               /* OK, remove the page from the caches. */
-               if (PageSwapCache(page)) {
-                       __delete_from_swap_cache(page);
-                       goto found_page;
-               }
+               page = list_entry(entry, struct page, lru);
  
-               if (page->mapping) {
-                       __remove_inode_page(page);
-                       goto found_page;
-               }
+               if (__builtin_expect(!PageInactive(page) && !PageActive(page), 0))
+                       BUG();
  
-               /* We should never ever get here. */
-               printk(KERN_ERR "VM: reclaim_page, found unknown page\n");
-               list_del(page_lru);
-               zone->inactive_clean_pages--;
-               UnlockPage(page);
-       }
-       /* Reset page pointer, maybe we encountered an unfreeable page. */
-       page = NULL;
-       goto out;
-
-found_page:
-       memory_pressure++;
-       del_page_from_inactive_clean_list(page);
-       UnlockPage(page);
-       page->age = PAGE_AGE_START;
-       if (page_count(page) != 1)
-               printk("VM: reclaim_page, found page with count %d!\n",
-                               page_count(page));
-out:
-       spin_unlock(&pagemap_lru_lock);
-       spin_unlock(&pagecache_lock);
-       return page;
-}
-
-/**
- * page_launder - clean dirty inactive pages, move to inactive_clean list
- * @gfp_mask: what operations we are allowed to do
- * @sync: are we allowed to do synchronous IO in emergencies ?
- *
- * When this function is called, we are most likely low on free +
- * inactive_clean pages. Since we want to refill those pages as
- * soon as possible, we'll make two loops over the inactive list,
- * one to move the already cleaned pages to the inactive_clean lists
- * and one to (often asynchronously) clean the dirty inactive pages.
- *
- * In situations where kswapd cannot keep up, user processes will
- * end up calling this function. Since the user process needs to
- * have a page before it can continue with its allocation, we'll
- * do synchronous page flushing in that case.
- *
- * This code used to be heavily inspired by the FreeBSD source code. 
- * Thanks go out to Matthew Dillon.
- */
-#define CAN_DO_FS              (gfp_mask & __GFP_FS)
-int page_launder(int gfp_mask, int sync)
-{
-       int maxscan, cleaned_pages;
-       struct list_head * page_lru;
-       struct page * page;
-
-       cleaned_pages = 0;
-
-       /* Will we wait on IO? */
-       if (!sync)
-               gfp_mask &= ~__GFP_WAIT;
-
-       spin_lock(&pagemap_lru_lock);
-       maxscan = nr_inactive_dirty_pages >> DEF_PRIORITY;
-       while ((page_lru = inactive_dirty_list.prev) != &inactive_dirty_list &&
-                               maxscan-- > 0) {
-               page = list_entry(page_lru, struct page, lru);
-
-               /* Wrong page on list?! (list corruption, should not happen) */
-               if (!PageInactiveDirty(page)) {
-                       printk("VM: page_launder, wrong page on list.\n");
-                       list_del(page_lru);
-                       nr_inactive_dirty_pages--;
-                       page->zone->inactive_dirty_pages--;
+               if (PageTestandClearReferenced(page)) {
+                       if (PageInactive(page)) {
+                               del_page_from_inactive_list(page);
+                               add_page_to_active_list(page);
+                       } else if (PageActive(page)) {
+                               list_del(entry);
+                               list_add(entry, &active_list);
+                       } else
+                               BUG();
                         continue;
                 }
  
-               /* Page is referenced? Clear and move to the head of the list.. */
-               if (PageTestandClearReferenced(page)) {
-                       list_del(page_lru);
-                       list_add(page_lru, &inactive_dirty_list);
-               }
+               deactivate_page_nolock(page);
+               list_del(entry);
+               list_add_tail(entry, &inactive_local_lru);
  
-               /* Page is in use?  Move it to the active list. */
-               if ((!page->buffers && page_count(page) > 1) || page_ramdisk(page)) {
-                       del_page_from_inactive_dirty_list(page);
-                       add_page_to_active_list(page);
+               if (__builtin_expect(!memclass(page->zone, classzone), 0))
                         continue;
-               }
  
-               /* 
-                * If this zone has plenty of pages free,
-                * don't spend time on cleaning it.
-                */
-               if (zone_free_plenty(page->zone)) {
-                       list_del(page_lru);
-                       list_add(page_lru, &inactive_dirty_list);
+               __max_scan--;
+
+               /* Racy check to avoid trylocking when not worthwhile */
+               if (!page->buffers && page_count(page) != 1) {
+                       activate_page_nolock(page);
+                       list_del(entry);
+                       list_add_tail(entry, &active_local_lru);
                         continue;
                 }
  
@@ -536,362 +380,252 @@ int page_launder(int gfp_mask, int sync)
                  * The page is locked. IO in progress?
                  * Move it to the back of the list.
                  */
-               if (TryLockPage(page)) {
-                       list_del(page_lru);
-                       list_add(page_lru, &inactive_dirty_list);
+               if (__builtin_expect(TryLockPage(page), 0))
                         continue;
-               }
  
-               /*
-                * Dirty swap-cache page? Write it out if
-                * last copy..
-                */
-               if (PageDirty(page)) {
+               if (PageDirty(page) && is_page_cache_freeable(page)) {
+                       /*
+                        * It is not critical here to write it only if
+                        * the page is unmapped beause any direct writer
+                        * like O_DIRECT would set the PG_dirty bitflag
+                        * on the phisical page after having successfully
+                        * pinned it and after the I/O to the page is finished,
+                        * so the direct writes to the page cannot get lost.
+                        */
                         int (*writepage)(struct page *);
  
-                       /* Can a page get here without page->mapping? */
-                       if (!page->mapping)
-                               goto page_active;
                         writepage = page->mapping->a_ops->writepage;
-                       if (!writepage)
-                               goto page_active;
+                       if (gfp_mask & __GFP_FS && writepage) {
+                               spin_unlock(&pagemap_lru_lock);
  
-                       /* Can't do it? Move it to the back of the list */
-                       if (!CAN_DO_FS) {
-                               list_del(page_lru);
-                               list_add(page_lru, &inactive_dirty_list);
-                               UnlockPage(page);
+                               ClearPageDirty(page);
+                               writepage(page);
+
+                               spin_lock(&pagemap_lru_lock);
                                 continue;
                         }
-
-                       /* OK, do a physical asynchronous write to swap.  */
-                       ClearPageDirty(page);
-                       page_cache_get(page);
-                       spin_unlock(&pagemap_lru_lock);
-
-                       writepage(page);
-                       page_cache_release(page);
-
-                       /* And re-start the thing.. */
-                       spin_lock(&pagemap_lru_lock);
-                       continue;
                 }
  
                 /*
                  * If the page has buffers, try to free the buffer mappings
-                * associated with this page. If we succeed we either free
-                * the page (in case it was a buffercache only page) or we
-                * move the page to the inactive_clean list.
-                *
-                * On the first round, we should free all previously cleaned
-                * buffer pages
+                * associated with this page. If we succeed we try to free
+                * the page as well.
                  */
                 if (page->buffers) {
-                       int clearedbuf;
-                       int freed_page = 0;
+                       spin_unlock(&pagemap_lru_lock);
  
-                       /*
-                        * Since we might be doing disk IO, we have to
-                        * drop the spinlock and take an extra reference
-                        * on the page so it doesn't go away from under us.
-                        */
-                       del_page_from_inactive_dirty_list(page);
+                       /* avoid to free a locked page */
                         page_cache_get(page);
-                       spin_unlock(&pagemap_lru_lock);
  
-                       /* Try to free the page buffers. */
-                       clearedbuf = try_to_free_buffers(page, gfp_mask);
+                       if (try_to_free_buffers(page, gfp_mask)) {
+                               if (!page->mapping) {
+                                       UnlockPage(page);
  
-                       /*
-                        * Re-take the spinlock. Note that we cannot
-                        * unlock the page yet since we're still
-                        * accessing the page_struct here...
-                        */
-                       spin_lock(&pagemap_lru_lock);
+                                       /*
+                                        * Account we successfully freed a page
+                                        * of buffer cache.
+                                        */
+                                       atomic_dec(&buffermem_pages);
  
-                       /* The buffers were not freed. */
-                       if (!clearedbuf) {
-                               add_page_to_inactive_dirty_list(page);
+                                       spin_lock(&pagemap_lru_lock);
+                                       __lru_cache_del(page);
  
-                       /* The page was only in the buffer cache. */
-                       } else if (!page->mapping) {
-                               atomic_dec(&buffermem_pages);
-                               freed_page = 1;
-                               cleaned_pages++;
+                                       /* effectively free the page here */
+                                       page_cache_release(page);
  
-                       /* The page has more users besides the cache and us. */
-                       } else if (page_count(page) > 2) {
-                               add_page_to_active_list(page);
+                                       if (--nr_pages)
+                                               continue;
+                                       break;
+                               } else {
+                                       /*
+                                        * The page is still in pagecache so undo the stuff
+                                        * before the try_to_free_buffers since we've not
+                                        * finished and we can now try the next step.
+                                        */
+                                       page_cache_release(page);
+
+                                       spin_lock(&pagemap_lru_lock);
+                               }
+                       } else {
+                               /* failed to drop the buffers so stop here */
+                               UnlockPage(page);
+                               page_cache_release(page);
  
-                       /* OK, we "created" a freeable page. */
-                       } else /* page->mapping && page_count(page) == 2 */ {
-                               add_page_to_inactive_clean_list(page);
-                               cleaned_pages++;
+                               spin_lock(&pagemap_lru_lock);
+                               continue;
                         }
-
-                       /*
-                        * Unlock the page and drop the extra reference.
-                        * We can only do it here because we are accessing
-                        * the page struct above.
-                        */
-                       UnlockPage(page);
-                       page_cache_release(page);
-
-                       continue;
-               } else if (page->mapping && !PageDirty(page)) {
-                       /*
-                        * If a page had an extra reference in
-                        * deactivate_page(), we will find it here.
-                        * Now the page is really freeable, so we
-                        * move it to the inactive_clean list.
-                        */
-                       del_page_from_inactive_dirty_list(page);
-                       add_page_to_inactive_clean_list(page);
-                       UnlockPage(page);
-                       cleaned_pages++;
-               } else {
-page_active:
-                       /*
-                        * OK, we don't know what to do with the page.
-                        * It's no use keeping it here, so we move it to
-                        * the active list.
-                        */
-                       del_page_from_inactive_dirty_list(page);
-                       add_page_to_active_list(page);
-                       UnlockPage(page);
                 }
-       }
-       spin_unlock(&pagemap_lru_lock);
  
-       /* Return the number of pages moved to the inactive_clean list. */
-       return cleaned_pages;
-}
+               if (__builtin_expect(!page->mapping, 0))
+                       BUG();
  
-/**
- * refill_inactive_scan - scan the active list and find pages to deactivate
- * @priority: the priority at which to scan
- *
- * This function will scan a portion of the active list to find
- * unused pages, those pages will then be moved to the inactive list.
- */
-static int refill_inactive_scan(unsigned int priority)
-{
-       struct list_head * page_lru;
-       struct page * page;
-       int maxscan = nr_active_pages >> priority;
-       int page_active = 0;
-       int nr_deactivated = 0;
+               if (__builtin_expect(!spin_trylock(&pagecache_lock), 0)) {
+                       /* we hold the page lock so the page cannot go away from under us */
+                       spin_unlock(&pagemap_lru_lock);
  
-       /* Take the lock while messing with the list... */
-       spin_lock(&pagemap_lru_lock);
-       while (maxscan-- > 0 && (page_lru = active_list.prev) != &active_list) {
-               page = list_entry(page_lru, struct page, lru);
-
-               /* Wrong page on list?! (list corruption, should not happen) */
-               if (!PageActive(page)) {
-                       printk("VM: refill_inactive, wrong page on list.\n");
-                       list_del(page_lru);
-                       nr_active_pages--;
-                       continue;
+                       spin_lock(&pagecache_lock);
+                       spin_lock(&pagemap_lru_lock);
                 }
  
                 /*
-                * Do not deactivate pages from zones which 
-                * have plenty inactive pages.
+                * this is the non-racy check, it is critical to check
+                * PageDirty _after_ we made sure the page is freeable
+                * so not in use by anybody.
                  */
-
-               if (zone_inactive_plenty(page->zone)) {
-                       page_active = 1;
-                       goto skip_page;
+               if (!is_page_cache_freeable(page) || PageDirty(page)) {
+                       spin_unlock(&pagecache_lock);
+                       UnlockPage(page);
+                       continue;
                 }
  
-               /* Do aging on the pages. */
-               if (PageTestandClearReferenced(page)) {
-                       age_page_up(page);
-                       page_active = 1;
-               } else {
-                       age_page_down(page);
-                       /*
-                        * Since we don't hold a reference on the page
-                        * ourselves, we have to do our test a bit more
-                        * strict then deactivate_page(). This is needed
-                        * since otherwise the system could hang shuffling
-                        * unfreeable pages from the active list to the
-                        * inactive_dirty list and back again...
-                        *
-                        * SUBTLE: we can have buffer pages with count 1.
-                        */
-                       if (page_count(page) <= (page->buffers ? 2 : 1)) {
-                               deactivate_page_nolock(page);
-                               page_active = 0;
-                       } else {
-                               page_active = 1;
-                       }
-               }
-               /*
-                * If the page is still on the active list, move it
-                * to the other end of the list. Otherwise we exit if
-                * we have done enough work.
-                */
-               if (page_active || PageActive(page)) {
-skip_page:
-                       list_del(page_lru);
-                       list_add(page_lru, &active_list);
-               } else {
-                       nr_deactivated++;
-               }
+               /* point of no return */
+               if (__builtin_expect(!PageSwapCache(page), 1))
+                       __remove_inode_page(page);
+               else
+                       __delete_from_swap_cache(page);
+               spin_unlock(&pagecache_lock);
+
+               __lru_cache_del(page);
+
+               UnlockPage(page);
+
+               /* effectively free the page here */
+               page_cache_release(page);
+
+               if (--nr_pages)
+                       continue;
+               break;
         }
+
+       list_splice(&inactive_local_lru, &inactive_list);
+       list_splice(&active_local_lru, &active_list);
         spin_unlock(&pagemap_lru_lock);
  
-       return nr_deactivated;
+       *max_scan = __max_scan;
+       return nr_pages;
  }
  
-/*
- * Check if there are zones with a severe shortage of free pages,
- * or if all zones have a minor shortage.
- */
-int free_shortage(void)
+static int FASTCALL(shrink_caches(int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages));
+static int shrink_caches(int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages)
  {
-       pg_data_t *pgdat;
-       unsigned int global_free = 0;
-       unsigned int global_target = freepages.high;
+       int max_scan = (nr_inactive_pages + nr_active_pages / priority) / priority;
  
-       /* Are we low on free pages anywhere? */
-       pgdat = pgdat_list;
-       do {
-               int i;
-               for(i = 0; i < MAX_NR_ZONES; i++) {
-                       zone_t *zone = pgdat->node_zones+ i;
-                       unsigned int free;
-
-                       if (!zone->size)
-                               continue;
+       nr_pages -= kmem_cache_reap(gfp_mask);
+       if (nr_pages <= 0)
+               return 0;
  
-                       free = zone->free_pages;
-                       free += zone->inactive_clean_pages;
+       nr_pages = shrink_cache(&inactive_list, &max_scan, nr_pages, classzone, gfp_mask);
+       if (nr_pages <= 0)
+               return 0;
  
-                       /* Local shortage? */
-                       if (free < zone->pages_low)
-                               return 1;
+       nr_pages = shrink_cache(&active_list, &max_scan, nr_pages, classzone, gfp_mask);
+       if (nr_pages <= 0)
+               return 0;
  
-                       global_free += free;
-               }
-               pgdat = pgdat->node_next;
-       } while (pgdat);
+       shrink_dcache_memory(priority, gfp_mask);
+       shrink_icache_memory(priority, gfp_mask);
  
-       /* Global shortage? */
-       return global_free < global_target;
+       return nr_pages;
  }
  
-/*
- * Are we low on inactive pages globally or in any zone?
- */
-int inactive_shortage(void)
+int try_to_free_pages(zone_t * classzone, unsigned int gfp_mask, unsigned int order)
  {
-       pg_data_t *pgdat;
-       unsigned int global_target = freepages.high + inactive_target;
-       unsigned int global_inactive = 0;
+       int priority = DEF_PRIORITY;
  
-       pgdat = pgdat_list;
         do {
-               int i;
-               for(i = 0; i < MAX_NR_ZONES; i++) {
-                       zone_t *zone = pgdat->node_zones + i;
-                       unsigned int inactive;
+               int nr_pages = SWAP_CLUSTER_MAX;
+               nr_pages = shrink_caches(priority, classzone, gfp_mask, nr_pages);
+               if (nr_pages <= 0)
+                       return 1;
  
-                       if (!zone->size)
-                               continue;
+               swap_out(priority, classzone, gfp_mask, SWAP_CLUSTER_MAX);
+       } while (--priority);
  
-                       inactive  = zone->inactive_dirty_pages;
-                       inactive += zone->inactive_clean_pages;
-                       inactive += zone->free_pages;
+       return 0;
+}
  
-                       /* Local shortage? */
-                       if (inactive < zone->pages_high)
-                               return 1;
+DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
  
-                       global_inactive += inactive;
-               }
-               pgdat = pgdat->node_next;
-       } while (pgdat);
+static int check_classzone_need_balance(zone_t * classzone)
+{
+       zone_t * first_classzone;
  
-       /* Global shortage? */
-       return global_inactive < global_target;
+       first_classzone = classzone->zone_pgdat->node_zones;
+       while (classzone >= first_classzone) {
+               if (classzone->free_pages > classzone->pages_high)
+                       return 0;
+               classzone--;
+       }
+       return 1;
  }
  
-/*
- * Loop until we are no longer under an inactive or free
- * shortage. Return 1 on success, 0 if we failed to get
- * there even after "maxtry" loops.
- */
-#define INACTIVE_SHORTAGE 1
-#define FREE_SHORTAGE 2
-#define GENERAL_SHORTAGE 4
-static int do_try_to_free_pages(unsigned int gfp_mask, int user)
+static int kswapd_balance_pgdat(pg_data_t * pgdat)
  {
-       int shortage = 0;
-       int maxtry;
+       int need_more_balance = 0, i;
+       zone_t * zone;
  
-       /* Always walk at least the active queue when called */
-       refill_inactive_scan(DEF_PRIORITY);
+       for (i = pgdat->nr_zones-1; i >= 0; i--) {
+               zone = pgdat->node_zones + i;
+               if (current->need_resched)
+                       schedule();
+               if (!zone->need_balance)
+                       continue;
+               if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) {
+                       zone->need_balance = 0;
+                       continue;
+               }
+               if (check_classzone_need_balance(zone))
+                       need_more_balance = 1;
+               else
+                       zone->need_balance = 0;
+       }
  
-       maxtry = 1 << DEF_PRIORITY;
-       do {
-               /*
-                * If needed, we move pages from the active list
-                * to the inactive list.
-                */
-               if (shortage & INACTIVE_SHORTAGE) {
-                       /* Walk the VM space for a bit.. */
-                       swap_out(DEF_PRIORITY, gfp_mask);
+       return need_more_balance;
+}
  
-                       /* ..and refill the inactive list */
-                       refill_inactive_scan(DEF_PRIORITY);
-               }
+static void kswapd_balance(void)
+{
+       int need_more_balance;
+       pg_data_t * pgdat;
  
-               /*
-                * If we're low on free pages, move pages from the
-                * inactive_dirty list to the inactive_clean list.
-                *
-                * Usually bdflush will have pre-cleaned the pages
-                * before we get around to moving them to the other
-                * list, so this is a relatively cheap operation.
-                */
-               if (shortage & FREE_SHORTAGE)
-                       page_launder(gfp_mask, user);
+       do {
+               need_more_balance = 0;
+               pgdat = pgdat_list;
+               do
+                       need_more_balance |= kswapd_balance_pgdat(pgdat);
+               while ((pgdat = pgdat->node_next));
+       } while (need_more_balance);
+}
  
-               /*      
-                * Reclaim unused slab cache if we were short on memory.
-                */
-               if (shortage & GENERAL_SHORTAGE) {
-                       shrink_dcache_memory(DEF_PRIORITY, gfp_mask);
-                       shrink_icache_memory(DEF_PRIORITY, gfp_mask);
+static int kswapd_can_sleep_pgdat(pg_data_t * pgdat)
+{
+       zone_t * zone;
+       int i;
  
-                       kmem_cache_reap(gfp_mask);
-               }
+       for (i = pgdat->nr_zones-1; i >= 0; i--) {
+               zone = pgdat->node_zones + i;
+               if (!zone->need_balance)
+                       continue;
+               return 0;
+       }
  
-               if (current->need_resched) {
-                        __set_current_state(TASK_RUNNING);
-                       schedule();
-               }
+       return 1;
+}
  
-               shortage = 0;
-               if (inactive_shortage())
-                       shortage |= INACTIVE_SHORTAGE | GENERAL_SHORTAGE;
-               if (free_shortage())
-                       shortage |= FREE_SHORTAGE | GENERAL_SHORTAGE;
+static int kswapd_can_sleep(void)
+{
+       pg_data_t * pgdat;
  
-               if (--maxtry <= 0)
-                       break;
-       } while (shortage);
+       pgdat = pgdat_list;
+       do {
+               if (kswapd_can_sleep_pgdat(pgdat))
+                       continue;
+               return 0;
+       } while ((pgdat = pgdat->node_next));
  
-       /* Return success if we're not "totally short" */
-       return shortage != (FREE_SHORTAGE | INACTIVE_SHORTAGE | GENERAL_SHORTAGE);
+       return 1;
  }
  
-DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
-DECLARE_WAIT_QUEUE_HEAD(kswapd_done);
-
  /*
   * The background pageout daemon, started as a kernel thread
   * from the init process. 
@@ -908,6 +642,7 @@ DECLARE_WAIT_QUEUE_HEAD(kswapd_done);
  int kswapd(void *unused)
  {
         struct task_struct *tsk = current;
+       DECLARE_WAITQUEUE(wait, tsk);
  
         daemonize();
         strcpy(tsk->comm, "kswapd");
@@ -931,107 +666,31 @@ int kswapd(void *unused)
          * Kswapd main loop.
          */
         for (;;) {
-               static long recalc = 0;
-
-               /* Once a second ... */
-               if (time_after(jiffies, recalc + HZ)) {
-                       recalc = jiffies;
+               __set_current_state(TASK_INTERRUPTIBLE);
+               add_wait_queue(&kswapd_wait, &wait);
  
-                       /* Recalculate VM statistics. */
-                       recalculate_vm_stats();
-               }
-
-               if (!do_try_to_free_pages(GFP_KSWAPD, 1)) {
-                       if (out_of_memory())
-                               oom_kill();
-                       continue;
-               }
-
-               run_task_queue(&tq_disk);
-               interruptible_sleep_on_timeout(&kswapd_wait, HZ);
-       }
-}
-
-void wakeup_kswapd(void)
-{
-       if (waitqueue_active(&kswapd_wait))
-               wake_up_interruptible(&kswapd_wait);
-}
-
-/*
- * Called by non-kswapd processes when they want more
- * memory but are unable to sleep on kswapd because
- * they might be holding some IO locks ...
- */
-int try_to_free_pages(unsigned int gfp_mask)
-{
-       int ret = 1;
-
-       if (gfp_mask & __GFP_WAIT) {
-               current->flags |= PF_MEMALLOC;
-               ret = do_try_to_free_pages(gfp_mask, 1);
-               current->flags &= ~PF_MEMALLOC;
-       }
-
-       return ret;
-}
-
-DECLARE_WAIT_QUEUE_HEAD(kreclaimd_wait);
-/*
- * Kreclaimd will move pages from the inactive_clean list to the
- * free list, in order to keep atomic allocations possible under
- * all circumstances.
- */
-int kreclaimd(void *unused)
-{
-       struct task_struct *tsk = current;
-       pg_data_t *pgdat;
-
-       daemonize();
-       strcpy(tsk->comm, "kreclaimd");
-       sigfillset(&tsk->blocked);
-       current->flags |= PF_MEMALLOC;
-
-       while (1) {
+               mb();
+               if (kswapd_can_sleep())
+                       schedule();
  
-               /*
-                * We sleep until someone wakes us up from
-                * page_alloc.c::__alloc_pages().
-                */
-               interruptible_sleep_on(&kreclaimd_wait);
+               __set_current_state(TASK_RUNNING);
+               remove_wait_queue(&kswapd_wait, &wait);
  
                 /*
-                * Move some pages from the inactive_clean lists to
-                * the free lists, if it is needed.
+                * If we actually get into a low-memory situation,
+                * the processes needing more memory will wake us
+                * up on a more timely basis.
                  */
-               pgdat = pgdat_list;
-               do {
-                       int i;
-                       for(i = 0; i < MAX_NR_ZONES; i++) {
-                               zone_t *zone = pgdat->node_zones + i;
-                               if (!zone->size)
-                                       continue;
-
-                               while (zone->free_pages < zone->pages_low) {
-                                       struct page * page;
-                                       page = reclaim_page(zone);
-                                       if (!page)
-                                               break;
-                                       __free_page(page);
-                               }
-                       }
-                       pgdat = pgdat->node_next;
-               } while (pgdat);
+               kswapd_balance();
+               run_task_queue(&tq_disk);
         }
  }
  
-
  static int __init kswapd_init(void)
  {
-       printk("Starting kswapd v1.8\n");
+       printk("Starting kswapd\n");
         swap_setup();
         kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
-       kernel_thread(kreclaimd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
         return 0;
  }
  
diff --git a/net/core/dev.c b/net/core/dev.c

index b3a91ffd088121133c8b66b09d61af02e3d9cee5..5f215ba103e8e3b69d9a038ba61243043a4db98f 100644 (file)
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1218,7 +1218,7 @@ enqueue:
                         dev_hold(skb->dev);
                         __skb_queue_tail(&queue->input_pkt_queue,skb);
                         /* Runs from irqs or BH's, no need to wake BH */
-                       __cpu_raise_softirq(this_cpu, NET_RX_SOFTIRQ);
+                       cpu_raise_softirq(this_cpu, NET_RX_SOFTIRQ);
                         local_irq_restore(flags);
  #ifndef OFFLINE_SAMPLE
                         get_sample_stats(this_cpu);
@@ -1529,7 +1529,7 @@ softnet_break:
         local_irq_disable();
         netdev_rx_stat[this_cpu].time_squeeze++;
         /* This already runs in BH context, no need to wake up BH's */
-       __cpu_raise_softirq(this_cpu, NET_RX_SOFTIRQ);
+       cpu_raise_softirq(this_cpu, NET_RX_SOFTIRQ);
         local_irq_enable();
  
         NET_PROFILE_LEAVE(softnet_process);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c

index 1fb788cea75a461219fbe5f7ecc5a2091d8d80e4..8374d454f21a4267a328a67e297b28ef26e3e406 100644 (file)
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1236,7 +1236,7 @@ static int tcp_recv_urg(struct sock * sk, long timeo,
                 msg->msg_flags|=MSG_OOB;
  
                 if(len>0) {
-                       if (!(flags & MSG_PEEK) && !(flags & MSG_TRUNC))
+                       if (!(flags & MSG_TRUNC))
                                 err = memcpy_toiovec(msg->msg_iov, &c, 1);
                         len = 1;
                 } else
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c

index ff395981cfb7683acea8f8f585ccf31fc9381617..cd047cda9017b92544ea8de5a25160602ea092a1 100644 (file)
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -726,7 +726,7 @@ int ip6_forward(struct sk_buff *skb)
         struct ipv6hdr *hdr = skb->nh.ipv6h;
         struct inet6_skb_parm *opt =(struct inet6_skb_parm*)skb->cb;
         
-       if (ipv6_devconf.forwarding == 0 && opt->srcrt == 0)
+       if (ipv6_devconf.forwarding == 0)
                 goto error;
  
         skb->ip_summed = CHECKSUM_NONE;
author	Linus Torvalds <torvalds@athlon.transmeta.com>
	Tue, 5 Feb 2002 04:18:55 +0000 (20:18 -0800)
committer	Linus Torvalds <torvalds@athlon.transmeta.com>
	Tue, 5 Feb 2002 04:18:55 +0000 (20:18 -0800)
Documentation/DocBook/Makefile		patch \| blob \| history
Makefile		patch \| blob \| history
arch/alpha/kernel/irq.c		patch \| blob \| history
arch/alpha/kernel/irq_alpha.c		patch \| blob \| history
arch/alpha/kernel/process.c		patch \| blob \| history
arch/alpha/kernel/traps.c		patch \| blob \| history
arch/alpha/mm/fault.c		patch \| blob \| history
arch/arm/kernel/init_task.c		patch \| blob \| history
arch/cris/kernel/process.c		patch \| blob \| history
arch/i386/kernel/init_task.c		patch \| blob \| history
arch/i386/kernel/io_apic.c		patch \| blob \| history
arch/i386/kernel/process.c		patch \| blob \| history
arch/i386/kernel/smp.c		patch \| blob \| history
arch/i386/kernel/smpboot.c		patch \| blob \| history
arch/i386/kernel/traps.c		patch \| blob \| history
arch/i386/mm/extable.c		patch \| blob \| history
arch/i386/mm/fault.c		patch \| blob \| history
arch/ia64/kernel/init_task.c		patch \| blob \| history
arch/m68k/kernel/process.c		patch \| blob \| history
arch/mips/kernel/init_task.c		patch \| blob \| history
arch/mips64/kernel/init_task.c		patch \| blob \| history
arch/mips64/mm/fault.c		patch \| blob \| history
arch/parisc/kernel/init_task.c		patch \| blob \| history
arch/parisc/kernel/pdc_cons.c		patch \| blob \| history
arch/ppc/Makefile		patch \| blob \| history
arch/ppc/amiga/config.c		patch \| blob \| history
arch/ppc/kernel/ppc_ksyms.c		patch \| blob \| history
arch/ppc/kernel/process.c		patch \| blob \| history
arch/s390/kernel/init_task.c		patch \| blob \| history
arch/s390x/kernel/init_task.c		patch \| blob \| history
arch/sh/kernel/init_task.c		patch \| blob \| history
arch/sparc/kernel/init_task.c		patch \| blob \| history
arch/sparc/lib/debuglocks.c		patch \| blob \| history
arch/sparc64/kernel/init_task.c		patch \| blob \| history
drivers/block/ll_rw_blk.c		patch \| blob \| history
drivers/block/loop.c		patch \| blob \| history
drivers/block/rd.c		patch \| blob \| history
drivers/char/console.c		patch \| blob \| history
drivers/char/keyboard.c		patch \| blob \| history
drivers/char/pc_keyb.c		patch \| blob \| history
drivers/char/raw.c		patch \| blob \| history
drivers/char/serial.c		patch \| blob \| history
drivers/char/vc_screen.c		patch \| blob \| history
drivers/char/vt.c		patch \| blob \| history
drivers/ide/ide.c		patch \| blob \| history
drivers/md/lvm.c		patch \| blob \| history
drivers/md/md.c		patch \| blob \| history
drivers/net/3c59x.c		patch \| blob \| history
drivers/net/eepro100.c		patch \| blob \| history
drivers/sbus/char/sunkbd.c		patch \| blob \| history
drivers/scsi/megaraid.c		patch \| blob \| history
drivers/scsi/qla1280.h		patch \| blob \| history
drivers/video/fbcon.c		patch \| blob \| history
fs/block_dev.c		patch \| blob \| history
fs/buffer.c		patch \| blob \| history
fs/dcache.c		patch \| blob \| history
fs/devices.c		patch \| blob \| history
fs/exec.c		patch \| blob \| history
fs/ext2/dir.c		patch \| blob \| history
fs/ext2/fsync.c		patch \| blob \| history
fs/ext2/inode.c		patch \| blob \| history
fs/fcntl.c		patch \| blob \| history
fs/file_table.c		patch \| blob \| history
fs/inode.c		patch \| blob \| history
fs/open.c		patch \| blob \| history
fs/proc/kmsg.c		patch \| blob \| history
fs/proc/proc_misc.c		patch \| blob \| history
fs/reiserfs/file.c		patch \| blob \| history
fs/reiserfs/inode.c		patch \| blob \| history
fs/super.c		patch \| blob \| history
include/asm-alpha/fcntl.h		patch \| blob \| history
include/asm-alpha/processor.h		patch \| blob \| history
include/asm-arm/processor.h		patch \| blob \| history
include/asm-cris/processor.h		patch \| blob \| history
include/asm-i386/fcntl.h		patch \| blob \| history
include/asm-i386/hw_irq.h		patch \| blob \| history
include/asm-i386/irq.h		patch \| blob \| history
include/asm-i386/kmap_types.h		patch \| blob \| history
include/asm-i386/page.h		patch \| blob \| history
include/asm-i386/pgalloc.h		patch \| blob \| history
include/asm-i386/processor.h		patch \| blob \| history
include/asm-ia64/processor.h		patch \| blob \| history
include/asm-m68k/processor.h		patch \| blob \| history
include/asm-mips/processor.h		patch \| blob \| history
include/asm-mips64/processor.h		patch \| blob \| history
include/asm-parisc/processor.h		patch \| blob \| history
include/asm-ppc/kmap_types.h		patch \| blob \| history
include/asm-ppc/processor.h		patch \| blob \| history
include/asm-s390/processor.h		patch \| blob \| history
include/asm-s390x/processor.h		patch \| blob \| history
include/asm-sh/processor.h		patch \| blob \| history
include/asm-sparc/fcntl.h		patch \| blob \| history
include/asm-sparc/kmap_types.h		patch \| blob \| history
include/asm-sparc/processor.h		patch \| blob \| history
include/asm-sparc64/fcntl.h		patch \| blob \| history
include/asm-sparc64/processor.h		patch \| blob \| history
include/linux/blkdev.h		patch \| blob \| history
include/linux/cache.h		patch \| blob \| history
include/linux/console.h		patch \| blob \| history
include/linux/ext2_fs_i.h		patch \| blob \| history
include/linux/fs.h		patch \| blob \| history
include/linux/highmem.h		patch \| blob \| history
include/linux/irq.h		patch \| blob \| history
include/linux/kbd_kern.h		patch \| blob \| history
include/linux/kernel.h		patch \| blob \| history
include/linux/list.h		patch \| blob \| history
include/linux/loop.h		patch \| blob \| history
include/linux/lvm.h		patch \| blob \| history
include/linux/mm.h		patch \| blob \| history
include/linux/mmzone.h		patch \| blob \| history
include/linux/pagemap.h		patch \| blob \| history
include/linux/rbtree.h	[new file with mode: 0644]	patch \| blob
include/linux/sched.h		patch \| blob \| history
include/linux/slab.h		patch \| blob \| history
include/linux/swap.h		patch \| blob \| history
include/linux/swapctl.h		patch \| blob \| history
include/linux/timer.h		patch \| blob \| history
kernel/Makefile		patch \| blob \| history
kernel/fork.c		patch \| blob \| history
kernel/ksyms.c		patch \| blob \| history
kernel/panic.c		patch \| blob \| history
kernel/printk.c		patch \| blob \| history
kernel/sched.c		patch \| blob \| history
kernel/signal.c		patch \| blob \| history
kernel/sysctl.c		patch \| blob \| history
lib/Makefile		patch \| blob \| history
lib/bust_spinlocks.c	[new file with mode: 0644]	patch \| blob
lib/rbtree.c	[new file with mode: 0644]	patch \| blob
mm/filemap.c		patch \| blob \| history
mm/highmem.c		patch \| blob \| history
mm/memory.c		patch \| blob \| history
mm/mlock.c		patch \| blob \| history
mm/mmap.c		patch \| blob \| history
mm/mmap_avl.c	[deleted file]	patch \| blob \| history
mm/mprotect.c		patch \| blob \| history
mm/mremap.c		patch \| blob \| history
mm/numa.c		patch \| blob \| history
mm/oom_kill.c		patch \| blob \| history
mm/page_alloc.c		patch \| blob \| history
mm/shmem.c		patch \| blob \| history
mm/slab.c		patch \| blob \| history
mm/swap.c		patch \| blob \| history
mm/swap_state.c		patch \| blob \| history
mm/swapfile.c		patch \| blob \| history
mm/vmalloc.c		patch \| blob \| history
mm/vmscan.c		patch \| blob \| history
net/core/dev.c		patch \| blob \| history
net/ipv4/tcp.c		patch \| blob \| history
net/ipv6/ip6_output.c		patch \| blob \| history