$(TOPDIR)/arch/i386/kernel/mca.c \
$(TOPDIR)/arch/i386/kernel/mtrr.c \
$(TOPDIR)/drivers/char/misc.c \
+ $(TOPDIR)/kernel/printk.c \
$(TOPDIR)/drivers/net/net_init.c \
$(TOPDIR)/drivers/net/8390.c \
$(TOPDIR)/drivers/char/serial.c \
VERSION = 2
PATCHLEVEL = 4
SUBLEVEL = 10
-EXTRAVERSION =-pre10
+EXTRAVERSION =-pre11
KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)
static void register_irq_proc(unsigned int irq);
-unsigned long irq_err_count;
+volatile unsigned long irq_err_count;
/*
* Special irq handlers.
unsigned long __irq_attempt[NR_IRQS];
#endif
-extern unsigned long irq_err_count;
-
/* Hack minimum IPL during interrupt processing for broken hardware. */
#ifdef CONFIG_ALPHA_BROKEN_IRQ_MASK
int __min_ipl;
*/
unsigned long init_user_stack[1024] = { STACK_MAGIC, };
-static struct vm_area_struct init_mmap = INIT_MMAP;
static struct fs_struct init_fs = INIT_FS;
static struct files_struct init_files = INIT_FILES;
static struct signal_struct init_signals = INIT_SIGNALS;
void
show_regs(struct pt_regs * regs)
{
- printk("\nps: %04lx pc: [<%016lx>]\n", regs->ps, regs->pc);
+ printk("\n");
+ printk("Pid: %d, comm: %20s\n", current->pid, current->comm);
+ printk("ps: %04lx pc: [<%016lx>] CPU %d\n", regs->ps, regs->pc, smp_processor_id());
printk("rp: [<%016lx>] sp: %p\n", regs->r26, regs+1);
printk(" r0: %016lx r1: %016lx r2: %016lx r3: %016lx\n",
regs->r0, regs->r1, regs->r2, regs->r3);
printk("\n");
}
+void show_trace_task(struct task_struct * tsk)
+{
+ struct thread_struct * thread = &tsk->thread;
+ unsigned long fp, sp = thread->ksp, base = (unsigned long) thread;
+
+ if (sp > base && sp+6*8 < base + 16*1024) {
+ fp = ((unsigned long*)sp)[6];
+ if (fp > sp && fp < base + 16*1024)
+ dik_show_trace((unsigned long *)fp);
+ }
+}
+
int kstack_depth_to_print = 24;
void show_stack(unsigned long *sp)
case 3: /* FEN fault */
case 5: /* illoc */
default: /* unexpected instruction-fault type */
+ ;
}
send_sig(SIGILL, current, 1);
}
goto bad_area;
}
+ survive:
/*
* If for any reason at all we couldn't handle the fault,
* make sure we exit gracefully rather than endlessly redo
* us unable to handle the page fault gracefully.
*/
out_of_memory:
+ if (current->pid == 1) {
+ current->policy |= SCHED_YIELD;
+ schedule();
+ down_read(&mm->mmap_sem);
+ goto survive;
+ }
printk(KERN_ALERT "VM: killing process %s(%d)\n",
current->comm, current->pid);
if (!user_mode(regs))
#include <asm/uaccess.h>
#include <asm/pgtable.h>
-static struct vm_area_struct init_mmap = INIT_MMAP;
static struct fs_struct init_fs = INIT_FS;
static struct files_struct init_files = INIT_FILES;
static struct signal_struct init_signals = INIT_SIGNALS;
* setup.
*/
-static struct vm_area_struct init_mmap = INIT_MMAP;
static struct fs_struct init_fs = INIT_FS;
static struct files_struct init_files = INIT_FILES;
static struct signal_struct init_signals = INIT_SIGNALS;
#include <asm/pgtable.h>
#include <asm/desc.h>
-static struct vm_area_struct init_mmap = INIT_MMAP;
static struct fs_struct init_fs = INIT_FS;
static struct files_struct init_files = INIT_FILES;
static struct signal_struct init_signals = INIT_SIGNALS;
void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
{
struct IO_APIC_route_entry entry;
+ unsigned long flags;
/*
* Disable it in the IO-APIC irq-routing table:
*/
memset(&entry, 0, sizeof(entry));
entry.mask = 1;
+ spin_lock_irqsave(&ioapic_lock, flags);
io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
+ spin_unlock_irqrestore(&ioapic_lock, flags);
}
static void clear_IO_APIC (void)
{
struct IO_APIC_route_entry entry;
int apic, pin, idx, irq, first_notcon = 1, vector;
+ unsigned long flags;
printk(KERN_DEBUG "init IO_APIC IRQs\n");
if (!apic && (irq < 16))
disable_8259A_irq(irq);
}
+ spin_lock_irqsave(&ioapic_lock, flags);
io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
+ spin_unlock_irqrestore(&ioapic_lock, flags);
}
}
void __init setup_ExtINT_IRQ0_pin(unsigned int pin, int vector)
{
struct IO_APIC_route_entry entry;
+ unsigned long flags;
memset(&entry,0,sizeof(entry));
/*
* Add it to the IO-APIC irq-routing table:
*/
+ spin_lock_irqsave(&ioapic_lock, flags);
io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1));
io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0));
+ spin_unlock_irqrestore(&ioapic_lock, flags);
enable_8259A_irq(0);
}
struct IO_APIC_reg_00 reg_00;
struct IO_APIC_reg_01 reg_01;
struct IO_APIC_reg_02 reg_02;
+ unsigned long flags;
printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
for (i = 0; i < nr_ioapics; i++)
for (apic = 0; apic < nr_ioapics; apic++) {
+ spin_lock_irqsave(&ioapic_lock, flags);
*(int *)®_00 = io_apic_read(apic, 0);
*(int *)®_01 = io_apic_read(apic, 1);
if (reg_01.version >= 0x10)
*(int *)®_02 = io_apic_read(apic, 2);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
printk("\n");
printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
for (i = 0; i <= reg_01.entries; i++) {
struct IO_APIC_route_entry entry;
+ spin_lock_irqsave(&ioapic_lock, flags);
*(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
*(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
printk(KERN_DEBUG " %02x %03X %02X ",
i,
{
struct IO_APIC_reg_01 reg_01;
int i;
+ unsigned long flags;
for (i = 0; i < PIN_MAP_SIZE; i++) {
irq_2_pin[i].pin = -1;
* The number of IO-APIC IRQ registers (== #pins):
*/
for (i = 0; i < nr_ioapics; i++) {
+ spin_lock_irqsave(&ioapic_lock, flags);
*(int *)®_01 = io_apic_read(i, 1);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
nr_ioapic_registers[i] = reg_01.entries+1;
}
int apic;
int i;
unsigned char old_id;
+ unsigned long flags;
/*
* Set the IOAPIC ID to the value stored in the MPC table.
for (apic = 0; apic < nr_ioapics; apic++) {
/* Read the register 0 value */
+ spin_lock_irqsave(&ioapic_lock, flags);
*(int *)®_00 = io_apic_read(apic, 0);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
old_id = mp_ioapics[apic].mpc_apicid;
mp_ioapics[apic].mpc_apicid);
reg_00.ID = mp_ioapics[apic].mpc_apicid;
+ spin_lock_irqsave(&ioapic_lock, flags);
io_apic_write(apic, 0, *(int *)®_00);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
/*
* Sanity check
*/
+ spin_lock_irqsave(&ioapic_lock, flags);
*(int *)®_00 = io_apic_read(apic, 0);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
if (reg_00.ID != mp_ioapics[apic].mpc_apicid)
panic("could not set ID!\n");
else
int pin, i;
struct IO_APIC_route_entry entry0, entry1;
unsigned char save_control, save_freq_select;
+ unsigned long flags;
pin = find_isa_irq_pin(8, mp_INT);
if (pin == -1)
return;
+ spin_lock_irqsave(&ioapic_lock, flags);
*(((int *)&entry0) + 1) = io_apic_read(0, 0x11 + 2 * pin);
*(((int *)&entry0) + 0) = io_apic_read(0, 0x10 + 2 * pin);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
clear_IO_APIC_pin(0, pin);
memset(&entry1, 0, sizeof(entry1));
entry1.trigger = 0;
entry1.vector = 0;
+ spin_lock_irqsave(&ioapic_lock, flags);
io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
+ spin_unlock_irqrestore(&ioapic_lock, flags);
save_control = CMOS_READ(RTC_CONTROL);
save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
clear_IO_APIC_pin(0, pin);
+ spin_lock_irqsave(&ioapic_lock, flags);
io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
+ spin_unlock_irqrestore(&ioapic_lock, flags);
}
/*
unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
printk("\n");
+ printk("Pid: %d, comm: %20s\n", current->pid, current->comm);
printk("EIP: %04x:[<%08lx>] CPU: %d",0xffff & regs->xcs,regs->eip, smp_processor_id());
if (regs->xcs & 3)
printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
spin_lock_bh(&call_lock);
call_data = &data;
+ wmb();
/* Send a message to all other CPUs and wait for them to respond */
send_IPI_allbutself(CALL_FUNCTION_VECTOR);
* Notify initiating CPU that I've grabbed the data and am
* about to execute the function
*/
+ mb();
atomic_inc(&call_data->started);
/*
* At this point the info structure may be out of scope unless wait==1
*/
(*func)(info);
- if (wait)
+ if (wait) {
+ mb();
atomic_inc(&call_data->finished);
+ }
}
static volatile unsigned long cpu_callout_map;
/* Per CPU bogomips and other parameters */
-struct cpuinfo_x86 cpu_data[NR_CPUS];
+struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
/* Set when the idlers are all forked */
int smp_threads_ready;
*/
struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, };
-extern void bust_spinlocks(void);
-
asmlinkage void divide_error(void);
asmlinkage void debug(void);
asmlinkage void nmi(void);
{
console_verbose();
spin_lock_irq(&die_lock);
+ bust_spinlocks(1);
printk("%s: %04lx\n", str, err & 0xffff);
show_registers(regs);
-
+ bust_spinlocks(0);
spin_unlock_irq(&die_lock);
do_exit(SIGSEGV);
}
static spinlock_t nmi_print_lock = SPIN_LOCK_UNLOCKED;
-inline void nmi_watchdog_tick(struct pt_regs * regs)
+static unsigned int
+ last_irq_sums [NR_CPUS],
+ alert_counter [NR_CPUS];
+
+/*
+ * Sometimes, we know that we're disabling interrupts for too long.
+ * This happens during long writes to slow console devices, and may
+ * happen in other places.
+ *
+ * To prevent the NMI watchdog from firing when we're doing these things,
+ * touch_nmi_watchdog() may be used to reset the NMI watchdog timer
+ * back to its full interval (five seconds).
+ */
+void touch_nmi_watchdog (void)
{
+ int i;
+
/*
- * the best way to detect wether a CPU has a 'hard lockup' problem
- * is to check it's local APIC timer IRQ counts. If they are not
- * changing then that CPU has some problem.
- *
- * as these watchdog NMI IRQs are broadcasted to every CPU, here
- * we only have to check the current processor.
- *
- * since NMIs dont listen to _any_ locks, we have to be extremely
- * careful not to rely on unsafe variables. The printk might lock
- * up though, so we have to break up console_lock first ...
- * [when there will be more tty-related locks, break them up
- * here too!]
+ * Just reset the alert counters, (other CPUs might be
+ * spinning on locks we hold):
*/
+ for (i = 0; i < smp_num_cpus; i++)
+ alert_counter[i] = 0;
+}
- static unsigned int last_irq_sums [NR_CPUS],
- alert_counter [NR_CPUS];
+/*
+ * The best way to detect whether a CPU has a 'hard lockup' problem
+ * is to check it's local APIC timer IRQ counts. If they are not
+ * changing then that CPU has some problem.
+ *
+ * As these watchdog NMI IRQs are generated on every CPU, we only
+ * have to check the current processor.
+ *
+ * Since NMIs don't listen to _any_ locks, we have to be extremely
+ * careful not to rely on unsafe variables. The printk path might lock
+ * up though, so we use bust_spinlocks() to break up any console
+ * locks first. There may be other tty-related locks which require
+ * breaking as well. They can be broken in bust_spinlocks(), or the
+ * global variable `oops_in_progress' may be used to bypass the
+ * tty locking.
+ */
+inline void nmi_watchdog_tick(struct pt_regs * regs)
+{
/*
* Since current-> is always on the stack, and we always switch
* the stack NMI-atomically, it's safe to use smp_processor_id().
* We are in trouble anyway, lets at least try
* to get a message out.
*/
- bust_spinlocks();
+ bust_spinlocks(1);
printk("NMI Watchdog detected LOCKUP on CPU%d, registers:\n", cpu);
show_registers(regs);
printk("console shuts up ...\n");
console_silent();
spin_unlock(&nmi_print_lock);
+ bust_spinlocks(0);
do_exit(SIGSEGV);
}
} else {
search_exception_table(unsigned long addr)
{
unsigned long ret = 0;
- unsigned long flags;
#ifndef CONFIG_MODULES
/* There is only the kernel to search. */
ret = search_one_table(__start___ex_table, __stop___ex_table-1, addr);
return ret;
#else
+ unsigned long flags;
/* The kernel is the last "module" -- no need to treat it special. */
struct module *mp;
#include <linux/smp_lock.h>
#include <linux/interrupt.h>
#include <linux/init.h>
+#include <linux/vt_kern.h> /* For unblank_screen() */
#include <asm/system.h>
#include <asm/uaccess.h>
extern void die(const char *,struct pt_regs *,long);
+extern int console_loglevel;
+
/*
* Ugly, ugly, but the goto's result in better assembly..
*/
start &= PAGE_MASK;
for (;;) {
- if (handle_mm_fault(current->mm, vma, start, 1) <= 0)
- goto bad_area;
+ survive:
+ {
+ int fault = handle_mm_fault(current->mm, vma, start, 1);
+ if (!fault)
+ goto bad_area;
+ if (fault < 0)
+ goto out_of_memory;
+ }
if (!size)
break;
size--;
bad_area:
return 0;
+
+out_of_memory:
+ if (current->pid == 1) {
+ current->policy |= SCHED_YIELD;
+ schedule();
+ goto survive;
+ }
+ goto bad_area;
}
-extern spinlock_t console_lock, timerlist_lock;
+extern spinlock_t timerlist_lock;
/*
* Unlock any spinlocks which will prevent us from getting the
* message out (timerlist_lock is acquired through the
* console unblank code)
*/
-void bust_spinlocks(void)
+void bust_spinlocks(int yes)
{
- spin_lock_init(&console_lock);
spin_lock_init(&timerlist_lock);
+ if (yes) {
+ oops_in_progress = 1;
+#ifdef CONFIG_SMP
+ global_irq_lock = 0; /* Many serial drivers do __global_cli() */
+#endif
+ } else {
+ int loglevel_save = console_loglevel;
+ unblank_screen();
+ oops_in_progress = 0;
+ /*
+ * OK, the message is on the console. Now we call printk()
+ * without oops_in_progress set so that printk will give klogd
+ * a poke. Hold onto your hats...
+ */
+ console_loglevel = 15; /* NMI oopser may have shut the console up */
+ printk(" ");
+ console_loglevel = loglevel_save;
+ }
}
+#if 0
+/*
+ * Verbose bug reporting: call do_BUG(__FILE__, __LINE__) in page.h:BUG() to enable this
+ */
+void do_BUG(const char *file, int line)
+{
+ bust_spinlocks(1);
+ printk("kernel BUG at %s:%d!\n", file, line);
+}
+#endif
+
asmlinkage void do_invalid_op(struct pt_regs *, unsigned long);
extern unsigned long idt;
goto bad_area;
}
+ survive:
/*
* If for any reason at all we couldn't handle the fault,
* make sure we exit gracefully rather than endlessly redo
* terminate things with extreme prejudice.
*/
- bust_spinlocks();
+ bust_spinlocks(1);
if (address < PAGE_SIZE)
printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
printk(KERN_ALERT "*pte = %08lx\n", page);
}
die("Oops", regs, error_code);
+ bust_spinlocks(0);
do_exit(SIGKILL);
/*
*/
out_of_memory:
up_read(&mm->mmap_sem);
+ if (tsk->pid == 1) {
+ tsk->policy |= SCHED_YIELD;
+ schedule();
+ down_read(&mm->mmap_sem);
+ goto survive;
+ }
printk("VM: killing process %s\n", tsk->comm);
if (error_code & 4)
do_exit(SIGKILL);
#include <asm/uaccess.h>
#include <asm/pgtable.h>
-static struct vm_area_struct init_mmap = INIT_MMAP;
static struct fs_struct init_fs = INIT_FS;
static struct files_struct init_files = INIT_FILES;
static struct signal_struct init_signals = INIT_SIGNALS;
* alignment requirements and potentially different initial
* setup.
*/
-static struct vm_area_struct init_mmap = INIT_MMAP;
static struct fs_struct init_fs = INIT_FS;
static struct files_struct init_files = INIT_FILES;
static struct signal_struct init_signals = INIT_SIGNALS;
#include <asm/uaccess.h>
#include <asm/pgtable.h>
-static struct vm_area_struct init_mmap = INIT_MMAP;
static struct fs_struct init_fs = INIT_FS;
static struct files_struct init_files = INIT_FILES;
static struct signal_struct init_signals = INIT_SIGNALS;
#include <asm/uaccess.h>
#include <asm/pgtable.h>
-static struct vm_area_struct init_mmap = INIT_MMAP;
static struct fs_struct init_fs = INIT_FS;
static struct files_struct init_files = INIT_FILES;
static struct signal_struct init_signals = INIT_SIGNALS;
printk("Got exception 0x%lx at 0x%lx\n", retaddr, regs.cp0_epc);
}
-extern spinlock_t console_lock, timerlist_lock;
+extern spinlock_t timerlist_lock;
/*
* Unlock any spinlocks which will prevent us from getting the
(unsigned int) regs->regs[31]);
die("Oops", regs, write);
do_exit(SIGKILL);
+ bust_spinlocks(0);
/*
* We ran out of memory, or some other thing happened to us that made
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
-static struct vm_area_struct init_mmap = INIT_MMAP;
static struct fs_struct init_fs = INIT_FS;
static struct files_struct init_files = INIT_FILES;
static struct signal_struct init_signals = INIT_SIGNALS;
--pdc_console_initialized;
#ifdef CONFIG_VT_CONSOLE
- {
- /* fixme (needed?): Wait for console-tasklet to finish !*/
- extern struct tasklet_struct console_tasklet;
- tasklet_schedule(&console_tasklet);
- }
+ schedule_console_callback();
#endif
unregister_console(&pdc_cons);
BOOT_TARGETS = zImage zImage.initrd znetboot znetboot.initrd
+# All the instructions talk about "make bzImage".
+bzImage: zImage
+
$(BOOT_TARGETS): $(CHECKS) vmlinux
@$(MAKEBOOT) $@
return 0;
}
-void dbprintf(const char *fmt , ...)
-{
- static char buf[1024];
- va_list args;
- extern void console_print (const char *str);
- extern int vsprintf(char * buf, const char * fmt, va_list args);
-
- va_start(args, fmt);
- vsprintf(buf, fmt, args);
- va_end(args);
-
- console_print (buf);
-}
-
static NORET_TYPE void amiga_reset( void )
ATTRIB_NORET;
EXPORT_SYMBOL(tb_ticks_per_jiffy);
EXPORT_SYMBOL(get_wchan);
EXPORT_SYMBOL(console_drivers);
-EXPORT_SYMBOL(console_lock);
#ifdef CONFIG_XMON
EXPORT_SYMBOL(xmon);
#endif
struct task_struct *last_task_used_math = NULL;
struct task_struct *last_task_used_altivec = NULL;
-static struct vm_area_struct init_mmap = INIT_MMAP;
static struct fs_struct init_fs = INIT_FS;
static struct files_struct init_files = INIT_FILES;
static struct signal_struct init_signals = INIT_SIGNALS;
#include <asm/uaccess.h>
#include <asm/pgtable.h>
-static struct vm_area_struct init_mmap = INIT_MMAP;
static struct fs_struct init_fs = INIT_FS;
static struct files_struct init_files = INIT_FILES;
static struct signal_struct init_signals = INIT_SIGNALS;
#include <asm/uaccess.h>
#include <asm/pgtable.h>
-static struct vm_area_struct init_mmap = INIT_MMAP;
static struct fs_struct init_fs = INIT_FS;
static struct files_struct init_files = INIT_FILES;
static struct signal_struct init_signals = INIT_SIGNALS;
#include <asm/uaccess.h>
#include <asm/pgtable.h>
-static struct vm_area_struct init_mmap = INIT_MMAP;
static struct fs_struct init_fs = INIT_FS;
static struct files_struct init_files = INIT_FILES;
static struct signal_struct init_signals = INIT_SIGNALS;
#include <asm/pgtable.h>
#include <asm/uaccess.h>
-static struct vm_area_struct init_mmap = INIT_MMAP;
static struct fs_struct init_fs = INIT_FS;
static struct files_struct init_files = INIT_FILES;
static struct signal_struct init_signals = INIT_SIGNALS;
static inline void show(char *str, spinlock_t *lock, unsigned long caller)
{
int cpu = smp_processor_id();
- extern spinlock_t console_lock;
- if (lock != &console_lock)
- printk("%s(%p) CPU#%d stuck at %08lx, owner PC(%08lx):CPU(%lx)\n",str,
- lock, cpu, caller, lock->owner_pc & ~3, lock->owner_pc & 3);
+ printk("%s(%p) CPU#%d stuck at %08lx, owner PC(%08lx):CPU(%lx)\n",str,
+ lock, cpu, caller, lock->owner_pc & ~3, lock->owner_pc & 3);
}
static inline void show_read(char *str, rwlock_t *lock, unsigned long caller)
#include <asm/pgtable.h>
#include <asm/uaccess.h>
-static struct vm_area_struct init_mmap = INIT_MMAP;
static struct fs_struct init_fs = INIT_FS;
static struct files_struct init_files = INIT_FILES;
static struct signal_struct init_signals = INIT_SIGNALS;
major = MAJOR(bhs[0]->b_dev);
/* Determine correct block size for this device. */
- correct_size = BLOCK_SIZE;
- if (blksize_size[major]) {
- i = blksize_size[major][MINOR(bhs[0]->b_dev)];
- if (i)
- correct_size = i;
- }
+ correct_size = get_hardsect_size(bhs[0]->b_dev);
/* Verify requested block sizes. */
for (i = 0; i < nr; i++) {
static int transfer_none(struct loop_device *lo, int cmd, char *raw_buf,
char *loop_buf, int size, int real_block)
{
- if (cmd == READ)
- memcpy(loop_buf, raw_buf, size);
- else
- memcpy(raw_buf, loop_buf, size);
+ if (raw_buf != loop_buf) {
+ if (cmd == READ)
+ memcpy(loop_buf, raw_buf, size);
+ else
+ memcpy(raw_buf, loop_buf, size);
+ }
return 0;
}
static int none_status(struct loop_device *lo, struct loop_info *info)
{
+ lo->lo_flags |= LO_FLAGS_BH_REMAP;
return 0;
}
return ret;
}
+static void loop_end_io_transfer(struct buffer_head *bh, int uptodate);
static void loop_put_buffer(struct buffer_head *bh)
{
- if (bh) {
+ /*
+ * check b_end_io, may just be a remapped bh and not an allocated one
+ */
+ if (bh && bh->b_end_io == loop_end_io_transfer) {
__free_page(bh->b_page);
kmem_cache_free(bh_cachep, bh);
}
{
struct buffer_head *bh;
+ /*
+ * for xfer_funcs that can operate on the same bh, do that
+ */
+ if (lo->lo_flags & LO_FLAGS_BH_REMAP) {
+ bh = rbh;
+ goto out_bh;
+ }
+
do {
bh = kmem_cache_alloc(bh_cachep, SLAB_NOIO);
if (bh)
bh->b_size = rbh->b_size;
bh->b_dev = rbh->b_rdev;
- spin_lock_irq(&lo->lo_lock);
- bh->b_rdev = lo->lo_device;
- spin_unlock_irq(&lo->lo_lock);
bh->b_state = (1 << BH_Req) | (1 << BH_Mapped) | (1 << BH_Lock);
/*
bh->b_data = page_address(bh->b_page);
bh->b_end_io = loop_end_io_transfer;
- bh->b_rsector = rbh->b_rsector + (lo->lo_offset >> 9);
+ bh->b_private = rbh;
init_waitqueue_head(&bh->b_wait);
+out_bh:
+ bh->b_rsector = rbh->b_rsector + (lo->lo_offset >> 9);
+ spin_lock_irq(&lo->lo_lock);
+ bh->b_rdev = lo->lo_device;
+ spin_unlock_irq(&lo->lo_lock);
+
return bh;
}
* piggy old buffer on original, and submit for I/O
*/
bh = loop_get_buffer(lo, rbh);
- bh->b_private = rbh;
- IV = loop_get_iv(lo, bh->b_rsector);
+ IV = loop_get_iv(lo, rbh->b_rsector);
if (rw == WRITE) {
set_bit(BH_Dirty, &bh->b_state);
if (lo_do_transfer(lo, WRITE, bh->b_data, rbh->b_data,
error = -EBUSY;
if (lo->lo_state != Lo_unbound)
goto out;
-
+
error = -EBADF;
file = fget(arg);
if (!file)
* If we can't read - sorry. If we only can't write - well,
* it's going to be read-only.
*/
- error = -EINVAL;
if (!aops->readpage)
goto out_putf;
static int rd_blocksizes[NUM_RAMDISKS]; /* Size of 1024 byte blocks :) */
static int rd_kbsize[NUM_RAMDISKS]; /* Size in blocks of 1024 bytes */
static devfs_handle_t devfs_handle;
-static struct block_device *rd_bdev[NUM_RAMDISKS];/* Protected device data */
+static struct inode *rd_inode[NUM_RAMDISKS]; /* Protected device inodes */
/*
* Parameters for the boot-loading of the RAM disk. These are set by
#endif
+static int rd_blkdev_pagecache_IO(int rw, struct buffer_head * sbh, int minor)
+{
+ struct address_space * mapping = rd_inode[minor]->i_mapping;
+ unsigned long index;
+ int offset, size, err = 0;
+
+ if (sbh->b_page->mapping == mapping) {
+ if (rw != READ)
+ SetPageDirty(sbh->b_page);
+ goto out;
+ }
+
+ index = sbh->b_rsector >> (PAGE_CACHE_SHIFT - 9);
+ offset = (sbh->b_rsector << 9) & ~PAGE_CACHE_MASK;
+ size = sbh->b_size;
+
+ do {
+ int count;
+ struct page ** hash;
+ struct page * page;
+ const char * src;
+ char * dst;
+ int unlock = 0;
+
+ count = PAGE_CACHE_SIZE - offset;
+ if (count > size)
+ count = size;
+ size -= count;
+
+ hash = page_hash(mapping, index);
+ page = __find_get_page(mapping, index, hash);
+ if (!page && rw != READ) {
+ page = grab_cache_page(mapping, index);
+ err = -ENOMEM;
+ if (!page)
+ goto out;
+ err = 0;
+ unlock = 1;
+ }
+
+ index++;
+ if (!page) {
+ offset = 0;
+ continue;
+ }
+
+ if (rw == READ) {
+ src = kmap(page);
+ src += offset;
+ dst = bh_kmap(sbh);
+ } else {
+ dst = kmap(page);
+ dst += offset;
+ src = bh_kmap(sbh);
+ }
+ offset = 0;
+
+ memcpy(dst, src, count);
+
+ kunmap(page);
+ bh_kunmap(sbh);
+
+ if (rw != READ)
+ SetPageDirty(page);
+ if (unlock)
+ UnlockPage(page);
+ __free_page(page);
+ } while (size);
+
+ out:
+ return err;
+}
+
/*
* Basically, my strategy here is to set up a buffer-head which can't be
* deleted, and make that my Ramdisk. If the request is outside of the
{
unsigned int minor;
unsigned long offset, len;
- struct buffer_head *rbh;
- char *bdata;
-
minor = MINOR(sbh->b_rdev);
if (minor >= NUM_RAMDISKS)
goto fail;
}
- rbh = getblk(sbh->b_rdev, sbh->b_rsector/(sbh->b_size>>9), sbh->b_size);
- /* I think that it is safe to assume that rbh is not in HighMem, though
- * sbh might be - NeilBrown
- */
- bdata = bh_kmap(sbh);
- if (rw == READ) {
- if (sbh != rbh)
- memcpy(bdata, rbh->b_data, rbh->b_size);
- } else
- if (sbh != rbh)
- memcpy(rbh->b_data, bdata, rbh->b_size);
- bh_kunmap(sbh);
- mark_buffer_protected(rbh);
- brelse(rbh);
+ if (rd_blkdev_pagecache_IO(rw, sbh, minor))
+ goto fail;
sbh->b_end_io(sbh,1);
return 0;
/* special: we want to release the ramdisk memory,
it's not like with the other blockdevices where
this ioctl only flushes away the buffer cache. */
- if ((atomic_read(&rd_bdev[minor]->bd_openers) > 2))
- return -EBUSY;
- destroy_buffers(inode->i_rdev);
- rd_blocksizes[minor] = 0;
+ {
+ struct block_device * bdev = inode->i_bdev;
+
+ down(&bdev->bd_sem);
+ if (bdev->bd_openers > 2) {
+ up(&bdev->bd_sem);
+ return -EBUSY;
+ }
+ bdev->bd_openers--;
+ bdev->bd_cache_openers--;
+ iput(rd_inode[minor]);
+ rd_inode[minor] = NULL;
+ rd_blocksizes[minor] = rd_blocksize;
+ up(&bdev->bd_sem);
+ }
break;
case BLKGETSIZE: /* Return device size */
{
extern void free_initrd_mem(unsigned long, unsigned long);
- lock_kernel();
if (!--initrd_users) {
- blkdev_put(inode->i_bdev, BDEV_FILE);
free_initrd_mem(initrd_start, initrd_end);
initrd_start = 0;
}
- unlock_kernel();
return 0;
}
static struct file_operations initrd_fops = {
read: initrd_read,
- release: initrd_release,
};
#endif
static int rd_open(struct inode * inode, struct file * filp)
{
- int unit = DEVICE_NR(inode->i_rdev);
-
#ifdef CONFIG_BLK_DEV_INITRD
- if (unit == INITRD_MINOR) {
+ if (DEVICE_NR(inode->i_rdev) == INITRD_MINOR) {
+ static struct block_device_operations initrd_bd_op = {
+ open: rd_open,
+ release: initrd_release,
+ };
+
if (!initrd_start) return -ENODEV;
initrd_users++;
filp->f_op = &initrd_fops;
+ inode->i_bdev->bd_op = &initrd_bd_op;
return 0;
}
#endif
- if (unit >= NUM_RAMDISKS)
+ if (DEVICE_NR(inode->i_rdev) >= NUM_RAMDISKS)
return -ENXIO;
/*
* Immunize device against invalidate_buffers() and prune_icache().
*/
- if (rd_bdev[unit] == NULL) {
- rd_bdev[unit] = bdget(kdev_t_to_nr(inode->i_rdev));
- atomic_inc(&rd_bdev[unit]->bd_openers);
+ if (rd_inode[DEVICE_NR(inode->i_rdev)] == NULL) {
+ if (!inode->i_bdev) return -ENXIO;
+ if ((rd_inode[DEVICE_NR(inode->i_rdev)] = igrab(inode)) != NULL) {
+ struct block_device *bdev = inode->i_bdev;
+
+ /* bdev->bd_sem is held by caller */
+ bdev->bd_openers++;
+ bdev->bd_cache_openers++;
+ bdev->bd_inode = inode;
+ }
}
MOD_INC_USE_COUNT;
return 0;
}
-static struct block_device_operations fd_fops = {
+static struct block_device_operations rd_bd_op = {
open: rd_open,
release: rd_release,
ioctl: rd_ioctl,
int i;
for (i = 0 ; i < NUM_RAMDISKS; i++) {
- struct block_device *bdev = rd_bdev[i];
- rd_bdev[i] = NULL;
- if (bdev) {
- blkdev_put(bdev, BDEV_FILE);
- bdput(bdev);
+ if (rd_inode[i]) {
+ /* withdraw invalidate_buffers() and prune_icache() immunity */
+ struct block_device *bdev = rd_inode[i]->i_bdev;
+
+ down(&bdev->bd_sem);
+ bdev->bd_openers--;
+ bdev->bd_cache_openers--;
+ up(&bdev->bd_sem);
+
+ /* remove stale pointer to module address space */
+ rd_inode[i]->i_bdev->bd_op = NULL;
+ iput(rd_inode[i]);
}
destroy_buffers(MKDEV(MAJOR_NR, i));
}
rd_blocksize = BLOCK_SIZE;
}
- if (register_blkdev(MAJOR_NR, "ramdisk", &fd_fops)) {
+ if (register_blkdev(MAJOR_NR, "ramdisk", &rd_bd_op)) {
printk("RAMDISK: Could not get major %d", MAJOR_NR);
return -EIO;
}
devfs_register_series (devfs_handle, "%u", NUM_RAMDISKS,
DEVFS_FL_DEFAULT, MAJOR_NR, 0,
S_IFBLK | S_IRUSR | S_IWUSR,
- &fd_fops, NULL);
+ &rd_bd_op, NULL);
for (i = 0; i < NUM_RAMDISKS; i++)
- register_disk(NULL, MKDEV(MAJOR_NR,i), 1, &fd_fops, rd_size<<1);
+ register_disk(NULL, MKDEV(MAJOR_NR,i), 1, &rd_bd_op, rd_size<<1);
#ifdef CONFIG_BLK_DEV_INITRD
/* We ought to separate initrd operations here */
- register_disk(NULL, MKDEV(MAJOR_NR,INITRD_MINOR), 1, &fd_fops, rd_size<<1);
+ register_disk(NULL, MKDEV(MAJOR_NR,INITRD_MINOR), 1, &rd_bd_op, rd_size<<1);
#endif
hardsect_size[MAJOR_NR] = rd_hardsec; /* Size of the RAM disk blocks */
outfile.f_op = &def_blk_fops;
init_special_inode(out_inode, S_IFBLK | S_IRUSR | S_IWUSR, kdev_t_to_nr(ram_device));
- if (blkdev_open(inode, &infile) != 0)
+ if (blkdev_open(inode, &infile) != 0) {
+ iput(out_inode);
goto free_inode;
+ }
if (blkdev_open(out_inode, &outfile) != 0)
goto free_inodes;
if (i && (i % devblocks == 0)) {
printk("done disk #%d.\n", i/devblocks);
rotate = 0;
- invalidate_buffers(device);
- if (infile.f_op->release)
- infile.f_op->release(inode, &infile);
+ if (blkdev_close(inode, &infile) != 0) {
+ printk("Error closing the disk.\n");
+ goto noclose_input;
+ }
printk("Please insert disk #%d and press ENTER\n", i/devblocks+1);
wait_for_keypress();
if (blkdev_open(inode, &infile) != 0) {
printk("Error opening disk.\n");
- goto done;
+ goto noclose_input;
}
infile.f_pos = 0;
printk("Loading disk #%d... ", i/devblocks+1);
kfree(buf);
successful_load:
- invalidate_buffers(device);
ROOT_DEV = MKDEV(MAJOR_NR, unit);
if (ROOT_DEVICE_NAME != NULL) strcpy (ROOT_DEVICE_NAME, "rd/0");
done:
- if (infile.f_op->release)
- infile.f_op->release(inode, &infile);
+ blkdev_close(inode, &infile);
+noclose_input:
+ blkdev_close(out_inode, &outfile);
+ iput(inode);
+ iput(out_inode);
set_fs(fs);
return;
free_inodes: /* free inodes on error */
iput(out_inode);
- blkdev_put(inode->i_bdev, BDEV_FILE);
+ blkdev_close(inode, &infile);
free_inode:
iput(inode);
}
*
* Removed old-style timers, introduced console_timer, made timer
* deletion SMP-safe. 17Jun00, Andrew Morton <andrewm@uow.edu.au>
+ *
+ * Removed console_lock, enabled interrupts across all console operations
+ * 13 March 2001, Andrew Morton
*/
#include <linux/module.h>
static void set_cursor(int currcons);
static void hide_cursor(int currcons);
static void unblank_screen_t(unsigned long dummy);
+static void console_callback(void *ignored);
static int printable; /* Is console ready for printing? */
static int blankinterval = 10*60*HZ;
static int vesa_off_interval;
+static struct tq_struct console_callback_tq = {
+ routine: console_callback,
+};
+
/*
* fg_console is the current virtual console,
* last_console is the last used one,
/*
* Unfortunately, we need to delay tty echo when we're currently writing to the
- * console since the code is (and always was) not re-entrant, so we insert
- * all filp requests to con_task_queue instead of tq_timer and run it from
- * the console_tasklet. The console_tasklet is protected by the IRQ
- * protected console_lock.
+ * console since the code is (and always was) not re-entrant, so we schedule
+ * all flip requests to process context with schedule-task() and run it from
+ * console_callback().
*/
-DECLARE_TASK_QUEUE(con_task_queue);
/*
- * For the same reason, we defer scrollback to the console tasklet.
+ * For the same reason, we defer scrollback to the console callback.
*/
static int scrollback_delta;
static inline void scrolldelta(int lines)
{
scrollback_delta += lines;
- tasklet_schedule(&console_tasklet);
+ schedule_console_callback();
+}
+
+void schedule_console_callback(void)
+{
+ schedule_task(&console_callback_tq);
}
static void scrup(int currcons, unsigned int t, unsigned int b, int nr)
void vc_disallocate(unsigned int currcons)
{
+ acquire_console_sem();
if (vc_cons_allocated(currcons)) {
sw->con_deinit(vc_cons[currcons].d);
if (kmalloced)
kfree(vc_cons[currcons].d);
vc_cons[currcons].d = NULL;
}
+ release_console_sem();
}
/*
color = def_color;
}
+/* console_sem is held */
static void csi_m(int currcons)
{
int i;
return report_mouse;
}
+/* console_sem is held */
static void set_mode(int currcons, int on_off)
{
int i;
}
}
+/* console_sem is held */
static void setterm_command(int currcons)
{
switch(par[0]) {
}
}
-static void insert_line(int currcons, unsigned int nr)
-{
- scrdown(currcons,y,bottom,nr);
- need_wrap = 0;
-}
-
-
-static void delete_line(int currcons, unsigned int nr)
-{
- scrup(currcons,y,bottom,nr);
- need_wrap = 0;
-}
-
+/* console_sem is held */
static void csi_at(int currcons, unsigned int nr)
{
if (nr > video_num_columns - x)
insert_char(currcons, nr);
}
+/* console_sem is held */
static void csi_L(int currcons, unsigned int nr)
{
if (nr > video_num_lines - y)
nr = video_num_lines - y;
else if (!nr)
nr = 1;
- insert_line(currcons, nr);
+ scrdown(currcons,y,bottom,nr);
+ need_wrap = 0;
}
+/* console_sem is held */
static void csi_P(int currcons, unsigned int nr)
{
if (nr > video_num_columns - x)
delete_char(currcons, nr);
}
+/* console_sem is held */
static void csi_M(int currcons, unsigned int nr)
{
if (nr > video_num_lines - y)
nr = video_num_lines - y;
else if (!nr)
nr=1;
- delete_line(currcons, nr);
+ scrup(currcons,y,bottom,nr);
+ need_wrap = 0;
}
+/* console_sem is held (except via vc_init->reset_terminal */
static void save_cur(int currcons)
{
saved_x = x;
saved_G1 = G1_charset;
}
+/* console_sem is held */
static void restore_cur(int currcons)
{
gotoxy(currcons,saved_x,saved_y);
EShash, ESsetG0, ESsetG1, ESpercent, ESignore, ESnonstd,
ESpalette };
+/* console_sem is held (except via vc_init()) */
static void reset_terminal(int currcons, int do_clear)
{
top = 0;
csi_J(currcons,2);
}
+/* console_sem is held */
static void do_con_trol(struct tty_struct *tty, unsigned int currcons, int c)
{
/*
#define CON_BUF_SIZE PAGE_SIZE
DECLARE_MUTEX(con_buf_sem);
+/* acquires console_sem */
static int do_con_write(struct tty_struct * tty, int from_user,
const unsigned char *buf, int count)
{
const unsigned char *orig_buf = NULL;
int orig_count;
+ if (in_interrupt())
+ return count;
+
currcons = vt->vc_num;
if (!vc_cons_allocated(currcons)) {
/* could this happen? */
again:
if (count > CON_BUF_SIZE)
count = CON_BUF_SIZE;
+ console_conditional_schedule();
if (copy_from_user(con_buf, buf, count)) {
n = 0; /* ?? are error codes legal here ?? */
goto out;
* the console spinlock during the entire write.
*/
- spin_lock_irq(&console_lock);
+ acquire_console_sem();
himask = hi_font_mask;
charmask = himask ? 0x1ff : 0xff;
do_con_trol(tty, currcons, c);
}
FLUSH
- spin_unlock_irq(&console_lock);
+ console_conditional_schedule();
+ release_console_sem();
out:
if (from_user) {
}
/*
- * This is the console switching tasklet.
+ * This is the console switching callback.
*
- * Doing console switching in a tasklet allows
+ * Doing console switching in a process context allows
* us to do the switches asynchronously (needed when we want
* to switch due to a keyboard interrupt). Synchronization
* with other console code and prevention of re-entrancy is
- * ensured with console_lock.
+ * ensured with console_sem.
*/
-static void console_softint(unsigned long ignored)
+static void console_callback(void *ignored)
{
- /* Runs the task queue outside of the console lock. These
- * callbacks can come back into the console code and thus
- * will perform their own locking.
- */
- run_task_queue(&con_task_queue);
-
- spin_lock_irq(&console_lock);
+ acquire_console_sem();
if (want_console >= 0) {
if (want_console != fg_console && vc_cons_allocated(want_console)) {
scrollback_delta = 0;
}
- spin_unlock_irq(&console_lock);
+ release_console_sem();
+}
+
+void set_console(int nr)
+{
+ want_console = nr;
+ schedule_console_callback();
}
#ifdef CONFIG_VT_CONSOLE
/*
* Console on virtual terminal
*
- * The console_lock must be held when we get here.
+ * The console must be locked when we get here.
*/
void vt_console_print(struct console *co, const char * b, unsigned count)
}
set_cursor(currcons);
+ if (!oops_in_progress)
+ poke_blanked_console();
+
quit:
clear_bit(0, &printing);
}
* Handling of Linux-specific VC ioctls
*/
+/*
+ * Generally a bit racy with respect to console_sem().
+ *
+ * There are some functions which don't need it.
+ *
+ * There are some functions which can sleep for arbitrary periods (paste_selection)
+ * but we don't need the lock there anyway.
+ *
+ * set_selection has locking, and definitely needs it
+ */
+
int tioclinux(struct tty_struct *tty, unsigned long arg)
{
char type, data;
+ int ret;
if (tty->driver.type != TTY_DRIVER_TYPE_CONSOLE)
return -EINVAL;
- if (current->tty != tty && !suser())
+ if (current->tty != tty && !capable(CAP_SYS_ADMIN))
return -EPERM;
if (get_user(type, (char *)arg))
return -EFAULT;
+ ret = 0;
switch (type)
{
case 2:
- return set_selection(arg, tty, 1);
+ acquire_console_sem();
+ ret = set_selection(arg, tty, 1);
+ release_console_sem();
+ break;
case 3:
- return paste_selection(tty);
+ ret = paste_selection(tty);
+ break;
case 4:
unblank_screen();
- return 0;
+ break;
case 5:
- return sel_loadlut(arg);
+ ret = sel_loadlut(arg);
+ break;
case 6:
/*
* related to the kernel should not use this.
*/
data = shift_state;
- return __put_user(data, (char *) arg);
+ ret = __put_user(data, (char *) arg);
+ break;
case 7:
data = mouse_reporting();
- return __put_user(data, (char *) arg);
+ ret = __put_user(data, (char *) arg);
+ break;
case 10:
set_vesa_blanking(arg);
- return 0;
+ break;;
case 11: /* set kmsg redirect */
- if (!suser())
- return -EPERM;
- if (get_user(data, (char *)arg+1))
- return -EFAULT;
- kmsg_redirect = data;
- return 0;
+ if (!capable(CAP_SYS_ADMIN)) {
+ ret = -EPERM;
+ } else {
+ if (get_user(data, (char *)arg+1))
+ ret = -EFAULT;
+ else
+ kmsg_redirect = data;
+ }
+ break;
case 12: /* get fg_console */
- return fg_console;
+ ret = fg_console;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
}
- return -EINVAL;
+ return ret;
}
/*
static void con_put_char(struct tty_struct *tty, unsigned char ch)
{
+ if (in_interrupt())
+ return; /* n_r3964 calls put_char() from interrupt context */
pm_access(pm_con);
do_con_write(tty, 0, &ch, 1);
}
static void con_flush_chars(struct tty_struct *tty)
{
- unsigned long flags;
struct vt_struct *vt = (struct vt_struct *)tty->driver_data;
+ if (in_interrupt()) /* from flush_to_ldisc */
+ return;
+
pm_access(pm_con);
- spin_lock_irqsave(&console_lock, flags);
+ acquire_console_sem();
set_cursor(vt->vc_num);
- spin_unlock_irqrestore(&console_lock, flags);
+ release_console_sem();
}
/*
struct tty_driver console_driver;
static int console_refcount;
-DECLARE_TASKLET_DISABLED(console_tasklet, console_softint, 0);
-
void __init con_init(void)
{
const char *display_desc = NULL;
#ifdef CONFIG_VT_CONSOLE
register_console(&vt_console_driver);
#endif
-
- tasklet_enable(&console_tasklet);
- tasklet_schedule(&console_tasklet);
}
#ifndef VT_SINGLE_DRIVER
console_driver.minor_start + i);
}
+/*
+ * This is called by a timer handler
+ */
static void vesa_powerdown(void)
{
struct vc_data *c = vc_cons[fg_console].d;
}
}
+/*
+ * This is a timer handler
+ */
static void vesa_powerdown_screen(unsigned long dummy)
{
- console_timer.function = unblank_screen_t; /* I don't have a clue why this is necessary */
+ console_timer.function = unblank_screen_t;
vesa_powerdown();
}
timer_do_blank_screen(entering_gfx, 0);
}
+/*
+ * This is a timer handler
+ */
static void unblank_screen_t(unsigned long dummy)
{
unblank_screen();
}
+/*
+ * Called by timer as well as from vt_console_driver
+ */
void unblank_screen(void)
{
int currcons;
set_cursor(fg_console);
}
+/*
+ * This is both a user-level callable and a timer handler
+ */
static void blank_screen(unsigned long dummy)
{
timer_do_blank_screen(0, 1);
void poke_blanked_console(void)
{
- del_timer(&console_timer); /* Can't use _sync here: called from tasklet */
+ del_timer(&console_timer);
if (!vt_cons[fg_console] || vt_cons[fg_console]->vc_mode == KD_GRAPHICS)
return;
if (console_blanked) {
op->data = temp;
}
- spin_lock_irq(&console_lock);
+ acquire_console_sem();
rc = sw->con_font_op(vc_cons[currcons].d, op);
- spin_unlock_irq(&console_lock);
+ release_console_sem();
op->data = old_op.data;
if (!rc && !set) {
char raw_mode;
pm_access(pm_kbd);
-
- do_poke_blanked_console = 1;
- tasklet_schedule(&console_tasklet);
add_keyboard_randomness(scancode | up_flag);
tty = ttytab? ttytab[fg_console]: NULL;
if (tty && (!tty->driver_data)) {
/*
- * We touch the tty structure via the the ttytab array
+ * We touch the tty structure via the ttytab array
* without knowing whether or not tty is open, which
* is inherently dangerous. We currently rely on that
* fact that console_open sets tty->driver_data when
* Convert scancode to keycode
*/
if (!kbd_translate(scancode, &keycode, raw_mode))
- return;
+ goto out;
/*
* At this point the variable `keycode' contains the keycode.
#ifdef CONFIG_MAGIC_SYSRQ /* Handle the SysRq Hack */
if (keycode == SYSRQ_KEY) {
sysrq_pressed = !up_flag;
- return;
+ goto out;
} else if (sysrq_pressed) {
if (!up_flag) {
handle_sysrq(kbd_sysrq_xlate[keycode], kbd_pt_regs, kbd, tty);
- return;
+ goto out;
}
}
#endif
if (type >= 0xf0) {
type -= 0xf0;
if (raw_mode && ! (TYPES_ALLOWED_IN_RAW_MODE & (1 << type)))
- return;
+ goto out;
if (type == KT_LETTER) {
type = KT_LATIN;
if (vc_kbd_led(kbd, VC_CAPSLOCK)) {
compute_shiftstate();
kbd->slockstate = 0; /* play it safe */
#else
- keysym = U(plain_map[keycode]);
+ keysym = U(plain_map[keycode]);
type = KTYP(keysym);
if (type == KT_SHIFT)
(*key_handler[type])(keysym & 0xff, up_flag);
#endif
}
}
+out:
+ do_poke_blanked_console = 1;
+ schedule_console_callback();
}
#include <linux/kbd_kern.h>
#include <linux/vt_kern.h>
#include <linux/smp_lock.h>
+#include <linux/kd.h>
#include <asm/keyboard.h>
#include <asm/bitops.h>
#define dprintk(x...)
typedef struct raw_device_data_s {
- struct kiobuf * iobuf;
- long iobuf_lock;
struct block_device *binding;
int inuse, sector_size, sector_bits;
struct semaphore mutex;
return 0;
}
+ if (!filp->f_iobuf) {
+ err = alloc_kiovec(1, &filp->f_iobuf);
+ if (err)
+ return err;
+ }
+
down(&raw_devices[minor].mutex);
/*
* No, it is a normal raw device. All we need to do on open is
if (raw_devices[minor].inuse++)
goto out;
- /*
- * We'll just use one kiobuf
- */
-
- err = alloc_kiovec(1, &raw_devices[minor].iobuf);
- if (err) {
- raw_devices[minor].inuse--;
- up(&raw_devices[minor].mutex);
- blkdev_put(bdev, BDEV_RAW);
- return err;
- }
-
-
/*
* Don't interfere with mounted devices: we cannot safely set
* the blocksize on a device which is already mounted.
minor = MINOR(inode->i_rdev);
down(&raw_devices[minor].mutex);
bdev = raw_devices[minor].binding;
- if (!--raw_devices[minor].inuse)
- free_kiovec(1, &raw_devices[minor].iobuf);
+ raw_devices[minor].inuse--;
up(&raw_devices[minor].mutex);
blkdev_put(bdev, BDEV_RAW);
return 0;
minor = MINOR(filp->f_dentry->d_inode->i_rdev);
new_iobuf = 0;
- iobuf = raw_devices[minor].iobuf;
- if (test_and_set_bit(0, &raw_devices[minor].iobuf_lock)) {
+ iobuf = filp->f_iobuf;
+ if (test_and_set_bit(0, &filp->f_iobuf_lock)) {
/*
* A parallel read/write is using the preallocated iobuf
* so just run slow and allocate a new one.
out_free:
if (!new_iobuf)
- clear_bit(0, &raw_devices[minor].iobuf_lock);
+ clear_bit(0, &filp->f_iobuf_lock);
else
free_kiovec(1, &iobuf);
out:
if (I_IGNPAR(info->tty))
info->ignore_status_mask |= UART_LSR_OE;
}
+#if 0 /* breaks serial console during boot stage */
/*
* !!! ignore all characters if CREAD is not set
*/
if ((cflag & CREAD) == 0)
info->ignore_status_mask |= UART_LSR_DR;
+#endif
save_flags(flags); cli();
if (uart_config[info->state->type].flags & UART_STARTECH) {
serial_outp(info, UART_LCR, 0xBF);
* Print a string to the serial port trying not to disturb
* any possible real use of the port...
*
- * The console_lock must be held when we get here.
+ * The console must be locked when we get here.
*/
static void serial_console_write(struct console *co, const char *s,
unsigned count)
/* Select the proper current console and verify
* sanity of the situation under the console lock.
*/
- spin_lock_irq(&console_lock);
+ acquire_console_sem();
attr = (currcons & 128);
currcons = (currcons & 127);
}
}
- /* Finally, temporarily drop the console lock and push
+ /* Finally, release the console semaphore while we push
* all the data to userspace from our temporary buffer.
+ *
+ * AKPM: Even though it's a semaphore, we should drop it because
+ * the pagefault handling code may want to call printk().
*/
- spin_unlock_irq(&console_lock);
+ release_console_sem();
ret = copy_to_user(buf, con_buf_start, orig_count);
- spin_lock_irq(&console_lock);
+ acquire_console_sem();
if (ret) {
read += (orig_count - ret);
if (read)
ret = read;
unlock_out:
- spin_unlock_irq(&console_lock);
+ release_console_sem();
up(&con_buf_sem);
return ret;
}
/* Select the proper current console and verify
* sanity of the situation under the console lock.
*/
- spin_lock_irq(&console_lock);
+ acquire_console_sem();
attr = (currcons & 128);
currcons = (currcons & 127);
/* Temporarily drop the console lock so that we can read
* in the write data from userspace safely.
*/
- spin_unlock_irq(&console_lock);
+ release_console_sem();
ret = copy_from_user(con_buf, buf, this_round);
- spin_lock_irq(&console_lock);
+ acquire_console_sem();
if (ret) {
this_round -= ret;
ret = written;
unlock_out:
- spin_unlock_irq(&console_lock);
+ release_console_sem();
up(&con_buf_sem);
* make sure we are atomic with respect to
* other console switches..
*/
- spin_lock_irq(&console_lock);
+ acquire_console_sem();
complete_change_console(newvt);
- spin_unlock_irq(&console_lock);
+ release_console_sem();
}
}
vt_cons[new_console]->vt_mode.frsig = 0;
vt_cons[new_console]->vt_pid = -1;
vt_cons[new_console]->vt_newvt = -1;
- reset_palette (new_console) ;
+ if (!in_interrupt()) /* Via keyboard.c:SAK() - akpm */
+ reset_palette(new_console) ;
}
/*
cur & 0xf,
IN_BYTE(IDE_SECTOR_REG));
}
- if (HWGROUP(drive)->rq)
+ if (HWGROUP(drive) && HWGROUP(drive)->rq)
printk(", sector=%ld", HWGROUP(drive)->rq->sector);
}
}
#include "lvm-snap.h"
-#define LVM_CORRECT_READ_AHEAD( a) \
- if ( a < LVM_MIN_READ_AHEAD || \
- a > LVM_MAX_READ_AHEAD) a = LVM_MAX_READ_AHEAD;
+#define LVM_CORRECT_READ_AHEAD(a) \
+do { \
+ if ((a) < LVM_MIN_READ_AHEAD || \
+ (a) > LVM_MAX_READ_AHEAD) \
+ (a) = LVM_DEFAULT_READ_AHEAD; \
+ read_ahead[MAJOR_NR] = (a); \
+} while(0)
#ifndef WRITEA
# define WRITEA WRITE
(long) arg > LVM_MAX_READ_AHEAD)
return -EINVAL;
lv_ptr->lv_read_ahead = (long) arg;
+ read_ahead[MAJOR_NR] = lv_ptr->lv_read_ahead;
break;
mddev->param.chunk_size = chunk_size;
mddev->param.personality = pnum;
- if ((pnum != MULTIPATH) && (pnum != RAID1) && (pnum != LINEAR)) {
+ if ((pnum != MULTIPATH) && (pnum != RAID1)) {
if (!chunk_size) {
/*
* 'default chunksize' in the old md code used to
}
} else
if (chunk_size)
- printk(KERN_INFO "RAID level %d does not need chunksize! Continuing anyway.\n", mddev->sb->level);
+ printk(KERN_INFO "md: RAID level %d does not need chunksize! Continuing anyway.\n", mddev->sb->level);
if (pnum >= MAX_PERSONALITY) {
MD_BUG();
if (!pers[pnum])
#endif
{
- printk(KERN_ERR "md.c: personality %d is not loaded!\n",
+ printk(KERN_ERR "md: personality %d is not loaded!\n",
pnum);
return -EINVAL;
}
if (mddev->pers->restart_resync)
mddev->pers->restart_resync(mddev);
} else {
- printk (KERN_ERR "md.c: md%d has no personality assigned.\n",
+ printk (KERN_ERR "md: md%d has no personality assigned.\n",
mdidx(mddev));
err = -EINVAL;
}
if (!mddev->pers)
return -ENODEV;
- printk("trying to generate %s error in md%d ... \n",
+ printk("md: trying to generate %s error in md%d ... \n",
partition_name(dev), mdidx(mddev));
rdev = find_rdev(mddev, dev);
MD_BUG();
return -ENODEV;
}
- printk("okay, generating error!\n");
+ printk("md: okay, generating error!\n");
// q->oneshot_error = 1; // disabled for now
return 0;
unsigned long max_blocks, resync, res, dt, db, rt;
resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
- max_blocks = mddev->sb->size << 1;
+ max_blocks = mddev->sb->size;
/*
* Should not happen.
/* The 3c59x-specific entries in the device structure. */
dev->open = vortex_open;
if (vp->full_bus_master_tx) {
- struct sysinfo sysinfo;
-
dev->hard_start_xmit = boomerang_start_xmit;
- si_meminfo(&sysinfo);
- if (sysinfo.totalhigh == 0) {
- /* Actually, it still should work with iommu. */
- dev->features |= NETIF_F_SG;
- }
+ /* Actually, it still should work with iommu. */
+ dev->features |= NETIF_F_SG;
if (((hw_checksums[card_idx] == -1) && (vp->drv_flags & HAS_HWCKSM)) ||
(hw_checksums[card_idx] == 1)) {
dev->features |= NETIF_F_IP_CSUM;
/* Clear CmdSuspend (1<<30) avoiding interference with the card access to the
status bits. Previous driver versions used separate 16 bit fields for
commands and statuses. --SAW
- FIXME: it may not work on non-IA32 architectures.
*/
-#if defined(__LITTLE_ENDIAN)
-#define clear_suspend(cmd) ((__u16 *)&(cmd)->cmd_status)[1] &= ~0x4000
-#elif defined(__BIG_ENDIAN)
-#define clear_suspend(cmd) ((__u16 *)&(cmd)->cmd_status)[1] &= ~0x0040
+#if defined(__alpha__)
+# define clear_suspend(cmd) clear_bit(30, &(cmd)->cmd_status);
#else
-#error Unsupported byteorder
+# if defined(__LITTLE_ENDIAN)
+# define clear_suspend(cmd) ((__u16 *)&(cmd)->cmd_status)[1] &= ~0x4000
+# elif defined(__BIG_ENDIAN)
+# define clear_suspend(cmd) ((__u16 *)&(cmd)->cmd_status)[1] &= ~0x0040
+# else
+# error Unsupported byteorder
+# endif
#endif
enum SCBCmdBits {
}
do_poke_blanked_console = 1;
- tasklet_schedule(&console_tasklet);
+ schedule_console_callback();
add_keyboard_randomness(keycode);
tty = ttytab? ttytab[fg_console]: NULL;
if (!host)
goto err_unmap;
+#if 0
/*
* Comment the following initialization if you know 'max_sectors' is
* not defined for this kernel.
* greatly increases the IO performance - AM
*/
host->max_sectors = 1024;
+#endif
scsi_set_pci_device(host, pdev);
megaCfg = (mega_host_config *) host->hostdata;
#define SG_SEGMENTS 32 /* Cmd entry + 6 continuations */
-typedef struct timer_list timer_t; /* timer */
-
/*
* SCSI Request Block structure
*/
}
}
scr_writew(c, d);
+ console_conditional_schedule();
s++;
d++;
} while (s < le);
if (s > start)
p->dispsw->putcs(conp, p, start, s - start, real_y(p, line), x);
+ console_conditional_schedule();
if (offset > 0)
line++;
else {
* linux/fs/block_dev.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
*/
#include <linux/config.h>
#include <linux/major.h>
#include <linux/devfs_fs_kernel.h>
#include <linux/smp_lock.h>
+#include <linux/iobuf.h>
+#include <linux/highmem.h>
+#include <linux/blkdev.h>
#include <asm/uaccess.h>
-extern int *blk_size[];
-extern int *blksize_size[];
+static inline int blkdev_get_block(struct inode * inode, long iblock, struct buffer_head * bh_result)
+{
+ int err;
+
+ err = -EIO;
+ if (iblock >= buffered_blk_size(inode->i_rdev) >> (BUFFERED_BLOCKSIZE_BITS - BLOCK_SIZE_BITS))
+ goto out;
-#define MAX_BUF_PER_PAGE (PAGE_SIZE / 512)
-#define NBUF 64
+ bh_result->b_blocknr = iblock;
+ bh_result->b_state |= 1UL << BH_Mapped;
+ err = 0;
+
+ out:
+ return err;
+}
-ssize_t block_write(struct file * filp, const char * buf,
- size_t count, loff_t *ppos)
+static int blkdev_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize)
{
- struct inode * inode = filp->f_dentry->d_inode;
- ssize_t blocksize, blocksize_bits, i, buffercount, write_error;
- ssize_t block, blocks;
- loff_t offset;
- ssize_t chars;
- ssize_t written, retval;
- struct buffer_head * bhlist[NBUF];
- size_t size;
- kdev_t dev = inode->i_rdev;
- struct buffer_head * bh, *bufferlist[NBUF];
- register char * p;
-
- if (is_read_only(dev))
- return -EPERM;
-
- retval = written = write_error = buffercount = 0;
- blocksize = BLOCK_SIZE;
- if (blksize_size[MAJOR(dev)] && blksize_size[MAJOR(dev)][MINOR(dev)])
- blocksize = blksize_size[MAJOR(dev)][MINOR(dev)];
-
- i = blocksize;
- blocksize_bits = 0;
- while(i != 1) {
- blocksize_bits++;
- i >>= 1;
- }
+ int i, nr_blocks, retval, dev = inode->i_rdev;
+ unsigned long * blocks = iobuf->blocks;
- block = *ppos >> blocksize_bits;
- offset = *ppos & (blocksize-1);
-
- if (blk_size[MAJOR(dev)])
- size = ((loff_t) blk_size[MAJOR(dev)][MINOR(dev)] << BLOCK_SIZE_BITS) >> blocksize_bits;
- else
- size = INT_MAX;
- while (count>0) {
- if (block >= size) {
- retval = -ENOSPC;
- goto cleanup;
- }
- chars = blocksize - offset;
- if (chars > count)
- chars=count;
-
-#if 0
- /* get the buffer head */
- {
- struct buffer_head * (*fn)(kdev_t, int, int) = getblk;
- if (chars != blocksize)
- fn = bread;
- bh = fn(dev, block, blocksize);
- if (!bh) {
- retval = -EIO;
- goto cleanup;
- }
- if (!buffer_uptodate(bh))
- wait_on_buffer(bh);
- }
-#else
- bh = getblk(dev, block, blocksize);
- if (!bh) {
- retval = -EIO;
- goto cleanup;
- }
+ if (blocksize != BUFFERED_BLOCKSIZE)
+ BUG();
- if (!buffer_uptodate(bh))
- {
- if (chars == blocksize)
- wait_on_buffer(bh);
- else
- {
- bhlist[0] = bh;
- if (!filp->f_reada || !read_ahead[MAJOR(dev)]) {
- /* We do this to force the read of a single buffer */
- blocks = 1;
- } else {
- /* Read-ahead before write */
- blocks = read_ahead[MAJOR(dev)] / (blocksize >> 9) / 2;
- if (block + blocks > size) blocks = size - block;
- if (blocks > NBUF) blocks=NBUF;
- if (!blocks) blocks = 1;
- for(i=1; i<blocks; i++)
- {
- bhlist[i] = getblk (dev, block+i, blocksize);
- if (!bhlist[i])
- {
- while(i >= 0) brelse(bhlist[i--]);
- retval = -EIO;
- goto cleanup;
- }
- }
- }
- ll_rw_block(READ, blocks, bhlist);
- for(i=1; i<blocks; i++) brelse(bhlist[i]);
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh)) {
- brelse(bh);
- retval = -EIO;
- goto cleanup;
- }
- };
- };
-#endif
- block++;
- p = offset + bh->b_data;
- offset = 0;
- *ppos += chars;
- written += chars;
- count -= chars;
- copy_from_user(p,buf,chars);
- p += chars;
- buf += chars;
- mark_buffer_uptodate(bh, 1);
- mark_buffer_dirty(bh);
- if (filp->f_flags & O_SYNC)
- bufferlist[buffercount++] = bh;
- else
- brelse(bh);
- if (buffercount == NBUF){
- ll_rw_block(WRITE, buffercount, bufferlist);
- for(i=0; i<buffercount; i++){
- wait_on_buffer(bufferlist[i]);
- if (!buffer_uptodate(bufferlist[i]))
- write_error=1;
- brelse(bufferlist[i]);
- }
- buffercount=0;
- }
- balance_dirty();
- if (write_error)
- break;
+ nr_blocks = iobuf->length >> BUFFERED_BLOCKSIZE_BITS;
+ /* build the blocklist */
+ for (i = 0; i < nr_blocks; i++, blocknr++) {
+ struct buffer_head bh;
+
+ retval = blkdev_get_block(inode, blocknr, &bh);
+ if (retval)
+ goto out;
+
+ blocks[i] = bh.b_blocknr;
}
- cleanup:
- if ( buffercount ){
- ll_rw_block(WRITE, buffercount, bufferlist);
- for(i=0; i<buffercount; i++){
- wait_on_buffer(bufferlist[i]);
- if (!buffer_uptodate(bufferlist[i]))
- write_error=1;
- brelse(bufferlist[i]);
+
+ retval = brw_kiovec(rw, 1, &iobuf, dev, iobuf->blocks, blocksize);
+
+ out:
+ return retval;
+}
+
+static int blkdev_writepage(struct page * page)
+{
+ int err, i;
+ unsigned long block;
+ struct buffer_head *bh, *head;
+ struct inode *inode = page->mapping->host;
+
+ if (!PageLocked(page))
+ BUG();
+
+ if (!page->buffers)
+ create_empty_buffers(page, inode->i_rdev, BUFFERED_BLOCKSIZE);
+ head = page->buffers;
+
+ block = page->index << (PAGE_CACHE_SHIFT - BUFFERED_BLOCKSIZE_BITS);
+
+ bh = head;
+ i = 0;
+
+ /* Stage 1: make sure we have all the buffers mapped! */
+ do {
+ /*
+ * If the buffer isn't up-to-date, we can't be sure
+ * that the buffer has been initialized with the proper
+ * block number information etc..
+ *
+ * Leave it to the low-level FS to make all those
+ * decisions (block #0 may actually be a valid block)
+ */
+ if (!buffer_mapped(bh)) {
+ err = blkdev_get_block(inode, block, bh);
+ if (err)
+ goto out;
}
- }
- if(!retval)
- filp->f_reada = 1;
- if(write_error)
- return -EIO;
- return written ? written : retval;
+ bh = bh->b_this_page;
+ block++;
+ } while (bh != head);
+
+ /* Stage 2: lock the buffers, mark them clean */
+ do {
+ lock_buffer(bh);
+ set_buffer_async_io(bh);
+ set_bit(BH_Uptodate, &bh->b_state);
+ clear_bit(BH_Dirty, &bh->b_state);
+ bh = bh->b_this_page;
+ } while (bh != head);
+
+ /* Stage 3: submit the IO */
+ do {
+ submit_bh(WRITE, bh);
+ bh = bh->b_this_page;
+ } while (bh != head);
+
+ /* Done - end_buffer_io_async will unlock */
+ SetPageUptodate(page);
+ return 0;
+
+out:
+ ClearPageUptodate(page);
+ UnlockPage(page);
+ return err;
}
-ssize_t block_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
+static int blkdev_readpage(struct file * file, struct page * page)
{
- struct inode * inode = filp->f_dentry->d_inode;
- size_t block;
- loff_t offset;
- ssize_t blocksize;
- ssize_t blocksize_bits, i;
- size_t blocks, rblocks, left;
- int bhrequest, uptodate;
- struct buffer_head ** bhb, ** bhe;
- struct buffer_head * buflist[NBUF];
- struct buffer_head * bhreq[NBUF];
- unsigned int chars;
- loff_t size;
- kdev_t dev;
- ssize_t read;
-
- dev = inode->i_rdev;
- blocksize = BLOCK_SIZE;
- if (blksize_size[MAJOR(dev)] && blksize_size[MAJOR(dev)][MINOR(dev)])
- blocksize = blksize_size[MAJOR(dev)][MINOR(dev)];
- i = blocksize;
- blocksize_bits = 0;
- while (i != 1) {
- blocksize_bits++;
- i >>= 1;
- }
+ struct inode *inode = page->mapping->host;
+ kdev_t dev = inode->i_rdev;
+ unsigned long iblock, lblock;
+ struct buffer_head *bh, *head, *arr[1 << (PAGE_CACHE_SHIFT - BUFFERED_BLOCKSIZE_BITS)];
+ unsigned int blocks;
+ int nr, i;
+
+ if (!PageLocked(page))
+ PAGE_BUG(page);
+ if (!page->buffers)
+ create_empty_buffers(page, dev, BUFFERED_BLOCKSIZE);
+ head = page->buffers;
+
+ blocks = PAGE_CACHE_SIZE >> BUFFERED_BLOCKSIZE_BITS;
+ iblock = page->index << (PAGE_CACHE_SHIFT - BUFFERED_BLOCKSIZE_BITS);
+ lblock = buffered_blk_size(dev) >> (BUFFERED_BLOCKSIZE_BITS - BLOCK_SIZE_BITS);
+ bh = head;
+ nr = 0;
+ i = 0;
+
+ do {
+ if (buffer_uptodate(bh))
+ continue;
- offset = *ppos;
- if (blk_size[MAJOR(dev)])
- size = (loff_t) blk_size[MAJOR(dev)][MINOR(dev)] << BLOCK_SIZE_BITS;
- else
- size = (loff_t) INT_MAX << BLOCK_SIZE_BITS;
-
- if (offset > size)
- left = 0;
- /* size - offset might not fit into left, so check explicitly. */
- else if (size - offset > INT_MAX)
- left = INT_MAX;
- else
- left = size - offset;
- if (left > count)
- left = count;
- if (left <= 0)
+ if (!buffer_mapped(bh)) {
+ if (iblock <= lblock) {
+ if (blkdev_get_block(inode, iblock, bh))
+ continue;
+ }
+ if (!buffer_mapped(bh)) {
+ memset(kmap(page) + i * BUFFERED_BLOCKSIZE, 0, BUFFERED_BLOCKSIZE);
+ flush_dcache_page(page);
+ kunmap(page);
+ set_bit(BH_Uptodate, &bh->b_state);
+ continue;
+ }
+ /* get_block() might have updated the buffer synchronously */
+ if (buffer_uptodate(bh))
+ continue;
+ }
+
+ arr[nr] = bh;
+ nr++;
+ } while (i++, iblock++, (bh = bh->b_this_page) != head);
+
+ if (!nr) {
+ /*
+ * all buffers are uptodate - we can set the page
+ * uptodate as well.
+ */
+ SetPageUptodate(page);
+ UnlockPage(page);
return 0;
- read = 0;
- block = offset >> blocksize_bits;
- offset &= blocksize-1;
- size >>= blocksize_bits;
- rblocks = blocks = (left + offset + blocksize - 1) >> blocksize_bits;
- bhb = bhe = buflist;
- if (filp->f_reada) {
- if (blocks < read_ahead[MAJOR(dev)] / (blocksize >> 9))
- blocks = read_ahead[MAJOR(dev)] / (blocksize >> 9);
- if (rblocks > blocks)
- blocks = rblocks;
-
}
- if (block + blocks > size) {
- blocks = size - block;
- if (blocks == 0)
- return 0;
+
+ /* Stage two: lock the buffers */
+ for (i = 0; i < nr; i++) {
+ struct buffer_head * bh = arr[i];
+ lock_buffer(bh);
+ set_buffer_async_io(bh);
}
- /* We do this in a two stage process. We first try to request
- as many blocks as we can, then we wait for the first one to
- complete, and then we try to wrap up as many as are actually
- done. This routine is rather generic, in that it can be used
- in a filesystem by substituting the appropriate function in
- for getblk.
+ /* Stage 3: start the IO */
+ for (i = 0; i < nr; i++)
+ submit_bh(READ, arr[i]);
- This routine is optimized to make maximum use of the various
- buffers and caches. */
+ return 0;
+}
- do {
- bhrequest = 0;
- uptodate = 1;
- while (blocks) {
- --blocks;
- *bhb = getblk(dev, block++, blocksize);
- if (*bhb && !buffer_uptodate(*bhb)) {
- uptodate = 0;
- bhreq[bhrequest++] = *bhb;
- }
+static int __blkdev_prepare_write(struct inode *inode, struct page *page,
+ unsigned from, unsigned to)
+{
+ kdev_t dev = inode->i_rdev;
+ unsigned block_start, block_end;
+ unsigned long block;
+ int err = 0;
+ struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
+ kmap(page);
- if (++bhb == &buflist[NBUF])
- bhb = buflist;
+ if (!page->buffers)
+ create_empty_buffers(page, dev, BUFFERED_BLOCKSIZE);
+ head = page->buffers;
- /* If the block we have on hand is uptodate, go ahead
- and complete processing. */
- if (uptodate)
- break;
- if (bhb == bhe)
- break;
- }
+ block = page->index << (PAGE_CACHE_SHIFT - BUFFERED_BLOCKSIZE_BITS);
- /* Now request them all */
- if (bhrequest) {
- ll_rw_block(READ, bhrequest, bhreq);
+ for(bh = head, block_start = 0; bh != head || !block_start;
+ block++, block_start=block_end, bh = bh->b_this_page) {
+ if (!bh)
+ BUG();
+ block_end = block_start + BUFFERED_BLOCKSIZE;
+ if (block_end <= from)
+ continue;
+ if (block_start >= to)
+ break;
+ if (!buffer_mapped(bh)) {
+ err = blkdev_get_block(inode, block, bh);
+ if (err)
+ goto out;
+ }
+ if (Page_Uptodate(page)) {
+ set_bit(BH_Uptodate, &bh->b_state);
+ continue;
}
+ if (!buffer_uptodate(bh) &&
+ (block_start < from || block_end > to)) {
+ ll_rw_block(READ, 1, &bh);
+ *wait_bh++=bh;
+ }
+ }
+ /*
+ * If we issued read requests - let them complete.
+ */
+ while(wait_bh > wait) {
+ wait_on_buffer(*--wait_bh);
+ err = -EIO;
+ if (!buffer_uptodate(*wait_bh))
+ goto out;
+ }
+ return 0;
+out:
+ return err;
+}
- do { /* Finish off all I/O that has actually completed */
- if (*bhe) {
- wait_on_buffer(*bhe);
- if (!buffer_uptodate(*bhe)) { /* read error? */
- brelse(*bhe);
- if (++bhe == &buflist[NBUF])
- bhe = buflist;
- left = 0;
- break;
- }
- }
- if (left < blocksize - offset)
- chars = left;
- else
- chars = blocksize - offset;
- *ppos += chars;
- left -= chars;
- read += chars;
- if (*bhe) {
- copy_to_user(buf,offset+(*bhe)->b_data,chars);
- brelse(*bhe);
- buf += chars;
- } else {
- while (chars-- > 0)
- put_user(0,buf++);
+static int blkdev_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to)
+{
+ struct inode *inode = page->mapping->host;
+ int err = __blkdev_prepare_write(inode, page, from, to);
+ if (err) {
+ ClearPageUptodate(page);
+ kunmap(page);
+ }
+ return err;
+}
+
+static int __blkdev_commit_write(struct inode *inode, struct page *page,
+ unsigned from, unsigned to)
+{
+ unsigned block_start, block_end;
+ int partial = 0, need_balance_dirty = 0;
+ struct buffer_head *bh, *head;
+
+ for(bh = head = page->buffers, block_start = 0;
+ bh != head || !block_start;
+ block_start=block_end, bh = bh->b_this_page) {
+ block_end = block_start + BUFFERED_BLOCKSIZE;
+ if (block_end <= from || block_start >= to) {
+ if (!buffer_uptodate(bh))
+ partial = 1;
+ } else {
+ set_bit(BH_Uptodate, &bh->b_state);
+ if (!atomic_set_buffer_dirty(bh)) {
+ __mark_dirty(bh);
+ buffer_insert_inode_data_queue(bh, inode);
+ need_balance_dirty = 1;
}
- offset = 0;
- if (++bhe == &buflist[NBUF])
- bhe = buflist;
- } while (left > 0 && bhe != bhb && (!*bhe || !buffer_locked(*bhe)));
- if (bhe == bhb && !blocks)
- break;
- } while (left > 0);
-
-/* Release the read-ahead blocks */
- while (bhe != bhb) {
- brelse(*bhe);
- if (++bhe == &buflist[NBUF])
- bhe = buflist;
- };
- if (!read)
- return -EIO;
- filp->f_reada = 1;
- return read;
+ }
+ }
+
+ if (need_balance_dirty)
+ balance_dirty();
+ /*
+ * is this a partial write that happened to make all buffers
+ * uptodate then we can optimize away a bogus readpage() for
+ * the next read(). Here we 'discover' wether the page went
+ * uptodate as a result of this (potentially partial) write.
+ */
+ if (!partial)
+ SetPageUptodate(page);
+ return 0;
+}
+
+static int blkdev_commit_write(struct file *file, struct page *page,
+ unsigned from, unsigned to)
+{
+ struct inode *inode = page->mapping->host;
+ __blkdev_commit_write(inode,page,from,to);
+ kunmap(page);
+ return 0;
}
/*
}
+static int __block_fsync(struct inode * inode)
+{
+ int ret;
+
+ filemap_fdatasync(inode->i_mapping);
+ ret = sync_buffers(inode->i_rdev, 1);
+ filemap_fdatawait(inode->i_mapping);
+
+ return ret;
+}
+
/*
* Filp may be NULL when we are called by an msync of a vma
* since the vma has no handle.
static int block_fsync(struct file *filp, struct dentry *dentry, int datasync)
{
- return fsync_dev(dentry->d_inode->i_rdev);
+ struct inode * inode = dentry->d_inode;
+
+ return __block_fsync(inode);
}
/*
atomic_set(&new_bdev->bd_count,1);
new_bdev->bd_dev = dev;
new_bdev->bd_op = NULL;
+ new_bdev->bd_inode = NULL;
spin_lock(&bdev_lock);
bdev = bdfind(dev, head);
if (!bdev) {
void bdput(struct block_device *bdev)
{
if (atomic_dec_and_test(&bdev->bd_count)) {
- spin_lock(&bdev_lock);
- if (atomic_read(&bdev->bd_openers))
+ if (bdev->bd_openers)
+ BUG();
+ if (bdev->bd_cache_openers)
BUG();
+ spin_lock(&bdev_lock);
list_del(&bdev->bd_hash);
spin_unlock(&bdev_lock);
destroy_bdev(bdev);
int ret = -ENODEV;
kdev_t rdev = to_kdev_t(bdev->bd_dev); /* this should become bdev */
down(&bdev->bd_sem);
+ lock_kernel();
if (!bdev->bd_op)
bdev->bd_op = get_blkfops(MAJOR(rdev));
if (bdev->bd_op) {
ret = 0;
if (bdev->bd_op->open)
ret = bdev->bd_op->open(fake_inode, &fake_file);
- if (!ret)
- atomic_inc(&bdev->bd_openers);
- else if (!atomic_read(&bdev->bd_openers))
+ if (!ret) {
+ bdev->bd_openers++;
+ atomic_inc(&bdev->bd_count);
+ } else if (!bdev->bd_openers)
bdev->bd_op = NULL;
iput(fake_inode);
}
}
+ unlock_kernel();
up(&bdev->bd_sem);
return ret;
}
{
int ret = -ENXIO;
struct block_device *bdev = inode->i_bdev;
+
+ /*
+ * Preserve backwards compatibility and allow large file access
+ * even if userspace doesn't ask for it explicitly. Some mkfs
+ * binary needs it. We might want to drop this workaround
+ * during an unstable branch.
+ */
+ filp->f_flags |= O_LARGEFILE;
+
down(&bdev->bd_sem);
lock_kernel();
if (!bdev->bd_op)
ret = 0;
if (bdev->bd_op->open)
ret = bdev->bd_op->open(inode,filp);
- if (!ret)
- atomic_inc(&bdev->bd_openers);
- else if (!atomic_read(&bdev->bd_openers))
+ if (!ret) {
+ bdev->bd_openers++;
+ if (!bdev->bd_cache_openers && bdev->bd_inode)
+ BUG();
+ if (bdev->bd_cache_openers && !bdev->bd_inode)
+ BUG();
+ if (!bdev->bd_cache_openers++)
+ bdev->bd_inode = inode;
+ else {
+ if (bdev->bd_inode != inode && !inode->i_mapping_overload++) {
+ inode->i_mapping = bdev->bd_inode->i_mapping;
+ atomic_inc(&bdev->bd_inode->i_count);
+ }
+ }
+ } else if (!bdev->bd_openers)
bdev->bd_op = NULL;
}
unlock_kernel();
int ret = 0;
kdev_t rdev = to_kdev_t(bdev->bd_dev); /* this should become bdev */
down(&bdev->bd_sem);
- /* syncing will go here */
lock_kernel();
if (kind == BDEV_FILE)
fsync_dev(rdev);
else if (kind == BDEV_FS)
fsync_no_super(rdev);
- if (atomic_dec_and_test(&bdev->bd_openers)) {
- /* invalidating buffers will go here */
+ /* only filesystems uses buffer cache for the metadata these days */
+ if (kind == BDEV_FS)
invalidate_buffers(rdev);
- }
if (bdev->bd_op->release) {
struct inode * fake_inode = get_empty_inode();
ret = -ENOMEM;
fake_inode->i_rdev = rdev;
ret = bdev->bd_op->release(fake_inode, NULL);
iput(fake_inode);
- }
+ } else
+ printk(KERN_WARNING "blkdev_put: ->release couldn't be run due -ENOMEM\n");
}
- if (!atomic_read(&bdev->bd_openers))
+ if (!--bdev->bd_openers)
bdev->bd_op = NULL; /* we can't rely on driver being */
/* kind to stay around. */
unlock_kernel();
up(&bdev->bd_sem);
+ bdput(bdev);
return ret;
}
-static int blkdev_close(struct inode * inode, struct file * filp)
+int blkdev_close(struct inode * inode, struct file * filp)
{
- return blkdev_put(inode->i_bdev, BDEV_FILE);
+ struct block_device *bdev = inode->i_bdev;
+ int ret = 0;
+ struct inode * bd_inode = bdev->bd_inode;
+
+ if (bd_inode->i_mapping != inode->i_mapping)
+ BUG();
+ down(&bdev->bd_sem);
+ lock_kernel();
+ /* cache coherency protocol */
+ if (!--bdev->bd_cache_openers) {
+ struct super_block * sb;
+
+ /* flush the pagecache to disk */
+ __block_fsync(inode);
+ /* drop the pagecache, uptodate info is on disk by now */
+ truncate_inode_pages(inode->i_mapping, 0);
+ /* forget the bdev pagecache address space */
+ bdev->bd_inode = NULL;
+
+ /* if the fs was mounted ro just throw away most of its caches */
+ sb = get_super(inode->i_rdev);
+ if (sb) {
+ if (sb->s_flags & MS_RDONLY) {
+ /*
+ * This call is not destructive in terms of
+ * dirty cache, so it is safe to run it
+ * even if the fs gets mounted read write
+ * under us.
+ */
+ invalidate_device(inode->i_rdev, 0);
+ }
+
+ /*
+ * Now only if an underlying fs is mounted ro we'll
+ * try to refill its pinned buffer cache from disk.
+ * The fs cannot go away under us because we hold
+ * the read semaphore of the superblock, but
+ * we must also serialize against ->remount_fs and
+ * ->read_super callbacks to avoid MS_RDONLY to go
+ * away under us.
+ */
+ lock_super(sb);
+ if (sb->s_flags & MS_RDONLY)
+ /* now refill the obsolete pinned buffers from disk */
+ update_buffers(inode->i_rdev);
+ unlock_super(sb);
+
+ drop_super(sb);
+ }
+ }
+ if (inode != bd_inode && !--inode->i_mapping_overload) {
+ inode->i_mapping = &inode->i_data;
+ iput(bd_inode);
+ }
+
+ /* release the device driver */
+ if (bdev->bd_op->release)
+ ret = bdev->bd_op->release(inode, NULL);
+ if (!--bdev->bd_openers)
+ bdev->bd_op = NULL;
+ unlock_kernel();
+ up(&bdev->bd_sem);
+
+ return ret;
}
static int blkdev_ioctl(struct inode *inode, struct file *file, unsigned cmd,
return -EINVAL;
}
+struct address_space_operations def_blk_aops = {
+ readpage: blkdev_readpage,
+ writepage: blkdev_writepage,
+ sync_page: block_sync_page,
+ prepare_write: blkdev_prepare_write,
+ commit_write: blkdev_commit_write,
+ direct_IO: blkdev_direct_IO,
+};
+
struct file_operations def_blk_fops = {
open: blkdev_open,
release: blkdev_close,
llseek: block_llseek,
- read: block_read,
- write: block_write,
+ read: generic_file_read,
+ write: generic_file_write,
+ mmap: generic_file_mmap,
fsync: block_fsync,
ioctl: blkdev_ioctl,
};
/* These are the min and max parameter values that we will allow to be assigned */
int bdflush_min[N_PARAM] = { 0, 10, 5, 25, 0, 1*HZ, 0, 0, 0};
-int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,600*HZ, 6000*HZ, 100, 0, 0};
+int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,10000*HZ, 6000*HZ, 100, 0, 0};
inline void unlock_buffer(struct buffer_head *bh)
{
+ clear_bit(BH_Wait_IO, &bh->b_state);
clear_bit(BH_Lock, &bh->b_state);
smp_mb__after_clear_bit();
if (waitqueue_active(&bh->b_wait))
int nr;
next = lru_list[BUF_DIRTY];
- nr = nr_buffers_type[BUF_DIRTY] * 2;
+ nr = nr_buffers_type[BUF_DIRTY];
count = 0;
while (next && --nr >= 0) {
struct buffer_head * bh = next;
int nr;
next = lru_list[index];
- nr = nr_buffers_type[index] * 2;
+ nr = nr_buffers_type[index];
while (next && --nr >= 0) {
struct buffer_head *bh = next;
next = bh->b_next_free;
* We will ultimately want to put these in a separate list, but for
* now we search all of the lists for dirty buffers.
*/
-static int sync_buffers(kdev_t dev, int wait)
+int sync_buffers(kdev_t dev, int wait)
{
int err = 0;
spin_unlock(&lru_list_lock);
}
+void buffer_insert_inode_data_queue(struct buffer_head *bh, struct inode *inode)
+{
+ spin_lock(&lru_list_lock);
+ if (bh->b_inode)
+ list_del(&bh->b_inode_buffers);
+ bh->b_inode = inode;
+ list_add(&bh->b_inode_buffers, &inode->i_dirty_data_buffers);
+ spin_unlock(&lru_list_lock);
+}
+
/* The caller must have the lru_list lock before calling the
remove_inode_queue functions. */
static void __remove_inode_queue(struct buffer_head *bh)
int ret;
spin_lock(&lru_list_lock);
- ret = !list_empty(&inode->i_dirty_buffers);
+ ret = !list_empty(&inode->i_dirty_buffers) || !list_empty(&inode->i_dirty_data_buffers);
spin_unlock(&lru_list_lock);
return ret;
}
-
/* If invalidate_buffers() will trash dirty buffers, it means some kind
of fs corruption is going on. Trashing dirty data always imply losing
information that was supposed to be just stored on the physical layer
These are two special cases. Normal usage imply the device driver
to issue a sync on the device (without waiting I/O completion) and
- then an invalidate_buffers call that doesn't trash dirty buffers. */
-void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
+ then an invalidate_buffers call that doesn't trash dirty buffers.
+
+ For handling cache coherency with the blkdev pagecache the 'update' case
+ is been introduced. It is needed to re-read from disk any pinned
+ buffer. NOTE: re-reading from disk is destructive so we can do it only
+ when we assume nobody is changing the buffercache under our I/O and when
+ we think the disk contains more recent information than the buffercache.
+ The update == 1 pass marks the buffers we need to update, the update == 2
+ pass does the actual I/O. */
+void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers, int update)
{
int i, nlist, slept;
struct buffer_head * bh, * bh_next;
}
write_lock(&hash_table_lock);
- if (!atomic_read(&bh->b_count) &&
- (destroy_dirty_buffers || !buffer_dirty(bh))) {
- remove_inode_queue(bh);
- __remove_from_queues(bh);
- put_last_free(bh);
+ /* All buffers in the lru lists are mapped */
+ if (!buffer_mapped(bh))
+ BUG();
+ if (!atomic_read(&bh->b_count)) {
+ if (destroy_dirty_buffers || !buffer_dirty(bh)) {
+ remove_inode_queue(bh);
+ __remove_from_queues(bh);
+ put_last_free(bh);
+ }
+ } else if (update) {
+ if ((update == 2) ^ buffer_uptodate(bh) &&
+ (update == 2) ^ buffer_req(bh)) {
+ write_unlock(&hash_table_lock);
+ atomic_inc(&bh->b_count);
+ spin_unlock(&lru_list_lock);
+
+ if (update == 2) {
+ ll_rw_block(READ, 1, &bh);
+ wait_on_buffer(bh);
+ } else {
+ lock_buffer(bh);
+ clear_bit(BH_Uptodate, &bh->b_state);
+ clear_bit(BH_Req, &bh->b_state);
+ unlock_buffer(bh);
+ }
+
+ atomic_dec(&bh->b_count);
+ goto retry;
+ }
}
- /* else complain loudly? */
write_unlock(&hash_table_lock);
if (slept)
static void free_more_memory(void)
{
balance_dirty();
- page_launder(GFP_NOFS, 0);
wakeup_bdflush();
- wakeup_kswapd();
current->policy |= SCHED_YIELD;
__set_current_state(TASK_RUNNING);
schedule();
* that unlock the page..
*/
spin_lock_irqsave(&page_uptodate_lock, flags);
+ mark_buffer_async(bh, 0);
unlock_buffer(bh);
tmp = bh->b_this_page;
while (tmp != bh) {
- if (tmp->b_end_io == end_buffer_io_async && buffer_locked(tmp))
+ if (buffer_async(tmp) && buffer_locked(tmp))
goto still_busy;
tmp = tmp->b_this_page;
}
/* OK, the async IO on this page is complete. */
spin_unlock_irqrestore(&page_uptodate_lock, flags);
- put_bh(bh);
/*
* if none of the buffers had errors then we can set the
return;
still_busy:
- put_bh(bh);
spin_unlock_irqrestore(&page_uptodate_lock, flags);
return;
}
-void set_buffer_async_io(struct buffer_head *bh) {
+inline void set_buffer_async_io(struct buffer_head *bh) {
bh->b_end_io = end_buffer_io_async ;
+ mark_buffer_async(bh, 1);
}
/*
return err2;
}
+int fsync_inode_data_buffers(struct inode *inode)
+{
+ struct buffer_head *bh;
+ struct inode tmp;
+ int err = 0, err2;
+
+ INIT_LIST_HEAD(&tmp.i_dirty_data_buffers);
+
+ spin_lock(&lru_list_lock);
+
+ while (!list_empty(&inode->i_dirty_data_buffers)) {
+ bh = BH_ENTRY(inode->i_dirty_data_buffers.next);
+ list_del(&bh->b_inode_buffers);
+ if (!buffer_dirty(bh) && !buffer_locked(bh))
+ bh->b_inode = NULL;
+ else {
+ bh->b_inode = &tmp;
+ list_add(&bh->b_inode_buffers, &tmp.i_dirty_data_buffers);
+ if (buffer_dirty(bh)) {
+ get_bh(bh);
+ spin_unlock(&lru_list_lock);
+ ll_rw_block(WRITE, 1, &bh);
+ brelse(bh);
+ spin_lock(&lru_list_lock);
+ }
+ }
+ }
+
+ while (!list_empty(&tmp.i_dirty_data_buffers)) {
+ bh = BH_ENTRY(tmp.i_dirty_data_buffers.prev);
+ remove_inode_queue(bh);
+ get_bh(bh);
+ spin_unlock(&lru_list_lock);
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh))
+ err = -EIO;
+ brelse(bh);
+ spin_lock(&lru_list_lock);
+ }
+
+ spin_unlock(&lru_list_lock);
+ err2 = osync_inode_data_buffers(inode);
+
+ if (err)
+ return err;
+ else
+ return err2;
+}
/*
* osync is designed to support O_SYNC io. It waits synchronously for
return err;
}
+int osync_inode_data_buffers(struct inode *inode)
+{
+ struct buffer_head *bh;
+ struct list_head *list;
+ int err = 0;
+
+ spin_lock(&lru_list_lock);
+
+ repeat:
+
+ for (list = inode->i_dirty_data_buffers.prev;
+ bh = BH_ENTRY(list), list != &inode->i_dirty_data_buffers;
+ list = bh->b_inode_buffers.prev) {
+ if (buffer_locked(bh)) {
+ get_bh(bh);
+ spin_unlock(&lru_list_lock);
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh))
+ err = -EIO;
+ brelse(bh);
+ spin_lock(&lru_list_lock);
+ goto repeat;
+ }
+ }
+
+ spin_unlock(&lru_list_lock);
+ return err;
+}
+
/*
* Invalidate any and all dirty buffers on a given inode. We are
*/
void invalidate_inode_buffers(struct inode *inode)
{
- struct list_head *list, *next;
+ struct list_head * entry;
spin_lock(&lru_list_lock);
- list = inode->i_dirty_buffers.next;
- while (list != &inode->i_dirty_buffers) {
- next = list->next;
- remove_inode_queue(BH_ENTRY(list));
- list = next;
- }
+ while ((entry = inode->i_dirty_buffers.next) != &inode->i_dirty_buffers)
+ remove_inode_queue(BH_ENTRY(entry));
+ while ((entry = inode->i_dirty_data_buffers.next) != &inode->i_dirty_data_buffers)
+ remove_inode_queue(BH_ENTRY(entry));
spin_unlock(&lru_list_lock);
}
out:
write_unlock(&hash_table_lock);
spin_unlock(&lru_list_lock);
+ touch_buffer(bh);
return bh;
}
}
}
-static __inline__ void __mark_dirty(struct buffer_head *bh)
+inline void __mark_dirty(struct buffer_head *bh)
{
bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
refile_buffer(bh);
dispose = BUF_LOCKED;
if (buffer_dirty(bh))
dispose = BUF_DIRTY;
- if (buffer_protected(bh))
- dispose = BUF_PROTECTED;
if (dispose != bh->b_list) {
__remove_from_lru_list(bh, bh->b_list);
bh->b_list = dispose;
/* grab the lru lock here to block bdflush. */
spin_lock(&lru_list_lock);
write_lock(&hash_table_lock);
- if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf) || buffer_protected(buf))
+ if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf))
goto in_use;
__hash_unlink(buf);
- remove_inode_queue(buf);
write_unlock(&hash_table_lock);
+ remove_inode_queue(buf);
__remove_from_lru_list(buf, buf->b_list);
spin_unlock(&lru_list_lock);
put_last_free(buf);
struct buffer_head * bh;
bh = getblk(dev, block, size);
- touch_buffer(bh);
if (buffer_uptodate(bh))
return bh;
ll_rw_block(READ, 1, &bh);
* we have truncated the file and are going to free the
* blocks on-disk..
*/
-int block_flushpage(struct page *page, unsigned long offset)
+int discard_bh_page(struct page *page, unsigned long offset, int drop_pagecache)
{
struct buffer_head *head, *bh, *next;
unsigned int curr_off = 0;
*/
if (!offset) {
if (!try_to_free_buffers(page, 0)) {
- atomic_inc(&buffermem_pages);
+ if (drop_pagecache)
+ atomic_inc(&buffermem_pages);
return 0;
}
}
return 1;
}
-static void create_empty_buffers(struct page *page, kdev_t dev, unsigned long blocksize)
+void create_empty_buffers(struct page *page, kdev_t dev, unsigned long blocksize)
{
struct buffer_head *bh, *head, *tail;
/* Stage 2: lock the buffers, mark them clean */
do {
lock_buffer(bh);
- bh->b_end_io = end_buffer_io_async;
- get_bh(bh);
+ set_buffer_async_io(bh);
set_bit(BH_Uptodate, &bh->b_state);
clear_bit(BH_Dirty, &bh->b_state);
bh = bh->b_this_page;
set_bit(BH_Uptodate, &bh->b_state);
if (!atomic_set_buffer_dirty(bh)) {
__mark_dirty(bh);
- buffer_insert_inode_queue(bh, inode);
+ buffer_insert_inode_data_queue(bh, inode);
need_balance_dirty = 1;
}
}
for (i = 0; i < nr; i++) {
struct buffer_head * bh = arr[i];
lock_buffer(bh);
- bh->b_end_io = end_buffer_io_async;
- get_bh(bh);
+ set_buffer_async_io(bh);
}
/* Stage 3: start the IO */
return tmp.b_blocknr;
}
+int generic_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize, get_block_t * get_block)
+{
+ int i, nr_blocks, retval;
+ unsigned long * blocks = iobuf->blocks;
+
+ nr_blocks = iobuf->length / blocksize;
+ /* build the blocklist */
+ for (i = 0; i < nr_blocks; i++, blocknr++) {
+ struct buffer_head bh;
+
+ bh.b_state = 0;
+ bh.b_dev = inode->i_dev;
+ bh.b_size = blocksize;
+
+ retval = get_block(inode, blocknr, &bh, rw == READ ? 0 : 1);
+ if (retval)
+ goto out;
+
+ if (rw == READ) {
+ if (buffer_new(&bh))
+ BUG();
+ if (!buffer_mapped(&bh)) {
+ /* there was an hole in the filesystem */
+ blocks[i] = -1UL;
+ continue;
+ }
+ } else {
+ if (buffer_new(&bh))
+ unmap_underlying_metadata(&bh);
+ if (!buffer_mapped(&bh))
+ BUG();
+ }
+ blocks[i] = bh.b_blocknr;
+ }
+
+ retval = brw_kiovec(rw, 1, &iobuf, inode->i_dev, iobuf->blocks, blocksize);
+
+ out:
+ return retval;
+}
+
/*
* IO completion routine for a buffer_head being used for kiobuf IO: we
* can't dispatch the kiobuf callback until io_count reaches 0.
while (length > 0) {
blocknr = b[bufind++];
+ if (blocknr == -1UL) {
+ if (rw == READ) {
+ /* there was an hole in the filesystem */
+ memset(kmap(map) + offset, 0, size);
+ flush_dcache_page(map);
+ kunmap(map);
+
+ transferred += size;
+ goto skip_block;
+ } else
+ BUG();
+ }
tmp = bhs[bhind++];
tmp->b_dev = B_FREE;
} else
set_bit(BH_Uptodate, &tmp->b_state);
- length -= size;
- offset += size;
-
atomic_inc(&iobuf->io_count);
submit_bh(rw, tmp);
/*
goto finished;
bhind = 0;
}
-
+
+ skip_block:
+ length -= size;
+ offset += size;
+
if (offset >= PAGE_SIZE) {
offset = 0;
break;
lock_buffer(bh);
bh->b_blocknr = *(b++);
set_bit(BH_Mapped, &bh->b_state);
- bh->b_end_io = end_buffer_io_async;
- get_bh(bh);
+ set_buffer_async_io(bh);
bh = bh->b_this_page;
} while (bh != head);
return 0;
}
-/*
- * Sync all the buffers on one page..
- *
- * If we have old buffers that are locked, we'll
- * wait on them, but we won't wait on the new ones
- * we're writing out now.
- *
- * This all is required so that we can free up memory
- * later.
- *
- * Wait:
- * 0 - no wait (this does not get called - see try_to_free_buffers below)
- * 1 - start IO for dirty buffers
- * 2 - wait for completion of locked buffers
- */
-static void sync_page_buffers(struct buffer_head *bh, unsigned int gfp_mask)
+static int sync_page_buffers(struct buffer_head *bh, unsigned int gfp_mask)
{
- struct buffer_head * tmp = bh;
+ struct buffer_head * p = bh;
+ int tryagain = 1;
do {
- struct buffer_head *p = tmp;
- tmp = tmp->b_this_page;
- if (buffer_locked(p)) {
- if (gfp_mask & __GFP_WAIT)
- __wait_on_buffer(p);
- } else if (buffer_dirty(p))
- ll_rw_block(WRITE, 1, &p);
- } while (tmp != bh);
+ if (buffer_dirty(p) || buffer_locked(p)) {
+ if (test_and_set_bit(BH_Wait_IO, &p->b_state)) {
+ if (buffer_dirty(p)) {
+ ll_rw_block(WRITE, 1, &p);
+ tryagain = 0;
+ } else if (buffer_locked(p)) {
+ if (gfp_mask & __GFP_WAIT) {
+ wait_on_buffer(p);
+ tryagain = 1;
+ } else
+ tryagain = 0;
+ }
+ } else
+ tryagain = 0;
+ }
+ p = p->b_this_page;
+ } while (p != bh);
+
+ return tryagain;
}
/*
* Can the buffer be thrown out?
*/
-#define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
+#define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock))
#define buffer_busy(bh) (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
/*
write_unlock(&hash_table_lock);
spin_unlock(&lru_list_lock);
if (gfp_mask & __GFP_IO) {
- if (!(gfp_mask & __GFP_HIGHIO) && PageHighMem(page))
- return 0;
- sync_page_buffers(bh, gfp_mask);
- /* We waited synchronously, so we can free the buffers. */
- if (gfp_mask & __GFP_WAIT) {
- gfp_mask = 0; /* no IO or waiting this time around */
- goto cleaned_buffers_try_again;
+ if ((gfp_mask & __GFP_HIGHIO) || !PageHighMem(page)) {
+ if (sync_page_buffers(bh, gfp_mask)) {
+ /* no IO or waiting next time */
+ gfp_mask = 0;
+ goto cleaned_buffers_try_again;
+ }
}
- wakeup_bdflush();
}
+ if (balance_dirty_state() >= 0)
+ wakeup_bdflush();
return 0;
}
#ifdef CONFIG_SMP
struct buffer_head * bh;
int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
- int protected = 0;
int nlist;
- static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", "PROTECTED", };
+ static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", };
#endif
printk("Buffer memory: %6dkB\n",
if (!spin_trylock(&lru_list_lock))
return;
for(nlist = 0; nlist < NR_LIST; nlist++) {
- found = locked = dirty = used = lastused = protected = 0;
+ found = locked = dirty = used = lastused = 0;
bh = lru_list[nlist];
if(!bh) continue;
found++;
if (buffer_locked(bh))
locked++;
- if (buffer_protected(bh))
- protected++;
if (buffer_dirty(bh))
dirty++;
if (atomic_read(&bh->b_count))
buf_types[nlist], found, tmp);
}
printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
- "%d locked, %d protected, %d dirty\n",
+ "%d locked, %d dirty\n",
buf_types[nlist], found, size_buffers_type[nlist]>>10,
- used, lastused, locked, protected, dirty);
+ used, lastused, locked, dirty);
}
spin_unlock(&lru_list_lock);
#endif
if (!(gfp_mask & __GFP_FS))
return 0;
- count = dentry_stat.nr_unused >> priority;
+ count = dentry_stat.nr_unused / priority;
prune_dcache(count);
kmem_cache_shrink(dentry_cache);
inode->i_cdev = cdget(rdev);
} else if (S_ISBLK(mode)) {
inode->i_fop = &def_blk_fops;
+ inode->i_mapping->a_ops = &def_blk_aops;
inode->i_rdev = to_kdev_t(rdev);
inode->i_bdev = bdget(rdev);
} else if (S_ISFIFO(mode))
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/spinlock.h>
+#include <linux/personality.h>
#define __NO_VERSION__
#include <linux/module.h>
int len;
unsigned long pos;
- if (get_user(str, argv+argc) || !str || !(len = strnlen_user(str, bprm->p)))
+ if (get_user(str, argv+argc) || !(len = strnlen_user(str, bprm->p)))
return -EFAULT;
if (bprm->p < len)
return -E2BIG;
const char *name = dentry->d_name.name;
int namelen = dentry->d_name.len;
unsigned reclen = EXT2_DIR_REC_LEN(namelen);
- unsigned long n;
+ unsigned long start, n;
unsigned long npages = dir_pages(dir);
struct page *page = NULL;
ext2_dirent * de;
/* OFFSET_CACHE */
*res_page = NULL;
- for (n = 0; n < npages; n++) {
+ start = dir->u.ext2_i.i_dir_start_lookup;
+ if (start >= npages)
+ start = 0;
+ n = start;
+ do {
char *kaddr;
page = ext2_get_page(dir, n);
- if (IS_ERR(page))
- continue;
-
- kaddr = page_address(page);
- de = (ext2_dirent *) kaddr;
- kaddr += PAGE_CACHE_SIZE - reclen;
- for ( ; (char *) de <= kaddr ; de = ext2_next_entry(de))
- if (ext2_match (namelen, name, de))
- goto found;
- ext2_put_page(page);
- }
+ if (!IS_ERR(page)) {
+ kaddr = page_address(page);
+ de = (ext2_dirent *) kaddr;
+ kaddr += PAGE_CACHE_SIZE - reclen;
+ while ((char *) de <= kaddr) {
+ if (ext2_match (namelen, name, de))
+ goto found;
+ de = ext2_next_entry(de);
+ }
+ ext2_put_page(page);
+ }
+ if (++n >= npages)
+ n = 0;
+ } while (n != start);
return NULL;
found:
*res_page = page;
+ dir->u.ext2_i.i_dir_start_lookup = n;
return de;
}
int err;
err = fsync_inode_buffers(inode);
+ err |= fsync_inode_data_buffers(inode);
if (!(inode->i_state & I_DIRTY))
return err;
if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
{
return generic_block_bmap(mapping,block,ext2_get_block);
}
+static int ext2_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize)
+{
+ return generic_direct_IO(rw, inode, iobuf, blocknr, blocksize, ext2_get_block);
+}
struct address_space_operations ext2_aops = {
readpage: ext2_readpage,
writepage: ext2_writepage,
sync_page: block_sync_page,
prepare_write: ext2_prepare_write,
commit_write: generic_commit_write,
- bmap: ext2_bmap
+ bmap: ext2_bmap,
+ direct_IO: ext2_direct_IO,
};
/*
#include <linux/dnotify.h>
#include <linux/smp_lock.h>
#include <linux/slab.h>
+#include <linux/iobuf.h>
#include <asm/poll.h>
#include <asm/siginfo.h>
return ret;
}
-#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | FASYNC)
+#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | FASYNC | O_DIRECT)
static int setfl(int fd, struct file * filp, unsigned long arg)
{
}
}
+ if (arg & O_DIRECT) {
+ /*
+ * alloc_kiovec() can sleep and we are only serialized by
+ * the big kernel lock here, so abuse the i_sem to serialize
+ * this case too. We of course wouldn't need to go deep down
+ * to the inode layer, we could stay at the file layer, but
+ * we don't want to pay for the memory of a semaphore in each
+ * file structure too and we use the inode semaphore that we just
+ * pay for anyways.
+ */
+ error = 0;
+ down(&inode->i_sem);
+ if (!filp->f_iobuf)
+ error = alloc_kiovec(1, &filp->f_iobuf);
+ up(&inode->i_sem);
+ if (error < 0)
+ return error;
+ }
+
/* required for strict SunOS emulation */
if (O_NONBLOCK != O_NDELAY)
if (arg & O_NDELAY)
#include <linux/init.h>
#include <linux/module.h>
#include <linux/smp_lock.h>
+#include <linux/iobuf.h>
/* sysctl tunables... */
struct files_stat_struct files_stat = {0, 0, NR_FILE};
if (atomic_dec_and_test(&file->f_count)) {
locks_remove_flock(file);
+
+ if (file->f_iobuf)
+ free_kiovec(1, &file->f_iobuf);
+
if (file->f_op && file->f_op->release)
file->f_op->release(inode, file);
fops_put(file->f_op);
#include <linux/swap.h>
#include <linux/swapctl.h>
#include <linux/prefetch.h>
+#include <linux/locks.h>
/*
* New inode.c implementation.
((struct inode *) kmem_cache_alloc(inode_cachep, SLAB_KERNEL))
static void destroy_inode(struct inode *inode)
{
- if (!list_empty(&inode->i_dirty_buffers))
+ if (inode_has_buffers(inode))
BUG();
kmem_cache_free(inode_cachep, (inode));
}
INIT_LIST_HEAD(&inode->i_data.locked_pages);
INIT_LIST_HEAD(&inode->i_dentry);
INIT_LIST_HEAD(&inode->i_dirty_buffers);
+ INIT_LIST_HEAD(&inode->i_dirty_data_buffers);
sema_init(&inode->i_sem, 1);
sema_init(&inode->i_zombie, 1);
spin_lock_init(&inode->i_data.i_shared_lock);
{
struct super_block * sb = inode->i_sb;
+ if (!sb)
+ return;
+
/* Don't do this for I_DIRTY_PAGES - that doesn't actually dirty the inode itself */
if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
if (sb->s_op && sb->s_op->dirty_inode)
}
}
-static inline int try_to_sync_unused_list(struct list_head *head)
+static inline int try_to_sync_unused_list(struct list_head *head, int nr_inodes)
{
struct list_head *tmp = head;
struct inode *inode;
- while ((tmp = tmp->prev) != head) {
+ while (nr_inodes && (tmp = tmp->prev) != head) {
inode = list_entry(tmp, struct inode, i_list);
if (!atomic_read(&inode->i_count)) {
- /*
- * We're under PF_MEMALLOC here, and syncing the
- * inode may have to allocate memory. To avoid
- * running into a OOM deadlock, we write one
- * inode synchronously and stop syncing in case
- * we're under freepages.low
- */
+ __sync_one(inode, 0);
+ nr_inodes--;
- int sync = nr_free_pages() < freepages.low;
- __sync_one(inode, sync);
- if (sync)
- return 0;
/*
* __sync_one moved the inode to another list,
* so we have to start looking from the list head.
tmp = head;
}
}
- return 1;
+
+ return nr_inodes;
}
void sync_inodes_sb(struct super_block *sb)
}
}
-/*
- * Called with the spinlock already held..
- */
-static void try_to_sync_unused_inodes(void)
+static void try_to_sync_unused_inodes(void * arg)
{
struct super_block * sb;
+ int nr_inodes = inodes_stat.nr_unused;
+ spin_lock(&inode_lock);
spin_lock(&sb_lock);
sb = sb_entry(super_blocks.next);
- for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.next)) {
+ for (; nr_inodes && sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.next)) {
spin_unlock(&sb_lock);
- if (!try_to_sync_unused_list(&sb->s_dirty))
- return;
+ nr_inodes = try_to_sync_unused_list(&sb->s_dirty, nr_inodes);
spin_lock(&sb_lock);
}
spin_unlock(&sb_lock);
+ spin_unlock(&inode_lock);
}
+static struct tq_struct unused_inodes_flush_task;
+
/**
* write_inode_now - write an inode to disk
* @inode: inode to write to disk
while (inode->i_state & I_DIRTY)
sync_one(inode, sync);
spin_unlock(&inode_lock);
+ if (sync)
+ wait_on_inode(inode);
}
else
printk(KERN_ERR "write_inode_now: no super block\n");
* O_SYNC flag set, to flush dirty writes to disk.
*/
-int generic_osync_inode(struct inode *inode, int datasync)
+int generic_osync_inode(struct inode *inode, int what)
{
- int err;
+ int err = 0, err2 = 0, need_write_inode_now = 0;
/*
* WARNING
* every O_SYNC write, not just the synchronous I/Os. --sct
*/
-#ifdef WRITERS_QUEUE_IO
- err = osync_inode_buffers(inode);
-#else
- err = fsync_inode_buffers(inode);
-#endif
+ if (what & OSYNC_METADATA)
+ err = fsync_inode_buffers(inode);
+ if (what & OSYNC_DATA)
+ err2 = fsync_inode_data_buffers(inode);
+ if (!err)
+ err = err2;
spin_lock(&inode_lock);
- if (!(inode->i_state & I_DIRTY))
- goto out;
- if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
- goto out;
+ if ((inode->i_state & I_DIRTY) &&
+ ((what & OSYNC_INODE) || (inode->i_state & I_DIRTY_DATASYNC)))
+ need_write_inode_now = 1;
spin_unlock(&inode_lock);
- write_inode_now(inode, 1);
- return err;
- out:
- spin_unlock(&inode_lock);
+ if (need_write_inode_now)
+ write_inode_now(inode, 1);
+ else
+ wait_on_inode(inode);
+
return err;
}
void clear_inode(struct inode *inode)
{
- if (!list_empty(&inode->i_dirty_buffers))
- invalidate_inode_buffers(inode);
+ invalidate_inode_buffers(inode);
if (inode->i_data.nrpages)
BUG();
res = 0;
sb = get_super(dev);
if (sb) {
+ /*
+ * no need to lock the super, get_super holds the
+ * read semaphore so the filesystem cannot go away
+ * under us (->put_super runs with the write lock
+ * hold).
+ */
+ shrink_dcache_sb(sb);
res = invalidate_inodes(sb);
drop_super(sb);
}
{
LIST_HEAD(list);
struct list_head *entry, *freeable = &list;
- int count, synced = 0;
+ int count;
struct inode * inode;
spin_lock(&inode_lock);
-free_unused:
count = 0;
entry = inode_unused.prev;
while (entry != &inode_unused)
dispose_list(freeable);
/*
- * If we freed enough clean inodes, avoid writing
- * dirty ones. Also giveup if we already tried to
- * sync dirty inodes.
+ * If we didn't freed enough clean inodes schedule
+ * a sync of the dirty inodes, we cannot do it
+ * from here or we're either synchronously dogslow
+ * or we deadlock with oom.
*/
- if (!goal || synced)
- return;
-
- synced = 1;
-
- spin_lock(&inode_lock);
- try_to_sync_unused_inodes();
- goto free_unused;
+ if (goal)
+ schedule_task(&unused_inodes_flush_task);
}
int shrink_icache_memory(int priority, int gfp_mask)
if (!(gfp_mask & __GFP_FS))
return 0;
- count = inodes_stat.nr_unused >> priority;
+ count = inodes_stat.nr_unused / priority;
prune_icache(count);
kmem_cache_shrink(inode_cachep);
inode->i_nlink = 1;
atomic_set(&inode->i_writecount, 0);
inode->i_size = 0;
+ inode->i_blocks = 0;
inode->i_generation = 0;
memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
inode->i_pipe = NULL;
if (inode) {
struct super_operations *op = NULL;
+ if (inode->i_state == I_CLEAR)
+ BUG();
+
if (inode->i_sb && inode->i_sb->s_op)
op = inode->i_sb->s_op;
if (op && op->put_inode)
NULL);
if (!inode_cachep)
panic("cannot create inode slab cache");
+
+ unused_inodes_flush_task.routine = try_to_sync_unused_inodes;
}
/**
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/tty.h>
+#include <linux/iobuf.h>
#include <asm/uaccess.h>
f->f_reada = 0;
f->f_op = fops_get(inode->i_fop);
file_move(f, &inode->i_sb->s_files);
+
+ /* preallocate kiobuf for O_DIRECT */
+ f->f_iobuf = NULL;
+ f->f_iobuf_lock = 0;
+ if (f->f_flags & O_DIRECT) {
+ error = alloc_kiovec(1, &f->f_iobuf);
+ if (error)
+ goto cleanup_all;
+ }
+
if (f->f_op && f->f_op->open) {
error = f->f_op->open(inode,f);
if (error)
return f;
cleanup_all:
+ if (f->f_iobuf)
+ free_kiovec(1, &f->f_iobuf);
fops_put(f->f_op);
if (f->f_mode & FMODE_WRITE)
put_write_access(inode);
#include <asm/uaccess.h>
#include <asm/io.h>
-extern unsigned long log_size;
extern wait_queue_head_t log_wait;
extern int do_syslog(int type, char * bug, int count);
static unsigned int kmsg_poll(struct file *file, poll_table * wait)
{
poll_wait(file, &log_wait, wait);
- if (log_size)
+ if (do_syslog(9, 0, 0))
return POLLIN | POLLRDNORM;
return 0;
}
* display in kilobytes.
*/
#define K(x) ((x) << (PAGE_SHIFT - 10))
-#define B(x) ((x) << PAGE_SHIFT)
+#define B(x) ((unsigned long long)(x) << PAGE_SHIFT)
si_meminfo(&i);
si_swapinfo(&i);
len = sprintf(page, " total: used: free: shared: buffers: cached:\n"
- "Mem: %8lu %8lu %8lu %8lu %8lu %8u\n"
- "Swap: %8lu %8lu %8lu\n",
+ "Mem: %8Lu %8Lu %8Lu %8Lu %8Lu %8Lu\n"
+ "Swap: %8Lu %8Lu %8Lu\n",
B(i.totalram), B(i.totalram-i.freeram), B(i.freeram),
B(i.sharedram), B(i.bufferram),
B(atomic_read(&page_cache_size)), B(i.totalswap),
"Cached: %8lu kB\n"
"SwapCached: %8lu kB\n"
"Active: %8u kB\n"
- "Inact_dirty: %8u kB\n"
- "Inact_clean: %8u kB\n"
- "Inact_target: %8lu kB\n"
+ "Inactive: %8u kB\n"
"HighTotal: %8lu kB\n"
"HighFree: %8lu kB\n"
"LowTotal: %8lu kB\n"
K(atomic_read(&page_cache_size) - swapper_space.nrpages),
K(swapper_space.nrpages),
K(nr_active_pages),
- K(nr_inactive_dirty_pages),
- K(nr_inactive_clean_pages()),
- K(inactive_target),
+ K(nr_inactive_pages),
K(i.totalhigh),
K(i.freehigh),
K(i.totalram-i.totalhigh),
) {
struct inode * p_s_inode = p_s_dentry->d_inode;
struct reiserfs_transaction_handle th ;
- int n_err = 0;
+ int n_err;
int windex ;
int jbegin_count = 1 ;
BUG ();
n_err = fsync_inode_buffers(p_s_inode) ;
+ n_err |= fsync_inode_data_buffers(p_s_inode);
/* commit the current transaction to flush any metadata
** changes. sys_fsync takes care of flushing the dirty pages for us
*/
for(i = 0 ; i < nr ; i++) {
bh = bhp[i] ;
lock_buffer(bh) ;
- get_bh(bh) ; /* async end_io handler puts this */
set_buffer_async_io(bh) ;
/* submit_bh doesn't care if the buffer is dirty, but nobody
** later on in the call chain will be cleaning it. So, we
blivet = do_umount(old_rootmnt, 0);
mntput(old_rootmnt);
if (!blivet) {
- ioctl_by_bdev(ramdisk, BLKFLSBUF, 0);
+ int ioctl_err;
+
+ ioctl_err = ioctl_by_bdev(ramdisk, BLKFLSBUF, 0);
+ if (ioctl_err)
+ printk("failed to release ramdisk %d...", ioctl_err);
printk("okay\n");
error = 0;
}
#define O_NDELAY O_NONBLOCK
#define O_SYNC 040000
#define FASYNC 020000 /* fcntl, for BSD compatibility */
-#define O_DIRECT 040000 /* direct disk access - should check with OSF/1 */
#define O_DIRECTORY 0100000 /* must be a directory */
#define O_NOFOLLOW 0200000 /* don't follow links */
#define O_LARGEFILE 0400000 /* will be set by the kernel on every open */
+#define O_DIRECT 02000000 /* direct disk access - should check with OSF/1 */
#define F_DUPFD 0 /* dup */
#define F_GETFD 1 /* get close_on_exec */
int bpt_nsaved;
};
-#define INIT_MMAP { &init_mm, PAGE_OFFSET, PAGE_OFFSET+0x10000000, \
- NULL, PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
-
#define INIT_THREAD { \
0, 0, 0, \
0, 0, 0, \
#define init_task (init_task_union.task)
#define init_stack (init_task_union.stack)
+#define ARCH_HAS_PREFETCH
+#define ARCH_HAS_PREFETCHW
+#define ARCH_HAS_SPINLOCK_PREFETCH
+
+extern inline void prefetch(const void *ptr)
+{
+ __asm__ ("ldl $31,%0" : : "m"(*(char *)ptr));
+}
+
+extern inline void prefetchw(const void *ptr)
+{
+ __asm__ ("ldl $31,%0" : : "m"(*(char *)ptr));
+}
+
+extern inline void spin_lock_prefetch(const void *ptr)
+{
+ __asm__ ("ldl $31,%0" : : "m"(*(char *)ptr));
+}
+
+
+
#endif /* __ASM_ALPHA_PROCESSOR_H */
EXTRA_THREAD_STRUCT
};
-#define INIT_MMAP { \
- vm_mm: &init_mm, \
- vm_page_prot: PAGE_SHARED, \
- vm_flags: VM_READ | VM_WRITE | VM_EXEC, \
- vm_avl_height: 1, \
-}
-
#define INIT_THREAD { \
refcount: ATOMIC_INIT(1), \
EXTRA_THREAD_STRUCT_INIT \
#define current_regs() user_regs(current)
-/* INIT_MMAP is the kernels map of memory, between KSEG_C and KSEG_D */
-
-#ifdef CONFIG_CRIS_LOW_MAP
-#define INIT_MMAP { &init_mm, KSEG_6, KSEG_7, NULL, PAGE_SHARED, \
- VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
-#else
-#define INIT_MMAP { &init_mm, KSEG_C, KSEG_D, NULL, PAGE_SHARED, \
- VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
-#endif
-
#define INIT_THREAD { \
0, 0, 0x20 } /* ccr = int enable, nothing else */
#define O_NDELAY O_NONBLOCK
#define O_SYNC 010000
#define FASYNC 020000 /* fcntl, for BSD compatibility */
-#define O_DIRECT 040000 /* direct disk access hint - currently ignored */
+#define O_DIRECT 040000 /* direct disk access hint */
#define O_LARGEFILE 0100000
#define O_DIRECTORY 0200000 /* must be a directory */
#define O_NOFOLLOW 0400000 /* don't follow links */
"\n" __ALIGN_STR"\n" \
"common_interrupt:\n\t" \
SAVE_ALL \
- "pushl $ret_from_intr\n\t" \
SYMBOL_NAME_STR(call_do_IRQ)":\n\t" \
- "jmp "SYMBOL_NAME_STR(do_IRQ));
+ "call " SYMBOL_NAME_STR(do_IRQ) "\n\t" \
+ "jmp ret_from_intr\n");
/*
* subtle. orig_eax is used by the signal code to distinct between
extern void disable_irq_nosync(unsigned int);
extern void enable_irq(unsigned int);
+#ifdef CONFIG_X86_LOCAL_APIC
+#define ARCH_HAS_NMI_WATCHDOG /* See include/linux/irq.h */
+#endif
+
#endif /* _ASM_IRQ_H */
enum km_type {
KM_BOUNCE_READ,
- KM_BOUNCE_WRITE,
KM_SKB_DATA,
KM_SKB_DATA_SOFTIRQ,
KM_USER0,
* Tell the user there is some problem. Beep too, so we can
* see^H^H^Hhear bugs in early bootup as well!
*/
-#define BUG() do { \
- __asm__ __volatile__(".byte 0x0f,0x0b"); \
+#define BUG() do { \
+ __asm__ __volatile__(".byte 0x0f,0x0b"); \
} while (0)
#define PAGE_BUG(page) do { \
free_page((unsigned long)pte);
}
-#define pte_free(pte) pte_free_slow(pte)
+#define pte_free(pte) pte_free_fast(pte)
+#ifdef CONFIG_X86_PAE
+#define pgd_alloc(mm) get_pgd_slow()
#define pgd_free(pgd) free_pgd_slow(pgd)
+#else
#define pgd_alloc(mm) get_pgd_fast()
+#define pgd_free(pgd) free_pgd_fast(pgd)
+#endif
/*
* allocating and freeing a pmd is trivial: the 1-entry pmd is
#include <asm/types.h>
#include <asm/sigcontext.h>
#include <asm/cpufeature.h>
+#include <linux/cache.h>
#include <linux/config.h>
#include <linux/threads.h>
unsigned long *pmd_quick;
unsigned long *pte_quick;
unsigned long pgtable_cache_sz;
-};
+} __attribute__((__aligned__(SMP_CACHE_BYTES)));
#define X86_VENDOR_INTEL 0
#define X86_VENDOR_CYRIX 1
0,{~0,} /* io permissions */ \
}
-#define INIT_MMAP \
-{ &init_mm, 0, 0, NULL, PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
-
#define INIT_TSS { \
0,0, /* back_link, __blh */ \
sizeof(init_stack) + (long) &init_stack, /* esp0 */ \
struct ia64_fpreg fph[96]; /* saved/loaded on demand */
};
-#define INIT_MMAP { \
- &init_mm, PAGE_OFFSET, PAGE_OFFSET + 0x10000000, NULL, PAGE_SHARED, \
- VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL \
-}
-
#define INIT_THREAD { \
0, /* ksp */ \
0, /* flags */ \
return result;
}
+
+#define ARCH_HAS_PREFETCH
+#define ARCH_HAS_PREFETCHW
+#define ARCH_HAS_SPINLOCK_PREFETCH
+#define PREFETCH_STRIDE 256
+
+extern inline void prefetch(const void *x)
+{
+ __asm__ __volatile__ ("lfetch [%0]" : : "r"(x));
+}
+
+extern inline void prefetchw(const void *x)
+{
+ __asm__ __volatile__ ("lfetch.excl [%0]" : : "r"(x));
+}
+
+#define spin_lock_prefetch(x) prefetchw(x)
+
+
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_IA64_PROCESSOR_H */
unsigned char fpstate[FPSTATESIZE]; /* floating point state */
};
-#define INIT_MMAP { &init_mm, 0, 0x40000000, NULL, __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED), VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
-
#define INIT_THREAD { \
sizeof(init_stack) + (unsigned long) init_stack, 0, \
PS_S, __KERNEL_DS, \
#endif /* !defined (_LANGUAGE_ASSEMBLY) */
-#define INIT_MMAP { &init_mm, KSEG0, KSEG1, NULL, PAGE_SHARED, \
- VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
-
#define INIT_THREAD { \
/* \
* saved main processor registers \
#endif /* !defined (_LANGUAGE_ASSEMBLY) */
-#define INIT_MMAP { &init_mm, KSEG0, KSEG1, NULL, PAGE_SHARED, \
- VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
-
#define INIT_THREAD { \
/* \
* saved main processor registers \
/* Thread struct flags. */
#define PARISC_KERNEL_DEATH (1UL << 31) /* see die_if_kernel()... */
-#define INIT_MMAP { &init_mm, 0, 0, NULL, PAGE_SHARED, \
- VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
-
#define INIT_THREAD { { \
{ 0, 0, 0, 0, 0, 0, 0, 0, \
0, 0, 0, 0, 0, 0, 0, 0, \
enum km_type {
KM_BOUNCE_READ,
- KM_BOUNCE_WRITE,
KM_SKB_DATA,
KM_SKB_DATA_SOFTIRQ,
KM_USER0,
{0}, 0, 0 \
}
-/*
- * Note: the vm_start and vm_end fields here should *not*
- * be in kernel space. (Could vm_end == vm_start perhaps?)
- */
-#define INIT_MMAP { &init_mm, 0, 0x1000, NULL, \
- PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, \
- 1, NULL, NULL }
-
/*
* Return saved PC of a blocked thread. For now, this is the "user" PC
*/
typedef struct thread_struct thread_struct;
-#define INIT_MMAP \
-{ &init_mm, 0, 0, NULL, PAGE_SHARED, \
-VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
-
#define INIT_THREAD { (struct pt_regs *) 0, \
{ 0,{{0},{0},{0},{0},{0},{0},{0},{0},{0},{0}, \
{0},{0},{0},{0},{0},{0}}}, \
typedef struct thread_struct thread_struct;
-#define INIT_MMAP \
-{ &init_mm, 0, 0, NULL, PAGE_SHARED, \
-VM_READ | VM_WRITE | VM_EXEC, 1, NULL,NULL }
-
#define INIT_THREAD { (struct pt_regs *) 0, \
{ 0,{{0},{0},{0},{0},{0},{0},{0},{0},{0},{0}, \
{0},{0},{0},{0},{0},{0}}}, \
union sh_fpu_union fpu;
};
-#define INIT_MMAP \
-{ &init_mm, 0, 0, NULL, PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
-
#define INIT_THREAD { \
sizeof(init_stack) + (long) &init_stack, /* sp */ \
0, /* pc */ \
#define O_DIRECTORY 0x10000 /* must be a directory */
#define O_NOFOLLOW 0x20000 /* don't follow links */
#define O_LARGEFILE 0x40000
+#define O_DIRECT 0x100000 /* direct disk access hint */
#define F_DUPFD 0 /* dup */
#define F_GETFD 1 /* get close_on_exec */
enum km_type {
KM_BOUNCE_READ,
- KM_BOUNCE_WRITE,
KM_SKB_DATA,
KM_SKB_DATA_SOFTIRQ,
KM_USER0,
#define SPARC_FLAG_KTHREAD 0x1 /* task is a kernel thread */
#define SPARC_FLAG_UNALIGNED 0x2 /* is allowed to do unaligned accesses */
-#define INIT_MMAP { &init_mm, (0), (0), \
- NULL, __pgprot(0x0) , VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
-
#define INIT_THREAD { \
/* uwinmask, kregs, ksp, kpc, kpsr, kwim */ \
0, 0, 0, 0, 0, 0, \
#define O_DIRECTORY 0x10000 /* must be a directory */
#define O_NOFOLLOW 0x20000 /* don't follow links */
#define O_LARGEFILE 0x40000
+#define O_DIRECT 0x100000 /* direct disk access hint */
+
#define F_DUPFD 0 /* dup */
#define F_GETFD 1 /* get close_on_exec */
#define FAULT_CODE_ITLB 0x04 /* Miss happened in I-TLB */
#define FAULT_CODE_WINFIXUP 0x08 /* Miss happened during spill/fill */
-#define INIT_MMAP { &init_mm, 0xfffff80000000000, 0xfffff80001000000, \
- NULL, PAGE_SHARED , VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
-
#define INIT_THREAD { \
/* ksp, wstate, cwp, flags, current_ds, */ \
0, 0, 0, 0, KERNEL_DS, \
#define blk_finished_io(nsects) do { } while (0)
#define blk_started_io(nsects) do { } while (0)
+static inline int buffered_blk_size(kdev_t dev)
+{
+ int ret = INT_MAX;
+ int major = MAJOR(dev);
+
+ if (blk_size[major])
+ ret = blk_size[major][MINOR(dev)] + ((BUFFERED_BLOCKSIZE-1) >> BLOCK_SIZE_BITS);
+
+ return ret;
+}
+
#endif
#ifndef __LINUX_CACHE_H
#define __LINUX_CACHE_H
+#include <linux/config.h>
#include <asm/cache.h>
#ifndef L1_CACHE_ALIGN
#define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES)))
#endif
+#ifndef ____cacheline_aligned_in_smp
+#ifdef CONFIG_SMP
+#define ____cacheline_aligned_in_smp ____cacheline_aligned
+#else
+#define ____cacheline_aligned_in_smp
+#endif /* CONFIG_SMP */
+#endif
+
#ifndef __cacheline_aligned
#ifdef MODULE
#define __cacheline_aligned ____cacheline_aligned
#define CON_CONSDEV (2) /* Last on the command line */
#define CON_ENABLED (4)
-extern spinlock_t console_lock;
-
struct console
{
char name[8];
extern void register_console(struct console *);
extern int unregister_console(struct console *);
extern struct console *console_drivers;
+extern void acquire_console_sem(void);
+extern void release_console_sem(void);
+extern void console_conditional_schedule(void);
/* VESA Blanking Levels */
#define VESA_NO_BLANKING 0
__u32 i_next_alloc_goal;
__u32 i_prealloc_block;
__u32 i_prealloc_count;
+ __u32 i_dir_start_lookup;
int i_new_inode:1; /* Is a freshly allocated inode */
};
#define BLOCK_SIZE_BITS 10
#define BLOCK_SIZE (1<<BLOCK_SIZE_BITS)
+/* buffer header fixed size for the blkdev I/O through pagecache */
+#define BUFFERED_BLOCKSIZE_BITS 10
+#define BUFFERED_BLOCKSIZE (1 << BUFFERED_BLOCKSIZE_BITS)
+
/* And dynamically-tunable limits and defaults: */
struct files_stat_struct {
int nr_files; /* read only */
BH_Req, /* 0 if the buffer has been invalidated */
BH_Mapped, /* 1 if the buffer has a disk mapping */
BH_New, /* 1 if the buffer is new and not yet written out */
- BH_Protected, /* 1 if the buffer is protected */
+ BH_Async, /* 1 if the buffer is under end_buffer_io_async I/O */
+ BH_Wait_IO, /* 1 if we should throttle on this buffer */
BH_PrivateStart,/* not a state bit, but the first bit available
* for private allocation by other entities
#define buffer_req(bh) __buffer_state(bh,Req)
#define buffer_mapped(bh) __buffer_state(bh,Mapped)
#define buffer_new(bh) __buffer_state(bh,New)
-#define buffer_protected(bh) __buffer_state(bh,Protected)
+#define buffer_async(bh) __buffer_state(bh,Async)
#define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK)
*/
struct page;
struct address_space;
+struct kiobuf;
struct address_space_operations {
int (*writepage)(struct page *);
int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
/* Unfortunately this kludge is needed for FIBMAP. Don't use it */
int (*bmap)(struct address_space *, long);
+#define KERNEL_HAS_O_DIRECT /* this is for modules out of the kernel */
+ int (*direct_IO)(int, struct inode *, struct kiobuf *, unsigned long, int);
};
struct address_space {
struct block_device {
struct list_head bd_hash;
atomic_t bd_count;
-/* struct address_space bd_data; */
+ struct inode * bd_inode;
dev_t bd_dev; /* not a kdev_t - it's a search key */
- atomic_t bd_openers;
+ int bd_openers;
+ int bd_cache_openers;
const struct block_device_operations *bd_op;
struct semaphore bd_sem; /* open/close mutex */
};
struct list_head i_dentry;
struct list_head i_dirty_buffers;
+ struct list_head i_dirty_data_buffers;
unsigned long i_ino;
atomic_t i_count;
wait_queue_head_t i_wait;
struct file_lock *i_flock;
struct address_space *i_mapping;
- struct address_space i_data;
+ struct address_space i_data;
+ int i_mapping_overload;
struct dquot *i_dquot[MAXQUOTAS];
/* These three should probably be a union */
struct pipe_inode_info *i_pipe;
/* needed for tty driver, and maybe others */
void *private_data;
+
+ /* preallocated helper kiobuf to speedup O_DIRECT */
+ struct kiobuf *f_iobuf;
+ long f_iobuf_lock;
};
extern spinlock_t files_lock;
#define file_list_lock() spin_lock(&files_lock);
extern struct char_device *cdget(dev_t);
extern void cdput(struct char_device *);
extern int blkdev_open(struct inode *, struct file *);
+extern int blkdev_close(struct inode *, struct file *);
extern struct file_operations def_blk_fops;
+extern struct address_space_operations def_blk_aops;
extern struct file_operations def_fifo_fops;
extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long);
extern int blkdev_get(struct block_device *, mode_t, unsigned, int);
#define BUF_CLEAN 0
#define BUF_LOCKED 1 /* Buffers scheduled for write */
#define BUF_DIRTY 2 /* Dirty buffers, not yet scheduled for write */
-#define BUF_PROTECTED 3 /* Ramdisk persistent storage */
-#define NR_LIST 4
+#define NR_LIST 3
static inline void get_bh(struct buffer_head * bh)
{
__mark_buffer_clean(bh);
}
-#define atomic_set_buffer_protected(bh) test_and_set_bit(BH_Protected, &(bh)->b_state)
-
-static inline void __mark_buffer_protected(struct buffer_head *bh)
-{
- refile_buffer(bh);
-}
-
-static inline void mark_buffer_protected(struct buffer_head * bh)
-{
- if (!atomic_set_buffer_protected(bh))
- __mark_buffer_protected(bh);
-}
-
+extern void FASTCALL(__mark_dirty(struct buffer_head *bh));
extern void FASTCALL(__mark_buffer_dirty(struct buffer_head *bh));
extern void FASTCALL(mark_buffer_dirty(struct buffer_head *bh));
+extern void FASTCALL(buffer_insert_inode_data_queue(struct buffer_head *, struct inode *));
#define atomic_set_buffer_dirty(bh) test_and_set_bit(BH_Dirty, &(bh)->b_state)
+static inline void mark_buffer_async(struct buffer_head * bh, int on)
+{
+ if (on)
+ set_bit(BH_Async, &bh->b_state);
+ else
+ clear_bit(BH_Async, &bh->b_state);
+}
+
/*
* If an error happens during the make_request, this function
* has to be recalled. It marks the buffer as clean and not
extern int invalidate_inodes(struct super_block *);
extern int invalidate_device(kdev_t, int);
extern void invalidate_inode_pages(struct inode *);
+extern void invalidate_inode_pages2(struct address_space *);
extern void invalidate_inode_buffers(struct inode *);
-#define invalidate_buffers(dev) __invalidate_buffers((dev), 0)
-#define destroy_buffers(dev) __invalidate_buffers((dev), 1)
-extern void __invalidate_buffers(kdev_t dev, int);
+#define invalidate_buffers(dev) __invalidate_buffers((dev), 0, 0)
+#define destroy_buffers(dev) __invalidate_buffers((dev), 1, 0)
+#define update_buffers(dev) \
+do { \
+ __invalidate_buffers((dev), 0, 1); \
+ __invalidate_buffers((dev), 0, 2); \
+} while (0)
+extern void __invalidate_buffers(kdev_t dev, int, int);
extern void sync_inodes(kdev_t);
extern void sync_unlocked_inodes(void);
extern void write_inode_now(struct inode *, int);
+extern int sync_buffers(kdev_t, int);
extern void sync_dev(kdev_t);
extern int fsync_dev(kdev_t);
extern int fsync_super(struct super_block *);
extern int fsync_no_super(kdev_t);
extern void sync_inodes_sb(struct super_block *);
-extern int fsync_inode_buffers(struct inode *);
extern int osync_inode_buffers(struct inode *);
+extern int osync_inode_data_buffers(struct inode *);
+extern int fsync_inode_buffers(struct inode *);
+extern int fsync_inode_data_buffers(struct inode *);
extern int inode_has_buffers(struct inode *);
extern void filemap_fdatasync(struct address_space *);
extern void filemap_fdatawait(struct address_space *);
typedef int (get_block_t)(struct inode*,long,struct buffer_head*,int);
/* Generic buffer handling for block filesystems.. */
-extern int block_flushpage(struct page *, unsigned long);
+extern int discard_bh_page(struct page *, unsigned long, int);
+#define block_flushpage(page, offset) discard_bh_page(page, offset, 1)
+#define block_invalidate_page(page) discard_bh_page(page, 0, 0)
extern int block_symlink(struct inode *, const char *, int);
extern int block_write_full_page(struct page*, get_block_t*);
extern int block_read_full_page(struct page*, get_block_t*);
int generic_block_bmap(struct address_space *, long, get_block_t *);
int generic_commit_write(struct file *, struct page *, unsigned, unsigned);
int block_truncate_page(struct address_space *, loff_t, get_block_t *);
+extern int generic_direct_IO(int, struct inode *, struct kiobuf *, unsigned long, int, get_block_t *);
+extern void create_empty_buffers(struct page *, kdev_t, unsigned long);
extern int waitfor_one_page(struct page*);
extern int generic_file_mmap(struct file *, struct vm_area_struct *);
extern int file_fsync(struct file *, struct dentry *, int);
extern int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx);
extern int generic_osync_inode(struct inode *, int);
+#define OSYNC_METADATA (1<<0)
+#define OSYNC_DATA (1<<1)
+#define OSYNC_INODE (1<<2)
extern int inode_change_ok(struct inode *, struct iattr *);
extern int inode_setattr(struct inode *, struct iattr *);
#include <asm/highmem.h>
/* declarations for linux/mm/highmem.c */
-FASTCALL(unsigned int nr_free_highpages(void));
+unsigned int nr_free_highpages(void);
extern struct buffer_head * create_bounce(int rw, struct buffer_head * bh_orig);
#include <asm/hw_irq.h> /* the arch dependent stuff */
+/**
+ * touch_nmi_watchdog - restart NMI watchdog timeout.
+ *
+ * If the architecture supports the NMI watchdog, touch_nmi_watchdog()
+ * may be used to reset the timeout - for code which intentionally
+ * disables interrupts for a long time. This call is stateless.
+ */
+#ifdef ARCH_HAS_NMI_WATCHDOG
+extern void touch_nmi_watchdog(void);
+#else
+# define touch_nmi_watchdog() do { } while(0)
+#endif
+
extern int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
extern int setup_irq(unsigned int , struct irqaction * );
extern unsigned char getledstate(void);
extern void setledstate(struct kbd_struct *kbd, unsigned int led);
-extern struct tasklet_struct console_tasklet;
-
extern int do_poke_blanked_console;
extern void (*kbd_ledfunc)(unsigned int led);
-static inline void show_console(void)
-{
- do_poke_blanked_console = 1;
- tasklet_schedule(&console_tasklet);
-}
-
-static inline void set_console(int nr)
-{
- want_console = nr;
- tasklet_schedule(&console_tasklet);
-}
+extern void set_console(int nr);
+extern void schedule_console_callback(void);
static inline void set_leds(void)
{
/* console.c */
-extern task_queue con_task_queue;
-
static inline void con_schedule_flip(struct tty_struct *t)
{
- queue_task(&t->flip.tqueue, &con_task_queue);
- tasklet_schedule(&console_tasklet);
+ schedule_task(&t->flip.tqueue);
}
#endif
console_loglevel = 15;
}
+extern void bust_spinlocks(int yes);
+extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */
+
#if DEBUG
#define pr_debug(fmt,arg...) \
printk(KERN_DEBUG fmt,##arg)
#if defined(__KERNEL__) || defined(_LVM_H_INCLUDE)
+#include <linux/prefetch.h>
+
/*
* Simple doubly linked list implementation.
*
static __inline__ void list_del(struct list_head *entry)
{
__list_del(entry->prev, entry->next);
+ entry->next = entry->prev = 0;
}
/**
* @head: the head for your list.
*/
#define list_for_each(pos, head) \
- for (pos = (head)->next; pos != (head); pos = pos->next)
-
+ for (pos = (head)->next, prefetch(pos->next); pos != (head); \
+ pos = pos->next, prefetch(pos->next))
+
#endif /* __KERNEL__ || _LVM_H_INCLUDE */
#endif
*/
#define LO_FLAGS_DO_BMAP 1
#define LO_FLAGS_READ_ONLY 2
+#define LO_FLAGS_BH_REMAP 4
/*
* Note that this structure gets the wrong offsets when directly used
#define LVM_MAX_STRIPES 128 /* max # of stripes */
#define LVM_MAX_SIZE ( 1024LU * 1024 / SECTOR_SIZE * 1024 * 1024) /* 1TB[sectors] */
#define LVM_MAX_MIRRORS 2 /* future use */
-#define LVM_MIN_READ_AHEAD 2 /* minimum read ahead sectors */
-#define LVM_MAX_READ_AHEAD 120 /* maximum read ahead sectors */
+#define LVM_MIN_READ_AHEAD 0 /* minimum read ahead sectors */
+#define LVM_DEFAULT_READ_AHEAD 1024 /* default read ahead sectors for 512k scsi segments */
+#define LVM_MAX_READ_AHEAD 10000 /* maximum read ahead sectors */
#define LVM_MAX_LV_IO_TIMEOUT 60 /* seconds I/O timeout (future use) */
#define LVM_PARTITION 0xfe /* LVM partition id */
#define LVM_NEW_PARTITION 0x8e /* new LVM partition id (10/09/1999) */
#include <linux/list.h>
#include <linux/mmzone.h>
#include <linux/swap.h>
+#include <linux/rbtree.h>
extern unsigned long max_mapnr;
extern unsigned long num_physpages;
extern int page_cluster;
/* The inactive_clean lists are per zone. */
extern struct list_head active_list;
-extern struct list_head inactive_dirty_list;
+extern struct list_head inactive_list;
#include <asm/page.h>
#include <asm/pgtable.h>
pgprot_t vm_page_prot; /* Access permissions of this VMA. */
unsigned long vm_flags; /* Flags, listed below. */
- /* AVL tree of VM areas per task, sorted by address */
- short vm_avl_height;
- struct vm_area_struct * vm_avl_left;
- struct vm_area_struct * vm_avl_right;
+ rb_node_t vm_rb;
/*
* For areas with an address space and backing store,
updated asynchronously */
struct list_head lru; /* Pageout list, eg. active_list;
protected by pagemap_lru_lock !! */
- unsigned long age; /* Page aging counter. */
wait_queue_head_t wait; /* Page locked? Stand in line... */
struct page **pprev_hash; /* Complement to *next_hash. */
struct buffer_head * buffers; /* Buffer maps us to a disk block. */
#define PG_dirty 4
#define PG_decr_after 5
#define PG_active 6
-#define PG_inactive_dirty 7
+#define PG_inactive 7
#define PG_slab 8
#define PG_swap_cache 9
#define PG_skip 10
-#define PG_inactive_clean 11
-#define PG_highmem 12
-#define PG_checked 13 /* kill me in 2.5.<early>. */
- /* bits 21-29 unused */
-#define PG_arch_1 30
-#define PG_reserved 31
+#define PG_highmem 11
+#define PG_checked 12 /* kill me in 2.5.<early>. */
+#define PG_arch_1 13
+#define PG_reserved 14
/* Make it prettier to test the above... */
#define Page_Uptodate(page) test_bit(PG_uptodate, &(page)->flags)
#define PageActive(page) test_bit(PG_active, &(page)->flags)
#define SetPageActive(page) set_bit(PG_active, &(page)->flags)
#define ClearPageActive(page) clear_bit(PG_active, &(page)->flags)
+#define TestandSetPageActive(page) test_and_set_bit(PG_active, &(page)->flags)
+#define TestandClearPageActive(page) test_and_clear_bit(PG_active, &(page)->flags)
-#define PageInactiveDirty(page) test_bit(PG_inactive_dirty, &(page)->flags)
-#define SetPageInactiveDirty(page) set_bit(PG_inactive_dirty, &(page)->flags)
-#define ClearPageInactiveDirty(page) clear_bit(PG_inactive_dirty, &(page)->flags)
-
-#define PageInactiveClean(page) test_bit(PG_inactive_clean, &(page)->flags)
-#define SetPageInactiveClean(page) set_bit(PG_inactive_clean, &(page)->flags)
-#define ClearPageInactiveClean(page) clear_bit(PG_inactive_clean, &(page)->flags)
+#define PageInactive(page) test_bit(PG_inactive, &(page)->flags)
+#define SetPageInactive(page) set_bit(PG_inactive, &(page)->flags)
+#define ClearPageInactive(page) clear_bit(PG_inactive, &(page)->flags)
+#define TestandSetPageInactive(page) test_and_set_bit(PG_inactive, &(page)->flags)
+#define TestandClearPageInactive(page) test_and_clear_bit(PG_inactive, &(page)->flags)
#ifdef CONFIG_HIGHMEM
#define PageHighMem(page) test_bit(PG_highmem, &(page)->flags)
* can allocate highmem pages, the *get*page*() variants return
* virtual kernel addresses to the allocated page(s).
*/
-extern struct page * FASTCALL(_alloc_pages(unsigned int gfp_mask, unsigned long order));
-extern struct page * FASTCALL(__alloc_pages(unsigned int gfp_mask, unsigned long order, zonelist_t *zonelist));
-extern struct page * alloc_pages_node(int nid, int gfp_mask, unsigned long order);
+extern struct page * FASTCALL(_alloc_pages(unsigned int gfp_mask, unsigned int order));
+extern struct page * FASTCALL(__alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist));
+extern struct page * alloc_pages_node(int nid, unsigned int gfp_mask, unsigned int order);
-static inline struct page * alloc_pages(int gfp_mask, unsigned long order)
+static inline struct page * alloc_pages(unsigned int gfp_mask, unsigned int order)
{
/*
* Gets optimized away by the compiler.
#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
-extern unsigned long FASTCALL(__get_free_pages(int gfp_mask, unsigned long order));
-extern unsigned long FASTCALL(get_zeroed_page(int gfp_mask));
+extern unsigned long FASTCALL(__get_free_pages(unsigned int gfp_mask, unsigned int order));
+extern unsigned long FASTCALL(get_zeroed_page(unsigned int gfp_mask));
#define __get_free_page(gfp_mask) \
__get_free_pages((gfp_mask),0)
/*
* There is only one 'core' page-freeing function.
*/
-extern void FASTCALL(__free_pages(struct page *page, unsigned long order));
-extern void FASTCALL(free_pages(unsigned long addr, unsigned long order));
+extern void FASTCALL(__free_pages(struct page *page, unsigned int order));
+extern void FASTCALL(free_pages(unsigned long addr, unsigned int order));
#define __free_page(page) __free_pages((page), 0)
#define free_page(addr) free_pages((addr),0)
*/
static inline pmd_t *pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
{
- if (!pgd_present(*pgd))
+ if (pgd_none(*pgd))
return __pmd_alloc(mm, pgd, address);
return pmd_offset(pgd, address);
}
extern void si_meminfo(struct sysinfo * val);
extern void swapin_readahead(swp_entry_t);
+static inline int is_page_cache_freeable(struct page * page)
+{
+ return page_count(page) - !!page->buffers == 1;
+}
+
/*
* Work out if there are any other processes sharing this
* swap cache page. Never mind the buffers.
extern void unlock_vma_mappings(struct vm_area_struct *);
extern void insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
extern void __insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
-extern void build_mmap_avl(struct mm_struct *);
+extern void build_mmap_rb(struct mm_struct *);
extern void exit_mmap(struct mm_struct *);
extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
extern unsigned long do_brk(unsigned long, unsigned long);
+static inline void __vma_unlink(struct mm_struct * mm, struct vm_area_struct * vma, struct vm_area_struct * prev)
+{
+ prev->vm_next = vma->vm_next;
+ rb_erase(&vma->vm_rb, &mm->mm_rb);
+ if (mm->mmap_cache == vma)
+ mm->mmap_cache = prev;
+}
+
+static inline int can_vma_merge(struct vm_area_struct * vma, unsigned long vm_flags)
+{
+ if (!vma->vm_file && vma->vm_flags == vm_flags)
+ return 1;
+ else
+ return 0;
+}
+
struct zone_t;
/* filemap.c */
extern void remove_inode_page(struct page *);
{
unsigned long grow;
+ /*
+ * vma->vm_start/vm_end cannot change under us because the caller is required
+ * to hold the mmap_sem in write mode. We need to get the spinlock only
+ * before relocating the vma range ourself.
+ */
address &= PAGE_MASK;
grow = (vma->vm_start - address) >> PAGE_SHIFT;
if (vma->vm_end - address > current->rlim[RLIMIT_STACK].rlim_cur ||
*/
spinlock_t lock;
unsigned long free_pages;
- unsigned long inactive_clean_pages;
- unsigned long inactive_dirty_pages;
unsigned long pages_min, pages_low, pages_high;
+ int need_balance;
/*
* free areas of different sizes
*/
- struct list_head inactive_clean_list;
free_area_t free_area[MAX_ORDER];
/*
typedef struct pglist_data {
zone_t node_zones[MAX_NR_ZONES];
zonelist_t node_zonelists[GFP_ZONEMASK+1];
+ int nr_zones;
struct page *node_mem_map;
unsigned long *valid_addr_bitmap;
struct bootmem_data *bdata;
extern int numnodes;
extern pg_data_t *pgdat_list;
-#define memclass(pgzone, tzone) (((pgzone)->zone_pgdat == (tzone)->zone_pgdat) \
- && ((pgzone) <= (tzone)))
+#define memclass(pgzone, classzone) (((pgzone)->zone_pgdat == (classzone)->zone_pgdat) \
+ && ((pgzone) <= (classzone)))
/*
* The following two are not meant for general usage. They are here as
#define PAGE_CACHE_ALIGN(addr) (((addr)+PAGE_CACHE_SIZE-1)&PAGE_CACHE_MASK)
#define page_cache_get(x) get_page(x)
-#define page_cache_free(x) __free_page(x)
#define page_cache_release(x) __free_page(x)
static inline struct page *page_cache_alloc(struct address_space *x)
--- /dev/null
+/*
+ Red Black Trees
+ (C) 1999 Andrea Arcangeli <andrea@suse.de>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+ linux/include/linux/rbtree.h
+
+ To use rbtrees you'll have to implement your own insert and search cores.
+ This will avoid us to use callbacks and to drop drammatically performances.
+ I know it's not the cleaner way, but in C (not in C++) to get
+ performances and genericity...
+
+ Some example of insert and search follows here. The search is a plain
+ normal search over an ordered tree. The insert instead must be implemented
+ int two steps: as first thing the code must insert the element in
+ order as a red leaf in the tree, then the support library function
+ rb_insert_color() must be called. Such function will do the
+ not trivial work to rebalance the rbtree if necessary.
+
+-----------------------------------------------------------------------
+static inline struct page * rb_search_page_cache(struct inode * inode,
+ unsigned long offset)
+{
+ rb_node_t * n = inode->i_rb_page_cache.rb_node;
+ struct page * page;
+
+ while (n)
+ {
+ page = rb_entry(n, struct page, rb_page_cache);
+
+ if (offset < page->offset)
+ n = n->rb_left;
+ else if (offset > page->offset)
+ n = n->rb_right;
+ else
+ return page;
+ }
+ return NULL;
+}
+
+static inline struct page * __rb_insert_page_cache(struct inode * inode,
+ unsigned long offset,
+ rb_node_t * node)
+{
+ rb_node_t ** p = &inode->i_rb_page_cache.rb_node;
+ rb_node_t * parent = NULL;
+ struct page * page;
+
+ while (*p)
+ {
+ parent = *p;
+ page = rb_entry(parent, struct page, rb_page_cache);
+
+ if (offset < page->offset)
+ p = &(*p)->rb_left;
+ else if (offset > page->offset)
+ p = &(*p)->rb_right;
+ else
+ return page;
+ }
+
+ rb_link_node(node, parent, p);
+
+ return NULL;
+}
+
+static inline struct page * rb_insert_page_cache(struct inode * inode,
+ unsigned long offset,
+ rb_node_t * node)
+{
+ struct page * ret;
+ if ((ret = __rb_insert_page_cache(inode, offset, node)))
+ goto out;
+ rb_insert_color(node, &inode->i_rb_page_cache);
+ out:
+ return ret;
+}
+-----------------------------------------------------------------------
+*/
+
+#ifndef _LINUX_RBTREE_H
+#define _LINUX_RBTREE_H
+
+#include <linux/kernel.h>
+#include <linux/stddef.h>
+
+typedef struct rb_node_s
+{
+ struct rb_node_s * rb_parent;
+ int rb_color;
+#define RB_RED 0
+#define RB_BLACK 1
+ struct rb_node_s * rb_right;
+ struct rb_node_s * rb_left;
+}
+rb_node_t;
+
+typedef struct rb_root_s
+{
+ struct rb_node_s * rb_node;
+}
+rb_root_t;
+
+#define RB_ROOT (rb_root_t) { NULL, }
+#define rb_entry(ptr, type, member) \
+ ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
+
+extern void rb_insert_color(rb_node_t *, rb_root_t *);
+extern void rb_erase(rb_node_t *, rb_root_t *);
+
+static inline void rb_link_node(rb_node_t * node, rb_node_t * parent, rb_node_t ** rb_link)
+{
+ node->rb_parent = parent;
+ node->rb_color = RB_RED;
+ node->rb_left = node->rb_right = NULL;
+
+ *rb_link = node;
+}
+
+#endif /* _LINUX_RBTREE_H */
#include <linux/types.h>
#include <linux/times.h>
#include <linux/timex.h>
+#include <linux/rbtree.h>
#include <asm/system.h>
#include <asm/semaphore.h>
/* Maximum number of active map areas.. This is a random (large) number */
#define MAX_MAP_COUNT (65536)
-/* Number of map areas at which the AVL tree is activated. This is arbitrary. */
-#define AVL_MIN_MAP_COUNT 32
-
struct mm_struct {
struct vm_area_struct * mmap; /* list of VMAs */
- struct vm_area_struct * mmap_avl; /* tree of VMAs */
+ rb_root_t mm_rb;
struct vm_area_struct * mmap_cache; /* last find_vma result */
pgd_t * pgd;
atomic_t mm_users; /* How many users with user space? */
#define INIT_MM(name) \
{ \
- mmap: &init_mmap, \
- mmap_avl: NULL, \
- mmap_cache: NULL, \
+ mm_rb: RB_ROOT, \
pgd: swapper_pg_dir, \
mm_users: ATOMIC_INIT(2), \
mm_count: ATOMIC_INIT(1), \
- map_count: 1, \
mmap_sem: __RWSEM_INITIALIZER(name.mmap_sem), \
page_table_lock: SPIN_LOCK_UNLOCKED, \
mmlist: LIST_HEAD_INIT(name.mmlist), \
struct task_struct *next_task, *prev_task;
struct mm_struct *active_mm;
+ struct list_head local_pages;
+ unsigned int allocation_order, nr_local_pages;
/* task state */
struct linux_binfmt *binfmt;
#define PF_DUMPCORE 0x00000200 /* dumped core */
#define PF_SIGNALED 0x00000400 /* killed by a signal */
#define PF_MEMALLOC 0x00000800 /* Allocating memory */
+#define PF_FREE_PAGES 0x00002000 /* per process page freeing */
#define PF_USEDFPU 0x00100000 /* task used FPU this quantum (SMP) */
extern void *kmalloc(size_t, int);
extern void kfree(const void *);
-extern void kmem_cache_reap(int);
+extern int FASTCALL(kmem_cache_reap(int));
extern int slabinfo_read_proc(char *page, char **start, off_t off,
int count, int *eof, void *data);
extern int slabinfo_write_proc(struct file *file, const char *buffer,
#define SWAP_FLAG_PRIO_MASK 0x7fff
#define SWAP_FLAG_PRIO_SHIFT 0
-#define MAX_SWAPFILES 8
+#define MAX_SWAPFILES 32
/*
* Magic header for a swap area. The first part of the union is
extern int nr_swap_pages;
extern unsigned int nr_free_pages(void);
-extern unsigned int nr_inactive_clean_pages(void);
extern unsigned int nr_free_buffer_pages(void);
extern int nr_active_pages;
-extern int nr_inactive_dirty_pages;
+extern int nr_inactive_pages;
extern atomic_t nr_async_pages;
extern struct address_space swapper_space;
extern atomic_t page_cache_size;
struct zone_t;
/* linux/mm/swap.c */
-extern int memory_pressure;
-extern void deactivate_page(struct page *);
-extern void deactivate_page_nolock(struct page *);
-extern void activate_page(struct page *);
-extern void activate_page_nolock(struct page *);
-extern void lru_cache_add(struct page *);
-extern void __lru_cache_del(struct page *);
-extern void lru_cache_del(struct page *);
-extern void recalculate_vm_stats(void);
+extern void FASTCALL(lru_cache_add(struct page *));
+extern void FASTCALL(__lru_cache_del(struct page *));
+extern void FASTCALL(lru_cache_del(struct page *));
+
+extern void FASTCALL(deactivate_page(struct page *));
+extern void FASTCALL(deactivate_page_nolock(struct page *));
+extern void FASTCALL(activate_page(struct page *));
+extern void FASTCALL(activate_page_nolock(struct page *));
+
extern void swap_setup(void);
/* linux/mm/vmscan.c */
-extern struct page * reclaim_page(zone_t *);
extern wait_queue_head_t kswapd_wait;
-extern wait_queue_head_t kreclaimd_wait;
-extern int page_launder(int, int);
-extern int free_shortage(void);
-extern int inactive_shortage(void);
-extern void wakeup_kswapd(void);
-extern int try_to_free_pages(unsigned int gfp_mask);
+extern int FASTCALL(try_to_free_pages(zone_t *, unsigned int, unsigned int));
/* linux/mm/page_io.c */
extern void rw_swap_page(int, struct page *);
extern struct page * read_swap_cache_async(swp_entry_t);
/* linux/mm/oom_kill.c */
-extern int out_of_memory(void);
extern void oom_kill(void);
/*
extern void free_page_and_swap_cache(struct page *page);
/* linux/mm/swapfile.c */
-extern int vm_swap_full(void);
extern unsigned int nr_swapfiles;
extern struct swap_info_struct swap_info[];
extern int is_swap_partition(kdev_t);
extern spinlock_t pagemap_lru_lock;
-extern void FASTCALL(mark_page_accessed(struct page *));
-
-/*
- * Page aging defines.
- * Since we do exponential decay of the page age, we
- * can chose a fairly large maximum.
- */
-#define PAGE_AGE_START 2
-#define PAGE_AGE_ADV 3
-#define PAGE_AGE_MAX 64
-
/*
* List add/del helper macros. These must be called
* with the pagemap_lru_lock held!
*/
-#define DEBUG_ADD_PAGE \
- if (PageActive(page) || PageInactiveDirty(page) || \
- PageInactiveClean(page)) BUG();
-
-#define ZERO_PAGE_BUG \
- if (page_count(page) == 0) BUG();
-
-#define add_page_to_active_list(page) { \
- DEBUG_ADD_PAGE \
- ZERO_PAGE_BUG \
- page->age = 0; \
- ClearPageReferenced(page); \
- SetPageActive(page); \
- list_add(&(page)->lru, &active_list); \
- nr_active_pages++; \
-}
-
-#define add_page_to_inactive_dirty_list(page) { \
- DEBUG_ADD_PAGE \
- ZERO_PAGE_BUG \
- SetPageInactiveDirty(page); \
- list_add(&(page)->lru, &inactive_dirty_list); \
- nr_inactive_dirty_pages++; \
- page->zone->inactive_dirty_pages++; \
-}
-
-#define add_page_to_inactive_clean_list(page) { \
- DEBUG_ADD_PAGE \
- ZERO_PAGE_BUG \
- SetPageInactiveClean(page); \
- list_add(&(page)->lru, &page->zone->inactive_clean_list); \
- page->zone->inactive_clean_pages++; \
-}
-
-#define del_page_from_active_list(page) { \
- list_del(&(page)->lru); \
- ClearPageActive(page); \
- nr_active_pages--; \
- DEBUG_ADD_PAGE \
- ZERO_PAGE_BUG \
-}
-
-#define del_page_from_inactive_dirty_list(page) { \
- list_del(&(page)->lru); \
- ClearPageInactiveDirty(page); \
- nr_inactive_dirty_pages--; \
- page->zone->inactive_dirty_pages--; \
- DEBUG_ADD_PAGE \
- ZERO_PAGE_BUG \
-}
-
-#define del_page_from_inactive_clean_list(page) { \
- list_del(&(page)->lru); \
- ClearPageInactiveClean(page); \
- page->zone->inactive_clean_pages--; \
- DEBUG_ADD_PAGE \
- ZERO_PAGE_BUG \
-}
-
-/*
- * In mm/swap.c::recalculate_vm_stats(), we substract
- * inactive_target from memory_pressure every second.
- * This means that memory_pressure is smoothed over
- * 64 (1 << INACTIVE_SHIFT) seconds.
- */
-#define INACTIVE_SHIFT 6
-#define inactive_target min_t(unsigned long, \
- (memory_pressure >> INACTIVE_SHIFT), \
- (num_physpages / 4))
+#define DEBUG_LRU_PAGE(page) \
+do { \
+ if (PageActive(page)) \
+ BUG(); \
+ if (PageInactive(page)) \
+ BUG(); \
+ if (page_count(page) == 0) \
+ BUG(); \
+} while (0)
+
+#define add_page_to_active_list(page) \
+do { \
+ DEBUG_LRU_PAGE(page); \
+ SetPageActive(page); \
+ list_add(&(page)->lru, &active_list); \
+ nr_active_pages++; \
+} while (0)
+
+#define add_page_to_inactive_list(page) \
+do { \
+ DEBUG_LRU_PAGE(page); \
+ SetPageInactive(page); \
+ list_add(&(page)->lru, &inactive_list); \
+ nr_inactive_pages++; \
+} while (0)
+
+#define del_page_from_active_list(page) \
+do { \
+ list_del(&(page)->lru); \
+ ClearPageActive(page); \
+ nr_active_pages--; \
+ DEBUG_LRU_PAGE(page); \
+} while (0)
+
+#define del_page_from_inactive_list(page) \
+do { \
+ list_del(&(page)->lru); \
+ ClearPageInactive(page); \
+ nr_inactive_pages--; \
+ DEBUG_LRU_PAGE(page); \
+} while (0)
/*
* Ugly ugly ugly HACK to make sure the inactive lists
#include <linux/major.h>
#endif
-#define page_ramdisk(page) \
- (page->buffers && (MAJOR(page->buffers->b_dev) == RAMDISK_MAJOR))
-
extern spinlock_t swaplock;
#define swap_list_lock() spin_lock(&swaplock)
#ifndef _LINUX_SWAPCTL_H
#define _LINUX_SWAPCTL_H
-#include <asm/page.h>
-#include <linux/fs.h>
-
-typedef struct buffer_mem_v1
-{
- unsigned int min_percent;
- unsigned int borrow_percent;
- unsigned int max_percent;
-} buffer_mem_v1;
-typedef buffer_mem_v1 buffer_mem_t;
-extern buffer_mem_t buffer_mem;
-extern buffer_mem_t page_cache;
-
-typedef struct freepages_v1
-{
- unsigned int min;
- unsigned int low;
- unsigned int high;
-} freepages_v1;
-typedef freepages_v1 freepages_t;
-extern freepages_t freepages;
-
typedef struct pager_daemon_v1
{
unsigned int tries_base;
unsigned long data;
void (*function)(unsigned long);
};
+typedef struct timer_list timer_t;
extern void add_timer(struct timer_list * timer);
extern int del_timer(struct timer_list * timer);
O_TARGET := kernel.o
-export-objs = signal.o sys.o kmod.o context.o ksyms.o pm.o exec_domain.o
+export-objs = signal.o sys.o kmod.o context.o ksyms.o pm.o exec_domain.o printk.o
obj-y = sched.o dma.o fork.o exec_domain.o panic.o printk.o \
module.o exit.o itimer.o info.o time.o softirq.o resource.o \
{
unsigned long flags;
- wq_write_lock_irqsave(&q->lock, flags);
wait->flags &= ~WQ_FLAG_EXCLUSIVE;
+ wq_write_lock_irqsave(&q->lock, flags);
__add_wait_queue(q, wait);
wq_write_unlock_irqrestore(&q->lock, flags);
}
{
unsigned long flags;
- wq_write_lock_irqsave(&q->lock, flags);
wait->flags |= WQ_FLAG_EXCLUSIVE;
+ wq_write_lock_irqsave(&q->lock, flags);
__add_wait_queue_tail(q, wait);
wq_write_unlock_irqrestore(&q->lock, flags);
}
* value: the thread structures can take up at most half
* of memory.
*/
- max_threads = mempages / (THREAD_SIZE/PAGE_SIZE) / 2;
+ max_threads = mempages / (THREAD_SIZE/PAGE_SIZE) / 16;
init_task.rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
init_task.rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
flush_cache_mm(current->mm);
mm->locked_vm = 0;
mm->mmap = NULL;
- mm->mmap_avl = NULL;
mm->mmap_cache = NULL;
mm->map_count = 0;
mm->rss = 0;
goto fail_nomem;
}
retval = 0;
- if (mm->map_count >= AVL_MIN_MAP_COUNT)
- build_mmap_avl(mm);
+ build_mmap_rb(mm);
fail_nomem:
flush_tlb_mm(current->mm);
p->lock_depth = -1; /* -1 = no lock */
p->start_time = jiffies;
+ INIT_LIST_HEAD(&p->local_pages);
+
retval = -ENOMEM;
/* copy all the process information */
if (copy_files(clone_flags, p))
EXPORT_SYMBOL(generic_file_read);
EXPORT_SYMBOL(do_generic_file_read);
EXPORT_SYMBOL(generic_file_write);
+EXPORT_SYMBOL(generic_direct_IO);
EXPORT_SYMBOL(generic_file_mmap);
EXPORT_SYMBOL(generic_ro_fops);
EXPORT_SYMBOL(generic_buffer_fdatasync);
EXPORT_SYMBOL(tty_std_termios);
/* block device driver support */
-EXPORT_SYMBOL(block_read);
-EXPORT_SYMBOL(block_write);
EXPORT_SYMBOL(blksize_size);
EXPORT_SYMBOL(hardsect_size);
EXPORT_SYMBOL(blk_size);
EXPORT_SYMBOL(tty_flip_buffer_push);
EXPORT_SYMBOL(tty_get_baud_rate);
EXPORT_SYMBOL(do_SAK);
-EXPORT_SYMBOL(console_print);
-EXPORT_SYMBOL(console_loglevel);
/* filesystem registration */
EXPORT_SYMBOL(register_filesystem);
/* misc */
EXPORT_SYMBOL(panic);
-EXPORT_SYMBOL(printk);
EXPORT_SYMBOL(sprintf);
EXPORT_SYMBOL(snprintf);
EXPORT_SYMBOL(sscanf);
EXPORT_SYMBOL(sys_tz);
EXPORT_SYMBOL(file_fsync);
EXPORT_SYMBOL(fsync_inode_buffers);
+EXPORT_SYMBOL(fsync_inode_data_buffers);
EXPORT_SYMBOL(clear_inode);
EXPORT_SYMBOL(nr_async_pages);
EXPORT_SYMBOL(___strtok);
/* binfmt_aout */
EXPORT_SYMBOL(get_write_access);
-/* dynamic registering of consoles */
-EXPORT_SYMBOL(register_console);
-EXPORT_SYMBOL(unregister_console);
-
/* time */
EXPORT_SYMBOL(get_fast_time);
#include <linux/interrupt.h>
asmlinkage void sys_sync(void); /* it's really int */
-extern void unblank_console(void);
int panic_timeout;
* panic - halt the system
* @fmt: The text string to print
*
- * Display a message, then unblank the console and perform
- * cleanups. Functions in the panic notifier list are called
- * after the filesystem cache is flushed (when possible).
+ * Display a message, then perform cleanups. Functions in the panic
+ * notifier list are called after the filesystem cache is flushed (when possible).
*
* This function never returns.
*/
unsigned long caller = (unsigned long) __builtin_return_address(0);
#endif
+ bust_spinlocks(1);
va_start(args, fmt);
vsprintf(buf, fmt, args);
va_end(args);
printk(KERN_EMERG "In idle task - not syncing\n");
else
sys_sync();
-
- unblank_console();
+ bust_spinlocks(0);
#ifdef CONFIG_SMP
smp_send_stop();
* Modified for sysctl support, 1/8/97, Chris Horn.
* Fixed SMP synchronization, 08/08/99, Manfred Spraul
* manfreds@colorfullife.com
+ * Rewrote bits to get rid of console_lock
+ * 01Mar01 Andrew Morton <andrewm@uow.edu.au>
*/
#include <linux/mm.h>
#include <linux/smp_lock.h>
#include <linux/console.h>
#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/interrupt.h> /* For in_interrupt() */
#include <asm/uaccess.h>
-#define LOG_BUF_LEN (16384)
+#define LOG_BUF_LEN (16384) /* This must be a power of two */
#define LOG_BUF_MASK (LOG_BUF_LEN-1)
-static char buf[1024];
-
/* printk's without a loglevel use this.. */
#define DEFAULT_MESSAGE_LOGLEVEL 4 /* KERN_WARNING */
#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */
#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */
-unsigned long log_size;
DECLARE_WAIT_QUEUE_HEAD(log_wait);
/* Keep together for sysctl support */
int minimum_console_loglevel = MINIMUM_CONSOLE_LOGLEVEL;
int default_console_loglevel = DEFAULT_CONSOLE_LOGLEVEL;
-spinlock_t console_lock = SPIN_LOCK_UNLOCKED;
+int oops_in_progress;
+/*
+ * console_sem protects the console_drivers list, and also
+ * provides serialisation for access to the entire console
+ * driver system.
+ */
+static DECLARE_MUTEX(console_sem);
struct console *console_drivers;
+
+/*
+ * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars
+ * It is also used in interesting ways to provide interlocking in
+ * release_console_sem().
+ */
+static spinlock_t logbuf_lock = SPIN_LOCK_UNLOCKED;
+
static char log_buf[LOG_BUF_LEN];
-static unsigned long log_start;
-static unsigned long logged_chars;
+#define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
+
+/*
+ * The indices into log_buf are not constrained to LOG_BUF_LEN - they
+ * must be masked before subscripting
+ */
+static unsigned long log_start; /* Index into log_buf: next char to be read by syslog() */
+static unsigned long con_start; /* Index into log_buf: next char to be sent to consoles */
+static unsigned long log_end; /* Index into log_buf: most-recently-written-char + 1 */
+static unsigned long logged_chars; /* Number of chars produced since last read+clear operation */
+
struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES];
static int preferred_console = -1;
+/* Flag: console code may call schedule() */
+static int console_may_schedule;
+
/*
* Setup a list of consoles. Called from init/main.c
*/
* 6 -- Disable printk's to console
* 7 -- Enable printk's to console
* 8 -- Set level of messages printed to console
+ * 9 -- Return number of unread characters in the log buffer
*/
int do_syslog(int type, char * buf, int len)
{
error = verify_area(VERIFY_WRITE,buf,len);
if (error)
goto out;
- error = wait_event_interruptible(log_wait, log_size);
+ error = wait_event_interruptible(log_wait, (log_start - log_end));
if (error)
goto out;
i = 0;
- spin_lock_irq(&console_lock);
- while (log_size && i < len) {
- c = log_buf[log_start & LOG_BUF_MASK];
+ spin_lock_irq(&logbuf_lock);
+ while ((log_start != log_end) && i < len) {
+ c = LOG_BUF(log_start);
log_start++;
- log_size--;
- spin_unlock_irq(&console_lock);
+ spin_unlock_irq(&logbuf_lock);
__put_user(c,buf);
buf++;
i++;
- spin_lock_irq(&console_lock);
+ spin_lock_irq(&logbuf_lock);
}
- spin_unlock_irq(&console_lock);
+ spin_unlock_irq(&logbuf_lock);
error = i;
break;
case 4: /* Read/clear last kernel messages */
count = len;
if (count > LOG_BUF_LEN)
count = LOG_BUF_LEN;
- spin_lock_irq(&console_lock);
+ spin_lock_irq(&logbuf_lock);
if (count > logged_chars)
count = logged_chars;
if (do_clear)
logged_chars = 0;
- limit = log_start + log_size;
+ limit = log_end;
/*
* __put_user() could sleep, and while we sleep
* printk() could overwrite the messages
*/
for(i=0;i < count;i++) {
j = limit-1-i;
- if (j+LOG_BUF_LEN < log_start+log_size)
+ if (j+LOG_BUF_LEN < log_end)
break;
- c = log_buf[ j & LOG_BUF_MASK ];
- spin_unlock_irq(&console_lock);
+ c = LOG_BUF(j);
+ spin_unlock_irq(&logbuf_lock);
__put_user(c,&buf[count-1-i]);
- spin_lock_irq(&console_lock);
+ spin_lock_irq(&logbuf_lock);
}
- spin_unlock_irq(&console_lock);
+ spin_unlock_irq(&logbuf_lock);
error = i;
if(i != count) {
int offset = count-error;
break;
case 5: /* Clear ring buffer */
- spin_lock_irq(&console_lock);
+ spin_lock_irq(&logbuf_lock);
logged_chars = 0;
- spin_unlock_irq(&console_lock);
+ spin_unlock_irq(&logbuf_lock);
break;
case 6: /* Disable logging to console */
- spin_lock_irq(&console_lock);
+ spin_lock_irq(&logbuf_lock);
console_loglevel = minimum_console_loglevel;
- spin_unlock_irq(&console_lock);
+ spin_unlock_irq(&logbuf_lock);
break;
case 7: /* Enable logging to console */
- spin_lock_irq(&console_lock);
+ spin_lock_irq(&logbuf_lock);
console_loglevel = default_console_loglevel;
- spin_unlock_irq(&console_lock);
+ spin_unlock_irq(&logbuf_lock);
break;
- case 8:
+ case 8: /* Set level of messages printed to console */
error = -EINVAL;
if (len < 1 || len > 8)
goto out;
if (len < minimum_console_loglevel)
len = minimum_console_loglevel;
- spin_lock_irq(&console_lock);
+ spin_lock_irq(&logbuf_lock);
console_loglevel = len;
- spin_unlock_irq(&console_lock);
+ spin_unlock_irq(&logbuf_lock);
error = 0;
break;
+ case 9: /* Number of chars in the log buffer */
+ spin_lock_irq(&logbuf_lock);
+ error = log_end - log_start;
+ spin_unlock_irq(&logbuf_lock);
+ break;
default:
error = -EINVAL;
break;
return do_syslog(type, buf, len);
}
-asmlinkage int printk(const char *fmt, ...)
+/*
+ * Call the console drivers on a range of log_buf
+ */
+static void __call_console_drivers(unsigned long start, unsigned long end)
{
- va_list args;
- int i;
- char *msg, *p, *buf_end;
- int line_feed;
- static signed char msg_level = -1;
- long flags;
+ struct console *con;
- spin_lock_irqsave(&console_lock, flags);
- va_start(args, fmt);
- i = vsprintf(buf + 3, fmt, args); /* hopefully i < sizeof(buf)-4 */
- buf_end = buf + 3 + i;
- va_end(args);
- for (p = buf + 3; p < buf_end; p++) {
- msg = p;
- if (msg_level < 0) {
- if (
- p[0] != '<' ||
- p[1] < '0' ||
- p[1] > '7' ||
- p[2] != '>'
- ) {
- p -= 3;
- p[0] = '<';
- p[1] = default_message_loglevel + '0';
- p[2] = '>';
- } else
- msg += 3;
- msg_level = p[1] - '0';
+ for (con = console_drivers; con; con = con->next) {
+ if ((con->flags & CON_ENABLED) && con->write)
+ con->write(con, &LOG_BUF(start), end - start);
+ }
+}
+
+/*
+ * Write out chars from start to end - 1 inclusive
+ */
+static void _call_console_drivers(unsigned long start, unsigned long end, int msg_log_level)
+{
+ if (msg_log_level < console_loglevel && console_drivers && start != end) {
+ if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) {
+ /* wrapped write */
+ __call_console_drivers(start & LOG_BUF_MASK, LOG_BUF_LEN);
+ __call_console_drivers(0, end & LOG_BUF_MASK);
+ } else {
+ __call_console_drivers(start, end);
+ }
+ }
+}
+
+/*
+ * Call the console drivers, asking them to write out
+ * log_buf[start] to log_buf[end - 1].
+ * The console_sem must be held.
+ */
+static void call_console_drivers(unsigned long start, unsigned long end)
+{
+ unsigned long cur_index, start_print;
+ static int msg_level = -1;
+
+ if (((long)(start - end)) > 0)
+ BUG();
+
+ cur_index = start;
+ start_print = start;
+ while (cur_index != end) {
+ if ( msg_level < 0 &&
+ ((end - cur_index) > 2) &&
+ LOG_BUF(cur_index + 0) == '<' &&
+ LOG_BUF(cur_index + 1) >= '0' &&
+ LOG_BUF(cur_index + 1) <= '7' &&
+ LOG_BUF(cur_index + 2) == '>')
+ {
+ msg_level = LOG_BUF(cur_index + 1) - '0';
+ cur_index += 3;
+ start_print = cur_index;
}
- line_feed = 0;
- for (; p < buf_end; p++) {
- log_buf[(log_start+log_size) & LOG_BUF_MASK] = *p;
- if (log_size < LOG_BUF_LEN)
- log_size++;
- else
- log_start++;
-
- logged_chars++;
- if (*p == '\n') {
- line_feed = 1;
+ while (cur_index != end) {
+ char c = LOG_BUF(cur_index);
+ cur_index++;
+
+ if (c == '\n') {
+ if (msg_level < 0) {
+ /*
+ * printk() has already given us loglevel tags in
+ * the buffer. This code is here in case the
+ * log buffer has wrapped right round and scribbled
+ * on those tags
+ */
+ msg_level = default_message_loglevel;
+ }
+ _call_console_drivers(start_print, cur_index, msg_level);
+ msg_level = -1;
+ start_print = cur_index;
break;
}
}
- if (msg_level < console_loglevel && console_drivers) {
- struct console *c = console_drivers;
- while(c) {
- if ((c->flags & CON_ENABLED) && c->write)
- c->write(c, msg, p - msg + line_feed);
- c = c->next;
+ }
+ _call_console_drivers(start_print, end, msg_level);
+}
+
+static void emit_log_char(char c)
+{
+ LOG_BUF(log_end) = c;
+ log_end++;
+ if (log_end - log_start > LOG_BUF_LEN)
+ log_start = log_end - LOG_BUF_LEN;
+ if (log_end - con_start > LOG_BUF_LEN)
+ con_start = log_end - LOG_BUF_LEN;
+ if (logged_chars < LOG_BUF_LEN)
+ logged_chars++;
+}
+
+/*
+ * This is printk. It can be called from any context. We want it to work.
+ *
+ * We try to grab the console_sem. If we succeed, it's easy - we log the output and
+ * call the console drivers. If we fail to get the semaphore we place the output
+ * into the log buffer and return. The current holder of the console_sem will
+ * notice the new output in release_console_sem() and will send it to the
+ * consoles before releasing the semaphore.
+ *
+ * One effect of this deferred printing is that code which calls printk() and
+ * then changes console_loglevel may break. This is because console_loglevel
+ * is inspected when the actual printing occurs.
+ */
+asmlinkage int printk(const char *fmt, ...)
+{
+ va_list args;
+ unsigned long flags;
+ int printed_len;
+ char *p;
+ static char printk_buf[1024];
+ static int log_level_unknown = 1;
+
+ if (oops_in_progress) {
+ /* If a crash is occurring, make sure we can't deadlock */
+ spin_lock_init(&logbuf_lock);
+ /* And make sure that we print immediately */
+ init_MUTEX(&console_sem);
+ }
+
+ /* This stops the holder of console_sem just where we want him */
+ spin_lock_irqsave(&logbuf_lock, flags);
+
+ /* Emit the output into the temporary buffer */
+ va_start(args, fmt);
+ printed_len = vsnprintf(printk_buf, sizeof(printk_buf), fmt, args);
+ va_end(args);
+
+ /*
+ * Copy the output into log_buf. If the caller didn't provide
+ * appropriate log level tags, we insert them here
+ */
+ for (p = printk_buf; *p; p++) {
+ if (log_level_unknown) {
+ if (p[0] != '<' || p[1] < '0' || p[1] > '7' || p[2] != '>') {
+ emit_log_char('<');
+ emit_log_char(default_message_loglevel + '0');
+ emit_log_char('>');
}
+ log_level_unknown = 0;
}
- if (line_feed)
- msg_level = -1;
+ emit_log_char(*p);
+ if (*p == '\n')
+ log_level_unknown = 1;
+ }
+
+ if (!down_trylock(&console_sem)) {
+ /*
+ * We own the drivers. We can drop the spinlock and let
+ * release_console_sem() print the text
+ */
+ spin_unlock_irqrestore(&logbuf_lock, flags);
+ console_may_schedule = 0;
+ release_console_sem();
+ } else {
+ /*
+ * Someone else owns the drivers. We drop the spinlock, which
+ * allows the semaphore holder to proceed and to call the
+ * console drivers with the output which we just produced.
+ */
+ spin_unlock_irqrestore(&logbuf_lock, flags);
}
- spin_unlock_irqrestore(&console_lock, flags);
- wake_up_interruptible(&log_wait);
- return i;
+ return printed_len;
}
+EXPORT_SYMBOL(printk);
-void console_print(const char *s)
+/**
+ * acquire_console_sem - lock the console system for exclusive use.
+ *
+ * Acquires a semaphore which guarantees that the caller has
+ * exclusive access to the console system and the console_drivers list.
+ *
+ * Can sleep, returns nothing.
+ */
+void acquire_console_sem(void)
+{
+ if (in_interrupt())
+ BUG();
+ down(&console_sem);
+ console_may_schedule = 1;
+}
+EXPORT_SYMBOL(acquire_console_sem);
+
+/**
+ * release_console_sem - unlock the console system
+ *
+ * Releases the semaphore which the caller holds on the console system
+ * and the console driver list.
+ *
+ * While the semaphore was held, console output may have been buffered
+ * by printk(). If this is the case, release_console_sem() emits
+ * the output prior to releasing the semaphore.
+ *
+ * If there is output waiting for klogd, we wake it up.
+ *
+ * release_console_sem() may be called from any context.
+ */
+void release_console_sem(void)
{
- struct console *c;
unsigned long flags;
- int len = strlen(s);
-
- spin_lock_irqsave(&console_lock, flags);
- c = console_drivers;
- while(c) {
- if ((c->flags & CON_ENABLED) && c->write)
- c->write(c, s, len);
- c = c->next;
+ unsigned long _con_start, _log_end;
+ unsigned long must_wake_klogd = 0;
+
+ for ( ; ; ) {
+ spin_lock_irqsave(&logbuf_lock, flags);
+ must_wake_klogd |= log_start - log_end;
+ if (con_start == log_end)
+ break; /* Nothing to print */
+ _con_start = con_start;
+ _log_end = log_end;
+ con_start = log_end; /* Flush */
+ spin_unlock_irqrestore(&logbuf_lock, flags);
+ call_console_drivers(_con_start, _log_end);
}
- spin_unlock_irqrestore(&console_lock, flags);
+ console_may_schedule = 0;
+ up(&console_sem);
+ spin_unlock_irqrestore(&logbuf_lock, flags);
+ if (must_wake_klogd && !oops_in_progress)
+ wake_up_interruptible(&log_wait);
}
-void unblank_console(void)
+/** console_conditional_schedule - yield the CPU if required
+ *
+ * If the console code is currently allowed to sleep, and
+ * if this CPU should yield the CPU to another task, do
+ * so here.
+ *
+ * Must be called within acquire_console_sem().
+ */
+void console_conditional_schedule(void)
{
- struct console *c;
- unsigned long flags;
-
- spin_lock_irqsave(&console_lock, flags);
- c = console_drivers;
- while(c) {
- if ((c->flags & CON_ENABLED) && c->unblank)
- c->unblank();
- c = c->next;
+ if (console_may_schedule && current->need_resched) {
+ set_current_state(TASK_RUNNING);
+ schedule();
}
- spin_unlock_irqrestore(&console_lock, flags);
}
+void console_print(const char *s)
+{
+ printk(KERN_EMERG "%s", s);
+}
+EXPORT_SYMBOL(console_print);
+
/*
* The console driver calls this routine during kernel initialization
* to register the console printing procedure with printk() and to
*/
void register_console(struct console * console)
{
- int i, j,len;
- int p;
- char buf[16];
- signed char msg_level = -1;
- char *q;
+ int i;
unsigned long flags;
/*
* Put this console in the list - keep the
* preferred driver at the head of the list.
*/
- spin_lock_irqsave(&console_lock, flags);
+ acquire_console_sem();
if ((console->flags & CON_CONSDEV) || console_drivers == NULL) {
console->next = console_drivers;
console_drivers = console;
console->next = console_drivers->next;
console_drivers->next = console;
}
- if ((console->flags & CON_PRINTBUFFER) == 0)
- goto done;
- /*
- * Print out buffered log messages.
- */
- p = log_start & LOG_BUF_MASK;
-
- for (i=0,j=0; i < log_size; i++) {
- buf[j++] = log_buf[p];
- p = (p+1) & LOG_BUF_MASK;
- if (buf[j-1] != '\n' && i < log_size - 1 && j < sizeof(buf)-1)
- continue;
- buf[j] = 0;
- q = buf;
- len = j;
- if (msg_level < 0) {
- if(buf[0] == '<' &&
- buf[1] >= '0' &&
- buf[1] <= '7' &&
- buf[2] == '>') {
- msg_level = buf[1] - '0';
- q = buf + 3;
- len -= 3;
- } else
- {
- msg_level = default_message_loglevel;
- }
- }
- if (msg_level < console_loglevel)
- console->write(console, q, len);
- if (buf[j-1] == '\n')
- msg_level = -1;
- j = 0;
+ if (console->flags & CON_PRINTBUFFER) {
+ /*
+ * release_cosole_sem() will print out the buffered messages for us.
+ */
+ spin_lock_irqsave(&logbuf_lock, flags);
+ con_start = log_start;
+ spin_unlock_irqrestore(&logbuf_lock, flags);
}
-done:
- spin_unlock_irqrestore(&console_lock, flags);
+ release_console_sem();
}
-
+EXPORT_SYMBOL(register_console);
int unregister_console(struct console * console)
{
struct console *a,*b;
- unsigned long flags;
int res = 1;
- spin_lock_irqsave(&console_lock, flags);
+ acquire_console_sem();
if (console_drivers == console) {
console_drivers=console->next;
res = 0;
- } else
- {
+ } else {
for (a=console_drivers->next, b=console_drivers ;
a; b=a, a=b->next) {
if (a == console) {
preferred_console = -1;
- spin_unlock_irqrestore(&console_lock, flags);
+ release_console_sem();
return res;
}
+EXPORT_SYMBOL(unregister_console);
-/*
- * Write a message to a certain tty, not just the console. This is used for
- * messages that need to be redirected to a specific tty.
+/**
+ * tty_write_message - write a message to a certain tty, not just the console.
+ *
+ * This is used for messages that need to be redirected to a specific tty.
* We don't put it into the syslog queue right now maybe in the future if
* really needed.
*/
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
#include <linux/completion.h>
+#include <linux/prefetch.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
*/
oldest_idle = (cycles_t) -1;
target_tsk = NULL;
- max_prio = 1;
+ max_prio = 0;
for (i = 0; i < smp_num_cpus; i++) {
cpu = cpu_logical_map(i);
struct task_struct *tsk;
tsk = cpu_curr(this_cpu);
- if (preemption_goodness(tsk, p, this_cpu) > 1)
+ if (preemption_goodness(tsk, p, this_cpu) > 0)
tsk->need_resched = 1;
#endif
}
struct list_head *tmp;
int this_cpu, c;
+ spin_lock_prefetch(&runqueue_lock);
+
if (!current->active_mm) BUG();
need_resched_back:
prev = current;
else
printk(" (NOTLB)\n");
-#if defined(CONFIG_X86) || defined(CONFIG_SPARC64) || defined(CONFIG_ARM)
+#if defined(CONFIG_X86) || defined(CONFIG_SPARC64) || defined(CONFIG_ARM) || defined(CONFIG_ALPHA)
/* This is very useful, but only works on ARM, x86 and sparc64 right now */
{
extern void show_trace_task(struct task_struct *tsk);
printk(" task PC stack pid father child younger older\n");
#endif
read_lock(&tasklist_lock);
- for_each_task(p)
+ for_each_task(p) {
+ /*
+ * reset the NMI-timeout, listing all files on a slow
+ * console might take alot of time:
+ */
+ touch_nmi_watchdog();
show_task(p);
+ }
read_unlock(&tasklist_lock);
}
{
t->sigpending = 1;
- if (t->state & TASK_INTERRUPTIBLE) {
- wake_up_process(t);
- return;
- }
-
#ifdef CONFIG_SMP
/*
* If the task is running on a different CPU
smp_send_reschedule(t->processor);
spin_unlock(&runqueue_lock);
#endif /* CONFIG_SMP */
+
+ if (t->state & TASK_INTERRUPTIBLE) {
+ wake_up_process(t);
+ return;
+ }
}
static int deliver_signal(int sig, struct siginfo *info, struct task_struct *t)
ret = deliver_signal(sig, info, t);
out:
spin_unlock_irqrestore(&t->sigmask_lock, flags);
- if ((t->state & TASK_INTERRUPTIBLE) && signal_pending(t))
- wake_up_process(t);
out_nolock:
#if DEBUG_SIG
printk(" %d -> %d\n", signal_pending(t), ret);
};
static ctl_table vm_table[] = {
- {VM_FREEPG, "freepages",
- &freepages, sizeof(freepages_t), 0444, NULL, &proc_dointvec},
{VM_BDFLUSH, "bdflush", &bdf_prm, 9*sizeof(int), 0644, NULL,
&proc_dointvec_minmax, &sysctl_intvec, NULL,
&bdflush_min, &bdflush_max},
{VM_OVERCOMMIT_MEMORY, "overcommit_memory", &sysctl_overcommit_memory,
sizeof(sysctl_overcommit_memory), 0644, NULL, &proc_dointvec},
- {VM_BUFFERMEM, "buffermem",
- &buffer_mem, sizeof(buffer_mem_t), 0644, NULL, &proc_dointvec},
- {VM_PAGECACHE, "pagecache",
- &page_cache, sizeof(buffer_mem_t), 0644, NULL, &proc_dointvec},
{VM_PAGERDAEMON, "kswapd",
&pager_daemon, sizeof(pager_daemon_t), 0644, NULL, &proc_dointvec},
{VM_PGT_CACHE, "pagetable_cache",
export-objs := cmdline.o dec_and_lock.o rwsem-spinlock.o rwsem.o
-obj-y := errno.o ctype.o string.o vsprintf.o brlock.o cmdline.o
+obj-y := errno.o ctype.o string.o vsprintf.o brlock.o cmdline.o bust_spinlocks.o rbtree.o
obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
--- /dev/null
+/*
+ * lib/bust_spinlocks.c
+ *
+ * Provides a minimal bust_spinlocks for architectures which don't have one of their own.
+ *
+ * bust_spinlocks() clears any spinlocks which would prevent oops, die(), BUG()
+ * and panic() information from reaching the user.
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/tty.h>
+#include <linux/wait.h>
+#include <linux/vt_kern.h>
+
+extern spinlock_t timerlist_lock;
+
+void bust_spinlocks(int yes)
+{
+ spin_lock_init(&timerlist_lock);
+ if (yes) {
+ oops_in_progress = 1;
+ } else {
+ int loglevel_save = console_loglevel;
+#ifdef CONFIG_VT
+ unblank_screen();
+#endif
+ oops_in_progress = 0;
+ /*
+ * OK, the message is on the console. Now we call printk()
+ * without oops_in_progress set so that printk() will give klogd
+ * and the blanked console a poke. Hold onto your hats...
+ */
+ console_loglevel = 15; /* NMI oopser may have shut the console up */
+ printk(" ");
+ console_loglevel = loglevel_save;
+ }
+}
+
+
--- /dev/null
+/*
+ Red Black Trees
+ (C) 1999 Andrea Arcangeli <andrea@suse.de>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+ linux/lib/rbtree.c
+*/
+
+#include <linux/rbtree.h>
+
+static void __rb_rotate_left(rb_node_t * node, rb_root_t * root)
+{
+ rb_node_t * right = node->rb_right;
+
+ if ((node->rb_right = right->rb_left))
+ right->rb_left->rb_parent = node;
+ right->rb_left = node;
+
+ if ((right->rb_parent = node->rb_parent))
+ {
+ if (node == node->rb_parent->rb_left)
+ node->rb_parent->rb_left = right;
+ else
+ node->rb_parent->rb_right = right;
+ }
+ else
+ root->rb_node = right;
+ node->rb_parent = right;
+}
+
+static void __rb_rotate_right(rb_node_t * node, rb_root_t * root)
+{
+ rb_node_t * left = node->rb_left;
+
+ if ((node->rb_left = left->rb_right))
+ left->rb_right->rb_parent = node;
+ left->rb_right = node;
+
+ if ((left->rb_parent = node->rb_parent))
+ {
+ if (node == node->rb_parent->rb_right)
+ node->rb_parent->rb_right = left;
+ else
+ node->rb_parent->rb_left = left;
+ }
+ else
+ root->rb_node = left;
+ node->rb_parent = left;
+}
+
+void rb_insert_color(rb_node_t * node, rb_root_t * root)
+{
+ rb_node_t * parent, * gparent;
+
+ while ((parent = node->rb_parent) && parent->rb_color == RB_RED)
+ {
+ gparent = parent->rb_parent;
+
+ if (parent == gparent->rb_left)
+ {
+ {
+ register rb_node_t * uncle = gparent->rb_right;
+ if (uncle && uncle->rb_color == RB_RED)
+ {
+ uncle->rb_color = RB_BLACK;
+ parent->rb_color = RB_BLACK;
+ gparent->rb_color = RB_RED;
+ node = gparent;
+ continue;
+ }
+ }
+
+ if (parent->rb_right == node)
+ {
+ register rb_node_t * tmp;
+ __rb_rotate_left(parent, root);
+ tmp = parent;
+ parent = node;
+ node = tmp;
+ }
+
+ parent->rb_color = RB_BLACK;
+ gparent->rb_color = RB_RED;
+ __rb_rotate_right(gparent, root);
+ } else {
+ {
+ register rb_node_t * uncle = gparent->rb_left;
+ if (uncle && uncle->rb_color == RB_RED)
+ {
+ uncle->rb_color = RB_BLACK;
+ parent->rb_color = RB_BLACK;
+ gparent->rb_color = RB_RED;
+ node = gparent;
+ continue;
+ }
+ }
+
+ if (parent->rb_left == node)
+ {
+ register rb_node_t * tmp;
+ __rb_rotate_right(parent, root);
+ tmp = parent;
+ parent = node;
+ node = tmp;
+ }
+
+ parent->rb_color = RB_BLACK;
+ gparent->rb_color = RB_RED;
+ __rb_rotate_left(gparent, root);
+ }
+ }
+
+ root->rb_node->rb_color = RB_BLACK;
+}
+
+static void __rb_erase_color(rb_node_t * node, rb_node_t * parent,
+ rb_root_t * root)
+{
+ rb_node_t * other;
+
+ while ((!node || node->rb_color == RB_BLACK) && node != root->rb_node)
+ {
+ if (parent->rb_left == node)
+ {
+ other = parent->rb_right;
+ if (other->rb_color == RB_RED)
+ {
+ other->rb_color = RB_BLACK;
+ parent->rb_color = RB_RED;
+ __rb_rotate_left(parent, root);
+ other = parent->rb_right;
+ }
+ if ((!other->rb_left ||
+ other->rb_left->rb_color == RB_BLACK)
+ && (!other->rb_right ||
+ other->rb_right->rb_color == RB_BLACK))
+ {
+ other->rb_color = RB_RED;
+ node = parent;
+ parent = node->rb_parent;
+ }
+ else
+ {
+ if (!other->rb_right ||
+ other->rb_right->rb_color == RB_BLACK)
+ {
+ register rb_node_t * o_left;
+ if ((o_left = other->rb_left))
+ o_left->rb_color = RB_BLACK;
+ other->rb_color = RB_RED;
+ __rb_rotate_right(other, root);
+ other = parent->rb_right;
+ }
+ other->rb_color = parent->rb_color;
+ parent->rb_color = RB_BLACK;
+ if (other->rb_right)
+ other->rb_right->rb_color = RB_BLACK;
+ __rb_rotate_left(parent, root);
+ node = root->rb_node;
+ break;
+ }
+ }
+ else
+ {
+ other = parent->rb_left;
+ if (other->rb_color == RB_RED)
+ {
+ other->rb_color = RB_BLACK;
+ parent->rb_color = RB_RED;
+ __rb_rotate_right(parent, root);
+ other = parent->rb_left;
+ }
+ if ((!other->rb_left ||
+ other->rb_left->rb_color == RB_BLACK)
+ && (!other->rb_right ||
+ other->rb_right->rb_color == RB_BLACK))
+ {
+ other->rb_color = RB_RED;
+ node = parent;
+ parent = node->rb_parent;
+ }
+ else
+ {
+ if (!other->rb_left ||
+ other->rb_left->rb_color == RB_BLACK)
+ {
+ register rb_node_t * o_right;
+ if ((o_right = other->rb_right))
+ o_right->rb_color = RB_BLACK;
+ other->rb_color = RB_RED;
+ __rb_rotate_left(other, root);
+ other = parent->rb_left;
+ }
+ other->rb_color = parent->rb_color;
+ parent->rb_color = RB_BLACK;
+ if (other->rb_left)
+ other->rb_left->rb_color = RB_BLACK;
+ __rb_rotate_right(parent, root);
+ node = root->rb_node;
+ break;
+ }
+ }
+ }
+ if (node)
+ node->rb_color = RB_BLACK;
+}
+
+void rb_erase(rb_node_t * node, rb_root_t * root)
+{
+ rb_node_t * child, * parent;
+ int color;
+
+ if (!node->rb_left)
+ child = node->rb_right;
+ else if (!node->rb_right)
+ child = node->rb_left;
+ else
+ {
+ rb_node_t * old = node, * left;
+
+ node = node->rb_right;
+ while ((left = node->rb_left))
+ node = left;
+ child = node->rb_right;
+ parent = node->rb_parent;
+ color = node->rb_color;
+
+ if (child)
+ child->rb_parent = parent;
+ if (parent)
+ {
+ if (parent->rb_left == node)
+ parent->rb_left = child;
+ else
+ parent->rb_right = child;
+ }
+ else
+ root->rb_node = child;
+
+ if (node->rb_parent == old)
+ parent = node;
+ node->rb_parent = old->rb_parent;
+ node->rb_color = old->rb_color;
+ node->rb_right = old->rb_right;
+ node->rb_left = old->rb_left;
+
+ if (old->rb_parent)
+ {
+ if (old->rb_parent->rb_left == old)
+ old->rb_parent->rb_left = node;
+ else
+ old->rb_parent->rb_right = node;
+ } else
+ root->rb_node = node;
+
+ old->rb_left->rb_parent = node;
+ if (old->rb_right)
+ old->rb_right->rb_parent = node;
+ goto color;
+ }
+
+ parent = node->rb_parent;
+ color = node->rb_color;
+
+ if (child)
+ child->rb_parent = parent;
+ if (parent)
+ {
+ if (parent->rb_left == node)
+ parent->rb_left = child;
+ else
+ parent->rb_right = child;
+ }
+ else
+ root->rb_node = child;
+
+ color:
+ if (color == RB_BLACK)
+ __rb_erase_color(child, parent, root);
+}
#include <linux/swapctl.h>
#include <linux/init.h>
#include <linux/mm.h>
+#include <linux/iobuf.h>
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
unsigned int page_hash_bits;
struct page **page_hash_table;
-spinlock_t __cacheline_aligned pagecache_lock = SPIN_LOCK_UNLOCKED;
+spinlock_t pagecache_lock ____cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
/*
* NOTE: to avoid deadlocking you must never acquire the pagecache_lock with
* the pagemap_lru_lock held.
*/
-spinlock_t __cacheline_aligned pagemap_lru_lock = SPIN_LOCK_UNLOCKED;
+spinlock_t pagemap_lru_lock ____cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
#define CLUSTER_PAGES (1 << page_cluster)
#define CLUSTER_OFFSET(x) (((x) >> page_cluster) << page_cluster)
}
-static inline void truncate_complete_page(struct page *page)
+static void truncate_complete_page(struct page *page)
{
/* Leave it on the LRU if it gets converted into anonymous buffers */
if (!page->buffers || block_flushpage(page, 0))
{
struct list_head *curr;
struct page * page;
+ int unlocked = 0;
- curr = head->next;
+ restart:
+ curr = head->prev;
while (curr != head) {
unsigned long offset;
/* Is one of the pages to truncate? */
if ((offset >= start) || (*partial && (offset + 1) == start)) {
- list_del(head);
- list_add(head, curr);
- if (TryLockPage(page)) {
- page_cache_get(page);
- spin_unlock(&pagecache_lock);
- wait_on_page(page);
- goto out_restart;
- }
+ int failed;
+
page_cache_get(page);
+ failed = TryLockPage(page);
+
+ list_del(head);
+ if (!failed)
+ /* Restart after this page */
+ list_add_tail(head, curr);
+ else
+ /* Restart on this page */
+ list_add(head, curr);
+
spin_unlock(&pagecache_lock);
+ unlocked = 1;
- if (*partial && (offset + 1) == start) {
- truncate_partial_page(page, *partial);
- *partial = 0;
- } else
- truncate_complete_page(page);
+ if (!failed) {
+ if (*partial && (offset + 1) == start) {
+ truncate_partial_page(page, *partial);
+ *partial = 0;
+ } else
+ truncate_complete_page(page);
- UnlockPage(page);
- goto out_restart;
+ UnlockPage(page);
+ } else
+ wait_on_page(page);
+
+ page_cache_release(page);
+
+ if (current->need_resched) {
+ __set_current_state(TASK_RUNNING);
+ schedule();
+ }
+
+ spin_lock(&pagecache_lock);
+ goto restart;
}
- curr = curr->next;
+ curr = curr->prev;
}
- return 0;
-out_restart:
- page_cache_release(page);
- spin_lock(&pagecache_lock);
- return 1;
+ return unlocked;
}
{
unsigned long start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
- int complete;
+ int unlocked;
spin_lock(&pagecache_lock);
do {
- complete = 1;
- while (truncate_list_pages(&mapping->clean_pages, start, &partial))
- complete = 0;
- while (truncate_list_pages(&mapping->dirty_pages, start, &partial))
- complete = 0;
- while (truncate_list_pages(&mapping->locked_pages, start, &partial))
- complete = 0;
- } while (!complete);
+ unlocked = truncate_list_pages(&mapping->clean_pages, start, &partial);
+ unlocked |= truncate_list_pages(&mapping->dirty_pages, start, &partial);
+ unlocked |= truncate_list_pages(&mapping->locked_pages, start, &partial);
+ } while (unlocked);
/* Traversed all three lists without dropping the lock */
spin_unlock(&pagecache_lock);
}
+static inline int invalidate_this_page2(struct page * page,
+ struct list_head * curr,
+ struct list_head * head)
+{
+ int unlocked = 1;
+
+ /*
+ * The page is locked and we hold the pagecache_lock as well
+ * so both page_count(page) and page->buffers stays constant here.
+ */
+ if (page_count(page) == 1 + !!page->buffers) {
+ /* Restart after this page */
+ list_del(head);
+ list_add_tail(head, curr);
+
+ page_cache_get(page);
+ spin_unlock(&pagecache_lock);
+ truncate_complete_page(page);
+ } else {
+ if (page->buffers) {
+ /* Restart after this page */
+ list_del(head);
+ list_add_tail(head, curr);
+
+ page_cache_get(page);
+ spin_unlock(&pagecache_lock);
+ block_invalidate_page(page);
+ } else
+ unlocked = 0;
+
+ ClearPageDirty(page);
+ ClearPageUptodate(page);
+ }
+
+ return unlocked;
+}
+
+static int FASTCALL(invalidate_list_pages2(struct list_head *));
+static int invalidate_list_pages2(struct list_head *head)
+{
+ struct list_head *curr;
+ struct page * page;
+ int unlocked = 0;
+
+ restart:
+ curr = head->prev;
+ while (curr != head) {
+ page = list_entry(curr, struct page, list);
+
+ if (!TryLockPage(page)) {
+ int __unlocked;
+
+ __unlocked = invalidate_this_page2(page, curr, head);
+ UnlockPage(page);
+ unlocked |= __unlocked;
+ if (!__unlocked) {
+ curr = curr->prev;
+ continue;
+ }
+ } else {
+ /* Restart on this page */
+ list_del(head);
+ list_add(head, curr);
+
+ page_cache_get(page);
+ spin_unlock(&pagecache_lock);
+ unlocked = 1;
+ wait_on_page(page);
+ }
+
+ page_cache_release(page);
+ if (current->need_resched) {
+ __set_current_state(TASK_RUNNING);
+ schedule();
+ }
+
+ spin_lock(&pagecache_lock);
+ goto restart;
+ }
+ return unlocked;
+}
+
+/**
+ * invalidate_inode_pages2 - Clear all the dirty bits around if it can't
+ * free the pages because they're mapped.
+ * @mapping: the address_space which pages we want to invalidate
+ */
+void invalidate_inode_pages2(struct address_space * mapping)
+{
+ int unlocked;
+
+ spin_lock(&pagecache_lock);
+ do {
+ unlocked = invalidate_list_pages2(&mapping->clean_pages);
+ unlocked |= invalidate_list_pages2(&mapping->dirty_pages);
+ unlocked |= invalidate_list_pages2(&mapping->locked_pages);
+ } while (unlocked);
+ spin_unlock(&pagecache_lock);
+}
+
static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
{
goto inside;
if (page->index == offset)
break;
}
+
+ SetPageReferenced(page);
+
not_found:
return page;
}
if (!PageLocked(page))
BUG();
+ page->index = index;
page_cache_get(page);
spin_lock(&pagecache_lock);
- page->index = index;
add_page_to_inode_queue(mapping, page);
add_page_to_hash_queue(page, page_hash(mapping, index));
lru_cache_add(page);
if (PageLocked(page))
BUG();
- flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced) | (1 << PG_arch_1) | (1 << PG_checked));
+ flags = page->flags & ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_dirty | 1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_checked);
page->flags = flags | (1 << PG_locked);
page_cache_get(page);
page->index = offset;
* This adds the requested page to the page cache if it isn't already there,
* and schedules an I/O to read in its contents from disk.
*/
-static inline int page_cache_read(struct file * file, unsigned long offset)
+static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
+static int page_cache_read(struct file * file, unsigned long offset)
{
struct inode *inode = file->f_dentry->d_inode;
struct address_space *mapping = inode->i_mapping;
struct page *page;
spin_lock(&pagecache_lock);
- page = __find_page_nolock(mapping, offset, *hash);
+ page = __find_page_nolock(mapping, offset, *hash);
spin_unlock(&pagecache_lock);
if (page)
return 0;
* We arrive here in the unlikely event that someone
* raced with us and added our page to the cache first.
*/
- page_cache_free(page);
+ page_cache_release(page);
return 0;
}
* Read in an entire cluster at once. A cluster is usually a 64k-
* aligned block that includes the page requested in "offset."
*/
+static int FASTCALL(read_cluster_nonblocking(struct file * file, unsigned long offset,
+ unsigned long filesize));
static int read_cluster_nonblocking(struct file * file, unsigned long offset,
unsigned long filesize)
{
add_wait_queue(&page->wait, &wait);
do {
- sync_page(page);
set_task_state(tsk, TASK_UNINTERRUPTIBLE);
if (!PageLocked(page))
break;
- run_task_queue(&tq_disk);
+ sync_page(page);
schedule();
} while (PageLocked(page));
tsk->state = TASK_RUNNING;
add_wait_queue_exclusive(&page->wait, &wait);
for (;;) {
- sync_page(page);
set_task_state(tsk, TASK_UNINTERRUPTIBLE);
if (PageLocked(page)) {
- run_task_queue(&tq_disk);
+ sync_page(page);
schedule();
- continue;
}
if (!TryLockPage(page))
break;
return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
}
+static inline unsigned long calc_end_index(struct inode * inode)
+{
+ unsigned long end_index;
+
+ if (!S_ISBLK(inode->i_mode))
+ end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+ else
+ end_index = buffered_blk_size(inode->i_rdev) >> (PAGE_CACHE_SHIFT - BLOCK_SIZE_BITS);
+
+ return end_index;
+}
+
+static inline loff_t calc_rsize(struct inode * inode)
+{
+ loff_t rsize;
+
+ if (!S_ISBLK(inode->i_mode))
+ rsize = inode->i_size;
+ else
+ rsize = (loff_t) buffered_blk_size(inode->i_rdev) << BLOCK_SIZE_BITS;
+
+ return rsize;
+}
+
static void generic_file_readahead(int reada_ok,
struct file * filp, struct inode * inode,
struct page * page)
{
- unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+ unsigned long end_index;
unsigned long index = page->index;
unsigned long max_ahead, ahead;
unsigned long raend;
int max_readahead = get_max_readahead(inode);
+ end_index = calc_end_index(inode);
+
raend = filp->f_raend;
max_ahead = 0;
return;
}
-/*
- * Mark a page as having seen activity.
- *
- * If it was already so marked, move it
- * to the active queue and drop the referenced
- * bit. Otherwise, just mark it for future
- * action..
- */
-void mark_page_accessed(struct page *page)
-{
- if (!PageActive(page) && PageReferenced(page)) {
- activate_page(page);
- ClearPageReferenced(page);
- return;
- }
-
- /* Mark the page referenced, AFTER checking for previous usage.. */
- SetPageReferenced(page);
-}
-
/*
* This is a generic file read routine, and uses the
* inode->i_op->readpage() function for the actual low-level
struct page *page, **hash;
unsigned long end_index, nr, ret;
- end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+ end_index = calc_end_index(inode);
+
if (index > end_index)
break;
nr = PAGE_CACHE_SIZE;
if (index == end_index) {
- nr = inode->i_size & ~PAGE_CACHE_MASK;
+ nr = calc_rsize(inode) & ~PAGE_CACHE_MASK;
if (nr <= offset)
break;
}
index += offset >> PAGE_CACHE_SHIFT;
offset &= ~PAGE_CACHE_MASK;
- mark_page_accessed(page);
page_cache_release(page);
if (ret == nr && desc->count)
continue;
*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
filp->f_reada = 1;
if (cached_page)
- page_cache_free(cached_page);
+ page_cache_release(cached_page);
UPDATE_ATIME(inode);
}
+static ssize_t generic_file_direct_IO(int rw, struct file * filp, char * buf, size_t count, loff_t offset)
+{
+ ssize_t retval;
+ int new_iobuf, chunk_size, blocksize_mask, blocksize, blocksize_bits, iosize, progress;
+ struct kiobuf * iobuf;
+ struct inode * inode = filp->f_dentry->d_inode;
+ struct address_space * mapping = inode->i_mapping;
+
+ new_iobuf = 0;
+ iobuf = filp->f_iobuf;
+ if (test_and_set_bit(0, &filp->f_iobuf_lock)) {
+ /*
+ * A parallel read/write is using the preallocated iobuf
+ * so just run slow and allocate a new one.
+ */
+ retval = alloc_kiovec(1, &iobuf);
+ if (retval)
+ goto out;
+ new_iobuf = 1;
+ }
+
+ if (!S_ISBLK(inode->i_mode)) {
+ blocksize = inode->i_sb->s_blocksize;
+ blocksize_bits = inode->i_sb->s_blocksize_bits;
+ } else {
+ blocksize = BUFFERED_BLOCKSIZE;
+ blocksize_bits = BUFFERED_BLOCKSIZE_BITS;
+ }
+ blocksize_mask = blocksize - 1;
+ chunk_size = KIO_MAX_ATOMIC_IO << 10;
+
+ retval = -EINVAL;
+ if ((offset & blocksize_mask) || (count & blocksize_mask))
+ goto out_free;
+ if (!mapping->a_ops->direct_IO)
+ goto out_free;
+
+ /*
+ * Flush to disk exlusively the _data_, metadata must remains
+ * completly asynchronous or performance will go to /dev/null.
+ */
+ filemap_fdatasync(mapping);
+ retval = fsync_inode_data_buffers(inode);
+ filemap_fdatawait(mapping);
+ if (retval < 0)
+ goto out_free;
+
+ progress = retval = 0;
+ while (count > 0) {
+ iosize = count;
+ if (iosize > chunk_size)
+ iosize = chunk_size;
+
+ retval = map_user_kiobuf(rw, iobuf, (unsigned long) buf, iosize);
+ if (retval)
+ break;
+
+ retval = mapping->a_ops->direct_IO(rw, inode, iobuf, (offset+progress) >> blocksize_bits, blocksize);
+
+ if (rw == READ && retval > 0)
+ mark_dirty_kiobuf(iobuf, retval);
+
+ if (retval >= 0) {
+ count -= retval;
+ buf += retval;
+ progress += retval;
+ }
+
+ unmap_kiobuf(iobuf);
+
+ if (retval != iosize)
+ break;
+ }
+
+ if (progress)
+ retval = progress;
+
+ out_free:
+ if (!new_iobuf)
+ clear_bit(0, &filp->f_iobuf_lock);
+ else
+ free_kiovec(1, &iobuf);
+ out:
+ return retval;
+}
+
int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
{
char *kaddr;
{
ssize_t retval;
+ if ((ssize_t) count < 0)
+ return -EINVAL;
+
+ if (filp->f_flags & O_DIRECT)
+ goto o_direct;
+
retval = -EFAULT;
if (access_ok(VERIFY_WRITE, buf, count)) {
retval = 0;
retval = desc.error;
}
}
+ out:
return retval;
+
+ o_direct:
+ {
+ loff_t pos = *ppos, size;
+ struct inode * inode = filp->f_dentry->d_inode;
+
+ retval = 0;
+ if (!count)
+ goto out; /* skip atime */
+ size = calc_rsize(inode);
+ if (pos < size) {
+ if (pos + count > size)
+ count = size - pos;
+ retval = generic_file_direct_IO(READ, filp, buf, count, pos);
+ if (retval > 0)
+ *ppos = pos + retval;
+ }
+ UPDATE_ATIME(filp->f_dentry->d_inode);
+ goto out;
+ }
}
static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size)
struct address_space *mapping = inode->i_mapping;
struct page *page, **hash, *old_page;
unsigned long size, pgoff;
+ loff_t rsize;
pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
* An external ptracer can access pages that normally aren't
* accessible..
*/
- size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ rsize = calc_rsize(inode);
+ size = (rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if ((pgoff >= size) && (area->vm_mm == current->mm))
return NULL;
return error;
}
-/*
- * Shared mappings need to be able to do the right thing at
- * close/unmap/sync. They will also use the private file as
- * backing-store for swapping..
- */
-static struct vm_operations_struct file_shared_mmap = {
- nopage: filemap_nopage,
-};
-
-/*
- * Private mappings just need to be able to load in the map.
- *
- * (This is actually used for shared mappings as well, if we
- * know they can't ever get write permissions..)
- */
-static struct vm_operations_struct file_private_mmap = {
+static struct vm_operations_struct generic_file_vm_ops = {
nopage: filemap_nopage,
};
int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
{
- struct vm_operations_struct * ops;
struct inode *inode = file->f_dentry->d_inode;
- ops = &file_private_mmap;
if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
if (!inode->i_mapping->a_ops->writepage)
return -EINVAL;
- ops = &file_shared_mmap;
}
if (!inode->i_sb || !S_ISREG(inode->i_mode))
return -EACCES;
if (!inode->i_mapping->a_ops->readpage)
return -ENOEXEC;
UPDATE_ATIME(inode);
- vma->vm_ops = ops;
+ vma->vm_ops = &generic_file_vm_ops;
return 0;
}
unsigned long end, int behavior)
{
struct vm_area_struct * n;
+ struct mm_struct * mm = vma->vm_mm;
n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
if (!n)
get_file(n->vm_file);
if (n->vm_ops && n->vm_ops->open)
n->vm_ops->open(n);
- lock_vma_mappings(vma);
- spin_lock(&vma->vm_mm->page_table_lock);
vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
+ lock_vma_mappings(vma);
+ spin_lock(&mm->page_table_lock);
vma->vm_start = end;
- __insert_vm_struct(current->mm, n);
- spin_unlock(&vma->vm_mm->page_table_lock);
+ __insert_vm_struct(mm, n);
+ spin_unlock(&mm->page_table_lock);
unlock_vma_mappings(vma);
return 0;
}
unsigned long start, int behavior)
{
struct vm_area_struct * n;
+ struct mm_struct * mm = vma->vm_mm;
n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
if (!n)
if (n->vm_ops && n->vm_ops->open)
n->vm_ops->open(n);
lock_vma_mappings(vma);
- spin_lock(&vma->vm_mm->page_table_lock);
+ spin_lock(&mm->page_table_lock);
vma->vm_end = start;
- __insert_vm_struct(current->mm, n);
- spin_unlock(&vma->vm_mm->page_table_lock);
+ __insert_vm_struct(mm, n);
+ spin_unlock(&mm->page_table_lock);
unlock_vma_mappings(vma);
return 0;
}
unsigned long start, unsigned long end, int behavior)
{
struct vm_area_struct * left, * right;
+ struct mm_struct * mm = vma->vm_mm;
left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
if (!left)
vma->vm_ops->open(left);
vma->vm_ops->open(right);
}
- lock_vma_mappings(vma);
- spin_lock(&vma->vm_mm->page_table_lock);
vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
+ vma->vm_raend = 0;
+ lock_vma_mappings(vma);
+ spin_lock(&mm->page_table_lock);
vma->vm_start = start;
vma->vm_end = end;
setup_read_behavior(vma, behavior);
- vma->vm_raend = 0;
- __insert_vm_struct(current->mm, left);
- __insert_vm_struct(current->mm, right);
- spin_unlock(&vma->vm_mm->page_table_lock);
+ __insert_vm_struct(mm, left);
+ __insert_vm_struct(mm, right);
+ spin_unlock(&mm->page_table_lock);
unlock_vma_mappings(vma);
return 0;
}
long error = -EBADF;
struct file * file;
unsigned long size, rlim_rss;
+ loff_t rsize;
/* Doesn't work if there's no mapped file. */
if (!vma->vm_file)
return error;
file = vma->vm_file;
- size = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT;
+ rsize = calc_rsize(file->f_dentry->d_inode);
+ size = (rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
if (end > vma->vm_end)
}
}
if (cached_page)
- page_cache_free(cached_page);
- mark_page_accessed(page);
+ page_cache_release(cached_page);
return page;
}
struct page *cached_page = NULL;
struct page *page = __grab_cache_page(mapping,index,&cached_page);
if (cached_page)
- page_cache_free(cached_page);
+ page_cache_release(cached_page);
return page;
}
* okir@monad.swb.de
*/
ssize_t
-generic_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)
+generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos)
{
struct inode *inode = file->f_dentry->d_inode;
struct address_space *mapping = inode->i_mapping;
loff_t pos;
struct page *page, *cached_page;
unsigned long written;
- long status;
+ long status = 0;
int err;
unsigned bytes;
- cached_page = NULL;
+ if ((ssize_t) count < 0)
+ return -EINVAL;
if (!access_ok(VERIFY_READ, buf, count))
return -EFAULT;
-
+
+ cached_page = NULL;
+
down(&inode->i_sem);
pos = *ppos;
written = 0;
- if (file->f_flags & O_APPEND)
+ /* FIXME: this is for backwards compatibility with 2.4 */
+ if (!S_ISBLK(inode->i_mode) && file->f_flags & O_APPEND)
pos = inode->i_size;
/*
* Linus frestrict idea will clean these up nicely..
*/
- if (pos >= inode->i_sb->s_maxbytes)
- {
- if (count || pos > inode->i_sb->s_maxbytes) {
- send_sig(SIGXFSZ, current, 0);
- err = -EFBIG;
+ if (!S_ISBLK(inode->i_mode)) {
+ if (pos >= inode->i_sb->s_maxbytes)
+ {
+ if (count || pos > inode->i_sb->s_maxbytes) {
+ send_sig(SIGXFSZ, current, 0);
+ err = -EFBIG;
+ goto out;
+ }
+ /* zero-length writes at ->s_maxbytes are OK */
+ }
+
+ if (pos + count > inode->i_sb->s_maxbytes)
+ count = inode->i_sb->s_maxbytes - pos;
+ } else {
+ if (is_read_only(inode->i_rdev)) {
+ err = -EPERM;
goto out;
}
- /* zero-length writes at ->s_maxbytes are OK */
- }
+ if (pos >= calc_rsize(inode)) {
+ if (count || pos > calc_rsize(inode)) {
+ /* FIXME: this is for backwards compatibility with 2.4 */
+ err = -ENOSPC;
+ goto out;
+ }
+ /* zero-length writes at blkdev end are OK */
+ }
- if (pos + count > inode->i_sb->s_maxbytes)
- count = inode->i_sb->s_maxbytes - pos;
+ if (pos + count > calc_rsize(inode))
+ count = calc_rsize(inode) - pos;
+ }
- if (count == 0) {
- err = 0;
+ err = 0;
+ if (count == 0)
goto out;
- }
- status = 0;
remove_suid(inode);
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
mark_inode_dirty_sync(inode);
- while (count) {
+ if (file->f_flags & O_DIRECT)
+ goto o_direct;
+
+ do {
unsigned long index, offset;
long page_fault;
char *kaddr;
if (status < 0)
break;
- }
+ } while (count);
*ppos = pos;
if (cached_page)
- page_cache_free(cached_page);
+ page_cache_release(cached_page);
/* For now, when the user asks for O_SYNC, we'll actually
* provide O_DSYNC. */
if ((status >= 0) && (file->f_flags & O_SYNC))
- status = generic_osync_inode(inode, 1); /* 1 means datasync */
+ status = generic_osync_inode(inode, OSYNC_METADATA|OSYNC_DATA);
+out_status:
err = written ? written : status;
out:
fail_write:
status = -EFAULT;
goto unlock;
+
+o_direct:
+ written = generic_file_direct_IO(WRITE, file, (char *) buf, count, pos);
+ if (written > 0) {
+ loff_t end = pos + written;
+ if (end > inode->i_size && !S_ISBLK(inode->i_mode)) {
+ inode->i_size = end;
+ mark_inode_dirty(inode);
+ }
+ *ppos = end;
+ invalidate_inode_pages2(mapping);
+ }
+ /*
+ * Sync the fs metadata but not the minor inode changes and
+ * of course not the data as we did direct DMA for the IO.
+ */
+ if (written >= 0 && file->f_flags & O_SYNC)
+ status = generic_osync_inode(inode, OSYNC_METADATA);
+ goto out_status;
}
void __init page_cache_init(unsigned long mempages)
p_from = from->b_page;
- vfrom = kmap_atomic(p_from, KM_BOUNCE_WRITE);
+ vfrom = kmap_atomic(p_from, KM_USER0);
memcpy(to->b_data, vfrom + bh_offset(from), to->b_size);
- kunmap_atomic(vfrom, KM_BOUNCE_WRITE);
+ kunmap_atomic(vfrom, KM_USER0);
}
static inline void copy_to_high_bh_irq (struct buffer_head *to,
{
pgd_t * page_dir = mm->pgd;
+ spin_lock(&mm->page_table_lock);
page_dir += first;
do {
free_one_pgd(page_dir);
page_dir++;
} while (--nr);
+ spin_unlock(&mm->page_table_lock);
/* keep the page table cache within bounds */
check_pgt_cache();
* free_page() used to be able to clear swap cache
* entries. We may now have to do it manually.
*/
- if (page->mapping) {
- if (pte_dirty(pte))
- set_page_dirty(page);
- if (pte_young(pte))
- mark_page_accessed(page);
- }
+ if (pte_dirty(pte) && page->mapping)
+ set_page_dirty(page);
free_page_and_swap_cache(page);
return 1;
}
break;
/* Recheck swapcachedness once the page is locked */
can_reuse = exclusive_swap_page(old_page);
+#if 1
+ if (can_reuse)
+ delete_from_swap_cache_nolock(old_page);
+#endif
UnlockPage(old_page);
if (!can_reuse)
break;
struct page *page;
swp_entry_t entry = pte_to_swp_entry(orig_pte);
pte_t pte;
+ int ret = 1;
spin_unlock(&mm->page_table_lock);
page = lookup_swap_cache(entry);
*/
return pte_same(*page_table, orig_pte) ? -1 : 1;
}
+
+ /* Had to read the page from swap area: Major fault */
+ ret = 2;
}
/*
swap_free(entry);
if (exclusive_swap_page(page)) {
+#if 0
if (write_access)
pte = pte_mkwrite(pte_mkdirty(pte));
- if (vm_swap_full()) {
- delete_from_swap_cache_nolock(page);
- pte = pte_mkdirty(pte);
- }
+#else
+ delete_from_swap_cache_nolock(page);
+ pte = pte_mkwrite(pte_mkdirty(pte));
+#endif
}
UnlockPage(page);
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, address, pte);
- return 1; /* Minor fault */
+ return ret;
}
/*
* Because we dropped the lock, we should re-check the
* entry, as somebody else could have populated it..
*/
- if (pgd_present(*pgd)) {
+ if (!pgd_none(*pgd)) {
pmd_free(new);
goto out;
}
*/
pte_t *pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
{
- if (!pmd_present(*pmd)) {
+ if (pmd_none(*pmd)) {
pte_t *new;
/* "fast" allocation can happen without dropping the lock.. */
* Because we dropped the lock, we should re-check the
* entry, as somebody else could have populated it..
*/
- if (pmd_present(*pmd)) {
+ if (!pmd_none(*pmd)) {
pte_free(new);
goto out;
}
get_file(n->vm_file);
if (n->vm_ops && n->vm_ops->open)
n->vm_ops->open(n);
+ vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
lock_vma_mappings(vma);
spin_lock(&vma->vm_mm->page_table_lock);
- vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
vma->vm_start = end;
__insert_vm_struct(current->mm, n);
spin_unlock(&vma->vm_mm->page_table_lock);
vma->vm_ops->open(left);
vma->vm_ops->open(right);
}
+ vma->vm_raend = 0;
+ vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
lock_vma_mappings(vma);
spin_lock(&vma->vm_mm->page_table_lock);
- vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
vma->vm_start = start;
vma->vm_end = end;
vma->vm_flags = newflags;
- vma->vm_raend = 0;
__insert_vm_struct(current->mm, left);
__insert_vm_struct(current->mm, right);
spin_unlock(&vma->vm_mm->page_table_lock);
#include <linux/init.h>
#include <linux/file.h>
#include <linux/fs.h>
+#include <linux/personality.h>
#include <asm/uaccess.h>
#include <asm/pgalloc.h>
+/*
+ * WARNING: the debugging will use recursive algorithms so never enable this
+ * unless you know what you are doing.
+ */
+#undef DEBUG_MM_RB
+
/* description of effects of mapping type and prot in current implementation.
* this is due to the limited x86 page protection hardware. The expected
* behavior is in parens:
#undef _trans
}
+#ifdef DEBUG_MM_RB
+static int browse_rb(rb_node_t * rb_node) {
+ int i = 0;
+ if (rb_node) {
+ i++;
+ i += browse_rb(rb_node->rb_left);
+ i += browse_rb(rb_node->rb_right);
+ }
+ return i;
+}
+
+static void validate_mm(struct mm_struct * mm) {
+ int bug = 0;
+ int i = 0;
+ struct vm_area_struct * tmp = mm->mmap;
+ while (tmp) {
+ tmp = tmp->vm_next;
+ i++;
+ }
+ if (i != mm->map_count)
+ printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
+ i = browse_rb(mm->mm_rb.rb_node);
+ if (i != mm->map_count)
+ printk("map_count %d rb %d\n", mm->map_count, i), bug = 1;
+ if (bug)
+ BUG();
+}
+#else
+#define validate_mm(mm) do { } while (0)
+#endif
+
+static struct vm_area_struct * find_vma_prepare(struct mm_struct * mm, unsigned long addr,
+ struct vm_area_struct ** pprev,
+ rb_node_t *** rb_link, rb_node_t ** rb_parent)
+{
+ struct vm_area_struct * vma;
+ rb_node_t ** __rb_link, * __rb_parent, * rb_prev;
+
+ __rb_link = &mm->mm_rb.rb_node;
+ rb_prev = __rb_parent = NULL;
+ vma = NULL;
+
+ while (*__rb_link) {
+ struct vm_area_struct *vma_tmp;
+
+ __rb_parent = *__rb_link;
+ vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
+
+ if (vma_tmp->vm_end > addr) {
+ vma = vma_tmp;
+ if (vma_tmp->vm_start <= addr)
+ return vma;
+ __rb_link = &__rb_parent->rb_left;
+ } else {
+ rb_prev = __rb_parent;
+ __rb_link = &__rb_parent->rb_right;
+ }
+ }
+
+ *pprev = NULL;
+ if (rb_prev)
+ *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
+ *rb_link = __rb_link;
+ *rb_parent = __rb_parent;
+ return vma;
+}
+
+static inline void __vma_link_list(struct mm_struct * mm, struct vm_area_struct * vma, struct vm_area_struct * prev,
+ rb_node_t * rb_parent)
+{
+ if (prev) {
+ vma->vm_next = prev->vm_next;
+ prev->vm_next = vma;
+ } else {
+ mm->mmap = vma;
+ if (rb_parent)
+ vma->vm_next = rb_entry(rb_parent, struct vm_area_struct, vm_rb);
+ else
+ vma->vm_next = NULL;
+ }
+}
+
+static inline void __vma_link_rb(struct mm_struct * mm, struct vm_area_struct * vma,
+ rb_node_t ** rb_link, rb_node_t * rb_parent)
+{
+ rb_link_node(&vma->vm_rb, rb_parent, rb_link);
+ rb_insert_color(&vma->vm_rb, &mm->mm_rb);
+}
+
+static inline void __vma_link_file(struct vm_area_struct * vma)
+{
+ struct file * file;
+
+ file = vma->vm_file;
+ if (file) {
+ struct inode * inode = file->f_dentry->d_inode;
+ struct address_space *mapping = inode->i_mapping;
+ struct vm_area_struct **head;
+
+ if (vma->vm_flags & VM_DENYWRITE)
+ atomic_dec(&inode->i_writecount);
+
+ head = &mapping->i_mmap;
+ if (vma->vm_flags & VM_SHARED)
+ head = &mapping->i_mmap_shared;
+
+ /* insert vma into inode's share list */
+ if((vma->vm_next_share = *head) != NULL)
+ (*head)->vm_pprev_share = &vma->vm_next_share;
+ *head = vma;
+ vma->vm_pprev_share = head;
+ }
+}
+
+static void __vma_link(struct mm_struct * mm, struct vm_area_struct * vma, struct vm_area_struct * prev,
+ rb_node_t ** rb_link, rb_node_t * rb_parent)
+{
+ __vma_link_list(mm, vma, prev, rb_parent);
+ __vma_link_rb(mm, vma, rb_link, rb_parent);
+ __vma_link_file(vma);
+}
+
+static inline void vma_link(struct mm_struct * mm, struct vm_area_struct * vma, struct vm_area_struct * prev,
+ rb_node_t ** rb_link, rb_node_t * rb_parent)
+{
+ lock_vma_mappings(vma);
+ spin_lock(&mm->page_table_lock);
+ __vma_link(mm, vma, prev, rb_link, rb_parent);
+ spin_unlock(&mm->page_table_lock);
+ unlock_vma_mappings(vma);
+
+ mm->map_count++;
+ validate_mm(mm);
+}
+
+static int vma_merge(struct mm_struct * mm, struct vm_area_struct * prev,
+ rb_node_t * rb_parent, unsigned long addr, unsigned long end, unsigned long vm_flags)
+{
+ spinlock_t * lock = &mm->page_table_lock;
+ if (!prev) {
+ prev = rb_entry(rb_parent, struct vm_area_struct, vm_rb);
+ goto merge_next;
+ }
+ if (prev->vm_end == addr && can_vma_merge(prev, vm_flags)) {
+ struct vm_area_struct * next;
+
+ spin_lock(lock);
+ prev->vm_end = end;
+ next = prev->vm_next;
+ if (next && prev->vm_end == next->vm_start && can_vma_merge(next, vm_flags)) {
+ prev->vm_end = next->vm_end;
+ __vma_unlink(mm, next, prev);
+ spin_unlock(lock);
+
+ mm->map_count--;
+ kmem_cache_free(vm_area_cachep, next);
+ return 1;
+ }
+ spin_unlock(lock);
+ return 1;
+ }
+
+ prev = prev->vm_next;
+ if (prev) {
+ merge_next:
+ if (!can_vma_merge(prev, vm_flags))
+ return 0;
+ if (end == prev->vm_start) {
+ spin_lock(lock);
+ prev->vm_start = addr;
+ spin_unlock(lock);
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned long len,
unsigned long prot, unsigned long flags, unsigned long pgoff)
{
struct mm_struct * mm = current->mm;
- struct vm_area_struct * vma;
+ struct vm_area_struct * vma, * prev;
unsigned int vm_flags;
int correct_wcount = 0;
int error;
+ rb_node_t ** rb_link, * rb_parent;
if (file && (!file->f_op || !file->f_op->mmap))
return -ENODEV;
if ((len = PAGE_ALIGN(len)) == 0)
return addr;
- if (len > TASK_SIZE || addr > TASK_SIZE-len)
+ if (len > TASK_SIZE)
return -EINVAL;
/* offset overflow? */
/* Clear old maps */
error = -ENOMEM;
- if (do_munmap(mm, addr, len))
- return -ENOMEM;
+munmap_back:
+ vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+ if (vma && vma->vm_start < addr + len) {
+ if (do_munmap(mm, addr, len))
+ return -ENOMEM;
+ goto munmap_back;
+ }
/* Check against address space limit. */
if ((mm->total_vm << PAGE_SHIFT) + len
return -ENOMEM;
/* Can we just expand an old anonymous mapping? */
- if (addr && !file && !(vm_flags & VM_SHARED)) {
- struct vm_area_struct * vma = find_vma(mm, addr-1);
- if (vma && vma->vm_end == addr && !vma->vm_file &&
- vma->vm_flags == vm_flags) {
- vma->vm_end = addr + len;
+ if (!file && !(vm_flags & VM_SHARED) && rb_parent)
+ if (vma_merge(mm, prev, rb_parent, addr, addr + len, vm_flags))
goto out;
- }
- }
/* Determine the object being mapped and call the appropriate
* specific mapper. the address has already been validated, but
*/
addr = vma->vm_start;
- insert_vm_struct(mm, vma);
+ vma_link(mm, vma, prev, rb_link, rb_parent);
if (correct_wcount)
atomic_inc(&file->f_dentry->d_inode->i_writecount);
if (len > TASK_SIZE)
return -ENOMEM;
- if (!addr)
- addr = TASK_UNMAPPED_BASE;
- addr = PAGE_ALIGN(addr);
+
+ if (addr) {
+ addr = PAGE_ALIGN(addr);
+ vma = find_vma(current->mm, addr);
+ if (TASK_SIZE - len >= addr &&
+ (!vma || addr + len <= vma->vm_start))
+ return addr;
+ }
+ addr = PAGE_ALIGN(TASK_UNMAPPED_BASE);
for (vma = find_vma(current->mm, addr); ; vma = vma->vm_next) {
/* At this point: (!vma || addr < vma->vm_end). */
unsigned long get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags)
{
if (flags & MAP_FIXED) {
+ if (addr > TASK_SIZE - len)
+ return -EINVAL;
if (addr & ~PAGE_MASK)
return -EINVAL;
return addr;
return arch_get_unmapped_area(file, addr, len, pgoff, flags);
}
-#define vm_avl_empty (struct vm_area_struct *) NULL
-
-#include "mmap_avl.c"
-
/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
{
/* (Cache hit rate is typically around 35%.) */
vma = mm->mmap_cache;
if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
- if (!mm->mmap_avl) {
- /* Go through the linear list. */
- vma = mm->mmap;
- while (vma && vma->vm_end <= addr)
- vma = vma->vm_next;
- } else {
- /* Then go through the AVL tree quickly. */
- struct vm_area_struct * tree = mm->mmap_avl;
- vma = NULL;
- for (;;) {
- if (tree == vm_avl_empty)
+ rb_node_t * rb_node;
+
+ rb_node = mm->mm_rb.rb_node;
+ vma = NULL;
+
+ while (rb_node) {
+ struct vm_area_struct * vma_tmp;
+
+ vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
+
+ if (vma_tmp->vm_end > addr) {
+ vma = vma_tmp;
+ if (vma_tmp->vm_start <= addr)
break;
- if (tree->vm_end > addr) {
- vma = tree;
- if (tree->vm_start <= addr)
- break;
- tree = tree->vm_avl_left;
- } else
- tree = tree->vm_avl_right;
- }
+ rb_node = rb_node->rb_left;
+ } else
+ rb_node = rb_node->rb_right;
}
if (vma)
mm->mmap_cache = vma;
struct vm_area_struct **pprev)
{
if (mm) {
- if (!mm->mmap_avl) {
- /* Go through the linear list. */
- struct vm_area_struct * prev = NULL;
- struct vm_area_struct * vma = mm->mmap;
- while (vma && vma->vm_end <= addr) {
- prev = vma;
- vma = vma->vm_next;
- }
- *pprev = prev;
- return vma;
- } else {
- /* Go through the AVL tree quickly. */
- struct vm_area_struct * vma = NULL;
- struct vm_area_struct * last_turn_right = NULL;
- struct vm_area_struct * prev = NULL;
- struct vm_area_struct * tree = mm->mmap_avl;
- for (;;) {
- if (tree == vm_avl_empty)
+ /* Go through the RB tree quickly. */
+ struct vm_area_struct * vma;
+ rb_node_t * rb_node, * rb_last_right, * rb_prev;
+
+ rb_node = mm->mm_rb.rb_node;
+ rb_last_right = rb_prev = NULL;
+ vma = NULL;
+
+ while (rb_node) {
+ struct vm_area_struct * vma_tmp;
+
+ vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
+
+ if (vma_tmp->vm_end > addr) {
+ vma = vma_tmp;
+ rb_prev = rb_last_right;
+ if (vma_tmp->vm_start <= addr)
break;
- if (tree->vm_end > addr) {
- vma = tree;
- prev = last_turn_right;
- if (tree->vm_start <= addr)
- break;
- tree = tree->vm_avl_left;
- } else {
- last_turn_right = tree;
- tree = tree->vm_avl_right;
- }
+ rb_node = rb_node->rb_left;
+ } else {
+ rb_last_right = rb_node;
+ rb_node = rb_node->rb_right;
}
- if (vma) {
- if (vma->vm_avl_left != vm_avl_empty) {
- prev = vma->vm_avl_left;
- while (prev->vm_avl_right != vm_avl_empty)
- prev = prev->vm_avl_right;
- }
- if ((prev ? prev->vm_next : mm->mmap) != vma)
- printk("find_vma_prev: tree inconsistent with list\n");
- *pprev = prev;
- return vma;
+ }
+ if (vma) {
+ if (vma->vm_rb.rb_left) {
+ rb_prev = vma->vm_rb.rb_left;
+ while (rb_prev->rb_right)
+ rb_prev = rb_prev->rb_right;
}
+ *pprev = NULL;
+ if (rb_prev)
+ *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
+ if ((rb_prev ? (*pprev)->vm_next : mm->mmap) != vma)
+ BUG();
+ return vma;
}
}
*pprev = NULL;
/* Work out to one of the ends. */
if (end == area->vm_end) {
+ /*
+ * here area isn't visible to the semaphore-less readers
+ * so we don't need to update it under the spinlock.
+ */
area->vm_end = addr;
lock_vma_mappings(area);
spin_lock(&mm->page_table_lock);
} else if (addr == area->vm_start) {
area->vm_pgoff += (end - area->vm_start) >> PAGE_SHIFT;
+ /* same locking considerations of the above case */
area->vm_start = end;
lock_vma_mappings(area);
spin_lock(&mm->page_table_lock);
*npp = mpnt->vm_next;
mpnt->vm_next = free;
free = mpnt;
- if (mm->mmap_avl)
- avl_remove(mpnt, &mm->mmap_avl);
+ rb_erase(&mpnt->vm_rb, &mm->mm_rb);
}
mm->mmap_cache = NULL; /* Kill the cache. */
spin_unlock(&mm->page_table_lock);
if (file)
atomic_inc(&file->f_dentry->d_inode->i_writecount);
}
+ validate_mm(mm);
/* Release the extra vma struct if it wasn't used */
if (extra)
unsigned long do_brk(unsigned long addr, unsigned long len)
{
struct mm_struct * mm = current->mm;
- struct vm_area_struct * vma;
- unsigned long flags, retval;
+ struct vm_area_struct * vma, * prev;
+ unsigned long flags;
+ rb_node_t ** rb_link, * rb_parent;
len = PAGE_ALIGN(len);
if (!len)
/*
* Clear old maps. this also does some error checking for us
*/
- retval = do_munmap(mm, addr, len);
- if (retval != 0)
- return retval;
+ munmap_back:
+ vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+ if (vma && vma->vm_start < addr + len) {
+ if (do_munmap(mm, addr, len))
+ return -ENOMEM;
+ goto munmap_back;
+ }
/* Check against address space limits *after* clearing old maps... */
if ((mm->total_vm << PAGE_SHIFT) + len
MAP_FIXED|MAP_PRIVATE) | mm->def_flags;
flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
-
+
/* Can we just expand an old anonymous mapping? */
- if (addr) {
- struct vm_area_struct * vma = find_vma(mm, addr-1);
- if (vma && vma->vm_end == addr && !vma->vm_file &&
- vma->vm_flags == flags) {
- vma->vm_end = addr + len;
- goto out;
- }
- }
+ if (rb_parent && vma_merge(mm, prev, rb_parent, addr, addr + len, flags))
+ goto out;
/*
* create a vma struct for an anonymous mapping
vma->vm_file = NULL;
vma->vm_private_data = NULL;
- insert_vm_struct(mm, vma);
+ vma_link(mm, vma, prev, rb_link, rb_parent);
out:
mm->total_vm += len >> PAGE_SHIFT;
return addr;
}
-/* Build the AVL tree corresponding to the VMA list. */
-void build_mmap_avl(struct mm_struct * mm)
+/* Build the RB tree corresponding to the VMA list. */
+void build_mmap_rb(struct mm_struct * mm)
{
struct vm_area_struct * vma;
-
- mm->mmap_avl = NULL;
- for (vma = mm->mmap; vma; vma = vma->vm_next)
- avl_insert(vma, &mm->mmap_avl);
+ rb_node_t ** rb_link, * rb_parent;
+
+ mm->mm_rb = RB_ROOT;
+ rb_link = &mm->mm_rb.rb_node;
+ rb_parent = NULL;
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ __vma_link_rb(mm, vma, rb_link, rb_parent);
+ rb_parent = &vma->vm_rb;
+ rb_link = &rb_parent->rb_right;
+ }
}
/* Release all mmaps. */
release_segments(mm);
spin_lock(&mm->page_table_lock);
mpnt = mm->mmap;
- mm->mmap = mm->mmap_avl = mm->mmap_cache = NULL;
+ mm->mmap = mm->mmap_cache = NULL;
+ mm->mm_rb = RB_ROOT;
mm->rss = 0;
spin_unlock(&mm->page_table_lock);
mm->total_vm = 0;
/* This is just debugging */
if (mm->map_count)
- printk("exit_mmap: map count is %d\n", mm->map_count);
+ BUG();
clear_page_tables(mm, FIRST_USER_PGD_NR, USER_PTRS_PER_PGD);
}
* and into the inode's i_mmap ring. If vm_file is non-NULL
* then the i_shared_lock must be held here.
*/
-void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vmp)
+void __insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
{
- struct vm_area_struct **pprev;
- struct file * file;
-
- if (!mm->mmap_avl) {
- pprev = &mm->mmap;
- while (*pprev && (*pprev)->vm_start <= vmp->vm_start)
- pprev = &(*pprev)->vm_next;
- } else {
- struct vm_area_struct *prev, *next;
- avl_insert_neighbours(vmp, &mm->mmap_avl, &prev, &next);
- pprev = (prev ? &prev->vm_next : &mm->mmap);
- if (*pprev != next)
- printk("insert_vm_struct: tree inconsistent with list\n");
- }
- vmp->vm_next = *pprev;
- *pprev = vmp;
+ struct vm_area_struct * __vma, * prev;
+ rb_node_t ** rb_link, * rb_parent;
+ __vma = find_vma_prepare(mm, vma->vm_start, &prev, &rb_link, &rb_parent);
+ if (__vma && __vma->vm_start < vma->vm_end)
+ BUG();
+ __vma_link(mm, vma, prev, rb_link, rb_parent);
mm->map_count++;
- if (mm->map_count >= AVL_MIN_MAP_COUNT && !mm->mmap_avl)
- build_mmap_avl(mm);
-
- file = vmp->vm_file;
- if (file) {
- struct inode * inode = file->f_dentry->d_inode;
- struct address_space *mapping = inode->i_mapping;
- struct vm_area_struct **head;
-
- if (vmp->vm_flags & VM_DENYWRITE)
- atomic_dec(&inode->i_writecount);
-
- head = &mapping->i_mmap;
- if (vmp->vm_flags & VM_SHARED)
- head = &mapping->i_mmap_shared;
-
- /* insert vmp into inode's share list */
- if((vmp->vm_next_share = *head) != NULL)
- (*head)->vm_pprev_share = &vmp->vm_next_share;
- *head = vmp;
- vmp->vm_pprev_share = head;
- }
+ validate_mm(mm);
}
-void insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vmp)
+void insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
{
- lock_vma_mappings(vmp);
- spin_lock(¤t->mm->page_table_lock);
- __insert_vm_struct(mm, vmp);
- spin_unlock(¤t->mm->page_table_lock);
- unlock_vma_mappings(vmp);
+ struct vm_area_struct * __vma, * prev;
+ rb_node_t ** rb_link, * rb_parent;
+
+ __vma = find_vma_prepare(mm, vma->vm_start, &prev, &rb_link, &rb_parent);
+ if (__vma && __vma->vm_start < vma->vm_end)
+ BUG();
+ vma_link(mm, vma, prev, rb_link, rb_parent);
+ validate_mm(mm);
}
+++ /dev/null
-/*
- * Searching a VMA in the linear list task->mm->mmap is horribly slow.
- * Use an AVL (Adelson-Velskii and Landis) tree to speed up this search
- * from O(n) to O(log n), where n is the number of VMAs of the task
- * n is typically around 6, but may reach 3000 in some cases: object-oriented
- * databases, persistent store, generational garbage collection (Java, Lisp),
- * ElectricFence.
- * Written by Bruno Haible <haible@ma2s2.mathematik.uni-karlsruhe.de>.
- */
-
-/* We keep the list and tree sorted by address. */
-#define vm_avl_key vm_end
-#define vm_avl_key_t unsigned long /* typeof(vma->avl_key) */
-
-/*
- * task->mm->mmap_avl is the AVL tree corresponding to task->mm->mmap
- * or, more exactly, its root.
- * A vm_area_struct has the following fields:
- * vm_avl_left left son of a tree node
- * vm_avl_right right son of a tree node
- * vm_avl_height 1+max(heightof(left),heightof(right))
- * The empty tree is represented as NULL.
- */
-
-/* Since the trees are balanced, their height will never be large. */
-#define avl_maxheight 41 /* why this? a small exercise */
-#define heightof(tree) ((tree) == vm_avl_empty ? 0 : (tree)->vm_avl_height)
-/*
- * Consistency and balancing rules:
- * 1. tree->vm_avl_height == 1+max(heightof(tree->vm_avl_left),heightof(tree->vm_avl_right))
- * 2. abs( heightof(tree->vm_avl_left) - heightof(tree->vm_avl_right) ) <= 1
- * 3. foreach node in tree->vm_avl_left: node->vm_avl_key <= tree->vm_avl_key,
- * foreach node in tree->vm_avl_right: node->vm_avl_key >= tree->vm_avl_key.
- */
-
-#ifdef DEBUG_AVL
-
-/* Look up the nodes at the left and at the right of a given node. */
-static void avl_neighbours (struct vm_area_struct * node, struct vm_area_struct * tree, struct vm_area_struct ** to_the_left, struct vm_area_struct ** to_the_right)
-{
- vm_avl_key_t key = node->vm_avl_key;
-
- *to_the_left = *to_the_right = NULL;
- for (;;) {
- if (tree == vm_avl_empty) {
- printk("avl_neighbours: node not found in the tree\n");
- return;
- }
- if (key == tree->vm_avl_key)
- break;
- if (key < tree->vm_avl_key) {
- *to_the_right = tree;
- tree = tree->vm_avl_left;
- } else {
- *to_the_left = tree;
- tree = tree->vm_avl_right;
- }
- }
- if (tree != node) {
- printk("avl_neighbours: node not exactly found in the tree\n");
- return;
- }
- if (tree->vm_avl_left != vm_avl_empty) {
- struct vm_area_struct * node;
- for (node = tree->vm_avl_left; node->vm_avl_right != vm_avl_empty; node = node->vm_avl_right)
- continue;
- *to_the_left = node;
- }
- if (tree->vm_avl_right != vm_avl_empty) {
- struct vm_area_struct * node;
- for (node = tree->vm_avl_right; node->vm_avl_left != vm_avl_empty; node = node->vm_avl_left)
- continue;
- *to_the_right = node;
- }
- if ((*to_the_left && ((*to_the_left)->vm_next != node)) || (node->vm_next != *to_the_right))
- printk("avl_neighbours: tree inconsistent with list\n");
-}
-
-#endif
-
-/*
- * Rebalance a tree.
- * After inserting or deleting a node of a tree we have a sequence of subtrees
- * nodes[0]..nodes[k-1] such that
- * nodes[0] is the root and nodes[i+1] = nodes[i]->{vm_avl_left|vm_avl_right}.
- */
-static void avl_rebalance (struct vm_area_struct *** nodeplaces_ptr, int count)
-{
- for ( ; count > 0 ; count--) {
- struct vm_area_struct ** nodeplace = *--nodeplaces_ptr;
- struct vm_area_struct * node = *nodeplace;
- struct vm_area_struct * nodeleft = node->vm_avl_left;
- struct vm_area_struct * noderight = node->vm_avl_right;
- int heightleft = heightof(nodeleft);
- int heightright = heightof(noderight);
- if (heightright + 1 < heightleft) {
- /* */
- /* * */
- /* / \ */
- /* n+2 n */
- /* */
- struct vm_area_struct * nodeleftleft = nodeleft->vm_avl_left;
- struct vm_area_struct * nodeleftright = nodeleft->vm_avl_right;
- int heightleftright = heightof(nodeleftright);
- if (heightof(nodeleftleft) >= heightleftright) {
- /* */
- /* * n+2|n+3 */
- /* / \ / \ */
- /* n+2 n --> / n+1|n+2 */
- /* / \ | / \ */
- /* n+1 n|n+1 n+1 n|n+1 n */
- /* */
- node->vm_avl_left = nodeleftright; nodeleft->vm_avl_right = node;
- nodeleft->vm_avl_height = 1 + (node->vm_avl_height = 1 + heightleftright);
- *nodeplace = nodeleft;
- } else {
- /* */
- /* * n+2 */
- /* / \ / \ */
- /* n+2 n --> n+1 n+1 */
- /* / \ / \ / \ */
- /* n n+1 n L R n */
- /* / \ */
- /* L R */
- /* */
- nodeleft->vm_avl_right = nodeleftright->vm_avl_left;
- node->vm_avl_left = nodeleftright->vm_avl_right;
- nodeleftright->vm_avl_left = nodeleft;
- nodeleftright->vm_avl_right = node;
- nodeleft->vm_avl_height = node->vm_avl_height = heightleftright;
- nodeleftright->vm_avl_height = heightleft;
- *nodeplace = nodeleftright;
- }
- }
- else if (heightleft + 1 < heightright) {
- /* similar to the above, just interchange 'left' <--> 'right' */
- struct vm_area_struct * noderightright = noderight->vm_avl_right;
- struct vm_area_struct * noderightleft = noderight->vm_avl_left;
- int heightrightleft = heightof(noderightleft);
- if (heightof(noderightright) >= heightrightleft) {
- node->vm_avl_right = noderightleft; noderight->vm_avl_left = node;
- noderight->vm_avl_height = 1 + (node->vm_avl_height = 1 + heightrightleft);
- *nodeplace = noderight;
- } else {
- noderight->vm_avl_left = noderightleft->vm_avl_right;
- node->vm_avl_right = noderightleft->vm_avl_left;
- noderightleft->vm_avl_right = noderight;
- noderightleft->vm_avl_left = node;
- noderight->vm_avl_height = node->vm_avl_height = heightrightleft;
- noderightleft->vm_avl_height = heightright;
- *nodeplace = noderightleft;
- }
- }
- else {
- int height = (heightleft<heightright ? heightright : heightleft) + 1;
- if (height == node->vm_avl_height)
- break;
- node->vm_avl_height = height;
- }
- }
-}
-
-/* Insert a node into a tree. */
-static inline void avl_insert (struct vm_area_struct * new_node, struct vm_area_struct ** ptree)
-{
- vm_avl_key_t key = new_node->vm_avl_key;
- struct vm_area_struct ** nodeplace = ptree;
- struct vm_area_struct ** stack[avl_maxheight];
- int stack_count = 0;
- struct vm_area_struct *** stack_ptr = &stack[0]; /* = &stack[stackcount] */
- for (;;) {
- struct vm_area_struct * node = *nodeplace;
- if (node == vm_avl_empty)
- break;
- *stack_ptr++ = nodeplace; stack_count++;
- if (key < node->vm_avl_key)
- nodeplace = &node->vm_avl_left;
- else
- nodeplace = &node->vm_avl_right;
- }
- new_node->vm_avl_left = vm_avl_empty;
- new_node->vm_avl_right = vm_avl_empty;
- new_node->vm_avl_height = 1;
- *nodeplace = new_node;
- avl_rebalance(stack_ptr,stack_count);
-}
-
-/* Insert a node into a tree, and
- * return the node to the left of it and the node to the right of it.
- */
-static inline void avl_insert_neighbours (struct vm_area_struct * new_node, struct vm_area_struct ** ptree,
- struct vm_area_struct ** to_the_left, struct vm_area_struct ** to_the_right)
-{
- vm_avl_key_t key = new_node->vm_avl_key;
- struct vm_area_struct ** nodeplace = ptree;
- struct vm_area_struct ** stack[avl_maxheight];
- int stack_count = 0;
- struct vm_area_struct *** stack_ptr = &stack[0]; /* = &stack[stackcount] */
- *to_the_left = *to_the_right = NULL;
- for (;;) {
- struct vm_area_struct * node = *nodeplace;
- if (node == vm_avl_empty)
- break;
- *stack_ptr++ = nodeplace; stack_count++;
- if (key < node->vm_avl_key) {
- *to_the_right = node;
- nodeplace = &node->vm_avl_left;
- } else {
- *to_the_left = node;
- nodeplace = &node->vm_avl_right;
- }
- }
- new_node->vm_avl_left = vm_avl_empty;
- new_node->vm_avl_right = vm_avl_empty;
- new_node->vm_avl_height = 1;
- *nodeplace = new_node;
- avl_rebalance(stack_ptr,stack_count);
-}
-
-/* Removes a node out of a tree. */
-static void avl_remove (struct vm_area_struct * node_to_delete, struct vm_area_struct ** ptree)
-{
- vm_avl_key_t key = node_to_delete->vm_avl_key;
- struct vm_area_struct ** nodeplace = ptree;
- struct vm_area_struct ** stack[avl_maxheight];
- int stack_count = 0;
- struct vm_area_struct *** stack_ptr = &stack[0]; /* = &stack[stackcount] */
- struct vm_area_struct ** nodeplace_to_delete;
- for (;;) {
- struct vm_area_struct * node = *nodeplace;
-#ifdef DEBUG_AVL
- if (node == vm_avl_empty) {
- /* what? node_to_delete not found in tree? */
- printk("avl_remove: node to delete not found in tree\n");
- return;
- }
-#endif
- *stack_ptr++ = nodeplace; stack_count++;
- if (key == node->vm_avl_key)
- break;
- if (key < node->vm_avl_key)
- nodeplace = &node->vm_avl_left;
- else
- nodeplace = &node->vm_avl_right;
- }
- nodeplace_to_delete = nodeplace;
- /* Have to remove node_to_delete = *nodeplace_to_delete. */
- if (node_to_delete->vm_avl_left == vm_avl_empty) {
- *nodeplace_to_delete = node_to_delete->vm_avl_right;
- stack_ptr--; stack_count--;
- } else {
- struct vm_area_struct *** stack_ptr_to_delete = stack_ptr;
- struct vm_area_struct ** nodeplace = &node_to_delete->vm_avl_left;
- struct vm_area_struct * node;
- for (;;) {
- node = *nodeplace;
- if (node->vm_avl_right == vm_avl_empty)
- break;
- *stack_ptr++ = nodeplace; stack_count++;
- nodeplace = &node->vm_avl_right;
- }
- *nodeplace = node->vm_avl_left;
- /* node replaces node_to_delete */
- node->vm_avl_left = node_to_delete->vm_avl_left;
- node->vm_avl_right = node_to_delete->vm_avl_right;
- node->vm_avl_height = node_to_delete->vm_avl_height;
- *nodeplace_to_delete = node; /* replace node_to_delete */
- *stack_ptr_to_delete = &node->vm_avl_left; /* replace &node_to_delete->vm_avl_left */
- }
- avl_rebalance(stack_ptr,stack_count);
-}
-
-#ifdef DEBUG_AVL
-
-/* print a list */
-static void printk_list (struct vm_area_struct * vma)
-{
- printk("[");
- while (vma) {
- printk("%08lX-%08lX", vma->vm_start, vma->vm_end);
- vma = vma->vm_next;
- if (!vma)
- break;
- printk(" ");
- }
- printk("]");
-}
-
-/* print a tree */
-static void printk_avl (struct vm_area_struct * tree)
-{
- if (tree != vm_avl_empty) {
- printk("(");
- if (tree->vm_avl_left != vm_avl_empty) {
- printk_avl(tree->vm_avl_left);
- printk("<");
- }
- printk("%08lX-%08lX", tree->vm_start, tree->vm_end);
- if (tree->vm_avl_right != vm_avl_empty) {
- printk(">");
- printk_avl(tree->vm_avl_right);
- }
- printk(")");
- }
-}
-
-static char *avl_check_point = "somewhere";
-
-/* check a tree's consistency and balancing */
-static void avl_checkheights (struct vm_area_struct * tree)
-{
- int h, hl, hr;
-
- if (tree == vm_avl_empty)
- return;
- avl_checkheights(tree->vm_avl_left);
- avl_checkheights(tree->vm_avl_right);
- h = tree->vm_avl_height;
- hl = heightof(tree->vm_avl_left);
- hr = heightof(tree->vm_avl_right);
- if ((h == hl+1) && (hr <= hl) && (hl <= hr+1))
- return;
- if ((h == hr+1) && (hl <= hr) && (hr <= hl+1))
- return;
- printk("%s: avl_checkheights: heights inconsistent\n",avl_check_point);
-}
-
-/* check that all values stored in a tree are < key */
-static void avl_checkleft (struct vm_area_struct * tree, vm_avl_key_t key)
-{
- if (tree == vm_avl_empty)
- return;
- avl_checkleft(tree->vm_avl_left,key);
- avl_checkleft(tree->vm_avl_right,key);
- if (tree->vm_avl_key < key)
- return;
- printk("%s: avl_checkleft: left key %lu >= top key %lu\n",avl_check_point,tree->vm_avl_key,key);
-}
-
-/* check that all values stored in a tree are > key */
-static void avl_checkright (struct vm_area_struct * tree, vm_avl_key_t key)
-{
- if (tree == vm_avl_empty)
- return;
- avl_checkright(tree->vm_avl_left,key);
- avl_checkright(tree->vm_avl_right,key);
- if (tree->vm_avl_key > key)
- return;
- printk("%s: avl_checkright: right key %lu <= top key %lu\n",avl_check_point,tree->vm_avl_key,key);
-}
-
-/* check that all values are properly increasing */
-static void avl_checkorder (struct vm_area_struct * tree)
-{
- if (tree == vm_avl_empty)
- return;
- avl_checkorder(tree->vm_avl_left);
- avl_checkorder(tree->vm_avl_right);
- avl_checkleft(tree->vm_avl_left,tree->vm_avl_key);
- avl_checkright(tree->vm_avl_right,tree->vm_avl_key);
-}
-
-/* all checks */
-static void avl_check (struct task_struct * task, char *caller)
-{
- avl_check_point = caller;
-/* printk("task \"%s\", %s\n",task->comm,caller); */
-/* printk("task \"%s\" list: ",task->comm); printk_list(task->mm->mmap); printk("\n"); */
-/* printk("task \"%s\" tree: ",task->comm); printk_avl(task->mm->mmap_avl); printk("\n"); */
- avl_checkheights(task->mm->mmap_avl);
- avl_checkorder(task->mm->mmap_avl);
-}
-
-#endif
return;
}
-static inline int mprotect_fixup_all(struct vm_area_struct * vma,
+static inline int mprotect_fixup_all(struct vm_area_struct * vma, struct vm_area_struct ** pprev,
int newflags, pgprot_t prot)
{
- spin_lock(&vma->vm_mm->page_table_lock);
+ struct vm_area_struct * prev = *pprev;
+ struct mm_struct * mm = vma->vm_mm;
+
+ if (prev && prev->vm_end == vma->vm_start && can_vma_merge(prev, newflags) &&
+ !vma->vm_file && !(vma->vm_flags & VM_SHARED)) {
+ spin_lock(&mm->page_table_lock);
+ prev->vm_end = vma->vm_end;
+ __vma_unlink(mm, vma, prev);
+ spin_unlock(&mm->page_table_lock);
+
+ kmem_cache_free(vm_area_cachep, vma);
+ mm->map_count--;
+
+ return 0;
+ }
+
+ spin_lock(&mm->page_table_lock);
vma->vm_flags = newflags;
vma->vm_page_prot = prot;
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(&mm->page_table_lock);
+
+ *pprev = vma;
+
return 0;
}
-static inline int mprotect_fixup_start(struct vm_area_struct * vma,
+static inline int mprotect_fixup_start(struct vm_area_struct * vma, struct vm_area_struct ** pprev,
unsigned long end,
int newflags, pgprot_t prot)
{
- struct vm_area_struct * n;
+ struct vm_area_struct * n, * prev = *pprev;
+
+ *pprev = vma;
+
+ if (prev && prev->vm_end == vma->vm_start && can_vma_merge(prev, newflags) &&
+ !vma->vm_file && !(vma->vm_flags & VM_SHARED)) {
+ spin_lock(&vma->vm_mm->page_table_lock);
+ prev->vm_end = end;
+ vma->vm_start = end;
+ spin_unlock(&vma->vm_mm->page_table_lock);
+ return 0;
+ }
n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
if (!n)
return -ENOMEM;
get_file(n->vm_file);
if (n->vm_ops && n->vm_ops->open)
n->vm_ops->open(n);
+ vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
lock_vma_mappings(vma);
spin_lock(&vma->vm_mm->page_table_lock);
- vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
vma->vm_start = end;
__insert_vm_struct(current->mm, n);
spin_unlock(&vma->vm_mm->page_table_lock);
unlock_vma_mappings(vma);
+
return 0;
}
-static inline int mprotect_fixup_end(struct vm_area_struct * vma,
+static inline int mprotect_fixup_end(struct vm_area_struct * vma, struct vm_area_struct ** pprev,
unsigned long start,
int newflags, pgprot_t prot)
{
__insert_vm_struct(current->mm, n);
spin_unlock(&vma->vm_mm->page_table_lock);
unlock_vma_mappings(vma);
+
+ *pprev = n;
+
return 0;
}
-static inline int mprotect_fixup_middle(struct vm_area_struct * vma,
+static inline int mprotect_fixup_middle(struct vm_area_struct * vma, struct vm_area_struct ** pprev,
unsigned long start, unsigned long end,
int newflags, pgprot_t prot)
{
vma->vm_ops->open(left);
vma->vm_ops->open(right);
}
+ vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
+ vma->vm_raend = 0;
+ vma->vm_page_prot = prot;
lock_vma_mappings(vma);
spin_lock(&vma->vm_mm->page_table_lock);
- vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
vma->vm_start = start;
vma->vm_end = end;
vma->vm_flags = newflags;
- vma->vm_raend = 0;
- vma->vm_page_prot = prot;
__insert_vm_struct(current->mm, left);
__insert_vm_struct(current->mm, right);
spin_unlock(&vma->vm_mm->page_table_lock);
unlock_vma_mappings(vma);
+
+ *pprev = right;
+
return 0;
}
-static int mprotect_fixup(struct vm_area_struct * vma,
+static int mprotect_fixup(struct vm_area_struct * vma, struct vm_area_struct ** pprev,
unsigned long start, unsigned long end, unsigned int newflags)
{
pgprot_t newprot;
int error;
- if (newflags == vma->vm_flags)
+ if (newflags == vma->vm_flags) {
+ *pprev = vma;
return 0;
+ }
newprot = protection_map[newflags & 0xf];
if (start == vma->vm_start) {
if (end == vma->vm_end)
- error = mprotect_fixup_all(vma, newflags, newprot);
+ error = mprotect_fixup_all(vma, pprev, newflags, newprot);
else
- error = mprotect_fixup_start(vma, end, newflags, newprot);
+ error = mprotect_fixup_start(vma, pprev, end, newflags, newprot);
} else if (end == vma->vm_end)
- error = mprotect_fixup_end(vma, start, newflags, newprot);
+ error = mprotect_fixup_end(vma, pprev, start, newflags, newprot);
else
- error = mprotect_fixup_middle(vma, start, end, newflags, newprot);
+ error = mprotect_fixup_middle(vma, pprev, start, end, newflags, newprot);
if (error)
return error;
asmlinkage long sys_mprotect(unsigned long start, size_t len, unsigned long prot)
{
unsigned long nstart, end, tmp;
- struct vm_area_struct * vma, * next;
+ struct vm_area_struct * vma, * next, * prev;
int error = -EINVAL;
if (start & ~PAGE_MASK)
if (end == start)
return 0;
- /* XXX: maybe this could be down_read ??? - Rik */
down_write(¤t->mm->mmap_sem);
- vma = find_vma(current->mm, start);
+ vma = find_vma_prev(current->mm, start, &prev);
error = -EFAULT;
if (!vma || vma->vm_start > start)
goto out;
for (nstart = start ; ; ) {
unsigned int newflags;
+ int last = 0;
/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
newflags = prot | (vma->vm_flags & ~(PROT_READ | PROT_WRITE | PROT_EXEC));
if ((newflags & ~(newflags >> 4)) & 0xf) {
error = -EACCES;
- break;
+ goto out;
}
- if (vma->vm_end >= end) {
- error = mprotect_fixup(vma, nstart, end, newflags);
- break;
+ if (vma->vm_end > end) {
+ error = mprotect_fixup(vma, &prev, nstart, end, newflags);
+ goto out;
}
+ if (vma->vm_end == end)
+ last = 1;
tmp = vma->vm_end;
next = vma->vm_next;
- error = mprotect_fixup(vma, nstart, tmp, newflags);
+ error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
if (error)
+ goto out;
+ if (last)
break;
nstart = tmp;
vma = next;
if (!vma || vma->vm_start != nstart) {
error = -EFAULT;
- break;
+ goto out;
}
}
+ if (next && prev->vm_end == next->vm_start && can_vma_merge(next, prev->vm_flags) &&
+ !prev->vm_file && !(prev->vm_flags & VM_SHARED)) {
+ spin_lock(&prev->vm_mm->page_table_lock);
+ prev->vm_end = next->vm_end;
+ __vma_unlink(prev->vm_mm, next, prev);
+ spin_unlock(&prev->vm_mm->page_table_lock);
+
+ kmem_cache_free(vm_area_cachep, next);
+ prev->vm_mm->map_count--;
+ }
out:
up_write(¤t->mm->mmap_sem);
return error;
unsigned long addr, unsigned long old_len, unsigned long new_len,
unsigned long new_addr)
{
- struct vm_area_struct * new_vma;
+ struct mm_struct * mm = vma->vm_mm;
+ struct vm_area_struct * new_vma, * next, * prev;
+ int allocated_vma;
+
+ new_vma = NULL;
+ next = find_vma_prev(mm, new_addr, &prev);
+ if (next) {
+ if (prev && prev->vm_end == new_addr &&
+ can_vma_merge(prev, vma->vm_flags) && !vma->vm_file && !(vma->vm_flags & VM_SHARED)) {
+ spin_lock(&mm->page_table_lock);
+ prev->vm_end = new_addr + new_len;
+ spin_unlock(&mm->page_table_lock);
+ new_vma = prev;
+ if (next != prev->vm_next)
+ BUG();
+ if (prev->vm_end == next->vm_start && can_vma_merge(next, prev->vm_flags)) {
+ spin_lock(&mm->page_table_lock);
+ prev->vm_end = next->vm_end;
+ __vma_unlink(mm, next, prev);
+ spin_unlock(&mm->page_table_lock);
+
+ mm->map_count--;
+ kmem_cache_free(vm_area_cachep, next);
+ }
+ } else if (next->vm_start == new_addr + new_len &&
+ can_vma_merge(next, vma->vm_flags) && !vma->vm_file && !(vma->vm_flags & VM_SHARED)) {
+ spin_lock(&mm->page_table_lock);
+ next->vm_start = new_addr;
+ spin_unlock(&mm->page_table_lock);
+ new_vma = next;
+ }
+ } else {
+ prev = find_vma(mm, new_addr-1);
+ if (prev && prev->vm_end == new_addr &&
+ can_vma_merge(prev, vma->vm_flags) && !vma->vm_file && !(vma->vm_flags & VM_SHARED)) {
+ spin_lock(&mm->page_table_lock);
+ prev->vm_end = new_addr + new_len;
+ spin_unlock(&mm->page_table_lock);
+ new_vma = prev;
+ }
+ }
- new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
- if (new_vma) {
- if (!move_page_tables(current->mm, new_addr, addr, old_len)) {
+ allocated_vma = 0;
+ if (!new_vma) {
+ new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+ if (!new_vma)
+ goto out;
+ allocated_vma = 1;
+ }
+
+ if (!move_page_tables(current->mm, new_addr, addr, old_len)) {
+ if (allocated_vma) {
*new_vma = *vma;
new_vma->vm_start = new_addr;
new_vma->vm_end = new_addr+new_len;
if (new_vma->vm_ops && new_vma->vm_ops->open)
new_vma->vm_ops->open(new_vma);
insert_vm_struct(current->mm, new_vma);
- do_munmap(current->mm, addr, old_len);
- current->mm->total_vm += new_len >> PAGE_SHIFT;
- if (new_vma->vm_flags & VM_LOCKED) {
- current->mm->locked_vm += new_len >> PAGE_SHIFT;
- make_pages_present(new_vma->vm_start,
- new_vma->vm_end);
- }
- return new_addr;
}
- kmem_cache_free(vm_area_cachep, new_vma);
+ do_munmap(current->mm, addr, old_len);
+ current->mm->total_vm += new_len >> PAGE_SHIFT;
+ if (new_vma->vm_flags & VM_LOCKED) {
+ current->mm->locked_vm += new_len >> PAGE_SHIFT;
+ make_pages_present(new_vma->vm_start,
+ new_vma->vm_end);
+ }
+ return new_addr;
}
+ if (allocated_vma)
+ kmem_cache_free(vm_area_cachep, new_vma);
+ out:
return -ENOMEM;
}
#endif /* !CONFIG_DISCONTIGMEM */
-struct page * alloc_pages_node(int nid, int gfp_mask, unsigned long order)
+struct page * alloc_pages_node(int nid, unsigned int gfp_mask, unsigned int order)
{
#ifdef CONFIG_NUMA
return __alloc_pages(gfp_mask, order, NODE_DATA(nid)->node_zonelists + (gfp_mask & GFP_ZONEMASK));
memset(pgdat->valid_addr_bitmap, 0, size);
}
-static struct page * alloc_pages_pgdat(pg_data_t *pgdat, int gfp_mask,
- unsigned long order)
+static struct page * alloc_pages_pgdat(pg_data_t *pgdat, unsigned int gfp_mask,
+ unsigned int order)
{
return __alloc_pages(gfp_mask, order, pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK));
}
* This can be refined. Currently, tries to do round robin, instead
* should do concentratic circle search, starting from current node.
*/
-struct page * _alloc_pages(unsigned int gfp_mask, unsigned long order)
+struct page * _alloc_pages(unsigned int gfp_mask, unsigned int order)
{
struct page *ret = 0;
pg_data_t *start, *temp;
schedule();
return;
}
-
-/**
- * out_of_memory - is the system out of memory?
- *
- * Returns 0 if there is still enough memory left,
- * 1 when we are out of memory (otherwise).
- */
-int out_of_memory(void)
-{
- long cache_mem, limit;
-
- /* Enough free memory? Not OOM. */
- if (nr_free_pages() > freepages.min)
- return 0;
-
- if (nr_free_pages() + nr_inactive_clean_pages() > freepages.low)
- return 0;
-
- /*
- * If the buffer and page cache (excluding swap cache) are over
- * their (/proc tunable) minimum, we're still not OOM. We test
- * this to make sure we don't return OOM when the system simply
- * has a hard time with the cache.
- */
- cache_mem = atomic_read(&page_cache_size);
- cache_mem += atomic_read(&buffermem_pages);
- cache_mem -= swapper_space.nrpages;
- limit = (page_cache.min_percent + buffer_mem.min_percent);
- limit *= num_physpages / 100;
-
- if (cache_mem > limit)
- return 0;
-
- /* Enough swap space left? Not OOM. */
- if (nr_swap_pages > 0)
- return 0;
-
- /* Else... */
- return 1;
-}
int nr_swap_pages;
int nr_active_pages;
-int nr_inactive_dirty_pages;
+int nr_inactive_pages;
+struct list_head inactive_list;
+struct list_head active_list;
pg_data_t *pgdat_list;
static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
-static int zone_balance_ratio[MAX_NR_ZONES] = { 32, 128, 128, };
-static int zone_balance_min[MAX_NR_ZONES] = { 10 , 10, 10, };
-static int zone_balance_max[MAX_NR_ZONES] = { 255 , 255, 255, };
+static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 32, 128, 128, };
+static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, };
+static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, };
-struct list_head active_list;
-struct list_head inactive_dirty_list;
/*
* Free_page() adds the page to the free lists. This is optimized for
* fast normal cases (no error jumps taken normally).
* Hint: -mask = 1+~mask
*/
-static void FASTCALL(__free_pages_ok (struct page *page, unsigned long order));
-static void __free_pages_ok (struct page *page, unsigned long order)
+static void FASTCALL(__free_pages_ok (struct page *page, unsigned int order));
+static void __free_pages_ok (struct page *page, unsigned int order)
{
unsigned long index, page_idx, mask, flags;
free_area_t *area;
BUG();
if (PageActive(page))
BUG();
- if (PageInactiveDirty(page))
+ if (PageInactive(page))
BUG();
- if (PageInactiveClean(page))
+ if (PageDirty(page))
BUG();
- page->flags &= ~((1<<PG_referenced) | (1<<PG_dirty));
- page->age = PAGE_AGE_START;
-
+ if (current->flags & PF_FREE_PAGES)
+ goto local_freelist;
+ back_local_freelist:
+
zone = page->zone;
mask = (~0UL) << order;
memlist_add_head(&(base + page_idx)->list, &area->free_list);
spin_unlock_irqrestore(&zone->lock, flags);
+ return;
+ local_freelist:
/*
- * We don't want to protect this variable from race conditions
- * since it's nothing important, but we do want to make sure
- * it never gets negative.
+ * This is a little subtle: if the allocation order
+ * wanted is major than zero we'd better take all the pages
+ * local since we must deal with fragmentation too and we
+ * can't rely on the nr_local_pages information.
*/
- if (memory_pressure > NR_CPUS)
- memory_pressure--;
+ if (current->nr_local_pages && !current->allocation_order)
+ goto back_local_freelist;
+
+ list_add(&page->list, ¤t->local_pages);
+ page->index = order;
+ current->nr_local_pages++;
}
#define MARK_USED(index, order, area) \
return page;
}
-static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned long order));
-static struct page * rmqueue(zone_t *zone, unsigned long order)
+static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned int order));
+static struct page * rmqueue(zone_t *zone, unsigned int order)
{
free_area_t * area = zone->free_area + order;
- unsigned long curr_order = order;
+ unsigned int curr_order = order;
struct list_head *head, *curr;
unsigned long flags;
struct page *page;
index = page - zone->zone_mem_map;
if (curr_order != MAX_ORDER-1)
MARK_USED(index, curr_order, area);
- zone->free_pages -= 1 << order;
+ zone->free_pages -= 1UL << order;
page = expand(zone, page, index, order, curr_order, area);
spin_unlock_irqrestore(&zone->lock, flags);
set_page_count(page, 1);
if (BAD_RANGE(zone,page))
BUG();
- DEBUG_ADD_PAGE
+ DEBUG_LRU_PAGE(page);
return page;
}
curr_order++;
return NULL;
}
-#define PAGES_MIN 0
-#define PAGES_LOW 1
-#define PAGES_HIGH 2
+#ifndef CONFIG_DISCONTIGMEM
+struct page *_alloc_pages(unsigned int gfp_mask, unsigned int order)
+{
+ return __alloc_pages(gfp_mask, order,
+ contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK));
+}
+#endif
-/*
- * This function does the dirty work for __alloc_pages
- * and is separated out to keep the code size smaller.
- * (suggested by Davem at 1:30 AM, typed by Rik at 6 AM)
- */
-static struct page * __alloc_pages_limit(zonelist_t *zonelist,
- unsigned long order, int limit, int direct_reclaim)
+static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, unsigned int, int *));
+static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed)
{
- zone_t **zone = zonelist->zones;
+ struct page * page = NULL;
+ int __freed = 0;
- for (;;) {
- zone_t *z = *(zone++);
- unsigned long water_mark;
+ if (!(gfp_mask & __GFP_WAIT))
+ goto out;
+ if (in_interrupt())
+ BUG();
- if (!z)
- break;
- if (!z->size)
- BUG();
+ current->allocation_order = order;
+ current->flags |= PF_MEMALLOC | PF_FREE_PAGES;
+
+ __freed = try_to_free_pages(classzone, gfp_mask, order);
+
+ current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES);
+
+ if (current->nr_local_pages) {
+ struct list_head * entry, * local_pages;
+ struct page * tmp;
+ int nr_pages;
+
+ local_pages = ¤t->local_pages;
+
+ if (__freed) {
+ /* pick from the last inserted so we're lifo */
+ entry = local_pages->next;
+ do {
+ tmp = list_entry(entry, struct page, list);
+ if (tmp->index == order && memclass(tmp->zone, classzone)) {
+ list_del(entry);
+ current->nr_local_pages--;
+ set_page_count(tmp, 1);
+ page = tmp;
+
+ if (page->buffers)
+ BUG();
+ if (page->mapping)
+ BUG();
+ if (!VALID_PAGE(page))
+ BUG();
+ if (PageSwapCache(page))
+ BUG();
+ if (PageLocked(page))
+ BUG();
+ if (PageDecrAfter(page))
+ BUG();
+ if (PageActive(page))
+ BUG();
+ if (PageInactive(page))
+ BUG();
+ if (PageDirty(page))
+ BUG();
- /*
- * We allocate if the number of free + inactive_clean
- * pages is above the watermark.
- */
- switch (limit) {
- default:
- case PAGES_MIN:
- water_mark = z->pages_min;
- break;
- case PAGES_LOW:
- water_mark = z->pages_low;
- break;
- case PAGES_HIGH:
- water_mark = z->pages_high;
+ break;
+ }
+ } while ((entry = entry->next) != local_pages);
}
- if (z->free_pages + z->inactive_clean_pages >= water_mark) {
- struct page *page = NULL;
- /* If possible, reclaim a page directly. */
- if (direct_reclaim)
- page = reclaim_page(z);
- /* If that fails, fall back to rmqueue. */
- if (!page)
- page = rmqueue(z, order);
- if (page)
- return page;
+ nr_pages = current->nr_local_pages;
+ /* free in reverse order so that the global order will be lifo */
+ while ((entry = local_pages->prev) != local_pages) {
+ list_del(entry);
+ tmp = list_entry(entry, struct page, list);
+ __free_pages_ok(tmp, tmp->index);
+ if (!nr_pages--)
+ BUG();
}
+ current->nr_local_pages = 0;
}
-
- /* Found nothing. */
- return NULL;
+ out:
+ *freed = __freed;
+ return page;
}
-#ifndef CONFIG_DISCONTIGMEM
-struct page *_alloc_pages(unsigned int gfp_mask, unsigned long order)
+static inline unsigned long zone_free_pages(zone_t * zone, unsigned int order)
{
- return __alloc_pages(gfp_mask, order,
- contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK));
+ long free = zone->free_pages - (1UL << order);
+ return free >= 0 ? free : 0;
}
-#endif
/*
* This is the 'heart' of the zoned buddy allocator:
*/
-struct page * __alloc_pages(unsigned int gfp_mask, unsigned long order, zonelist_t *zonelist)
+struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist)
{
- zone_t **zone;
- int direct_reclaim = 0;
+ zone_t **zone, * classzone;
struct page * page;
+ int freed;
- /*
- * Allocations put pressure on the VM subsystem.
- */
- memory_pressure++;
+ zone = zonelist->zones;
+ classzone = *zone;
+ for (;;) {
+ zone_t *z = *(zone++);
+ if (!z)
+ break;
- /*
- * (If anyone calls gfp from interrupts nonatomically then it
- * will sooner or later tripped up by a schedule().)
- *
- * We are falling back to lower-level zones if allocation
- * in a higher zone fails.
- */
+ if (zone_free_pages(z, order) > z->pages_low) {
+ page = rmqueue(z, order);
+ if (page)
+ return page;
+ }
+ }
- /*
- * Can we take pages directly from the inactive_clean
- * list?
- */
- if (order == 0 && (gfp_mask & __GFP_WAIT))
- direct_reclaim = 1;
+ classzone->need_balance = 1;
+ mb();
+ if (waitqueue_active(&kswapd_wait))
+ wake_up_interruptible(&kswapd_wait);
-try_again:
- /*
- * First, see if we have any zones with lots of free memory.
- *
- * We allocate free memory first because it doesn't contain
- * any data ... DUH!
- */
zone = zonelist->zones;
for (;;) {
zone_t *z = *(zone++);
if (!z)
break;
- if (!z->size)
- BUG();
- if (z->free_pages >= z->pages_low) {
+ if (zone_free_pages(z, order) > (gfp_mask & __GFP_HIGH ? z->pages_min / 2 : z->pages_min)) {
page = rmqueue(z, order);
if (page)
return page;
- } else if (z->free_pages < z->pages_min &&
- waitqueue_active(&kreclaimd_wait)) {
- wake_up_interruptible(&kreclaimd_wait);
}
}
- /*
- * Try to allocate a page from a zone with a HIGH
- * amount of free + inactive_clean pages.
- *
- * If there is a lot of activity, inactive_target
- * will be high and we'll have a good chance of
- * finding a page using the HIGH limit.
- */
- page = __alloc_pages_limit(zonelist, order, PAGES_HIGH, direct_reclaim);
- if (page)
- return page;
+ /* here we're in the low on memory slow path */
- /*
- * Then try to allocate a page from a zone with more
- * than zone->pages_low free + inactive_clean pages.
- *
- * When the working set is very large and VM activity
- * is low, we're most likely to have our allocation
- * succeed here.
- */
- page = __alloc_pages_limit(zonelist, order, PAGES_LOW, direct_reclaim);
- if (page)
- return page;
+ if (current->flags & PF_MEMALLOC) {
+ zone = zonelist->zones;
+ for (;;) {
+ zone_t *z = *(zone++);
+ if (!z)
+ break;
- /*
- * OK, none of the zones on our zonelist has lots
- * of pages free.
- *
- * We wake up kswapd, in the hope that kswapd will
- * resolve this situation before memory gets tight.
- *
- * We also yield the CPU, because that:
- * - gives kswapd a chance to do something
- * - slows down allocations, in particular the
- * allocations from the fast allocator that's
- * causing the problems ...
- * - ... which minimises the impact the "bad guys"
- * have on the rest of the system
- * - if we don't have __GFP_IO set, kswapd may be
- * able to free some memory we can't free ourselves
- */
- wakeup_kswapd();
- if (gfp_mask & __GFP_WAIT) {
- __set_current_state(TASK_RUNNING);
- current->policy |= SCHED_YIELD;
- schedule();
+ page = rmqueue(z, order);
+ if (page)
+ return page;
+ }
+ return NULL;
}
- /*
- * After waking up kswapd, we try to allocate a page
- * from any zone which isn't critical yet.
- *
- * Kswapd should, in most situations, bring the situation
- * back to normal in no time.
- */
- page = __alloc_pages_limit(zonelist, order, PAGES_MIN, direct_reclaim);
+ page = balance_classzone(classzone, gfp_mask, order, &freed);
if (page)
return page;
- /*
- * Damn, we didn't succeed.
- *
- * This can be due to 2 reasons:
- * - we're doing a higher-order allocation
- * --> move pages to the free list until we succeed
- * - we're /really/ tight on memory
- * --> try to free pages ourselves with page_launder
- */
- if (!(current->flags & PF_MEMALLOC)) {
- /*
- * Are we dealing with a higher order allocation?
- *
- * Move pages from the inactive_clean to the free list
- * in the hope of creating a large, physically contiguous
- * piece of free memory.
- */
- if (order > 0 && (gfp_mask & __GFP_WAIT)) {
- zone = zonelist->zones;
- /* First, clean some dirty pages. */
- current->flags |= PF_MEMALLOC;
- page_launder(gfp_mask, 1);
- current->flags &= ~PF_MEMALLOC;
- for (;;) {
- zone_t *z = *(zone++);
- if (!z)
- break;
- if (!z->size)
- continue;
- while (z->inactive_clean_pages) {
- struct page * page;
- /* Move one page to the free list. */
- page = reclaim_page(z);
- if (!page)
- break;
- __free_page(page);
- /* Try if the allocation succeeds. */
- page = rmqueue(z, order);
- if (page)
- return page;
- }
- }
- }
- /*
- * When we arrive here, we are really tight on memory.
- * Since kswapd didn't succeed in freeing pages for us,
- * we try to help it.
- *
- * Single page allocs loop until the allocation succeeds.
- * Multi-page allocs can fail due to memory fragmentation;
- * in that case we bail out to prevent infinite loops and
- * hanging device drivers ...
- *
- * Another issue are GFP_NOFS allocations; because they
- * do not have __GFP_FS set it's possible we cannot make
- * any progress freeing pages, in that case it's better
- * to give up than to deadlock the kernel looping here.
- */
- if (gfp_mask & __GFP_WAIT) {
- if (!order || free_shortage()) {
- int progress = try_to_free_pages(gfp_mask);
- if (progress || (gfp_mask & __GFP_FS))
- goto try_again;
- /*
- * Fail in case no progress was made and the
- * allocation may not be able to block on IO.
- */
- return NULL;
- }
- }
- }
-
- /*
- * Final phase: allocate anything we can!
- *
- * Higher order allocations, GFP_ATOMIC allocations and
- * recursive allocations (PF_MEMALLOC) end up here.
- *
- * Only recursive allocations can use the very last pages
- * in the system, otherwise it would be just too easy to
- * deadlock the system...
- */
zone = zonelist->zones;
- for (;;) {
- zone_t *z = *(zone++);
- struct page * page = NULL;
- if (!z)
- break;
- if (!z->size)
- BUG();
+ if (__builtin_expect(freed, 1)) {
+ for (;;) {
+ zone_t *z = *(zone++);
+ if (!z)
+ break;
- /*
- * SUBTLE: direct_reclaim is only possible if the task
- * becomes PF_MEMALLOC while looping above. This will
- * happen when the OOM killer selects this task for
- * instant execution...
- */
- if (direct_reclaim) {
- page = reclaim_page(z);
- if (page)
- return page;
+ if (zone_free_pages(z, order) > (gfp_mask & __GFP_HIGH ? z->pages_min / 2 : z->pages_min)) {
+ page = rmqueue(z, order);
+ if (page)
+ return page;
+ }
}
+ } else {
+ for (;;) {
+ zone_t *z = *(zone++);
+ if (!z)
+ break;
- /* XXX: is pages_min/4 a good amount to reserve for this? */
- if (z->free_pages < z->pages_min / 4 &&
- !(current->flags & PF_MEMALLOC))
- continue;
- page = rmqueue(z, order);
- if (page)
- return page;
+ if (zone_free_pages(z, order) > z->pages_high) {
+ page = rmqueue(z, order);
+ if (page)
+ return page;
+ }
+ }
}
- /* No luck.. */
- printk(KERN_ERR "__alloc_pages: %lu-order allocation failed (gfp=0x%x/%i).\n",
- order, gfp_mask, !!(current->flags & PF_MEMALLOC));
+ printk(KERN_NOTICE "__alloc_pages: %u-order allocation failed (gfp=0x%x/%i) from %p\n",
+ order, gfp_mask, !!(current->flags & PF_MEMALLOC), __builtin_return_address(0));
return NULL;
}
/*
* Common helper functions.
*/
-unsigned long __get_free_pages(int gfp_mask, unsigned long order)
+unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order)
{
struct page * page;
return (unsigned long) page_address(page);
}
-unsigned long get_zeroed_page(int gfp_mask)
+unsigned long get_zeroed_page(unsigned int gfp_mask)
{
struct page * page;
return 0;
}
-void __free_pages(struct page *page, unsigned long order)
+void __free_pages(struct page *page, unsigned int order)
{
if (!PageReserved(page) && put_page_testzero(page))
__free_pages_ok(page, order);
}
-void free_pages(unsigned long addr, unsigned long order)
+void free_pages(unsigned long addr, unsigned int order)
{
if (addr != 0)
__free_pages(virt_to_page(addr), order);
return sum;
}
-/*
- * Total amount of inactive_clean (allocatable) RAM:
- */
-unsigned int nr_inactive_clean_pages (void)
-{
- unsigned int sum;
- zone_t *zone;
- pg_data_t *pgdat = pgdat_list;
-
- sum = 0;
- while (pgdat) {
- for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++)
- sum += zone->inactive_clean_pages;
- pgdat = pgdat->node_next;
- }
- return sum;
-}
-
/*
* Amount of free RAM allocatable as buffer memory:
*/
unsigned int nr_free_buffer_pages (void)
{
+ pg_data_t *pgdat = pgdat_list;
unsigned int sum = 0;
zonelist_t *zonelist;
zone_t **zonep, *zone;
- zonelist = contig_page_data.node_zonelists + (GFP_NOFS & GFP_ZONEMASK);
- zonep = zonelist->zones;
+ do {
+ zonelist = pgdat->node_zonelists + __GFP_HIGHMEM;
+ zonep = zonelist->zones;
- for (zone = *zonep++; zone; zone = *zonep++) {
- unsigned int pages = zone->free_pages +
- zone->inactive_clean_pages +
- zone->inactive_dirty_pages;
+ for (zone = *zonep++; zone; zone = *zonep++)
+ sum += zone->free_pages;
- /* Allow the buffer cache to fill up at least "pages_high" pages */
- if (pages < zone->pages_high)
- pages = zone->pages_high;
- sum += pages;
- }
+ pgdat = pgdat->node_next;
+ } while (pgdat);
- return sum;
+ return sum + nr_active_pages + nr_inactive_pages;
}
#if CONFIG_HIGHMEM
*/
void show_free_areas_core(pg_data_t *pgdat)
{
- unsigned long order;
+ unsigned int order;
unsigned type;
printk("Free pages: %6dkB (%6dkB HighMem)\n",
nr_free_pages() << (PAGE_SHIFT-10),
nr_free_highpages() << (PAGE_SHIFT-10));
- printk("( Active: %d, inactive_dirty: %d, inactive_clean: %d, free: %d (%d %d %d) )\n",
- nr_active_pages,
- nr_inactive_dirty_pages,
- nr_inactive_clean_pages(),
- nr_free_pages(),
- freepages.min,
- freepages.low,
- freepages.high);
+ printk("( Active: %d, inactive: %d, free: %d )\n",
+ nr_active_pages,
+ nr_inactive_pages,
+ nr_free_pages());
for (type = 0; type < MAX_NR_ZONES; type++) {
struct list_head *head, *curr;
printk("On node %d totalpages: %lu\n", nid, realtotalpages);
- memlist_init(&active_list);
- memlist_init(&inactive_dirty_list);
+ INIT_LIST_HEAD(&active_list);
+ INIT_LIST_HEAD(&inactive_list);
/*
* Some architectures (with lots of mem and discontinous memory
pgdat->node_size = totalpages;
pgdat->node_start_paddr = zone_start_paddr;
pgdat->node_start_mapnr = (lmem_map - mem_map);
+ pgdat->nr_zones = 0;
/*
* Initially all pages are reserved - free ones are freed
zone->lock = SPIN_LOCK_UNLOCKED;
zone->zone_pgdat = pgdat;
zone->free_pages = 0;
- zone->inactive_clean_pages = 0;
- zone->inactive_dirty_pages = 0;
- memlist_init(&zone->inactive_clean_list);
if (!size)
continue;
+ pgdat->nr_zones = j+1;
+
mask = (realsize / zone_balance_ratio[j]);
if (mask < zone_balance_min[j])
mask = zone_balance_min[j];
zone->pages_min = mask;
zone->pages_low = mask*2;
zone->pages_high = mask*3;
- /*
- * Add these free targets to the global free target;
- * we have to be SURE that freepages.high is higher
- * than SUM [zone->pages_min] for all zones, otherwise
- * we may have bad bad problems.
- *
- * This means we cannot make the freepages array writable
- * in /proc, but have to add a separate extra_free_target
- * for people who require it to catch load spikes in eg.
- * gigabit ethernet routing...
- */
- freepages.min += mask;
- freepages.low += mask*2;
- freepages.high += mask*3;
+
zone->zone_mem_map = mem_map + offset;
zone->zone_start_mapnr = offset;
zone->zone_start_paddr = zone_start_paddr;
swap_free(*entry);
*entry = (swp_entry_t) {0};
delete_from_swap_cache_nolock(page);
- flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced) | (1 << PG_arch_1));
+ flags = page->flags & ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_referenced | 1 << PG_arch_1);
page->flags = flags | (1 << PG_dirty);
add_to_page_cache_locked(page, mapping, idx);
info->swapped--;
break;
slabp = list_entry(cachep->slabs_free.prev, slab_t, list);
+#if DEBUG
if (slabp->inuse)
BUG();
+#endif
list_del(&slabp->list);
spin_unlock_irq(&cachep->spinlock);
static inline void kmem_cache_alloc_head(kmem_cache_t *cachep, int flags)
{
-#if DEBUG
if (flags & SLAB_DMA) {
if (!(cachep->gfpflags & GFP_DMA))
BUG();
if (cachep->gfpflags & GFP_DMA)
BUG();
}
-#endif
}
static inline void * kmem_cache_alloc_one_tail (kmem_cache_t *cachep,
- slab_t *slabp, int partial)
+ slab_t *slabp)
{
void *objp;
objp = slabp->s_mem + slabp->free*cachep->objsize;
slabp->free=slab_bufctl(slabp)[slabp->free];
- if (slabp->free == BUFCTL_END) {
+ if (__builtin_expect(slabp->free == BUFCTL_END, 0)) {
list_del(&slabp->list);
list_add(&slabp->list, &cachep->slabs_full);
- } else {
- if (!partial) {
- list_del(&slabp->list);
- list_add(&slabp->list, &cachep->slabs_partial);
- }
}
#if DEBUG
if (cachep->flags & SLAB_POISON)
*/
#define kmem_cache_alloc_one(cachep) \
({ \
- slab_t *slabp; \
- struct list_head * slab_freelist; \
- int partial = 1; \
+ struct list_head * slabs_partial, * entry; \
+ slab_t *slabp; \
\
- slab_freelist = &(cachep)->slabs_partial; \
- if (list_empty(slab_freelist)) { \
- partial = 0; \
- slab_freelist = &(cachep)->slabs_free; \
- if (list_empty(slab_freelist)) \
+ slabs_partial = &(cachep)->slabs_partial; \
+ entry = slabs_partial->next; \
+ if (__builtin_expect(entry == slabs_partial, 0)) { \
+ struct list_head * slabs_free; \
+ slabs_free = &(cachep)->slabs_free; \
+ entry = slabs_free->next; \
+ if (__builtin_expect(entry == slabs_free, 0)) \
goto alloc_new_slab; \
+ list_del(entry); \
+ list_add(entry, slabs_partial); \
} \
\
- slabp = list_entry(slab_freelist->next, slab_t, list); \
- kmem_cache_alloc_one_tail(cachep, slabp, partial); \
+ slabp = list_entry(entry, slab_t, list); \
+ kmem_cache_alloc_one_tail(cachep, slabp); \
})
#ifdef CONFIG_SMP
{
int batchcount = cachep->batchcount;
cpucache_t* cc = cc_data(cachep);
- struct list_head * slab_freelist;
- int partial;
- slab_t *slabp;
spin_lock(&cachep->spinlock);
while (batchcount--) {
+ struct list_head * slabs_partial, * entry;
+ slab_t *slabp;
/* Get slab alloc is to come from. */
- slab_freelist = &(cachep)->slabs_partial;
- partial = 1;
- if (list_empty(slab_freelist)) {
- partial = 0;
- slab_freelist = &(cachep)->slabs_free;
- if (list_empty(slab_freelist))
+ slabs_partial = &(cachep)->slabs_partial;
+ entry = slabs_partial->next;
+ if (__builtin_expect(entry == slabs_partial, 0)) {
+ struct list_head * slabs_free;
+ slabs_free = &(cachep)->slabs_free;
+ entry = slabs_free->next;
+ if (__builtin_expect(entry == slabs_free, 0))
break;
+ list_del(entry);
+ list_add(entry, slabs_partial);
}
- slabp = list_entry(slab_freelist->next, slab_t, list);
+ slabp = list_entry(entry, slab_t, list);
cc_entry(cc)[cc->avail++] =
- kmem_cache_alloc_one_tail(cachep, slabp, partial);
+ kmem_cache_alloc_one_tail(cachep, slabp);
}
spin_unlock(&cachep->spinlock);
STATS_DEC_ACTIVE(cachep);
/* fixup slab chains */
- if (!--slabp->inuse)
- goto moveslab_free;
- if (slabp->inuse + 1 == cachep->num)
- goto moveslab_partial;
- return;
-
-moveslab_partial:
- /* Was full. */
- list_del(&slabp->list);
- list_add(&slabp->list, &cachep->slabs_partial);
- return;
-
-moveslab_free:
- /* Was partial, now empty. */
- list_del(&slabp->list);
- list_add(&slabp->list, &cachep->slabs_free);
- return;
+ {
+ int inuse = slabp->inuse;
+ if (__builtin_expect(!--slabp->inuse, 0)) {
+ /* Was partial or full, now empty. */
+ list_del(&slabp->list);
+ list_add(&slabp->list, &cachep->slabs_free);
+ } else if (__builtin_expect(inuse == cachep->num, 0)) {
+ /* Was full. */
+ list_del(&slabp->list);
+ list_add(&slabp->list, &cachep->slabs_partial);
+ }
+ }
}
#ifdef CONFIG_SMP
*
* Called from do_try_to_free_pages() and __alloc_pages()
*/
-void kmem_cache_reap (int gfp_mask)
+int kmem_cache_reap (int gfp_mask)
{
slab_t *slabp;
kmem_cache_t *searchp;
unsigned int best_pages;
unsigned int best_len;
unsigned int scan;
+ int ret = 0;
if (gfp_mask & __GFP_WAIT)
down(&cache_chain_sem);
else
if (down_trylock(&cache_chain_sem))
- return;
+ return 0;
scan = REAP_SCANLEN;
best_len = 0;
p = searchp->slabs_free.next;
while (p != &searchp->slabs_free) {
slabp = list_entry(p, slab_t, list);
+#if DEBUG
if (slabp->inuse)
BUG();
+#endif
full_free++;
p = p->next;
}
if (p == &best_cachep->slabs_free)
break;
slabp = list_entry(p,slab_t,list);
+#if DEBUG
if (slabp->inuse)
BUG();
+#endif
list_del(&slabp->list);
STATS_INC_REAPED(best_cachep);
spin_lock_irq(&best_cachep->spinlock);
}
spin_unlock_irq(&best_cachep->spinlock);
+ ret = scan * (1 << best_cachep->gfporder);
out:
up(&cache_chain_sem);
- return;
+ return ret;
}
#ifdef CONFIG_PROC_FS
#include <asm/uaccess.h> /* for copy_to/from_user */
#include <asm/pgtable.h>
-/*
- * We identify three levels of free memory. We never let free mem
- * fall below the freepages.min except for atomic allocations. We
- * start background swapping if we fall below freepages.high free
- * pages, and we begin intensive swapping below freepages.low.
- *
- * Actual initialization is done in mm/page_alloc.c
- */
-freepages_t freepages = {
- 0, /* freepages.min */
- 0, /* freepages.low */
- 0 /* freepages.high */
-};
-
/* How many pages do we try to swap or page in/out together? */
int page_cluster;
-/*
- * This variable contains the amount of page steals the system
- * is doing, averaged over a minute. We use this to determine how
- * many inactive pages we should have.
- *
- * In reclaim_page and __alloc_pages: memory_pressure++
- * In __free_pages_ok: memory_pressure--
- * In recalculate_vm_stats the value is decayed (once a second)
- */
-int memory_pressure;
-
/* We track the number of pages currently being asynchronously swapped
out, so that we don't try to swap TOO many pages out at once */
atomic_t nr_async_pages = ATOMIC_INIT(0);
-buffer_mem_t buffer_mem = {
- 2, /* minimum percent buffer */
- 10, /* borrow percent buffer */
- 60 /* maximum percent buffer */
-};
-
-buffer_mem_t page_cache = {
- 2, /* minimum percent page cache */
- 15, /* borrow percent page cache */
- 75 /* maximum */
-};
-
pager_daemon_t pager_daemon = {
512, /* base number for calculating the number of tries */
SWAP_CLUSTER_MAX, /* minimum number of tries */
*/
void deactivate_page_nolock(struct page * page)
{
- /*
- * One for the cache, one for the extra reference the
- * caller has and (maybe) one for the buffers.
- *
- * This isn't perfect, but works for just about everything.
- * Besides, as long as we don't move unfreeable pages to the
- * inactive_clean list it doesn't need to be perfect...
- */
- int maxcount = (page->buffers ? 3 : 2);
- page->age = 0;
- ClearPageReferenced(page);
-
- /*
- * Don't touch it if it's not on the active list.
- * (some pages aren't on any list at all)
- */
- if (PageActive(page) && page_count(page) <= maxcount && !page_ramdisk(page)) {
+ if (PageActive(page)) {
del_page_from_active_list(page);
- add_page_to_inactive_dirty_list(page);
+ add_page_to_inactive_list(page);
}
}
*/
void activate_page_nolock(struct page * page)
{
- if (PageInactiveDirty(page)) {
- del_page_from_inactive_dirty_list(page);
+ if (PageInactive(page)) {
+ del_page_from_inactive_list(page);
add_page_to_active_list(page);
- } else if (PageInactiveClean(page)) {
- del_page_from_inactive_clean_list(page);
- add_page_to_active_list(page);
- } else {
- /*
- * The page was not on any list, so we take care
- * not to do anything.
- */
}
-
- /* Make sure the page gets a fair chance at staying active. */
- if (page->age < PAGE_AGE_START)
- page->age = PAGE_AGE_START;
}
void activate_page(struct page * page)
*/
void lru_cache_add(struct page * page)
{
- spin_lock(&pagemap_lru_lock);
if (!PageLocked(page))
BUG();
- add_page_to_inactive_dirty_list(page);
- page->age = 0;
+ spin_lock(&pagemap_lru_lock);
+ add_page_to_inactive_list(page);
spin_unlock(&pagemap_lru_lock);
}
{
if (PageActive(page)) {
del_page_from_active_list(page);
- } else if (PageInactiveDirty(page)) {
- del_page_from_inactive_dirty_list(page);
- } else if (PageInactiveClean(page)) {
- del_page_from_inactive_clean_list(page);
- } else {
+ } else if (PageInactive(page)) {
+ del_page_from_inactive_list(page);
+ } else
printk("VM: __lru_cache_del, found unknown page ?!\n");
- }
- DEBUG_ADD_PAGE
+ DEBUG_LRU_PAGE(page);
}
/**
spin_unlock(&pagemap_lru_lock);
}
-/**
- * recalculate_vm_stats - recalculate VM statistics
- *
- * This function should be called once a second to recalculate
- * some useful statistics the VM subsystem uses to determine
- * its behaviour.
- */
-void recalculate_vm_stats(void)
-{
- /*
- * Substract one second worth of memory_pressure from
- * memory_pressure.
- */
- memory_pressure -= (memory_pressure >> INACTIVE_SHIFT);
-}
-
/*
* Perform any setup for the swap system
*/
*/
static int swap_writepage(struct page *page)
{
- /* One for the page cache, one for this user, one for page->buffers */
- if (page_count(page) > 2 + !!page->buffers)
- goto in_use;
- if (swap_count(page) > 1)
- goto in_use;
-
- delete_from_swap_cache_nolock(page);
- UnlockPage(page);
- return 0;
-
-in_use:
rw_swap_page(WRITE, page);
return 0;
}
BUG();
/* clear PG_dirty so a subsequent set_page_dirty takes effect */
- flags = page->flags & ~((1 << PG_error) | (1 << PG_dirty) | (1 << PG_arch_1));
+ flags = page->flags & ~(1 << PG_error | 1 << PG_dirty | 1 << PG_arch_1 | 1 << PG_referenced);
page->flags = flags | (1 << PG_uptodate);
- page->age = PAGE_AGE_START;
add_to_page_cache_locked(page, &swapper_space, entry.val);
}
struct swap_info_struct swap_info[MAX_SWAPFILES];
-/*
- * When swap space gets filled up, we will set this flag.
- * This will make do_swap_page(), in the page fault path,
- * free swap entries on swapin so we'll reclaim swap space
- * in order to be able to swap something out.
- *
- * At the moment we start reclaiming when swap usage goes
- * over 80% of swap space.
- *
- * XXX: Random numbers, fixme.
- */
-#define SWAP_FULL_PCT 80
-int vm_swap_full (void)
-{
- int swap_used = total_swap_pages - nr_swap_pages;
-
- return swap_used * 100 > total_swap_pages * SWAP_FULL_PCT;
-}
-
#define SWAPFILE_CLUSTER 256
static inline int scan_swap_map(struct swap_info_struct *si, unsigned short count)
lock_page(page);
if (PageSwapCache(page))
delete_from_swap_cache_nolock(page);
- SetPageDirty(page);
UnlockPage(page);
flush_page_to_ram(page);
mmput(start_mm);
start_mm = new_start_mm;
}
+ ClearPageDirty(page);
page_cache_release(page);
/*
int ret;
dir = pgd_offset_k(address);
- flush_cache_all();
spin_lock(&init_mm.page_table_lock);
do {
pmd_t *pmd;
ret = 0;
} while (address && (address < end));
spin_unlock(&init_mm.page_table_lock);
- flush_tlb_all();
return ret;
}
*/
#define DEF_PRIORITY (6)
-static inline void age_page_up(struct page *page)
-{
- unsigned age = page->age + PAGE_AGE_ADV;
- if (age > PAGE_AGE_MAX)
- age = PAGE_AGE_MAX;
- page->age = age;
-}
-
-static inline void age_page_down(struct page * page)
-{
- page->age /= 2;
-}
-
/*
* The swap-out function returns 1 if it successfully
* scanned all the pages it was asked to (`count').
* doesn't count as having freed a page.
*/
-/*
- * Estimate whether a zone has enough inactive or free pages..
- */
-static unsigned int zone_inactive_plenty(zone_t *zone)
-{
- unsigned int inactive;
-
- if (!zone->size)
- return 0;
-
- inactive = zone->inactive_dirty_pages;
- inactive += zone->inactive_clean_pages;
- inactive += zone->free_pages;
-
- return (inactive > (zone->size / 3));
-}
-
-static unsigned int zone_free_plenty(zone_t *zone)
-{
- unsigned int free;
-
- free = zone->free_pages;
- free += zone->inactive_clean_pages;
-
- return free > zone->pages_high*2;
-}
-
/* mm->page_table_lock is held. mmap_sem is not held */
-static void try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page)
+static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page, zone_t * classzone)
{
pte_t pte;
swp_entry_t entry;
- /*
- * If we are doing a zone-specific scan, do not
- * touch pages from zones which don't have a
- * shortage.
- */
- if (zone_inactive_plenty(page->zone))
- return;
-
/* Don't look at this pte if it's been accessed recently. */
if (ptep_test_and_clear_young(page_table)) {
- mark_page_accessed(page);
- return;
+ flush_tlb_page(vma, address);
+ SetPageReferenced(page);
+ return 0;
}
+ if (!memclass(page->zone, classzone))
+ return 0;
+
if (TryLockPage(page))
- return;
+ return 0;
/* From this point on, the odds are that we're going to
* nuke this pte, so read and clear the pte. This hook
set_pte(page_table, swp_entry_to_pte(entry));
drop_pte:
mm->rss--;
- if (!PageReferenced(page))
- deactivate_page(page);
UnlockPage(page);
- page_cache_release(page);
- return;
+ {
+ int freeable = page_count(page) - !!page->buffers <= 2;
+ if (freeable)
+ deactivate_page(page);
+ page_cache_release(page);
+ return freeable;
+ }
}
/*
out_unlock_restore:
set_pte(page_table, pte);
UnlockPage(page);
- return;
+ return 0;
}
/* mm->page_table_lock is held. mmap_sem is not held */
-static int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count)
+static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
{
pte_t * pte;
unsigned long pmd_end;
struct page *page = pte_page(*pte);
if (VALID_PAGE(page) && !PageReserved(page)) {
- try_to_swap_out(mm, vma, address, pte, page);
- if (!--count)
+ count -= try_to_swap_out(mm, vma, address, pte, page, classzone);
+ if (!count) {
+ address += PAGE_SIZE;
break;
+ }
}
}
address += PAGE_SIZE;
pte++;
} while (address && (address < end));
- mm->swap_address = address + PAGE_SIZE;
+ mm->swap_address = address;
return count;
}
/* mm->page_table_lock is held. mmap_sem is not held */
-static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count)
+static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
{
pmd_t * pmd;
unsigned long pgd_end;
end = pgd_end;
do {
- count = swap_out_pmd(mm, vma, pmd, address, end, count);
+ count = swap_out_pmd(mm, vma, pmd, address, end, count, classzone);
if (!count)
break;
address = (address + PMD_SIZE) & PMD_MASK;
}
/* mm->page_table_lock is held. mmap_sem is not held */
-static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count)
+static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count, zone_t * classzone)
{
pgd_t *pgdir;
unsigned long end;
if (address >= end)
BUG();
do {
- count = swap_out_pgd(mm, vma, pgdir, address, end, count);
+ count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone);
if (!count)
break;
address = (address + PGDIR_SIZE) & PGDIR_MASK;
return count;
}
+/* Placeholder for swap_out(): may be updated by fork.c:mmput() */
+struct mm_struct *swap_mm = &init_mm;
+
/*
* Returns non-zero if we scanned all `count' pages
*/
-static int swap_out_mm(struct mm_struct * mm, int count)
+static inline int swap_out_mm(struct mm_struct * mm, int count, int * race, zone_t * classzone)
{
unsigned long address;
struct vm_area_struct* vma;
- if (!count)
- return 1;
- /*
- * Go through process' page directory.
- */
-
/*
* Find the proper vm-area after freezing the vma chain
* and ptes.
*/
spin_lock(&mm->page_table_lock);
+ *race = 1;
+ if (swap_mm != mm)
+ goto out_unlock;
+ *race = 0;
address = mm->swap_address;
vma = find_vma(mm, address);
if (vma) {
address = vma->vm_start;
for (;;) {
- count = swap_out_vma(mm, vma, address, count);
+ count = swap_out_vma(mm, vma, address, count, classzone);
if (!count)
goto out_unlock;
vma = vma->vm_next;
/* Reset to 0 when we reach the end of address space */
mm->swap_address = 0;
+ spin_lock(&mmlist_lock);
+ swap_mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
+ spin_unlock(&mmlist_lock);
+
out_unlock:
spin_unlock(&mm->page_table_lock);
- return !count;
-}
-
-#define SWAP_MM_SHIFT 4
-#define SWAP_SHIFT 5
-#define SWAP_MIN 8
-static inline int swap_amount(struct mm_struct *mm)
-{
- int nr = mm->rss >> SWAP_SHIFT;
- if (nr < SWAP_MIN) {
- nr = SWAP_MIN;
- if (nr > mm->rss)
- nr = mm->rss;
- }
- return nr;
+ return count;
}
-/* Placeholder for swap_out(): may be updated by fork.c:mmput() */
-struct mm_struct *swap_mm = &init_mm;
-
-static void swap_out(unsigned int priority, int gfp_mask)
+static int FASTCALL(swap_out(unsigned int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages));
+static int swap_out(unsigned int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages)
{
- int counter;
- int retval = 0;
- struct mm_struct *mm = current->mm;
-
- /* Always start by trying to penalize the process that is allocating memory */
- if (mm)
- retval = swap_out_mm(mm, swap_amount(mm));
+ int counter, race;
+ struct mm_struct *mm;
/* Then, look at the other mm's */
- counter = (mmlist_nr << SWAP_MM_SHIFT) >> priority;
+ counter = mmlist_nr / priority;
do {
+ if (current->need_resched)
+ schedule();
+
spin_lock(&mmlist_lock);
mm = swap_mm;
if (mm == &init_mm) {
mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
if (mm == &init_mm)
goto empty;
+ swap_mm = mm;
}
- /* Set pointer for next call to next in the list */
- swap_mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
/* Make sure the mm doesn't disappear when we drop the lock.. */
atomic_inc(&mm->mm_users);
spin_unlock(&mmlist_lock);
- /* Walk about 6% of the address space each time */
- retval |= swap_out_mm(mm, swap_amount(mm));
+ nr_pages = swap_out_mm(mm, nr_pages, &race, classzone);
+
mmput(mm);
- } while (--counter >= 0);
- return;
+
+ if (!nr_pages)
+ return 1;
+ } while (race || --counter >= 0);
+
+ return 0;
empty:
spin_unlock(&mmlist_lock);
+ return 0;
}
-
-/**
- * reclaim_page - reclaims one page from the inactive_clean list
- * @zone: reclaim a page from this zone
- *
- * The pages on the inactive_clean can be instantly reclaimed.
- * The tests look impressive, but most of the time we'll grab
- * the first page of the list and exit successfully.
- */
-struct page * reclaim_page(zone_t * zone)
+static int FASTCALL(shrink_cache(struct list_head * lru, int * max_scan, int nr_pages, zone_t * classzone, unsigned int gfp_mask));
+static int shrink_cache(struct list_head * lru, int * max_scan, int nr_pages, zone_t * classzone, unsigned int gfp_mask)
{
- struct page * page = NULL;
- struct list_head * page_lru;
- int maxscan;
+ LIST_HEAD(active_local_lru);
+ LIST_HEAD(inactive_local_lru);
+ struct list_head * entry;
+ int __max_scan = *max_scan;
- /*
- * We only need the pagemap_lru_lock if we don't reclaim the page,
- * but we have to grab the pagecache_lock before the pagemap_lru_lock
- * to avoid deadlocks and most of the time we'll succeed anyway.
- */
- spin_lock(&pagecache_lock);
spin_lock(&pagemap_lru_lock);
- maxscan = zone->inactive_clean_pages;
- while ((page_lru = zone->inactive_clean_list.prev) !=
- &zone->inactive_clean_list && maxscan--) {
- page = list_entry(page_lru, struct page, lru);
-
- /* Wrong page on list?! (list corruption, should not happen) */
- if (!PageInactiveClean(page)) {
- printk("VM: reclaim_page, wrong page on list.\n");
- list_del(page_lru);
- page->zone->inactive_clean_pages--;
- continue;
- }
-
- /* Page is referenced? Clear and move to the head of the list.. */
- if (PageTestandClearReferenced(page)) {
- list_del(page_lru);
- list_add(page_lru, &zone->inactive_clean_list);
- }
-
- /* The page is dirty, or locked, move to inactive_dirty list. */
- if (page->buffers || PageDirty(page) || TryLockPage(page)) {
- del_page_from_inactive_clean_list(page);
- add_page_to_inactive_dirty_list(page);
- continue;
- }
+ while (__max_scan && (entry = lru->prev) != lru) {
+ struct page * page;
- /* Page is in use? Move it to the active list. */
- if (page_count(page) > 1) {
- UnlockPage(page);
- del_page_from_inactive_clean_list(page);
- add_page_to_active_list(page);
+ if (__builtin_expect(current->need_resched, 0)) {
+ spin_unlock(&pagemap_lru_lock);
+ schedule();
+ spin_lock(&pagemap_lru_lock);
continue;
}
- /* OK, remove the page from the caches. */
- if (PageSwapCache(page)) {
- __delete_from_swap_cache(page);
- goto found_page;
- }
+ page = list_entry(entry, struct page, lru);
- if (page->mapping) {
- __remove_inode_page(page);
- goto found_page;
- }
+ if (__builtin_expect(!PageInactive(page) && !PageActive(page), 0))
+ BUG();
- /* We should never ever get here. */
- printk(KERN_ERR "VM: reclaim_page, found unknown page\n");
- list_del(page_lru);
- zone->inactive_clean_pages--;
- UnlockPage(page);
- }
- /* Reset page pointer, maybe we encountered an unfreeable page. */
- page = NULL;
- goto out;
-
-found_page:
- memory_pressure++;
- del_page_from_inactive_clean_list(page);
- UnlockPage(page);
- page->age = PAGE_AGE_START;
- if (page_count(page) != 1)
- printk("VM: reclaim_page, found page with count %d!\n",
- page_count(page));
-out:
- spin_unlock(&pagemap_lru_lock);
- spin_unlock(&pagecache_lock);
- return page;
-}
-
-/**
- * page_launder - clean dirty inactive pages, move to inactive_clean list
- * @gfp_mask: what operations we are allowed to do
- * @sync: are we allowed to do synchronous IO in emergencies ?
- *
- * When this function is called, we are most likely low on free +
- * inactive_clean pages. Since we want to refill those pages as
- * soon as possible, we'll make two loops over the inactive list,
- * one to move the already cleaned pages to the inactive_clean lists
- * and one to (often asynchronously) clean the dirty inactive pages.
- *
- * In situations where kswapd cannot keep up, user processes will
- * end up calling this function. Since the user process needs to
- * have a page before it can continue with its allocation, we'll
- * do synchronous page flushing in that case.
- *
- * This code used to be heavily inspired by the FreeBSD source code.
- * Thanks go out to Matthew Dillon.
- */
-#define CAN_DO_FS (gfp_mask & __GFP_FS)
-int page_launder(int gfp_mask, int sync)
-{
- int maxscan, cleaned_pages;
- struct list_head * page_lru;
- struct page * page;
-
- cleaned_pages = 0;
-
- /* Will we wait on IO? */
- if (!sync)
- gfp_mask &= ~__GFP_WAIT;
-
- spin_lock(&pagemap_lru_lock);
- maxscan = nr_inactive_dirty_pages >> DEF_PRIORITY;
- while ((page_lru = inactive_dirty_list.prev) != &inactive_dirty_list &&
- maxscan-- > 0) {
- page = list_entry(page_lru, struct page, lru);
-
- /* Wrong page on list?! (list corruption, should not happen) */
- if (!PageInactiveDirty(page)) {
- printk("VM: page_launder, wrong page on list.\n");
- list_del(page_lru);
- nr_inactive_dirty_pages--;
- page->zone->inactive_dirty_pages--;
+ if (PageTestandClearReferenced(page)) {
+ if (PageInactive(page)) {
+ del_page_from_inactive_list(page);
+ add_page_to_active_list(page);
+ } else if (PageActive(page)) {
+ list_del(entry);
+ list_add(entry, &active_list);
+ } else
+ BUG();
continue;
}
- /* Page is referenced? Clear and move to the head of the list.. */
- if (PageTestandClearReferenced(page)) {
- list_del(page_lru);
- list_add(page_lru, &inactive_dirty_list);
- }
+ deactivate_page_nolock(page);
+ list_del(entry);
+ list_add_tail(entry, &inactive_local_lru);
- /* Page is in use? Move it to the active list. */
- if ((!page->buffers && page_count(page) > 1) || page_ramdisk(page)) {
- del_page_from_inactive_dirty_list(page);
- add_page_to_active_list(page);
+ if (__builtin_expect(!memclass(page->zone, classzone), 0))
continue;
- }
- /*
- * If this zone has plenty of pages free,
- * don't spend time on cleaning it.
- */
- if (zone_free_plenty(page->zone)) {
- list_del(page_lru);
- list_add(page_lru, &inactive_dirty_list);
+ __max_scan--;
+
+ /* Racy check to avoid trylocking when not worthwhile */
+ if (!page->buffers && page_count(page) != 1) {
+ activate_page_nolock(page);
+ list_del(entry);
+ list_add_tail(entry, &active_local_lru);
continue;
}
* The page is locked. IO in progress?
* Move it to the back of the list.
*/
- if (TryLockPage(page)) {
- list_del(page_lru);
- list_add(page_lru, &inactive_dirty_list);
+ if (__builtin_expect(TryLockPage(page), 0))
continue;
- }
- /*
- * Dirty swap-cache page? Write it out if
- * last copy..
- */
- if (PageDirty(page)) {
+ if (PageDirty(page) && is_page_cache_freeable(page)) {
+ /*
+ * It is not critical here to write it only if
+ * the page is unmapped beause any direct writer
+ * like O_DIRECT would set the PG_dirty bitflag
+ * on the phisical page after having successfully
+ * pinned it and after the I/O to the page is finished,
+ * so the direct writes to the page cannot get lost.
+ */
int (*writepage)(struct page *);
- /* Can a page get here without page->mapping? */
- if (!page->mapping)
- goto page_active;
writepage = page->mapping->a_ops->writepage;
- if (!writepage)
- goto page_active;
+ if (gfp_mask & __GFP_FS && writepage) {
+ spin_unlock(&pagemap_lru_lock);
- /* Can't do it? Move it to the back of the list */
- if (!CAN_DO_FS) {
- list_del(page_lru);
- list_add(page_lru, &inactive_dirty_list);
- UnlockPage(page);
+ ClearPageDirty(page);
+ writepage(page);
+
+ spin_lock(&pagemap_lru_lock);
continue;
}
-
- /* OK, do a physical asynchronous write to swap. */
- ClearPageDirty(page);
- page_cache_get(page);
- spin_unlock(&pagemap_lru_lock);
-
- writepage(page);
- page_cache_release(page);
-
- /* And re-start the thing.. */
- spin_lock(&pagemap_lru_lock);
- continue;
}
/*
* If the page has buffers, try to free the buffer mappings
- * associated with this page. If we succeed we either free
- * the page (in case it was a buffercache only page) or we
- * move the page to the inactive_clean list.
- *
- * On the first round, we should free all previously cleaned
- * buffer pages
+ * associated with this page. If we succeed we try to free
+ * the page as well.
*/
if (page->buffers) {
- int clearedbuf;
- int freed_page = 0;
+ spin_unlock(&pagemap_lru_lock);
- /*
- * Since we might be doing disk IO, we have to
- * drop the spinlock and take an extra reference
- * on the page so it doesn't go away from under us.
- */
- del_page_from_inactive_dirty_list(page);
+ /* avoid to free a locked page */
page_cache_get(page);
- spin_unlock(&pagemap_lru_lock);
- /* Try to free the page buffers. */
- clearedbuf = try_to_free_buffers(page, gfp_mask);
+ if (try_to_free_buffers(page, gfp_mask)) {
+ if (!page->mapping) {
+ UnlockPage(page);
- /*
- * Re-take the spinlock. Note that we cannot
- * unlock the page yet since we're still
- * accessing the page_struct here...
- */
- spin_lock(&pagemap_lru_lock);
+ /*
+ * Account we successfully freed a page
+ * of buffer cache.
+ */
+ atomic_dec(&buffermem_pages);
- /* The buffers were not freed. */
- if (!clearedbuf) {
- add_page_to_inactive_dirty_list(page);
+ spin_lock(&pagemap_lru_lock);
+ __lru_cache_del(page);
- /* The page was only in the buffer cache. */
- } else if (!page->mapping) {
- atomic_dec(&buffermem_pages);
- freed_page = 1;
- cleaned_pages++;
+ /* effectively free the page here */
+ page_cache_release(page);
- /* The page has more users besides the cache and us. */
- } else if (page_count(page) > 2) {
- add_page_to_active_list(page);
+ if (--nr_pages)
+ continue;
+ break;
+ } else {
+ /*
+ * The page is still in pagecache so undo the stuff
+ * before the try_to_free_buffers since we've not
+ * finished and we can now try the next step.
+ */
+ page_cache_release(page);
+
+ spin_lock(&pagemap_lru_lock);
+ }
+ } else {
+ /* failed to drop the buffers so stop here */
+ UnlockPage(page);
+ page_cache_release(page);
- /* OK, we "created" a freeable page. */
- } else /* page->mapping && page_count(page) == 2 */ {
- add_page_to_inactive_clean_list(page);
- cleaned_pages++;
+ spin_lock(&pagemap_lru_lock);
+ continue;
}
-
- /*
- * Unlock the page and drop the extra reference.
- * We can only do it here because we are accessing
- * the page struct above.
- */
- UnlockPage(page);
- page_cache_release(page);
-
- continue;
- } else if (page->mapping && !PageDirty(page)) {
- /*
- * If a page had an extra reference in
- * deactivate_page(), we will find it here.
- * Now the page is really freeable, so we
- * move it to the inactive_clean list.
- */
- del_page_from_inactive_dirty_list(page);
- add_page_to_inactive_clean_list(page);
- UnlockPage(page);
- cleaned_pages++;
- } else {
-page_active:
- /*
- * OK, we don't know what to do with the page.
- * It's no use keeping it here, so we move it to
- * the active list.
- */
- del_page_from_inactive_dirty_list(page);
- add_page_to_active_list(page);
- UnlockPage(page);
}
- }
- spin_unlock(&pagemap_lru_lock);
- /* Return the number of pages moved to the inactive_clean list. */
- return cleaned_pages;
-}
+ if (__builtin_expect(!page->mapping, 0))
+ BUG();
-/**
- * refill_inactive_scan - scan the active list and find pages to deactivate
- * @priority: the priority at which to scan
- *
- * This function will scan a portion of the active list to find
- * unused pages, those pages will then be moved to the inactive list.
- */
-static int refill_inactive_scan(unsigned int priority)
-{
- struct list_head * page_lru;
- struct page * page;
- int maxscan = nr_active_pages >> priority;
- int page_active = 0;
- int nr_deactivated = 0;
+ if (__builtin_expect(!spin_trylock(&pagecache_lock), 0)) {
+ /* we hold the page lock so the page cannot go away from under us */
+ spin_unlock(&pagemap_lru_lock);
- /* Take the lock while messing with the list... */
- spin_lock(&pagemap_lru_lock);
- while (maxscan-- > 0 && (page_lru = active_list.prev) != &active_list) {
- page = list_entry(page_lru, struct page, lru);
-
- /* Wrong page on list?! (list corruption, should not happen) */
- if (!PageActive(page)) {
- printk("VM: refill_inactive, wrong page on list.\n");
- list_del(page_lru);
- nr_active_pages--;
- continue;
+ spin_lock(&pagecache_lock);
+ spin_lock(&pagemap_lru_lock);
}
/*
- * Do not deactivate pages from zones which
- * have plenty inactive pages.
+ * this is the non-racy check, it is critical to check
+ * PageDirty _after_ we made sure the page is freeable
+ * so not in use by anybody.
*/
-
- if (zone_inactive_plenty(page->zone)) {
- page_active = 1;
- goto skip_page;
+ if (!is_page_cache_freeable(page) || PageDirty(page)) {
+ spin_unlock(&pagecache_lock);
+ UnlockPage(page);
+ continue;
}
- /* Do aging on the pages. */
- if (PageTestandClearReferenced(page)) {
- age_page_up(page);
- page_active = 1;
- } else {
- age_page_down(page);
- /*
- * Since we don't hold a reference on the page
- * ourselves, we have to do our test a bit more
- * strict then deactivate_page(). This is needed
- * since otherwise the system could hang shuffling
- * unfreeable pages from the active list to the
- * inactive_dirty list and back again...
- *
- * SUBTLE: we can have buffer pages with count 1.
- */
- if (page_count(page) <= (page->buffers ? 2 : 1)) {
- deactivate_page_nolock(page);
- page_active = 0;
- } else {
- page_active = 1;
- }
- }
- /*
- * If the page is still on the active list, move it
- * to the other end of the list. Otherwise we exit if
- * we have done enough work.
- */
- if (page_active || PageActive(page)) {
-skip_page:
- list_del(page_lru);
- list_add(page_lru, &active_list);
- } else {
- nr_deactivated++;
- }
+ /* point of no return */
+ if (__builtin_expect(!PageSwapCache(page), 1))
+ __remove_inode_page(page);
+ else
+ __delete_from_swap_cache(page);
+ spin_unlock(&pagecache_lock);
+
+ __lru_cache_del(page);
+
+ UnlockPage(page);
+
+ /* effectively free the page here */
+ page_cache_release(page);
+
+ if (--nr_pages)
+ continue;
+ break;
}
+
+ list_splice(&inactive_local_lru, &inactive_list);
+ list_splice(&active_local_lru, &active_list);
spin_unlock(&pagemap_lru_lock);
- return nr_deactivated;
+ *max_scan = __max_scan;
+ return nr_pages;
}
-/*
- * Check if there are zones with a severe shortage of free pages,
- * or if all zones have a minor shortage.
- */
-int free_shortage(void)
+static int FASTCALL(shrink_caches(int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages));
+static int shrink_caches(int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages)
{
- pg_data_t *pgdat;
- unsigned int global_free = 0;
- unsigned int global_target = freepages.high;
+ int max_scan = (nr_inactive_pages + nr_active_pages / priority) / priority;
- /* Are we low on free pages anywhere? */
- pgdat = pgdat_list;
- do {
- int i;
- for(i = 0; i < MAX_NR_ZONES; i++) {
- zone_t *zone = pgdat->node_zones+ i;
- unsigned int free;
-
- if (!zone->size)
- continue;
+ nr_pages -= kmem_cache_reap(gfp_mask);
+ if (nr_pages <= 0)
+ return 0;
- free = zone->free_pages;
- free += zone->inactive_clean_pages;
+ nr_pages = shrink_cache(&inactive_list, &max_scan, nr_pages, classzone, gfp_mask);
+ if (nr_pages <= 0)
+ return 0;
- /* Local shortage? */
- if (free < zone->pages_low)
- return 1;
+ nr_pages = shrink_cache(&active_list, &max_scan, nr_pages, classzone, gfp_mask);
+ if (nr_pages <= 0)
+ return 0;
- global_free += free;
- }
- pgdat = pgdat->node_next;
- } while (pgdat);
+ shrink_dcache_memory(priority, gfp_mask);
+ shrink_icache_memory(priority, gfp_mask);
- /* Global shortage? */
- return global_free < global_target;
+ return nr_pages;
}
-/*
- * Are we low on inactive pages globally or in any zone?
- */
-int inactive_shortage(void)
+int try_to_free_pages(zone_t * classzone, unsigned int gfp_mask, unsigned int order)
{
- pg_data_t *pgdat;
- unsigned int global_target = freepages.high + inactive_target;
- unsigned int global_inactive = 0;
+ int priority = DEF_PRIORITY;
- pgdat = pgdat_list;
do {
- int i;
- for(i = 0; i < MAX_NR_ZONES; i++) {
- zone_t *zone = pgdat->node_zones + i;
- unsigned int inactive;
+ int nr_pages = SWAP_CLUSTER_MAX;
+ nr_pages = shrink_caches(priority, classzone, gfp_mask, nr_pages);
+ if (nr_pages <= 0)
+ return 1;
- if (!zone->size)
- continue;
+ swap_out(priority, classzone, gfp_mask, SWAP_CLUSTER_MAX);
+ } while (--priority);
- inactive = zone->inactive_dirty_pages;
- inactive += zone->inactive_clean_pages;
- inactive += zone->free_pages;
+ return 0;
+}
- /* Local shortage? */
- if (inactive < zone->pages_high)
- return 1;
+DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
- global_inactive += inactive;
- }
- pgdat = pgdat->node_next;
- } while (pgdat);
+static int check_classzone_need_balance(zone_t * classzone)
+{
+ zone_t * first_classzone;
- /* Global shortage? */
- return global_inactive < global_target;
+ first_classzone = classzone->zone_pgdat->node_zones;
+ while (classzone >= first_classzone) {
+ if (classzone->free_pages > classzone->pages_high)
+ return 0;
+ classzone--;
+ }
+ return 1;
}
-/*
- * Loop until we are no longer under an inactive or free
- * shortage. Return 1 on success, 0 if we failed to get
- * there even after "maxtry" loops.
- */
-#define INACTIVE_SHORTAGE 1
-#define FREE_SHORTAGE 2
-#define GENERAL_SHORTAGE 4
-static int do_try_to_free_pages(unsigned int gfp_mask, int user)
+static int kswapd_balance_pgdat(pg_data_t * pgdat)
{
- int shortage = 0;
- int maxtry;
+ int need_more_balance = 0, i;
+ zone_t * zone;
- /* Always walk at least the active queue when called */
- refill_inactive_scan(DEF_PRIORITY);
+ for (i = pgdat->nr_zones-1; i >= 0; i--) {
+ zone = pgdat->node_zones + i;
+ if (current->need_resched)
+ schedule();
+ if (!zone->need_balance)
+ continue;
+ if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) {
+ zone->need_balance = 0;
+ continue;
+ }
+ if (check_classzone_need_balance(zone))
+ need_more_balance = 1;
+ else
+ zone->need_balance = 0;
+ }
- maxtry = 1 << DEF_PRIORITY;
- do {
- /*
- * If needed, we move pages from the active list
- * to the inactive list.
- */
- if (shortage & INACTIVE_SHORTAGE) {
- /* Walk the VM space for a bit.. */
- swap_out(DEF_PRIORITY, gfp_mask);
+ return need_more_balance;
+}
- /* ..and refill the inactive list */
- refill_inactive_scan(DEF_PRIORITY);
- }
+static void kswapd_balance(void)
+{
+ int need_more_balance;
+ pg_data_t * pgdat;
- /*
- * If we're low on free pages, move pages from the
- * inactive_dirty list to the inactive_clean list.
- *
- * Usually bdflush will have pre-cleaned the pages
- * before we get around to moving them to the other
- * list, so this is a relatively cheap operation.
- */
- if (shortage & FREE_SHORTAGE)
- page_launder(gfp_mask, user);
+ do {
+ need_more_balance = 0;
+ pgdat = pgdat_list;
+ do
+ need_more_balance |= kswapd_balance_pgdat(pgdat);
+ while ((pgdat = pgdat->node_next));
+ } while (need_more_balance);
+}
- /*
- * Reclaim unused slab cache if we were short on memory.
- */
- if (shortage & GENERAL_SHORTAGE) {
- shrink_dcache_memory(DEF_PRIORITY, gfp_mask);
- shrink_icache_memory(DEF_PRIORITY, gfp_mask);
+static int kswapd_can_sleep_pgdat(pg_data_t * pgdat)
+{
+ zone_t * zone;
+ int i;
- kmem_cache_reap(gfp_mask);
- }
+ for (i = pgdat->nr_zones-1; i >= 0; i--) {
+ zone = pgdat->node_zones + i;
+ if (!zone->need_balance)
+ continue;
+ return 0;
+ }
- if (current->need_resched) {
- __set_current_state(TASK_RUNNING);
- schedule();
- }
+ return 1;
+}
- shortage = 0;
- if (inactive_shortage())
- shortage |= INACTIVE_SHORTAGE | GENERAL_SHORTAGE;
- if (free_shortage())
- shortage |= FREE_SHORTAGE | GENERAL_SHORTAGE;
+static int kswapd_can_sleep(void)
+{
+ pg_data_t * pgdat;
- if (--maxtry <= 0)
- break;
- } while (shortage);
+ pgdat = pgdat_list;
+ do {
+ if (kswapd_can_sleep_pgdat(pgdat))
+ continue;
+ return 0;
+ } while ((pgdat = pgdat->node_next));
- /* Return success if we're not "totally short" */
- return shortage != (FREE_SHORTAGE | INACTIVE_SHORTAGE | GENERAL_SHORTAGE);
+ return 1;
}
-DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
-DECLARE_WAIT_QUEUE_HEAD(kswapd_done);
-
/*
* The background pageout daemon, started as a kernel thread
* from the init process.
int kswapd(void *unused)
{
struct task_struct *tsk = current;
+ DECLARE_WAITQUEUE(wait, tsk);
daemonize();
strcpy(tsk->comm, "kswapd");
* Kswapd main loop.
*/
for (;;) {
- static long recalc = 0;
-
- /* Once a second ... */
- if (time_after(jiffies, recalc + HZ)) {
- recalc = jiffies;
+ __set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue(&kswapd_wait, &wait);
- /* Recalculate VM statistics. */
- recalculate_vm_stats();
- }
-
- if (!do_try_to_free_pages(GFP_KSWAPD, 1)) {
- if (out_of_memory())
- oom_kill();
- continue;
- }
-
- run_task_queue(&tq_disk);
- interruptible_sleep_on_timeout(&kswapd_wait, HZ);
- }
-}
-
-void wakeup_kswapd(void)
-{
- if (waitqueue_active(&kswapd_wait))
- wake_up_interruptible(&kswapd_wait);
-}
-
-/*
- * Called by non-kswapd processes when they want more
- * memory but are unable to sleep on kswapd because
- * they might be holding some IO locks ...
- */
-int try_to_free_pages(unsigned int gfp_mask)
-{
- int ret = 1;
-
- if (gfp_mask & __GFP_WAIT) {
- current->flags |= PF_MEMALLOC;
- ret = do_try_to_free_pages(gfp_mask, 1);
- current->flags &= ~PF_MEMALLOC;
- }
-
- return ret;
-}
-
-DECLARE_WAIT_QUEUE_HEAD(kreclaimd_wait);
-/*
- * Kreclaimd will move pages from the inactive_clean list to the
- * free list, in order to keep atomic allocations possible under
- * all circumstances.
- */
-int kreclaimd(void *unused)
-{
- struct task_struct *tsk = current;
- pg_data_t *pgdat;
-
- daemonize();
- strcpy(tsk->comm, "kreclaimd");
- sigfillset(&tsk->blocked);
- current->flags |= PF_MEMALLOC;
-
- while (1) {
+ mb();
+ if (kswapd_can_sleep())
+ schedule();
- /*
- * We sleep until someone wakes us up from
- * page_alloc.c::__alloc_pages().
- */
- interruptible_sleep_on(&kreclaimd_wait);
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&kswapd_wait, &wait);
/*
- * Move some pages from the inactive_clean lists to
- * the free lists, if it is needed.
+ * If we actually get into a low-memory situation,
+ * the processes needing more memory will wake us
+ * up on a more timely basis.
*/
- pgdat = pgdat_list;
- do {
- int i;
- for(i = 0; i < MAX_NR_ZONES; i++) {
- zone_t *zone = pgdat->node_zones + i;
- if (!zone->size)
- continue;
-
- while (zone->free_pages < zone->pages_low) {
- struct page * page;
- page = reclaim_page(zone);
- if (!page)
- break;
- __free_page(page);
- }
- }
- pgdat = pgdat->node_next;
- } while (pgdat);
+ kswapd_balance();
+ run_task_queue(&tq_disk);
}
}
-
static int __init kswapd_init(void)
{
- printk("Starting kswapd v1.8\n");
+ printk("Starting kswapd\n");
swap_setup();
kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
- kernel_thread(kreclaimd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
return 0;
}
dev_hold(skb->dev);
__skb_queue_tail(&queue->input_pkt_queue,skb);
/* Runs from irqs or BH's, no need to wake BH */
- __cpu_raise_softirq(this_cpu, NET_RX_SOFTIRQ);
+ cpu_raise_softirq(this_cpu, NET_RX_SOFTIRQ);
local_irq_restore(flags);
#ifndef OFFLINE_SAMPLE
get_sample_stats(this_cpu);
local_irq_disable();
netdev_rx_stat[this_cpu].time_squeeze++;
/* This already runs in BH context, no need to wake up BH's */
- __cpu_raise_softirq(this_cpu, NET_RX_SOFTIRQ);
+ cpu_raise_softirq(this_cpu, NET_RX_SOFTIRQ);
local_irq_enable();
NET_PROFILE_LEAVE(softnet_process);
msg->msg_flags|=MSG_OOB;
if(len>0) {
- if (!(flags & MSG_PEEK) && !(flags & MSG_TRUNC))
+ if (!(flags & MSG_TRUNC))
err = memcpy_toiovec(msg->msg_iov, &c, 1);
len = 1;
} else
struct ipv6hdr *hdr = skb->nh.ipv6h;
struct inet6_skb_parm *opt =(struct inet6_skb_parm*)skb->cb;
- if (ipv6_devconf.forwarding == 0 && opt->srcrt == 0)
+ if (ipv6_devconf.forwarding == 0)
goto error;
skb->ip_summed = CHECKSUM_NONE;