-mm1 --- diff/Documentation/binfmt_misc.txt 2003-10-09 09:47:33.000000000 +0100 +++ source/Documentation/binfmt_misc.txt 2004-04-21 10:46:51.329778800 +0100 @@ -15,7 +15,7 @@ First you must mount binfmt_misc: mount binfmt_misc -t binfmt_misc /proc/sys/fs/binfmt_misc To actually register a new binary type, you have to set up a string looking like -:name:type:offset:magic:mask:interpreter: (where you can choose the ':' upon +:name:type:offset:magic:mask:interpreter:flags (where you can choose the ':' upon your needs) and echo it to /proc/sys/fs/binfmt_misc/register. Here is what the fields mean: - 'name' is an identifier string. A new /proc file will be created with this @@ -34,6 +34,28 @@ Here is what the fields mean: The mask is anded with the byte sequence of the file. - 'interpreter' is the program that should be invoked with the binary as first argument (specify the full path) + - 'flags' is an optional field that controls several aspects of the invocation + of the interpreter. It is a string of capital letters, each controls a certain + aspect. The following flags are supported - + 'P' - preserve-argv[0]. Legacy behavior of binfmt_misc is to overwrite the + original argv[0] with the full path to the binary. When this flag is + included, binfmt_misc will add an argument to the argument vector for + this purpose, thus preserving the original argv[0]. + 'O' - open-binary. Legacy behavior of binfmt_misc is to pass the full path + of the binary to the interpreter as an argument. When this flag is + included, binfmt_misc will open the file for reading and pass its + descriptor as an argument, instead of the full path, thus allowing + the interpreter to execute non-readable binaries. This feature should + be used with care - the interpreter has to be trusted not to emit + the contents of the non-readable binary. + 'C' - credentials. Currently, the behavior of binfmt_misc is to calculate + the credentials and security token of the new process according to + the interpreter. When this flag is included, these attributes are + calculated according to the binary. It also implies the 'O' flag. + This feature should be used with care as the interpreter + will run with root permissions when a setuid binary owned by root + is run with binfmt_misc. + There are some restrictions: - the whole register string may not exceed 255 characters @@ -83,9 +105,9 @@ If you want to pass special arguments to write a wrapper script for it. See Documentation/java.txt for an example. -Your interpreter should NOT look in the PATH for the filename; the -kernel passes it the full filename to use. Using the PATH can cause -unexpected behaviour and be a security hazard. +Your interpreter should NOT look in the PATH for the filename; the kernel +passes it the full filename (or the file descriptor) to use. Using $PATH can +cause unexpected behaviour and can be a security hazard. There is a web page about binfmt_misc at --- diff/Documentation/kernel-parameters.txt 2004-04-05 12:57:06.000000000 +0100 +++ source/Documentation/kernel-parameters.txt 2004-04-21 10:46:51.345776368 +0100 @@ -879,8 +879,8 @@ running once the system is up. psmouse.rate= [HW,MOUSE] Set desired mouse report rate, in reports per second. psmouse.resetafter= - [HW,MOUSE] Try to reset Synaptics Touchpad after so many - bad packets (0 = never). + [HW,MOUSE] Try to reset the device after so many bad packets + (0 = never). psmouse.resolution= [HW,MOUSE] Set desired mouse resolution, in dpi. psmouse.smartscroll= --- diff/MAINTAINERS 2004-04-21 10:45:33.327636928 +0100 +++ source/MAINTAINERS 2004-04-21 10:46:51.802706904 +0100 @@ -1202,6 +1202,12 @@ W: http://sf.net/projects/kernel-janitor W: http://developer.osdl.org/rddunlap/kj-patches/ S: Maintained +KGDB FOR I386 PLATFORM +P: George Anzinger +M: george@mvista.com +L: linux-net@vger.kernel.org +S: Supported + KERNEL NFSD P: Neil Brown M: neilb@cse.unsw.edu.au --- diff/Makefile 2004-04-21 10:45:33.329636624 +0100 +++ source/Makefile 2004-04-21 10:46:51.803706752 +0100 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 6 -EXTRAVERSION =-rc2 +EXTRAVERSION =-rc2-mm1 NAME=Zonked Quokka # *DOCUMENTATION* @@ -461,6 +461,7 @@ endif ifdef CONFIG_DEBUG_INFO CFLAGS += -g +AFLAGS += -g endif # warn about C99 declaration after statement @@ -715,8 +716,12 @@ modules: $(vmlinux-dirs) $(if $(KBUILD_B @echo ' Building modules, stage 2.'; $(Q)$(MAKE) -rR -f $(srctree)/scripts/Makefile.modpost -# Install modules +# Target to prepare building external modules +.PHONY: modules_prepare +modules_prepare: prepare-all scripts + +# Target to install modules .PHONY: modules_install modules_install: _modinst_ _modinst_post --- diff/arch/alpha/kernel/process.c 2004-04-21 10:45:33.334635864 +0100 +++ source/arch/alpha/kernel/process.c 2004-04-21 10:46:51.087815584 +0100 @@ -510,12 +510,6 @@ thread_saved_pc(task_t *t) return 0; } -/* - * These bracket the sleeping functions.. - */ -#define first_sched ((unsigned long) scheduling_functions_start_here) -#define last_sched ((unsigned long) scheduling_functions_end_here) - unsigned long get_wchan(struct task_struct *p) { @@ -534,7 +528,7 @@ get_wchan(struct task_struct *p) */ pc = thread_saved_pc(p); - if (pc >= first_sched && pc < last_sched) { + if (in_sched_functions(pc)) { schedule_frame = ((unsigned long *)p->thread_info->pcb.ksp)[6]; return ((unsigned long *)schedule_frame)[12]; } --- diff/arch/alpha/kernel/setup.c 2004-04-05 12:57:06.000000000 +0100 +++ source/arch/alpha/kernel/setup.c 2004-04-21 10:46:51.110812088 +0100 @@ -122,7 +122,6 @@ static void get_sysnames(unsigned long, static void determine_cpu_caches (unsigned int); static char command_line[COMMAND_LINE_SIZE]; -char saved_command_line[COMMAND_LINE_SIZE]; /* * The format of "screen_info" is strange, and due to early --- diff/arch/arm/kernel/process.c 2004-04-21 10:45:33.351633280 +0100 +++ source/arch/arm/kernel/process.c 2004-04-21 10:46:51.115811328 +0100 @@ -411,12 +411,6 @@ pid_t kernel_thread(int (*fn)(void *), v return do_fork(flags|CLONE_VM|CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); } -/* - * These bracket the sleeping functions.. - */ -#define first_sched ((unsigned long) scheduling_functions_start_here) -#define last_sched ((unsigned long) scheduling_functions_end_here) - unsigned long get_wchan(struct task_struct *p) { unsigned long fp, lr; @@ -431,7 +425,7 @@ unsigned long get_wchan(struct task_stru if (fp < stack_page || fp > 4092+stack_page) return 0; lr = pc_pointer (((unsigned long *)fp)[-1]); - if (lr < first_sched || lr > last_sched) + if (!in_sched_functions(lr)) return lr; fp = *(unsigned long *) (fp - 12); } while (count ++ < 16); --- diff/arch/arm/kernel/setup.c 2004-03-11 10:20:19.000000000 +0000 +++ source/arch/arm/kernel/setup.c 2004-04-21 10:46:51.116811176 +0100 @@ -81,7 +81,6 @@ struct cpu_cache_fns cpu_cache; unsigned char aux_device_present; char elf_platform[ELF_PLATFORM_SIZE]; -char saved_command_line[COMMAND_LINE_SIZE]; unsigned long phys_initrd_start __initdata = 0; unsigned long phys_initrd_size __initdata = 0; --- diff/arch/arm/lib/findbit.S 2002-10-16 04:27:54.000000000 +0100 +++ source/arch/arm/lib/findbit.S 2004-04-21 10:46:51.116811176 +0100 @@ -51,6 +51,39 @@ ENTRY(_find_next_zero_bit_le) add r2, r2, #1 @ align bit pointer b 2b @ loop for next bit +/* + * Purpose : Find a 'one' bit + * Prototype: int find_first_bit(const unsigned long *addr, unsigned int maxbit); + */ +ENTRY(_find_first_bit_le) + teq r1, #0 + beq 3f + mov r2, #0 +1: ldrb r3, [r0, r2, lsr #3] + movs r3, r3 + bne .found @ any now set - found zero bit + add r2, r2, #8 @ next bit pointer +2: cmp r2, r1 @ any more? + blo 1b +3: mov r0, r1 @ no free bits + RETINSTR(mov,pc,lr) + +/* + * Purpose : Find next 'one' bit + * Prototype: int find_next_zero_bit(void *addr, unsigned int maxbit, int offset) + */ +ENTRY(_find_next_bit_le) + teq r1, #0 + beq 2b + ands ip, r2, #7 + beq 1b @ If new byte, goto old routine + ldrb r3, [r0, r2, lsr #3] + movs r3, r3, lsr ip @ shift off unused bits + bne .found + orr r2, r2, #7 @ if zero, then no bits here + add r2, r2, #1 @ align bit pointer + b 2b @ loop for next bit + #ifdef __ARMEB__ ENTRY(_find_first_zero_bit_be) @@ -78,6 +111,30 @@ ENTRY(_find_next_zero_bit_be) addeq r2, r2, #1 @ align bit pointer beq 2b @ loop for next bit +ENTRY(_find_first_bit_be) + teq r1, #0 + beq 3f + mov r2, #0 +1: eor r3, r2, #0x18 @ big endian byte ordering + ldrb r3, [r0, r3, lsr #3] + movs r3, r3 + bne .found @ any now set - found zero bit + add r2, r2, #8 @ next bit pointer +2: cmp r2, r1 @ any more? + blo 1b +3: mov r0, r1 @ no free bits + RETINSTR(mov,pc,lr) + +ENTRY(_find_next_bit_be) + ands ip, r2, #7 + beq 1b @ If new byte, goto old routine + eor r3, r2, #0x18 @ big endian byte ordering + ldrb r3, [r0, r3, lsr #3] + movs r3, r3, lsr ip @ shift off unused bits + orreq r2, r2, #7 @ if zero, then no bits here + addeq r2, r2, #1 @ align bit pointer + beq 2b @ loop for next bit + #endif /* --- diff/arch/arm/mm/mm-armv.c 2004-04-21 10:45:33.372630088 +0100 +++ source/arch/arm/mm/mm-armv.c 2004-04-21 10:46:51.117811024 +0100 @@ -18,7 +18,6 @@ #include #include -#include #include #include #include @@ -231,7 +230,7 @@ void free_pgd_slow(pgd_t *pgd) pte = pmd_page(*pmd); pmd_clear(pmd); - pgtable_remove_rmap(pte); + dec_page_state(nr_page_table_pages); pte_free(pte); pmd_free(pmd); free: --- diff/arch/arm26/kernel/process.c 2004-04-21 10:45:33.379629024 +0100 +++ source/arch/arm26/kernel/process.c 2004-04-21 10:46:51.110812088 +0100 @@ -397,12 +397,6 @@ pid_t kernel_thread(int (*fn)(void *), v return __ret; } -/* - * These bracket the sleeping functions.. - */ -#define first_sched ((unsigned long) scheduling_functions_start_here) -#define last_sched ((unsigned long) scheduling_functions_end_here) - unsigned long get_wchan(struct task_struct *p) { unsigned long fp, lr; @@ -417,7 +411,7 @@ unsigned long get_wchan(struct task_stru if (fp < stack_page || fp > 4092+stack_page) return 0; lr = pc_pointer (((unsigned long *)fp)[-1]); - if (lr < first_sched || lr > last_sched) + if (!in_sched_functions(lr)) return lr; fp = *(unsigned long *) (fp - 12); } while (count ++ < 16); --- diff/arch/arm26/kernel/setup.c 2003-09-30 15:46:11.000000000 +0100 +++ source/arch/arm26/kernel/setup.c 2004-04-21 10:46:51.115811328 +0100 @@ -76,7 +76,6 @@ struct processor processor; unsigned char aux_device_present; char elf_platform[ELF_PLATFORM_SIZE]; -char saved_command_line[COMMAND_LINE_SIZE]; unsigned long phys_initrd_start __initdata = 0; unsigned long phys_initrd_size __initdata = 0; --- diff/arch/cris/arch-v10/kernel/process.c 2004-04-21 10:45:33.391627200 +0100 +++ source/arch/cris/arch-v10/kernel/process.c 2004-04-21 10:46:51.117811024 +0100 @@ -217,8 +217,8 @@ asmlinkage int sys_execve(const char *fn * These bracket the sleeping functions.. */ -#define first_sched ((unsigned long) scheduling_functions_start_here) -#define last_sched ((unsigned long) scheduling_functions_end_here) +#define first_sched ((unsigned long)__sched_text_start) +#define last_sched ((unsigned long)__sched_text_end) unsigned long get_wchan(struct task_struct *p) { --- diff/arch/cris/kernel/setup.c 2003-07-11 09:39:50.000000000 +0100 +++ source/arch/cris/kernel/setup.c 2004-04-21 10:46:51.117811024 +0100 @@ -28,10 +28,7 @@ unsigned char aux_device_present; extern int root_mountflags; extern char _etext, _edata, _end; -#define COMMAND_LINE_SIZE 256 - static char command_line[COMMAND_LINE_SIZE] = { 0, }; - char saved_command_line[COMMAND_LINE_SIZE]; extern const unsigned long text_start, edata; /* set by the linker script */ extern unsigned long dram_start, dram_end; --- diff/arch/h8300/kernel/process.c 2004-04-21 10:45:33.408624616 +0100 +++ source/arch/h8300/kernel/process.c 2004-04-21 10:46:51.118810872 +0100 @@ -261,12 +261,6 @@ out: return error; } -/* - * These bracket the sleeping functions.. - */ -#define first_sched ((unsigned long) scheduling_functions_start_here) -#define last_sched ((unsigned long) scheduling_functions_end_here) - unsigned long thread_saved_pc(struct task_struct *tsk) { return ((struct pt_regs *)tsk->thread.esp0)->pc; @@ -287,7 +281,7 @@ unsigned long get_wchan(struct task_stru fp >= 8184+stack_page) return 0; pc = ((unsigned long *)fp)[1]; - if (pc < first_sched || pc >= last_sched) + if (!in_sched_functions(pc)) return pc; fp = *(unsigned long *) fp; } while (count++ < 16); --- diff/arch/h8300/kernel/setup.c 2004-04-21 10:45:33.410624312 +0100 +++ source/arch/h8300/kernel/setup.c 2004-04-21 10:46:51.118810872 +0100 @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -60,8 +61,7 @@ unsigned long memory_end; struct task_struct *_current_task; -char command_line[512]; -char saved_command_line[512]; +char command_line[COMMAND_LINE_SIZE]; extern int _stext, _etext, _sdata, _edata, _sbss, _ebss, _end; extern int _ramstart, _ramend; --- diff/arch/i386/Kconfig 2004-04-21 10:45:33.419622944 +0100 +++ source/arch/i386/Kconfig 2004-04-21 10:46:51.119810720 +0100 @@ -479,6 +479,16 @@ config NR_CPUS This is purely to save memory - each supported CPU adds approximately eight kilobytes to the kernel image. +config SCHED_SMT + bool "SMT (Hyperthreading) scheduler support" + depends on SMP + default off + help + SMT scheduler support improves the CPU scheduler's decision making + when dealing with Intel Pentium 4 chips with HyperThreading at a + cost of slightly increased overhead in some places. If unsure say + N here. + config PREEMPT bool "Preemptible Kernel" help @@ -709,7 +719,7 @@ config X86_PAE # Common NUMA Features config NUMA - bool "Numa Memory Allocation Support" + bool "Numa Memory Allocation and Scheduler Support" depends on SMP && HIGHMEM64G && (X86_NUMAQ || X86_GENERICARCH || (X86_SUMMIT && ACPI)) default n if X86_PC default y if (X86_NUMAQ || X86_SUMMIT) @@ -1245,6 +1255,15 @@ config DEBUG_PAGEALLOC This results in a large slowdown, but helps to find certain types of memory corruptions. +config SPINLINE + bool "Spinlock inlining" + depends on DEBUG_KERNEL + help + This will change spinlocks from out of line to inline, making them + account cost to the callers in readprofile, rather than the lock + itself (as ".text.lock.filename"). This can be helpful for finding + the callers of locks. + config DEBUG_HIGHMEM bool "Highmem debugging" depends on DEBUG_KERNEL && HIGHMEM @@ -1261,12 +1280,194 @@ config DEBUG_INFO Say Y here only if you plan to use gdb to debug the kernel. If you don't debug the kernel, you can say N. +config LOCKMETER + bool "Kernel lock metering" + depends on SMP + help + Say Y to enable kernel lock metering, which adds overhead to SMP locks, + but allows you to see various statistics using the lockstat command. + config DEBUG_SPINLOCK_SLEEP bool "Sleep-inside-spinlock checking" help If you say Y here, various routines which may sleep will become very noisy if they are called with a spinlock held. +config KGDB + bool "Include kgdb kernel debugger" + depends on DEBUG_KERNEL + help + If you say Y here, the system will be compiled with the debug + option (-g) and a debugging stub will be included in the + kernel. This stub communicates with gdb on another (host) + computer via a serial port. The host computer should have + access to the kernel binary file (vmlinux) and a serial port + that is connected to the target machine. Gdb can be made to + configure the serial port or you can use stty and setserial to + do this. See the 'target' command in gdb. This option also + configures in the ability to request a breakpoint early in the + boot process. To request the breakpoint just include 'kgdb' + as a boot option when booting the target machine. The system + will then break as soon as it looks at the boot options. This + option also installs a breakpoint in panic and sends any + kernel faults to the debugger. For more information see the + Documentation/i386/kgdb/kgdb.txt file. + +choice + depends on KGDB + prompt "Debug serial port BAUD" + default KGDB_115200BAUD + help + Gdb and the kernel stub need to agree on the baud rate to be + used. Some systems (x86 family at this writing) allow this to + be configured. + +config KGDB_9600BAUD + bool "9600" + +config KGDB_19200BAUD + bool "19200" + +config KGDB_38400BAUD + bool "38400" + +config KGDB_57600BAUD + bool "57600" + +config KGDB_115200BAUD + bool "115200" +endchoice + +config KGDB_PORT + hex "hex I/O port address of the debug serial port" + depends on KGDB + default 3f8 + help + Some systems (x86 family at this writing) allow the port + address to be configured. The number entered is assumed to be + hex, don't put 0x in front of it. The standard address are: + COM1 3f8 , irq 4 and COM2 2f8 irq 3. Setserial /dev/ttySx + will tell you what you have. It is good to test the serial + connection with a live system before trying to debug. + +config KGDB_IRQ + int "IRQ of the debug serial port" + depends on KGDB + default 4 + help + This is the irq for the debug port. If everything is working + correctly and the kernel has interrupts on a control C to the + port should cause a break into the kernel debug stub. + +config DEBUG_INFO + bool + depends on KGDB + default y + +config KGDB_MORE + bool "Add any additional compile options" + depends on KGDB + default n + help + Saying yes here turns on the ability to enter additional + compile options. + + +config KGDB_OPTIONS + depends on KGDB_MORE + string "Additional compile arguments" + default "-O1" + help + This option allows you enter additional compile options for + the whole kernel compile. Each platform will have a default + that seems right for it. For example on PPC "-ggdb -O1", and + for i386 "-O1". Note that by configuring KGDB "-g" is already + turned on. In addition, on i386 platforms + "-fomit-frame-pointer" is deleted from the standard compile + options. + +config NO_KGDB_CPUS + int "Number of CPUs" + depends on KGDB && SMP + default NR_CPUS + help + + This option sets the number of cpus for kgdb ONLY. It is used + to prune some internal structures so they look "nice" when + displayed with gdb. This is to overcome possibly larger + numbers that may have been entered above. Enter the real + number to get nice clean kgdb_info displays. + +config KGDB_TS + bool "Enable kgdb time stamp macros?" + depends on KGDB + default n + help + Kgdb event macros allow you to instrument your code with calls + to the kgdb event recording function. The event log may be + examined with gdb at a break point. Turning on this + capability also allows you to choose how many events to + keep. Kgdb always keeps the lastest events. + +choice + depends on KGDB_TS + prompt "Max number of time stamps to save?" + default KGDB_TS_128 + +config KGDB_TS_64 + bool "64" + +config KGDB_TS_128 + bool "128" + +config KGDB_TS_256 + bool "256" + +config KGDB_TS_512 + bool "512" + +config KGDB_TS_1024 + bool "1024" + +endchoice + +config STACK_OVERFLOW_TEST + bool "Turn on kernel stack overflow testing?" + depends on KGDB + default n + help + This option enables code in the front line interrupt handlers + to check for kernel stack overflow on interrupts and system + calls. This is part of the kgdb code on x86 systems. + +config KGDB_CONSOLE + bool "Enable serial console thru kgdb port" + depends on KGDB + default n + help + This option enables the command line "console=kgdb" option. + When the system is booted with this option in the command line + all kernel printk output is sent to gdb (as well as to other + consoles). For this to work gdb must be connected. For this + reason, this command line option will generate a breakpoint if + gdb has not yet connected. After the gdb continue command is + given all pent up console output will be printed by gdb on the + host machine. Neither this option, nor KGDB require the + serial driver to be configured. + +config KGDB_SYSRQ + bool "Turn on SysRq 'G' command to do a break?" + depends on KGDB + default y + help + This option includes an option in the SysRq code that allows + you to enter SysRq G which generates a breakpoint to the KGDB + stub. This will work if the keyboard is alive and can + interrupt the system. Because of constraints on when the + serial port interrupt can be enabled, this code may allow you + to interrupt the system before the serial port control C is + available. Just say yes here. + config FRAME_POINTER bool "Compile the kernel with frame pointers" help @@ -1284,6 +1485,19 @@ config 4KSTACKS on the VM subsystem for higher order allocations. This option will also use IRQ stacks to compensate for the reduced stackspace. +config SCHEDSTATS + bool "Collect scheduler statistics" + depends on PROC_FS + default y + help + If you say Y here, additional code will be inserted into the + scheduler and related routines to collect statistics about + scheduler behavior and provide them in /proc/schedstat. These + stats may be useful for both tuning and debugging the scheduler + If you aren't debugging the scheduler or trying to tune a specific + application, you can say N to avoid the very slight overhead + this adds. + config X86_FIND_SMP_CONFIG bool depends on X86_LOCAL_APIC || X86_VOYAGER --- diff/arch/i386/Makefile 2004-04-21 10:45:33.429621424 +0100 +++ source/arch/i386/Makefile 2004-04-21 10:46:51.182801144 +0100 @@ -19,7 +19,7 @@ LDFLAGS := -m elf_i386 OBJCOPYFLAGS := -O binary -R .note -R .comment -S LDFLAGS_vmlinux := -CFLAGS += -pipe +CFLAGS += -pipe -msoft-float # prevent gcc from keeping the stack 16 byte aligned CFLAGS += $(call check_gcc,-mpreferred-stack-boundary=2,) @@ -97,6 +97,9 @@ mcore-$(CONFIG_X86_ES7000) := mach-es700 # default subarch .h files mflags-y += -Iinclude/asm-i386/mach-default +mflags-$(CONFIG_KGDB) += -gdwarf-2 +mflags-$(CONFIG_KGDB_MORE) += $(shell echo $(CONFIG_KGDB_OPTIONS) | sed -e 's/"//g') + head-y := arch/i386/kernel/head.o arch/i386/kernel/init_task.o libs-y += arch/i386/lib/ --- diff/arch/i386/kernel/Makefile 2004-04-21 10:45:33.429621424 +0100 +++ source/arch/i386/kernel/Makefile 2004-04-21 10:46:51.146806616 +0100 @@ -14,6 +14,7 @@ obj-y += timers/ obj-$(CONFIG_ACPI_BOOT) += acpi/ obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o obj-$(CONFIG_MCA) += mca.o +obj-$(CONFIG_KGDB) += kgdb_stub.o obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_MICROCODE) += microcode.o --- diff/arch/i386/kernel/acpi/boot.c 2004-04-05 12:57:06.000000000 +0100 +++ source/arch/i386/kernel/acpi/boot.c 2004-04-21 10:46:51.129809200 +0100 @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include #include @@ -653,9 +653,6 @@ acpi_parse_madt_ioapic_entries(void) return count; } - /* Build a default routing table for legacy (ISA) interrupts. */ - mp_config_acpi_legacy_irqs(); - count = acpi_table_parse_madt(ACPI_MADT_INT_SRC_OVR, acpi_parse_int_src_ovr, NR_IRQ_VECTORS); if (count < 0) { printk(KERN_ERR PREFIX "Error parsing interrupt source overrides entry\n"); @@ -670,6 +667,9 @@ acpi_parse_madt_ioapic_entries(void) if (!acpi_sci_override_gsi) acpi_sci_ioapic_setup(acpi_fadt.sci_int, 0, 0); + /* Fill in identity legacy mapings where no override */ + mp_config_acpi_legacy_irqs(); + count = acpi_table_parse_madt(ACPI_MADT_NMI_SRC, acpi_parse_nmi_src, NR_IRQ_VECTORS); if (count < 0) { printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n"); --- diff/arch/i386/kernel/apic.c 2004-03-11 10:20:20.000000000 +0000 +++ source/arch/i386/kernel/apic.c 2004-04-21 10:46:51.131808896 +0100 @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include --- diff/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c 2004-04-21 10:45:33.439619904 +0100 +++ source/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c 2004-04-21 10:46:51.131808896 +0100 @@ -57,8 +57,7 @@ static int cpufreq_p4_setdc(unsigned int u32 l, h; cpumask_t cpus_allowed, affected_cpu_map; struct cpufreq_freqs freqs; - int hyperthreading = 0; - int sibling = 0; + int j; if (!cpu_online(cpu) || (newstate > DC_DISABLE) || (newstate == DC_RESV)) @@ -68,13 +67,10 @@ static int cpufreq_p4_setdc(unsigned int cpus_allowed = current->cpus_allowed; /* only run on CPU to be set, or on its sibling */ - affected_cpu_map = cpumask_of_cpu(cpu); -#ifdef CONFIG_X86_HT - hyperthreading = ((cpu_has_ht) && (smp_num_siblings == 2)); - if (hyperthreading) { - sibling = cpu_sibling_map[cpu]; - cpu_set(sibling, affected_cpu_map); - } +#ifdef CONFIG_SMP + affected_cpu_map = cpu_sibling_map[cpu]; +#else + affected_cpu_map = cpumask_of_cpu(cpu); #endif set_cpus_allowed(current, affected_cpu_map); BUG_ON(!cpu_isset(smp_processor_id(), affected_cpu_map)); @@ -97,11 +93,11 @@ static int cpufreq_p4_setdc(unsigned int /* notifiers */ freqs.old = stock_freq * l / 8; freqs.new = stock_freq * newstate / 8; - freqs.cpu = cpu; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - if (hyperthreading) { - freqs.cpu = sibling; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + for_each_cpu(j) { + if (cpu_isset(j, affected_cpu_map)) { + freqs.cpu = j; + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + } } rdmsr(MSR_IA32_THERM_STATUS, l, h); @@ -132,10 +128,11 @@ static int cpufreq_p4_setdc(unsigned int set_cpus_allowed(current, cpus_allowed); /* notifiers */ - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - if (hyperthreading) { - freqs.cpu = cpu; - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + for_each_cpu(j) { + if (cpu_isset(j, affected_cpu_map)) { + freqs.cpu = j; + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + } } return 0; --- diff/arch/i386/kernel/cpu/cpufreq/powernow-k7.c 2004-04-21 10:45:33.440619752 +0100 +++ source/arch/i386/kernel/cpu/cpufreq/powernow-k7.c 2004-04-21 10:46:51.132808744 +0100 @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -27,6 +28,11 @@ #include #include +#ifdef CONFIG_ACPI_PROCESSOR +#include +#include +#endif + #include "powernow-k7.h" #define DEBUG @@ -57,6 +63,17 @@ struct pst_s { u8 numpstates; }; +#ifdef CONFIG_ACPI_PROCESSOR +union powernow_acpi_control_t { + struct { + unsigned long fid:5, + vid:5, + sgtc:20, + res1:2; + } bits; + unsigned long val; +}; +#endif /* divide by 1000 to get VID. */ static int mobile_vid_table[32] = { @@ -74,6 +91,12 @@ static int fid_codes[32] = { 150, 225, 160, 165, 170, 180, -1, -1, }; +/* This parameter is used in order to force ACPI instead of legacy method for + * configuration purpose. + */ + +static int powernow_acpi_force; + static struct cpufreq_frequency_table *powernow_table; static unsigned int can_scale_bus; @@ -85,6 +108,14 @@ static unsigned int fsb; static unsigned int latency; static char have_a0; +static int check_fsb(unsigned int fsbspeed) +{ + int delta; + unsigned int f = fsb / 1000; + + delta = (fsbspeed > f) ? fsbspeed - f : f - fsbspeed; + return (delta < 5); +} static int check_powernow(void) { @@ -140,7 +171,8 @@ static int check_powernow(void) static int get_ranges (unsigned char *pst) { - unsigned int j, speed; + unsigned int j; + unsigned int speed; u8 fid, vid; powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table) * (number_scales + 1)), GFP_KERNEL); @@ -151,12 +183,12 @@ static int get_ranges (unsigned char *ps for (j=0 ; j < number_scales; j++) { fid = *pst++; - powernow_table[j].frequency = fsb * fid_codes[fid] * 100; + powernow_table[j].frequency = (fsb * fid_codes[fid]) / 10; powernow_table[j].index = fid; /* lower 8 bits */ - speed = fsb * (fid_codes[fid]/10); + speed = powernow_table[j].frequency; + if ((fid_codes[fid] % 10)==5) { - speed += fsb/2; #if defined(CONFIG_ACPI_PROCESSOR) || defined(CONFIG_ACPI_PROCESSOR_MODULE) if (have_a0 == 1) powernow_table[j].frequency = CPUFREQ_ENTRY_INVALID; @@ -164,7 +196,7 @@ static int get_ranges (unsigned char *ps } dprintk (KERN_INFO PFX " FID: 0x%x (%d.%dx [%dMHz])\t", fid, - fid_codes[fid] / 10, fid_codes[fid] % 10, speed); + fid_codes[fid] / 10, fid_codes[fid] % 10, speed/1000); if (speed < minimum_speed) minimum_speed = speed; @@ -176,8 +208,6 @@ static int get_ranges (unsigned char *ps dprintk ("VID: 0x%x (%d.%03dV)\n", vid, mobile_vid_table[vid]/1000, mobile_vid_table[vid]%1000); } - dprintk ("\n"); - powernow_table[number_scales].frequency = CPUFREQ_TABLE_END; powernow_table[number_scales].index = 0; @@ -234,7 +264,8 @@ static void change_speed (unsigned int i rdmsrl (MSR_K7_FID_VID_STATUS, fidvidstatus.val); cfid = fidvidstatus.bits.CFID; - freqs.old = fsb * fid_codes[cfid] * 100; + freqs.old = fsb * fid_codes[cfid] / 10; + freqs.new = powernow_table[index].frequency; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); @@ -262,19 +293,136 @@ static void change_speed (unsigned int i } +#ifdef CONFIG_ACPI_PROCESSOR + +struct acpi_processor_performance *acpi_processor_perf; + +static int powernow_acpi_init(void) +{ + int i; + int retval = 0; + union powernow_acpi_control_t pc; + + if (acpi_processor_perf != NULL && powernow_table != NULL) { + retval = -EINVAL; + goto err0; + } + + acpi_processor_perf = kmalloc(sizeof(struct acpi_processor_performance), + GFP_KERNEL); + + if (!acpi_processor_perf) { + retval = -ENOMEM; + goto err0; + } + + memset(acpi_processor_perf, 0, sizeof(struct acpi_processor_performance)); + + if (acpi_processor_register_performance(acpi_processor_perf, 0)) { + retval = -EIO; + goto err1; + } + + if (acpi_processor_perf->control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) { + retval = -ENODEV; + goto err2; + } + + if (acpi_processor_perf->status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) { + retval = -ENODEV; + goto err2; + } + + number_scales = acpi_processor_perf->state_count; + + if (number_scales < 2) { + retval = -ENODEV; + goto err2; + } + + powernow_table = kmalloc((number_scales + 1) * (sizeof(struct cpufreq_frequency_table)), GFP_KERNEL); + if (!powernow_table) { + retval = -ENOMEM; + goto err2; + } + + memset(powernow_table, 0, ((number_scales + 1) * sizeof(struct cpufreq_frequency_table))); + + pc.val = (unsigned long) acpi_processor_perf->states[0].control; + for (i = 0; i < number_scales; i++) { + u8 fid, vid; + unsigned int speed; + + pc.val = (unsigned long) acpi_processor_perf->states[i].control; + dprintk (KERN_INFO PFX "acpi: P%d: %d MHz %d mW %d uS control %08x SGTC %d\n", + i, + (u32) acpi_processor_perf->states[i].core_frequency, + (u32) acpi_processor_perf->states[i].power, + (u32) acpi_processor_perf->states[i].transition_latency, + (u32) acpi_processor_perf->states[i].control, + pc.bits.sgtc); + + vid = pc.bits.vid; + fid = pc.bits.fid; + + powernow_table[i].frequency = fsb * fid_codes[fid] / 10; + powernow_table[i].index = fid; /* lower 8 bits */ + powernow_table[i].index |= (vid << 8); /* upper 8 bits */ + + speed = powernow_table[i].frequency; + + if ((fid_codes[fid] % 10)==5) { + if (have_a0 == 1) + powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID; + } + + dprintk (KERN_INFO PFX " FID: 0x%x (%d.%dx [%dMHz])\t", fid, + fid_codes[fid] / 10, fid_codes[fid] % 10, speed/1000); + dprintk ("VID: 0x%x (%d.%03dV)\n", vid, mobile_vid_table[vid]/1000, + mobile_vid_table[vid]%1000); + + if (latency < pc.bits.sgtc) + latency = pc.bits.sgtc; + + if (speed < minimum_speed) + minimum_speed = speed; + if (speed > maximum_speed) + maximum_speed = speed; + } + + powernow_table[i].frequency = CPUFREQ_TABLE_END; + powernow_table[i].index = 0; + + return 0; + +err2: + acpi_processor_unregister_performance(acpi_processor_perf, 0); +err1: + kfree(acpi_processor_perf); +err0: + printk(KERN_WARNING PFX "ACPI perflib can not be used in this platform\n"); + acpi_processor_perf = NULL; + return retval; +} +#else +static int powernow_acpi_init(void) +{ + printk(KERN_INFO PFX "no support for ACPI processor found." + " Please recompile your kernel with ACPI processor\n"); + return -EINVAL; +} +#endif + static int powernow_decode_bios (int maxfid, int startvid) { struct psb_s *psb; struct pst_s *pst; - struct cpuinfo_x86 *c = cpu_data; unsigned int i, j; unsigned char *p; unsigned int etuple; unsigned int ret; etuple = cpuid_eax(0x80000001); - etuple &= 0xf00; - etuple |= (c->x86_model<<4)|(c->x86_mask); for (i=0xC0000; i < 0xffff0 ; i+=16) { @@ -305,7 +453,6 @@ static int powernow_decode_bios (int max } dprintk (KERN_INFO PFX "Settling Time: %d microseconds.\n", psb->settlingtime); dprintk (KERN_INFO PFX "Has %d PST tables. (Only dumping ones relevant to this CPU).\n", psb->numpst); - latency *= 100; /* SGTC needs to be in units of 10ns */ p += sizeof (struct psb_s); @@ -315,7 +462,8 @@ static int powernow_decode_bios (int max pst = (struct pst_s *) p; number_scales = pst->numpstates; - if ((etuple == pst->cpuid) && (maxfid==pst->maxfid) && (startvid==pst->startvid)) + if ((etuple == pst->cpuid) && check_fsb(pst->fsbspeed) && + (maxfid==pst->maxfid) && (startvid==pst->startvid)) { dprintk (KERN_INFO PFX "PST:%d (@%p)\n", i, pst); dprintk (KERN_INFO PFX " cpuid: 0x%x\t", pst->cpuid); @@ -323,7 +471,6 @@ static int powernow_decode_bios (int max dprintk ("maxFID: 0x%x\t", pst->maxfid); dprintk ("startvid: 0x%x\n", pst->startvid); - fsb = pst->fsbspeed; ret = get_ranges ((char *) pst + sizeof (struct pst_s)); return ret; @@ -335,7 +482,7 @@ static int powernow_decode_bios (int max } printk (KERN_INFO PFX "No PST tables match this cpuid (0x%x)\n", etuple); printk (KERN_INFO PFX "This is indicative of a broken BIOS.\n"); - printk (KERN_INFO PFX "See http://www.codemonkey.org.uk/projects/cpufreq/powernow-k7.shtml\n"); + return -EINVAL; } p++; @@ -365,6 +512,33 @@ static int powernow_verify (struct cpufr return cpufreq_frequency_table_verify(policy, powernow_table); } +/* + * We use the fact that the bus frequency is somehow + * a multiple of 100000/3 khz, then we compute sgtc according + * to this multiple. + * That way, we match more how AMD thinks all of that work. + * We will then get the same kind of behaviour already tested under + * the "well-known" other OS. + */ +static int __init fixup_sgtc(void) +{ + unsigned int sgtc; + unsigned int m; + + m = fsb / 3333; + if ((m % 10) >= 5) + m += 5; + + m /= 10; + + sgtc = 100 * m * latency; + sgtc = sgtc / 3; + if (sgtc > 0xfffff) { + printk(KERN_WARNING PFX "SGTC too large %d\n", sgtc); + sgtc = 0xfffff; + } + return sgtc; +} static int __init powernow_cpu_init (struct cpufreq_policy *policy) { @@ -376,18 +550,45 @@ static int __init powernow_cpu_init (str rdmsrl (MSR_K7_FID_VID_STATUS, fidvidstatus.val); - result = powernow_decode_bios(fidvidstatus.bits.MFID, fidvidstatus.bits.SVID); + /* A K7 with powernow technology is set to max frequency by BIOS */ + fsb = (10 * cpu_khz) / fid_codes[fidvidstatus.bits.CFID]; + if (!fsb) { + printk(KERN_WARNING PFX "can not determine bus frequency\n"); + return -EINVAL; + } + dprintk(KERN_INFO PFX "FSB: %3d.%03d MHz\n", fsb/1000, fsb%1000); + + if ((dmi_broken & BROKEN_CPUFREQ) || powernow_acpi_force) { + printk (KERN_INFO PFX "PSB/PST known to be broken. Trying ACPI instead\n"); + result = powernow_acpi_init(); + } else { + result = powernow_decode_bios(fidvidstatus.bits.MFID, fidvidstatus.bits.SVID); + if (result) { + printk (KERN_INFO PFX "Trying ACPI perflib\n"); + maximum_speed = 0; + minimum_speed = -1; + latency = 0; + result = powernow_acpi_init(); + if (result) { + printk (KERN_INFO PFX "ACPI and legacy methods failed\n"); + printk (KERN_INFO PFX "See http://www.codemonkey.org.uk/projects/cpufreq/powernow-k7.shtml\n"); + } + } else { + /* SGTC use the bus clock as timer */ + latency = fixup_sgtc(); + printk(KERN_INFO PFX "SGTC: %d\n", latency); + } + } + if (result) return result; printk (KERN_INFO PFX "Minimum speed %d MHz. Maximum speed %d MHz.\n", - minimum_speed, maximum_speed); + minimum_speed/1000, maximum_speed/1000); policy->governor = CPUFREQ_DEFAULT_GOVERNOR; - /* latency is in 10 ns (look for SGTC above) for each VID - * and FID transition, so multiply that value with 20 */ - policy->cpuinfo.transition_latency = latency * 20; + policy->cpuinfo.transition_latency = 20 * latency / fsb; policy->cur = maximum_speed; @@ -418,10 +619,6 @@ static struct cpufreq_driver powernow_dr static int __init powernow_init (void) { - if (dmi_broken & BROKEN_CPUFREQ) { - printk (KERN_INFO PFX "Disabled at boot time by DMI,\n"); - return -ENODEV; - } if (check_powernow()==0) return -ENODEV; return cpufreq_register_driver(&powernow_driver); @@ -430,15 +627,25 @@ static int __init powernow_init (void) static void __exit powernow_exit (void) { +#ifdef CONFIG_ACPI_PROCESSOR + if (acpi_processor_perf) { + acpi_processor_unregister_performance(acpi_processor_perf, 0); + kfree(acpi_processor_perf); + } +#endif cpufreq_unregister_driver(&powernow_driver); if (powernow_table) kfree(powernow_table); } +module_param(powernow_acpi_force, int, 0444); + +MODULE_PARM_DESC(powernow_acpi_force, "Force ACPI to be used"); + MODULE_AUTHOR ("Dave Jones "); MODULE_DESCRIPTION ("Powernow driver for AMD K7 processors."); MODULE_LICENSE ("GPL"); -module_init(powernow_init); +late_initcall(powernow_init); module_exit(powernow_exit); --- diff/arch/i386/kernel/cpu/cpufreq/powernow-k8.c 2004-04-21 10:45:33.442619448 +0100 +++ source/arch/i386/kernel/cpu/cpufreq/powernow-k8.c 2004-04-21 10:46:51.133808592 +0100 @@ -32,7 +32,7 @@ #include #include -#ifdef CONFIG_X86_POWERNOW_K8_ACPI +#ifdef CONFIG_ACPI_PROCESSOR #include #include #endif @@ -269,7 +269,7 @@ static int core_voltage_pre_transition(s dprintk(KERN_DEBUG PFX "ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, reqvid 0x%x, rvo 0x%x\n", - smp_processor_id(); + smp_processor_id(), data->currfid, data->currvid, reqvid, data->rvo); while (data->currvid > reqvid) { @@ -666,7 +666,7 @@ static int find_psb_table(struct powerno return -ENODEV; } -#ifdef CONFIG_X86_POWERNOW_K8_ACPI +#ifdef CONFIG_ACPI_PROCESSOR static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) { if (!data->acpi_data.state_count) --- diff/arch/i386/kernel/cpu/cpufreq/powernow-k8.h 2004-04-21 10:45:33.442619448 +0100 +++ source/arch/i386/kernel/cpu/cpufreq/powernow-k8.h 2004-04-21 10:46:51.133808592 +0100 @@ -29,7 +29,7 @@ struct powernow_k8_data { * frequency is in kHz */ struct cpufreq_frequency_table *powernow_table; -#ifdef CONFIG_X86_POWERNOW_K8_ACPI +#ifdef CONFIG_ACPI_PROCESSOR /* the acpi table needs to be kept. it's only available if ACPI was * used to determine valid frequency/vid/fid states */ struct acpi_processor_performance acpi_data; --- diff/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c 2004-04-21 10:45:33.443619296 +0100 +++ source/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c 2004-04-21 10:46:51.134808440 +0100 @@ -195,21 +195,6 @@ static int centrino_cpu_init_table(struc struct cpuinfo_x86 *cpu = &cpu_data[policy->cpu]; struct cpu_model *model; - if (!cpu_has(cpu, X86_FEATURE_EST)) - return -ENODEV; - - /* Only Intel Pentium M stepping 5 for now - add new CPUs as - they appear after making sure they use PERF_CTL in the same - way. */ - if (cpu->x86_vendor != X86_VENDOR_INTEL || - cpu->x86 != 6 || - cpu->x86_model != 9 || - cpu->x86_mask != 5) { - printk(KERN_INFO PFX "found unsupported CPU with Enhanced SpeedStep: " - "send /proc/cpuinfo to " MAINTAINER "\n"); - return -ENODEV; - } - for(model = models; model->model_name != NULL; model++) if (strcmp(cpu->x86_model_id, model->model_name) == 0) break; @@ -361,6 +346,7 @@ static inline int centrino_cpu_init_acpi static int centrino_cpu_init(struct cpufreq_policy *policy) { + struct cpuinfo_x86 *cpu = &cpu_data[policy->cpu]; unsigned freq; unsigned l, h; int ret; @@ -368,6 +354,21 @@ static int centrino_cpu_init(struct cpuf if (policy->cpu != 0) return -ENODEV; + if (!cpu_has(cpu, X86_FEATURE_EST)) + return -ENODEV; + + /* Only Intel Pentium M stepping 5 for now - add new CPUs as + they appear after making sure they use PERF_CTL in the same + way. */ + if (cpu->x86_vendor != X86_VENDOR_INTEL || + cpu->x86 != 6 || + cpu->x86_model != 9 || + cpu->x86_mask != 5) { + printk(KERN_INFO PFX "found unsupported CPU with Enhanced SpeedStep: " + "send /proc/cpuinfo to " MAINTAINER "\n"); + return -ENODEV; + } + if (centrino_cpu_init_acpi(policy)) { if (centrino_cpu_init_table(policy)) { return -ENODEV; --- diff/arch/i386/kernel/dmi_scan.c 2004-04-21 10:45:33.460616712 +0100 +++ source/arch/i386/kernel/dmi_scan.c 2004-04-21 10:46:51.135808288 +0100 @@ -360,6 +360,22 @@ static __init int fix_broken_hp_bios_irq } /* + * Work around broken Acer TravelMate 360 Notebooks which assign Cardbus to + * IRQ 11 even though it is actually wired to IRQ 10 + */ +static __init int fix_acer_tm360_irqrouting(struct dmi_blacklist *d) +{ +#ifdef CONFIG_PCI + extern int acer_tm360_irqrouting; + if (acer_tm360_irqrouting == 0) + { + acer_tm360_irqrouting = 1; + printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident); + } +#endif + return 0; +} +/* * Check for clue free BIOS implementations who use * the following QA technique * @@ -413,30 +429,6 @@ static __init int swab_apm_power_in_minu } /* - * The Intel 440GX hall of shame. - * - * On many (all we have checked) of these boxes the $PIRQ table is wrong. - * The MP1.4 table is right however and so SMP kernels tend to work. - */ - -static __init int broken_pirq(struct dmi_blacklist *d) -{ - - printk(KERN_INFO " *** Possibly defective BIOS detected (irqtable)\n"); - printk(KERN_INFO " *** Many BIOSes matching this signature have incorrect IRQ routing tables.\n"); - printk(KERN_INFO " *** If you see IRQ problems, in particular SCSI resets and hangs at boot\n"); - printk(KERN_INFO " *** contact your hardware vendor and ask about updates.\n"); - printk(KERN_INFO " *** Building an SMP kernel may evade the bug some of the time.\n"); -#ifdef CONFIG_X86_IO_APIC - { - extern int skip_ioapic_setup; - skip_ioapic_setup = 0; - } -#endif - return 0; -} - -/* * ASUS K7V-RM has broken ACPI table defining sleep modes */ @@ -815,52 +807,6 @@ static __initdata struct dmi_blacklist d NO_MATCH, NO_MATCH } }, - /* Problem Intel 440GX bioses */ - - { broken_pirq, "SABR1 Bios", { /* Bad $PIR */ - MATCH(DMI_BIOS_VENDOR, "Intel Corporation"), - MATCH(DMI_BIOS_VERSION,"SABR1"), - NO_MATCH, NO_MATCH - } }, - { broken_pirq, "l44GX Bios", { /* Bad $PIR */ - MATCH(DMI_BIOS_VENDOR, "Intel Corporation"), - MATCH(DMI_BIOS_VERSION,"L440GX0.86B.0094.P10"), - NO_MATCH, NO_MATCH - } }, - { broken_pirq, "l44GX Bios", { /* Bad $PIR */ - MATCH(DMI_BIOS_VENDOR, "Intel Corporation"), - MATCH(DMI_BIOS_VERSION,"L440GX0.86B.0115.P12"), - NO_MATCH, NO_MATCH - } }, - { broken_pirq, "l44GX Bios", { /* Bad $PIR */ - MATCH(DMI_BIOS_VENDOR, "Intel Corporation"), - MATCH(DMI_BIOS_VERSION,"L440GX0.86B.0120.P12"), - NO_MATCH, NO_MATCH - } }, - { broken_pirq, "l44GX Bios", { /* Bad $PIR */ - MATCH(DMI_BIOS_VENDOR, "Intel Corporation"), - MATCH(DMI_BIOS_VERSION,"L440GX0.86B.0125.P13"), - NO_MATCH, NO_MATCH - } }, - { broken_pirq, "l44GX Bios", { /* Bad $PIR */ - MATCH(DMI_BIOS_VENDOR, "Intel Corporation"), - MATCH(DMI_BIOS_VERSION,"L440GX0.86B.0066.P07.9906041405"), - NO_MATCH, NO_MATCH - } }, - - { broken_pirq, "IBM xseries 370", { /* Bad $PIR */ - MATCH(DMI_BIOS_VENDOR, "IBM"), - MATCH(DMI_BIOS_VERSION,"MMKT33AUS"), - NO_MATCH, NO_MATCH - } }, - - /* Intel in disguise - In this case they can't hide and they don't run - too well either... */ - { broken_pirq, "Dell PowerEdge 8450", { /* Bad $PIR */ - MATCH(DMI_PRODUCT_NAME, "Dell PowerEdge 8450"), - NO_MATCH, NO_MATCH, NO_MATCH - } }, - { broken_acpi_Sx, "ASUS K7V-RM", { /* Bad ACPI Sx table */ MATCH(DMI_BIOS_VERSION,"ASUS K7V-RM ACPI BIOS Revision 1003A"), MATCH(DMI_BOARD_NAME, ""), @@ -894,6 +840,13 @@ static __initdata struct dmi_blacklist d MATCH(DMI_PRODUCT_VERSION, "HP Pavilion Notebook Model GE"), MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736") } }, + + { fix_acer_tm360_irqrouting, "Acer TravelMate 36x Laptop", { + MATCH(DMI_SYS_VENDOR, "Acer"), + MATCH(DMI_PRODUCT_NAME, "TravelMate 360"), + NO_MATCH, NO_MATCH + } }, + /* @@ -1032,6 +985,12 @@ static __initdata struct dmi_blacklist d MATCH(DMI_BIOS_VERSION, "ASUS A7V ACPI BIOS Revision 1007"), NO_MATCH }}, + { disable_acpi_pci, "Acer TravelMate 36x Laptop", { + MATCH(DMI_SYS_VENDOR, "Acer"), + MATCH(DMI_PRODUCT_NAME, "TravelMate 360"), + NO_MATCH, NO_MATCH + } }, + #endif { NULL, } --- diff/arch/i386/kernel/efi.c 2004-02-09 10:36:07.000000000 +0000 +++ source/arch/i386/kernel/efi.c 2004-04-21 10:46:51.136808136 +0100 @@ -37,7 +37,6 @@ #include #include #include -#include #include #define EFI_DEBUG 0 --- diff/arch/i386/kernel/entry.S 2004-04-21 10:45:33.461616560 +0100 +++ source/arch/i386/kernel/entry.S 2004-04-21 10:46:51.136808136 +0100 @@ -48,6 +48,18 @@ #include #include #include "irq_vectors.h" + /* We do not recover from a stack overflow, but at least + * we know it happened and should be able to track it down. + */ +#ifdef CONFIG_STACK_OVERFLOW_TEST +#define STACK_OVERFLOW_TEST \ + testl $(THREAD_SIZE - 512),%esp; \ + jnz 10f; \ + call stack_overflow; \ +10: +#else +#define STACK_OVERFLOW_TEST +#endif #define nr_syscalls ((syscall_table_size)/4) @@ -100,7 +112,8 @@ TSS_ESP0_OFFSET = (4 - 0x200) pushl %ebx; \ movl $(__USER_DS), %edx; \ movl %edx, %ds; \ - movl %edx, %es; + movl %edx, %es; \ + STACK_OVERFLOW_TEST #define RESTORE_INT_REGS \ popl %ebx; \ @@ -300,6 +313,19 @@ syscall_exit: testw $_TIF_ALLWORK_MASK, %cx # current->work jne syscall_exit_work restore_all: +#ifdef CONFIG_TRAP_BAD_SYSCALL_EXITS + movl EFLAGS(%esp), %eax # mix EFLAGS and CS + movb CS(%esp), %al + testl $(VM_MASK | 3), %eax + jz resume_kernelX # returning to kernel or vm86-space + + cmpl $0,TI_PRE_COUNT(%ebx) # non-zero preempt_count ? + jz resume_kernelX + + int $3 + +resume_kernelX: +#endif RESTORE_ALL # perform work that needs to be done immediately before resumption @@ -882,9 +908,9 @@ ENTRY(sys_call_table) .long sys_utimes .long sys_fadvise64_64 .long sys_ni_syscall /* sys_vserver */ - .long sys_ni_syscall /* sys_mbind */ - .long sys_ni_syscall /* 275 sys_get_mempolicy */ - .long sys_ni_syscall /* sys_set_mempolicy */ + .long sys_mbind + .long sys_get_mempolicy + .long sys_set_mempolicy .long sys_mq_open .long sys_mq_unlink .long sys_mq_timedsend --- diff/arch/i386/kernel/i386_ksyms.c 2004-04-05 12:57:06.000000000 +0100 +++ source/arch/i386/kernel/i386_ksyms.c 2004-04-21 10:46:51.140807528 +0100 @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include --- diff/arch/i386/kernel/io_apic.c 2004-04-21 10:45:33.467615648 +0100 +++ source/arch/i386/kernel/io_apic.c 2004-04-21 10:46:51.141807376 +0100 @@ -317,8 +317,7 @@ struct irq_cpu_info { #define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask) -#define CPU_TO_PACKAGEINDEX(i) \ - ((physical_balance && i > cpu_sibling_map[i]) ? cpu_sibling_map[i] : i) +#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i])) #define MAX_BALANCED_IRQ_INTERVAL (5*HZ) #define MIN_BALANCED_IRQ_INTERVAL (HZ/2) @@ -401,6 +400,7 @@ static void do_irq_balance(void) unsigned long max_cpu_irq = 0, min_cpu_irq = (~0); unsigned long move_this_load = 0; int max_loaded = 0, min_loaded = 0; + int load; unsigned long useful_load_threshold = balanced_irq_interval + 10; int selected_irq; int tmp_loaded, first_attempt = 1; @@ -452,7 +452,7 @@ static void do_irq_balance(void) for (i = 0; i < NR_CPUS; i++) { if (!cpu_online(i)) continue; - if (physical_balance && i > cpu_sibling_map[i]) + if (i != CPU_TO_PACKAGEINDEX(i)) continue; if (min_cpu_irq > CPU_IRQ(i)) { min_cpu_irq = CPU_IRQ(i); @@ -471,7 +471,7 @@ tryanothercpu: for (i = 0; i < NR_CPUS; i++) { if (!cpu_online(i)) continue; - if (physical_balance && i > cpu_sibling_map[i]) + if (i != CPU_TO_PACKAGEINDEX(i)) continue; if (max_cpu_irq <= CPU_IRQ(i)) continue; @@ -551,9 +551,14 @@ tryanotherirq: * We seek the least loaded sibling by making the comparison * (A+B)/2 vs B */ - if (physical_balance && (CPU_IRQ(min_loaded) >> 1) > - CPU_IRQ(cpu_sibling_map[min_loaded])) - min_loaded = cpu_sibling_map[min_loaded]; + load = CPU_IRQ(min_loaded) >> 1; + for_each_cpu_mask(j, cpu_sibling_map[min_loaded]) { + if (load > CPU_IRQ(j)) { + /* This won't change cpu_sibling_map[min_loaded] */ + load = CPU_IRQ(j); + min_loaded = j; + } + } cpus_and(allowed_mask, cpu_online_map, irq_affinity[selected_irq]); target_cpu_mask = cpumask_of_cpu(min_loaded); --- diff/arch/i386/kernel/irq.c 2004-04-21 10:45:33.467615648 +0100 +++ source/arch/i386/kernel/irq.c 2004-04-21 10:46:51.142807224 +0100 @@ -41,7 +41,6 @@ #include #include #include -#include #include #include #include @@ -570,6 +569,8 @@ out: irq_exit(); + kgdb_process_breakpoint(); + return 1; } --- diff/arch/i386/kernel/mpparse.c 2004-04-05 12:57:06.000000000 +0100 +++ source/arch/i386/kernel/mpparse.c 2004-04-21 10:46:51.155805248 +0100 @@ -28,7 +28,6 @@ #include #include #include -#include #include #include @@ -929,8 +928,6 @@ void __init mp_override_legacy_irq ( u32 gsi) { struct mpc_config_intsrc intsrc; - int i = 0; - int found = 0; int ioapic = -1; int pin = -1; @@ -963,23 +960,9 @@ void __init mp_override_legacy_irq ( (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq); - /* - * If an existing [IOAPIC.PIN -> IRQ] routing entry exists we override it. - * Otherwise create a new entry (e.g. gsi == 2). - */ - for (i = 0; i < mp_irq_entries; i++) { - if ((mp_irqs[i].mpc_srcbus == intsrc.mpc_srcbus) - && (mp_irqs[i].mpc_srcbusirq == intsrc.mpc_srcbusirq)) { - mp_irqs[i] = intsrc; - found = 1; - break; - } - } - if (!found) { - mp_irqs[mp_irq_entries] = intsrc; - if (++mp_irq_entries == MAX_IRQ_SOURCES) - panic("Max # of irq sources exceeded!\n"); - } + mp_irqs[mp_irq_entries] = intsrc; + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!\n"); return; } @@ -1010,13 +993,20 @@ void __init mp_config_acpi_legacy_irqs ( intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* - * Use the default configuration for the IRQs 0-15. These may be + * Use the default configuration for the IRQs 0-15. Unless * overriden by (MADT) interrupt source override entries. */ for (i = 0; i < 16; i++) { + int idx; + + for (idx = 0; idx < mp_irq_entries; idx++) + if (mp_irqs[idx].mpc_srcbus == MP_ISA_BUS && + (mp_irqs[idx].mpc_srcbusirq == i || + mp_irqs[idx].mpc_dstirq == i)) + break; - if (i == 2) - continue; /* Don't connect IRQ2 */ + if (idx != mp_irq_entries) + continue; /* IRQ already used */ intsrc.mpc_irqtype = mp_INT; intsrc.mpc_srcbusirq = i; /* Identity mapped */ --- diff/arch/i386/kernel/nmi.c 2004-04-21 10:45:33.468615496 +0100 +++ source/arch/i386/kernel/nmi.c 2004-04-21 10:46:51.155805248 +0100 @@ -31,9 +31,19 @@ #include #include +#ifdef CONFIG_KGDB +#include +#ifdef CONFIG_SMP +unsigned int nmi_watchdog = NMI_IO_APIC; +#else +unsigned int nmi_watchdog = NMI_LOCAL_APIC; +#endif +#else unsigned int nmi_watchdog = NMI_NONE; +#endif static unsigned int nmi_hz = HZ; -unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ +static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ +static unsigned int nmi_p4_cccr_val; extern void show_registers(struct pt_regs *regs); /* nmi_active: @@ -66,7 +76,8 @@ int nmi_active; #define P4_ESCR_EVENT_SELECT(N) ((N)<<25) #define P4_ESCR_OS (1<<3) #define P4_ESCR_USR (1<<2) -#define P4_CCCR_OVF_PMI (1<<26) +#define P4_CCCR_OVF_PMI0 (1<<26) +#define P4_CCCR_OVF_PMI1 (1<<27) #define P4_CCCR_THRESHOLD(N) ((N)<<20) #define P4_CCCR_COMPLEMENT (1<<19) #define P4_CCCR_COMPARE (1<<18) @@ -79,7 +90,7 @@ int nmi_active; #define MSR_P4_IQ_COUNTER0 0x30C #define P4_NMI_CRU_ESCR0 (P4_ESCR_EVENT_SELECT(0x3F)|P4_ESCR_OS|P4_ESCR_USR) #define P4_NMI_IQ_CCCR0 \ - (P4_CCCR_OVF_PMI|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ + (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) int __init check_nmi_watchdog (void) @@ -322,6 +333,11 @@ static int setup_p4_watchdog(void) return 0; nmi_perfctr_msr = MSR_P4_IQ_COUNTER0; + nmi_p4_cccr_val = P4_NMI_IQ_CCCR0; +#ifdef CONFIG_SMP + if (smp_num_siblings == 2) + nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1; +#endif if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL)) clear_msr_range(0x3F1, 2); @@ -339,7 +355,7 @@ static int setup_p4_watchdog(void) Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz/nmi_hz*1000)); wrmsr(MSR_P4_IQ_COUNTER0, -(cpu_khz/nmi_hz*1000), -1); apic_write(APIC_LVTPC, APIC_DM_NMI); - wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0, 0); + wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); return 1; } @@ -408,6 +424,9 @@ void touch_nmi_watchdog (void) for (i = 0; i < NR_CPUS; i++) alert_counter[i] = 0; } +#ifdef CONFIG_KGDB +int tune_watchdog = 5*HZ; +#endif void nmi_watchdog_tick (struct pt_regs * regs) { @@ -421,12 +440,24 @@ void nmi_watchdog_tick (struct pt_regs * sum = irq_stat[cpu].apic_timer_irqs; +#ifdef CONFIG_KGDB + if (! in_kgdb(regs) && last_irq_sums[cpu] == sum ) { + +#else if (last_irq_sums[cpu] == sum) { +#endif /* * Ayiee, looks like this CPU is stuck ... * wait a few IRQs (5 seconds) before doing the oops ... */ alert_counter[cpu]++; +#ifdef CONFIG_KGDB + if (alert_counter[cpu] == tune_watchdog) { + kgdb_handle_exception(2, SIGPWR, 0, regs); + last_irq_sums[cpu] = sum; + alert_counter[cpu] = 0; + } +#endif if (alert_counter[cpu] == 5*nmi_hz) { spin_lock(&nmi_print_lock); /* @@ -455,7 +486,7 @@ void nmi_watchdog_tick (struct pt_regs * * - LVTPC is masked on interrupt and must be * unmasked by the LVTPC handler. */ - wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0, 0); + wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); apic_write(APIC_LVTPC, APIC_DM_NMI); } else if (nmi_perfctr_msr == MSR_P6_PERFCTR0) { --- diff/arch/i386/kernel/process.c 2004-04-21 10:45:33.473614736 +0100 +++ source/arch/i386/kernel/process.c 2004-04-21 10:46:51.156805096 +0100 @@ -629,11 +629,6 @@ out: return error; } -/* - * These bracket the sleeping functions.. - */ -#define first_sched ((unsigned long) scheduling_functions_start_here) -#define last_sched ((unsigned long) scheduling_functions_end_here) #define top_esp (THREAD_SIZE - sizeof(unsigned long)) #define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long)) @@ -654,14 +649,12 @@ unsigned long get_wchan(struct task_stru if (ebp < stack_page || ebp > top_ebp+stack_page) return 0; eip = *(unsigned long *) (ebp+4); - if (eip < first_sched || eip >= last_sched) + if (!in_sched_functions(eip)) return eip; ebp = *(unsigned long *) ebp; } while (count++ < 16); return 0; } -#undef last_sched -#undef first_sched /* * sys_alloc_thread_area: get a yet unused TLS descriptor index. --- diff/arch/i386/kernel/setup.c 2004-04-21 10:45:33.483613216 +0100 +++ source/arch/i386/kernel/setup.c 2004-04-21 10:46:51.156805096 +0100 @@ -128,7 +128,6 @@ unsigned long saved_videomode; #define RAMDISK_LOAD_FLAG 0x4000 static char command_line[COMMAND_LINE_SIZE]; - char saved_command_line[COMMAND_LINE_SIZE]; unsigned char __initdata boot_params[PARAM_SIZE]; --- diff/arch/i386/kernel/smp.c 2004-03-11 10:20:20.000000000 +0000 +++ source/arch/i386/kernel/smp.c 2004-04-21 10:46:51.167803424 +0100 @@ -21,7 +21,6 @@ #include #include -#include #include #include #include @@ -466,7 +465,17 @@ void flush_tlb_all(void) { on_each_cpu(do_flush_tlb_all, 0, 1, 1); } - +#ifdef CONFIG_KGDB +/* + * By using the NMI code instead of a vector we just sneak thru the + * word generator coming out with just what we want. AND it does + * not matter if clustered_apic_mode is set or not. + */ +void smp_send_nmi_allbutself(void) +{ + send_IPI_allbutself(APIC_DM_NMI); +} +#endif /* * this function sends a 'reschedule' IPI to another CPU. * it goes straight through and wastes no time serializing --- diff/arch/i386/kernel/smpboot.c 2004-04-21 10:45:33.484613064 +0100 +++ source/arch/i386/kernel/smpboot.c 2004-04-21 10:46:51.157804944 +0100 @@ -39,6 +39,7 @@ #include #include +#include #include #include #include @@ -46,7 +47,6 @@ #include #include -#include #include #include #include @@ -936,7 +936,7 @@ static int boot_cpu_logical_apicid; /* Where the IO area was mapped on multiquad, always 0 otherwise */ void *xquad_portio; -int cpu_sibling_map[NR_CPUS] __cacheline_aligned; +cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned; static void __init smp_boot_cpus(unsigned int max_cpus) { @@ -955,6 +955,8 @@ static void __init smp_boot_cpus(unsigne current_thread_info()->cpu = 0; smp_tune_scheduling(); + cpus_clear(cpu_sibling_map[0]); + cpu_set(0, cpu_sibling_map[0]); /* * If we couldn't find an SMP configuration at boot time, @@ -1081,34 +1083,39 @@ static void __init smp_boot_cpus(unsigne Dprintk("Boot done.\n"); /* - * If Hyper-Threading is avaialble, construct cpu_sibling_map[], so - * that we can tell the sibling CPU efficiently. + * construct cpu_sibling_map[], so that we can tell sibling CPUs + * efficiently. */ - if (cpu_has_ht && smp_num_siblings > 1) { - for (cpu = 0; cpu < NR_CPUS; cpu++) - cpu_sibling_map[cpu] = NO_PROC_ID; - - for (cpu = 0; cpu < NR_CPUS; cpu++) { - int i; - if (!cpu_isset(cpu, cpu_callout_map)) - continue; + for (cpu = 0; cpu < NR_CPUS; cpu++) + cpus_clear(cpu_sibling_map[cpu]); + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + int siblings = 0; + int i; + if (!cpu_isset(cpu, cpu_callout_map)) + continue; + if (smp_num_siblings > 1) { for (i = 0; i < NR_CPUS; i++) { - if (i == cpu || !cpu_isset(i, cpu_callout_map)) + if (!cpu_isset(i, cpu_callout_map)) continue; if (phys_proc_id[cpu] == phys_proc_id[i]) { - cpu_sibling_map[cpu] = i; - printk("cpu_sibling_map[%d] = %d\n", cpu, cpu_sibling_map[cpu]); - break; + siblings++; + cpu_set(i, cpu_sibling_map[cpu]); } } - if (cpu_sibling_map[cpu] == NO_PROC_ID) { - smp_num_siblings = 1; - printk(KERN_WARNING "WARNING: No sibling found for CPU %d.\n", cpu); - } + } else { + siblings++; + cpu_set(cpu, cpu_sibling_map[cpu]); } + + if (siblings != smp_num_siblings) + printk(KERN_WARNING "WARNING: %d siblings found for CPU%d, should be %d\n", siblings, cpu, smp_num_siblings); } + if (nmi_watchdog == NMI_LOCAL_APIC) + check_nmi_watchdog(); + smpboot_setup_io_apic(); setup_boot_APIC_clock(); @@ -1120,6 +1127,209 @@ static void __init smp_boot_cpus(unsigne synchronize_tsc_bp(); } +#ifdef CONFIG_SCHED_SMT +#ifdef CONFIG_NUMA +static struct sched_group sched_group_cpus[NR_CPUS]; +static struct sched_group sched_group_phys[NR_CPUS]; +static struct sched_group sched_group_nodes[MAX_NUMNODES]; +static DEFINE_PER_CPU(struct sched_domain, cpu_domains); +static DEFINE_PER_CPU(struct sched_domain, phys_domains); +static DEFINE_PER_CPU(struct sched_domain, node_domains); +__init void arch_init_sched_domains(void) +{ + int i; + struct sched_group *first = NULL, *last = NULL; + + /* Set up domains */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + struct sched_domain *phys_domain = &per_cpu(phys_domains, i); + struct sched_domain *node_domain = &per_cpu(node_domains, i); + int node = cpu_to_node(i); + cpumask_t nodemask = node_to_cpumask(node); + + *cpu_domain = SD_SIBLING_INIT; + cpu_domain->span = cpu_sibling_map[i]; + cpu_domain->parent = phys_domain; + cpu_domain->groups = &sched_group_cpus[i]; + + *phys_domain = SD_CPU_INIT; + phys_domain->span = nodemask; + phys_domain->parent = node_domain; + phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)]; + + *node_domain = SD_NODE_INIT; + node_domain->span = cpu_possible_map; + node_domain->groups = &sched_group_nodes[cpu_to_node(i)]; + } + + /* Set up CPU (sibling) groups */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + int j; + first = last = NULL; + + if (i != first_cpu(cpu_domain->span)) + continue; + + for_each_cpu_mask(j, cpu_domain->span) { + struct sched_group *cpu = &sched_group_cpus[j]; + + cpu->cpumask = CPU_MASK_NONE; + cpu_set(j, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; + + if (!first) + first = cpu; + if (last) + last->next = cpu; + last = cpu; + } + last->next = first; + } + + for (i = 0; i < MAX_NUMNODES; i++) { + int j; + cpumask_t nodemask; + struct sched_group *node = &sched_group_nodes[i]; + cpus_and(nodemask, node_to_cpumask(i), cpu_possible_map); + + if (cpus_empty(nodemask)) + continue; + + first = last = NULL; + /* Set up physical groups */ + for_each_cpu_mask(j, nodemask) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, j); + struct sched_group *cpu = &sched_group_phys[j]; + + if (j != first_cpu(cpu_domain->span)) + continue; + + cpu->cpumask = cpu_domain->span; + /* + * Make each extra sibling increase power by 10% of + * the basic CPU. This is very arbitrary. + */ + cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10; + node->cpu_power += cpu->cpu_power; + + if (!first) + first = cpu; + if (last) + last->next = cpu; + last = cpu; + } + last->next = first; + } + + /* Set up nodes */ + first = last = NULL; + for (i = 0; i < MAX_NUMNODES; i++) { + struct sched_group *cpu = &sched_group_nodes[i]; + cpumask_t nodemask; + cpus_and(nodemask, node_to_cpumask(i), cpu_possible_map); + + if (cpus_empty(nodemask)) + continue; + + cpu->cpumask = nodemask; + /* ->cpu_power already setup */ + + if (!first) + first = cpu; + if (last) + last->next = cpu; + last = cpu; + } + last->next = first; + + mb(); + for_each_cpu(i) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + cpu_attach_domain(cpu_domain, i); + } +} +#else /* !CONFIG_NUMA */ +static struct sched_group sched_group_cpus[NR_CPUS]; +static struct sched_group sched_group_phys[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_domain, cpu_domains); +static DEFINE_PER_CPU(struct sched_domain, phys_domains); +__init void arch_init_sched_domains(void) +{ + int i; + struct sched_group *first = NULL, *last = NULL; + + /* Set up domains */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + struct sched_domain *phys_domain = &per_cpu(phys_domains, i); + + *cpu_domain = SD_SIBLING_INIT; + cpu_domain->span = cpu_sibling_map[i]; + cpu_domain->parent = phys_domain; + cpu_domain->groups = &sched_group_cpus[i]; + + *phys_domain = SD_CPU_INIT; + phys_domain->span = cpu_possible_map; + phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)]; + } + + /* Set up CPU (sibling) groups */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + int j; + first = last = NULL; + + if (i != first_cpu(cpu_domain->span)) + continue; + + for_each_cpu_mask(j, cpu_domain->span) { + struct sched_group *cpu = &sched_group_cpus[j]; + + cpus_clear(cpu->cpumask); + cpu_set(j, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; + + if (!first) + first = cpu; + if (last) + last->next = cpu; + last = cpu; + } + last->next = first; + } + + first = last = NULL; + /* Set up physical groups */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + struct sched_group *cpu = &sched_group_phys[i]; + + if (i != first_cpu(cpu_domain->span)) + continue; + + cpu->cpumask = cpu_domain->span; + /* See SMT+NUMA setup for comment */ + cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10; + + if (!first) + first = cpu; + if (last) + last->next = cpu; + last = cpu; + } + last->next = first; + + mb(); + for_each_cpu(i) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + cpu_attach_domain(cpu_domain, i); + } +} +#endif /* CONFIG_NUMA */ +#endif /* CONFIG_SCHED_SMT */ + /* These are wrappers to interface to the new boot process. Someone who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */ void __init smp_prepare_cpus(unsigned int max_cpus) --- diff/arch/i386/kernel/traps.c 2004-04-21 10:45:33.487612608 +0100 +++ source/arch/i386/kernel/traps.c 2004-04-21 10:46:51.168803272 +0100 @@ -47,7 +47,6 @@ #include #include -#include #include #include @@ -92,6 +91,40 @@ asmlinkage void alignment_check(void); asmlinkage void spurious_interrupt_bug(void); asmlinkage void machine_check(void); +#ifdef CONFIG_KGDB +extern void sysenter_entry(void); +#include +#include +void set_intr_gate(unsigned int n, void *addr); +static void set_intr_usr_gate(unsigned int n, void *addr); +/* + * Should be able to call this breakpoint() very early in + * bring up. Just hard code the call where needed. + * The breakpoint() code is here because set_?_gate() functions + * are local (static) to trap.c. They need be done only once, + * but it does not hurt to do them over. + */ +void breakpoint(void) +{ + set_intr_usr_gate(3,&int3); /* disable ints on trap */ + set_intr_gate(1,&debug); + set_intr_gate(14,&page_fault); + + BREAKPOINT; +} +#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) \ + { \ + if (!user_mode(regs) ) \ + { \ + kgdb_handle_exception(trapnr, signr, error_code, regs); \ + after; \ + } else if ((trapnr == 3) && (regs->eflags &0x200)) local_irq_enable(); \ + } +#else +#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) +#endif + + static int kstack_depth_to_print = 24; void show_trace(struct task_struct *task, unsigned long * stack) @@ -184,7 +217,7 @@ void show_registers(struct pt_regs *regs ss = regs->xss & 0xffff; } print_modules(); - printk("CPU: %d\nEIP: %04x:[<%08lx>] %s\nEFLAGS: %08lx" + printk("CPU: %d\nEIP: %04x:[<%08lx>] %s VLI\nEFLAGS: %08lx" " (%s) \n", smp_processor_id(), 0xffff & regs->xcs, regs->eip, print_tainted(), regs->eflags, UTS_RELEASE); @@ -202,23 +235,25 @@ void show_registers(struct pt_regs *regs * time of the fault.. */ if (in_kernel) { + u8 *eip; printk("\nStack: "); show_stack(NULL, (unsigned long*)esp); printk("Code: "); - if(regs->eip < PAGE_OFFSET) - goto bad; - for(i=0;i<20;i++) - { + eip = (u8 *)regs->eip - 43; + for (i = 0; i < 64; i++, eip++) { unsigned char c; - if(__get_user(c, &((unsigned char*)regs->eip)[i])) { -bad: + + if (eip < (u8 *)PAGE_OFFSET || __get_user(c, eip)) { printk(" Bad EIP value."); break; } - printk("%02x ", c); + if (eip == (u8 *)regs->eip) + printk("<%02x> ", c); + else + printk("%02x ", c); } } printk("\n"); @@ -286,6 +321,15 @@ void die(const char * str, struct pt_reg #endif if (nl) printk("\n"); +#ifdef CONFIG_KGDB + /* This is about the only place we want to go to kgdb even if in + * user mode. But we must go in via a trap so within kgdb we will + * always be in kernel mode. + */ + if (user_mode(regs)) + BREAKPOINT; +#endif + CHK_REMOTE_DEBUG(0,SIGTRAP,err,regs,) show_registers(regs); bust_spinlocks(0); spin_unlock_irq(&die_lock); @@ -355,6 +399,7 @@ static inline void do_trap(int trapnr, i #define DO_ERROR(trapnr, signr, str, name) \ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ { \ + CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,)\ do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ } @@ -372,7 +417,9 @@ asmlinkage void do_##name(struct pt_regs #define DO_VM86_ERROR(trapnr, signr, str, name) \ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ { \ + CHK_REMOTE_DEBUG(trapnr, signr, error_code,regs, return)\ do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ + return; \ } #define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ @@ -419,8 +466,10 @@ gp_in_vm86: return; gp_in_kernel: - if (!fixup_exception(regs)) + if (!fixup_exception(regs)){ + CHK_REMOTE_DEBUG(13,SIGSEGV,error_code,regs,) die("general protection fault", regs, error_code); + } } static void mem_parity_error(unsigned char reason, struct pt_regs * regs) @@ -582,8 +631,18 @@ asmlinkage void do_debug(struct pt_regs * allowing programs to debug themselves without the ptrace() * interface. */ +#ifdef CONFIG_KGDB + /* + * I think this is the only "real" case of a TF in the kernel + * that really belongs to user space. Others are + * "Ours all ours!" + */ + if (((regs->xcs & 3) == 0) && ((void *)regs->eip == sysenter_entry)) + goto clear_TF_reenable; +#else if ((regs->xcs & 3) == 0) goto clear_TF_reenable; +#endif if ((tsk->ptrace & (PT_DTRACE|PT_PTRACED)) == PT_DTRACE) goto clear_TF; } @@ -595,6 +654,17 @@ asmlinkage void do_debug(struct pt_regs info.si_errno = 0; info.si_code = TRAP_BRKPT; +#ifdef CONFIG_KGDB + /* + * If this is a kernel mode trap, we need to reset db7 to allow us + * to continue sanely ALSO skip the signal delivery + */ + if ((regs->xcs & 3) == 0) + goto clear_dr7; + + /* if not kernel, allow ints but only if they were on */ + if ( regs->eflags & 0x200) local_irq_enable(); +#endif /* If this is a kernel mode trap, save the user PC on entry to * the kernel, that's what the debugger can make sense of. */ @@ -609,6 +679,7 @@ clear_dr7: __asm__("movl %0,%%db7" : /* no output */ : "r" (0)); + CHK_REMOTE_DEBUG(1,SIGTRAP,error_code,regs,) return; debug_vm86: @@ -857,6 +928,12 @@ static void __init set_call_gate(void *a { _set_gate(a,12,3,addr,__KERNEL_CS); } +#ifdef CONFIG_KGDB +void set_intr_usr_gate(unsigned int n, void *addr) +{ + _set_gate(idt_table+n,14,3,addr,__KERNEL_CS); +} +#endif static void __init set_task_gate(unsigned int n, unsigned int gdt_entry) { @@ -879,7 +956,11 @@ void __init trap_init(void) set_trap_gate(0,÷_error); set_intr_gate(1,&debug); set_intr_gate(2,&nmi); +#ifndef CONFIG_KGDB set_system_gate(3,&int3); /* int3-5 can be called from all */ +#else + set_intr_usr_gate(3,&int3); /* int3-5 can be called from all */ +#endif set_system_gate(4,&overflow); set_system_gate(5,&bounds); set_trap_gate(6,&invalid_op); --- diff/arch/i386/kernel/vm86.c 2004-03-11 10:20:20.000000000 +0000 +++ source/arch/i386/kernel/vm86.c 2004-04-21 10:46:51.170802968 +0100 @@ -44,7 +44,6 @@ #include #include -#include #include #include #include --- diff/arch/i386/lib/Makefile 2004-04-05 12:57:06.000000000 +0100 +++ source/arch/i386/lib/Makefile 2004-04-21 10:46:51.171802816 +0100 @@ -9,3 +9,4 @@ lib-y = checksum.o delay.o \ lib-$(CONFIG_X86_USE_3DNOW) += mmx.o lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o +lib-$(CONFIG_KGDB) += kgdb_serial.o --- diff/arch/i386/lib/dec_and_lock.c 2003-10-09 09:47:16.000000000 +0100 +++ source/arch/i386/lib/dec_and_lock.c 2004-04-21 10:46:51.170802968 +0100 @@ -10,6 +10,7 @@ #include #include +#ifndef ATOMIC_DEC_AND_LOCK int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) { int counter; @@ -38,3 +39,5 @@ slow_path: spin_unlock(lock); return 0; } +#endif + --- diff/arch/i386/mach-default/setup.c 2002-12-30 10:17:12.000000000 +0000 +++ source/arch/i386/mach-default/setup.c 2004-04-21 10:46:51.171802816 +0100 @@ -7,6 +7,7 @@ #include #include #include +#include #include /** @@ -43,7 +44,8 @@ void __init intr_init_hook(void) apic_intr_init(); #endif - setup_irq(2, &irq2); + if (!acpi_ioapic) + setup_irq(2, &irq2); } /** --- diff/arch/i386/mach-es7000/es7000.c 2004-02-09 10:36:07.000000000 +0000 +++ source/arch/i386/mach-es7000/es7000.c 2004-04-21 10:46:51.172802664 +0100 @@ -83,6 +83,7 @@ parse_unisys_oem (char *oemptr, int oem_ host = (struct mip_reg *)val; host_reg = __va(host); val = MIP_RD_LO(mi->mip_reg); + mip_port = MIP_PORT(mi->mip_info); mip_addr = val; mip = (struct mip_reg *)val; mip_reg = __va(mip); --- diff/arch/i386/mach-es7000/es7000.h 2003-06-30 10:07:28.000000000 +0100 +++ source/arch/i386/mach-es7000/es7000.h 2004-04-21 10:46:51.172802664 +0100 @@ -30,6 +30,7 @@ #define MIP_BUSY 1 #define MIP_SPIN 0xf0000 #define MIP_VALID 0x0100000000000000 +#define MIP_PORT(VALUE) ((VALUE >> 32) & 0xffff) #define MIP_RD_LO(VALUE) (VALUE & 0xffffffff) --- diff/arch/i386/mach-es7000/setup.c 2003-06-30 10:07:28.000000000 +0100 +++ source/arch/i386/mach-es7000/setup.c 2004-04-21 10:46:51.173802512 +0100 @@ -7,6 +7,7 @@ #include #include #include +#include #include /** @@ -17,8 +18,7 @@ * the "ordinary" interrupt call gates. For legacy reasons, the ISA * interrupts should be initialised here if the machine emulates a PC * in any way. - **/ -void __init pre_intr_init_hook(void) + **/void __init pre_intr_init_hook(void) { init_ISA_irqs(); } @@ -43,7 +43,8 @@ void __init intr_init_hook(void) apic_intr_init(); #endif - setup_irq(2, &irq2); + if (!acpi_ioapic) + setup_irq(2, &irq2); } /** --- diff/arch/i386/mach-visws/traps.c 2003-02-26 16:01:06.000000000 +0000 +++ source/arch/i386/mach-visws/traps.c 2004-04-21 10:46:51.173802512 +0100 @@ -8,7 +8,6 @@ #include #include -#include #include #include #include "cobalt.h" --- diff/arch/i386/mach-voyager/setup.c 2003-01-02 10:43:02.000000000 +0000 +++ source/arch/i386/mach-voyager/setup.c 2004-04-21 10:46:51.173802512 +0100 @@ -6,6 +6,7 @@ #include #include #include +#include #include void __init pre_intr_init_hook(void) @@ -24,7 +25,8 @@ void __init intr_init_hook(void) smp_intr_init(); #endif - setup_irq(2, &irq2); + if (!acpi_ioapic) + setup_irq(2, &irq2); } void __init pre_setup_arch_hook(void) --- diff/arch/i386/mach-voyager/voyager_basic.c 2003-10-09 09:47:33.000000000 +0100 +++ source/arch/i386/mach-voyager/voyager_basic.c 2004-04-21 10:46:51.180801448 +0100 @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include --- diff/arch/i386/mach-voyager/voyager_smp.c 2004-04-21 10:45:33.500610632 +0100 +++ source/arch/i386/mach-voyager/voyager_smp.c 2004-04-21 10:46:51.181801296 +0100 @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include --- diff/arch/i386/mach-voyager/voyager_thread.c 2004-04-21 10:45:33.500610632 +0100 +++ source/arch/i386/mach-voyager/voyager_thread.c 2004-04-21 10:46:51.181801296 +0100 @@ -28,7 +28,6 @@ #include #include #include -#include #include #include --- diff/arch/i386/mm/fault.c 2003-12-19 09:51:11.000000000 +0000 +++ source/arch/i386/mm/fault.c 2004-04-21 10:46:51.187800384 +0100 @@ -24,7 +24,6 @@ #include #include -#include #include #include @@ -403,6 +402,12 @@ no_context: * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice. */ +#ifdef CONFIG_KGDB + if (!user_mode(regs)){ + kgdb_handle_exception(14,SIGBUS, error_code, regs); + return; + } +#endif bust_spinlocks(1); --- diff/arch/i386/mm/hugetlbpage.c 2004-04-21 10:45:33.510609112 +0100 +++ source/arch/i386/mm/hugetlbpage.c 2004-04-21 10:46:51.188800232 +0100 @@ -16,7 +16,6 @@ #include #include #include -#include #include #include @@ -210,19 +209,18 @@ void unmap_hugepage_range(struct vm_area { struct mm_struct *mm = vma->vm_mm; unsigned long address; - pte_t *pte; + pte_t pte; struct page *page; BUG_ON(start & (HPAGE_SIZE - 1)); BUG_ON(end & (HPAGE_SIZE - 1)); for (address = start; address < end; address += HPAGE_SIZE) { - pte = huge_pte_offset(mm, address); - if (pte_none(*pte)) + pte = ptep_get_and_clear(huge_pte_offset(mm, address)); + if (pte_none(pte)) continue; - page = pte_page(*pte); + page = pte_page(pte); huge_page_release(page); - pte_clear(pte); } mm->rss -= (end - start) >> PAGE_SHIFT; flush_tlb_range(vma, start, end); --- diff/arch/i386/mm/init.c 2004-04-21 10:45:33.516608200 +0100 +++ source/arch/i386/mm/init.c 2004-04-21 10:46:51.188800232 +0100 @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include --- diff/arch/i386/mm/ioremap.c 2003-11-25 15:24:57.000000000 +0000 +++ source/arch/i386/mm/ioremap.c 2004-04-21 10:46:51.194799320 +0100 @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include --- diff/arch/i386/oprofile/op_model_p4.c 2003-08-26 10:00:51.000000000 +0100 +++ source/arch/i386/oprofile/op_model_p4.c 2004-04-21 10:46:51.202798104 +0100 @@ -382,11 +382,8 @@ static struct p4_event_binding p4_events static unsigned int get_stagger(void) { #ifdef CONFIG_SMP - int cpu; - if (smp_num_siblings > 1) { - cpu = smp_processor_id(); - return (cpu_sibling_map[cpu] > cpu) ? 0 : 1; - } + int cpu = smp_processor_id(); + return (cpu != first_cpu(cpu_sibling_map[cpu])); #endif return 0; } --- diff/arch/i386/pci/irq.c 2004-04-21 10:45:33.531605920 +0100 +++ source/arch/i386/pci/irq.c 2004-04-21 10:46:51.202798104 +0100 @@ -23,6 +23,7 @@ #define PIRQ_VERSION 0x0100 int broken_hp_bios_irq9; +int acer_tm360_irqrouting; static struct irq_routing_table *pirq_table; @@ -453,15 +454,12 @@ static int pirq_bios_set(struct pci_dev static __init int intel_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) { -#if 0 /* Let's see what chip this is supposed to be ... */ - /* We must not touch 440GX even if we have tables. 440GX has - different IRQ routing weirdness */ + /* 440GX has a proprietary PIRQ router -- don't use it */ if ( pci_find_device(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_0, NULL) || pci_find_device(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_2, NULL)) return 0; -#endif switch(device) { @@ -749,6 +747,13 @@ static int pcibios_lookup_irq(struct pci r->set(pirq_router_dev, dev, pirq, 11); } + /* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */ + if (acer_tm360_irqrouting && pirq == 0x63 && dev->irq == 11) { + dev->irq = 10; + pci_write_config_byte(dev, PCI_INTERRUPT_LINE, 10); + r->set(pirq_router_dev, dev, pirq, 10); + } + /* * Find the best IRQ to assign: use the one * reported by the device if possible. --- diff/arch/ia64/Kconfig 2004-04-21 10:45:33.532605768 +0100 +++ source/arch/ia64/Kconfig 2004-04-21 10:46:51.205797648 +0100 @@ -444,6 +444,19 @@ config MAGIC_SYSRQ keys are documented in . Don't say Y unless you really know what this hack does. +config SCHEDSTATS + bool "Collect scheduler statistics" + depends on PROC_FS + default y + help + If you say Y here, additional code will be inserted into the + scheduler and related routines to collect statistics about + scheduler behavior and provide them in /proc/schedstat. These + stats may be useful for both tuning and debugging the scheduler + If you aren't debugging the scheduler or trying to tune a specific + application, you can say N to avoid the very slight overhead + this adds. + config DEBUG_SLAB bool "Debug memory allocations" depends on DEBUG_KERNEL @@ -493,6 +506,13 @@ config DEBUG_INFO Say Y here only if you plan to use gdb to debug the kernel. If you don't debug the kernel, you can say N. +config LOCKMETER + bool "Kernel lock metering" + depends on SMP + help + Say Y to enable kernel lock metering, which adds overhead to SMP locks, + but allows you to see various statistics using the lockstat command. + config SYSVIPC_COMPAT bool depends on COMPAT && SYSVIPC --- diff/arch/ia64/ia32/binfmt_elf32.c 2004-04-21 10:45:33.537605008 +0100 +++ source/arch/ia64/ia32/binfmt_elf32.c 2004-04-21 10:46:51.203797952 +0100 @@ -104,6 +104,7 @@ ia64_elf32_init (struct pt_regs *regs) vma->vm_pgoff = 0; vma->vm_file = NULL; vma->vm_private_data = NULL; + mpol_set_vma_default(vma); down_write(¤t->mm->mmap_sem); { insert_vm_struct(current->mm, vma); @@ -190,6 +191,7 @@ ia32_setup_arg_pages (struct linux_binpr mpnt->vm_pgoff = 0; mpnt->vm_file = NULL; mpnt->vm_private_data = 0; + mpol_set_vma_default(mpnt); insert_vm_struct(current->mm, mpnt); current->mm->total_vm = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; } --- diff/arch/ia64/ia32/ia32_entry.S 2004-04-05 12:57:06.000000000 +0100 +++ source/arch/ia64/ia32/ia32_entry.S 2004-04-21 10:46:51.203797952 +0100 @@ -350,11 +350,11 @@ ia32_syscall_table: data8 sys_setfsgid /* 16-bit version */ data8 sys_llseek /* 140 */ data8 sys32_getdents - data8 sys32_select + data8 compat_sys_select data8 sys_flock data8 sys32_msync - data8 sys32_readv /* 145 */ - data8 sys32_writev + data8 compat_sys_readv /* 145 */ + data8 compat_sys_writev data8 sys_getsid data8 sys_fdatasync data8 sys32_sysctl --- diff/arch/ia64/ia32/sys_ia32.c 2004-04-21 10:45:33.539604704 +0100 +++ source/arch/ia64/ia32/sys_ia32.c 2004-04-21 10:46:51.205797648 +0100 @@ -90,58 +90,17 @@ extern unsigned long arch_get_unmapped_a /* XXX make per-mm: */ static DECLARE_MUTEX(ia32_mmap_sem); -static int -nargs (unsigned int arg, char **ap) -{ - unsigned int addr; - int n, err; - - if (!arg) - return 0; - - n = 0; - do { - err = get_user(addr, (unsigned int *)A(arg)); - if (err) - return err; - if (ap) - *ap++ = (char *) A(addr); - arg += sizeof(unsigned int); - n++; - } while (addr); - return n - 1; -} - asmlinkage long -sys32_execve (char *filename, unsigned int argv, unsigned int envp, - struct pt_regs *regs) +sys32_execve (char *name, compat_uptr_t __user *argv, compat_uptr_t __user *envp, struct pt_regs *regs) { + long error; + char *filename; unsigned long old_map_base, old_task_size, tssd; - char **av, **ae; - int na, ne, len; - long r; - - na = nargs(argv, NULL); - if (na < 0) - return na; - ne = nargs(envp, NULL); - if (ne < 0) - return ne; - len = (na + ne + 2) * sizeof(*av); - av = kmalloc(len, GFP_KERNEL); - if (!av) - return -ENOMEM; - ae = av + na + 1; - av[na] = NULL; - ae[ne] = NULL; - - r = nargs(argv, av); - if (r < 0) - goto out; - r = nargs(envp, ae); - if (r < 0) - goto out; + filename = getname(name); + error = PTR_ERR(filename); + if (IS_ERR(filename)) + return error; old_map_base = current->thread.map_base; old_task_size = current->thread.task_size; @@ -153,19 +112,18 @@ sys32_execve (char *filename, unsigned i ia64_set_kr(IA64_KR_IO_BASE, current->thread.old_iob); ia64_set_kr(IA64_KR_TSSD, current->thread.old_k1); - set_fs(KERNEL_DS); - r = sys_execve(filename, av, ae, regs); - if (r < 0) { + error = compat_do_execve(filename, argv, envp, regs); + putname(filename); + + if (error < 0) { /* oops, execve failed, switch back to old values... */ ia64_set_kr(IA64_KR_IO_BASE, IA32_IOBASE); ia64_set_kr(IA64_KR_TSSD, tssd); current->thread.map_base = old_map_base; current->thread.task_size = old_task_size; - set_fs(USER_DS); /* establish new task-size as the address-limit */ } - out: - kfree(av); - return r; + + return error; } int cp_compat_stat(struct kstat *stat, struct compat_stat *ubuf) @@ -818,110 +776,6 @@ out: return error; } -/* - * We can actually return ERESTARTSYS instead of EINTR, but I'd - * like to be certain this leads to no problems. So I return - * EINTR just for safety. - * - * Update: ERESTARTSYS breaks at least the xview clock binary, so - * I'm trying ERESTARTNOHAND which restart only when you want to. - */ -#define MAX_SELECT_SECONDS \ - ((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1) -#define ROUND_UP_TIME(x,y) (((x)+(y)-1)/(y)) - -asmlinkage long -sys32_select (int n, fd_set *inp, fd_set *outp, fd_set *exp, struct compat_timeval *tvp32) -{ - fd_set_bits fds; - char *bits; - long timeout; - int ret, size; - - timeout = MAX_SCHEDULE_TIMEOUT; - if (tvp32) { - time_t sec, usec; - - ret = -EFAULT; - if (get_user(sec, &tvp32->tv_sec) || get_user(usec, &tvp32->tv_usec)) - goto out_nofds; - - ret = -EINVAL; - if (sec < 0 || usec < 0) - goto out_nofds; - - if ((unsigned long) sec < MAX_SELECT_SECONDS) { - timeout = ROUND_UP_TIME(usec, 1000000/HZ); - timeout += sec * (unsigned long) HZ; - } - } - - ret = -EINVAL; - if (n < 0) - goto out_nofds; - - if (n > current->files->max_fdset) - n = current->files->max_fdset; - - /* - * We need 6 bitmaps (in/out/ex for both incoming and outgoing), - * since we used fdset we need to allocate memory in units of - * long-words. - */ - ret = -ENOMEM; - size = FDS_BYTES(n); - bits = kmalloc(6 * size, GFP_KERNEL); - if (!bits) - goto out_nofds; - fds.in = (unsigned long *) bits; - fds.out = (unsigned long *) (bits + size); - fds.ex = (unsigned long *) (bits + 2*size); - fds.res_in = (unsigned long *) (bits + 3*size); - fds.res_out = (unsigned long *) (bits + 4*size); - fds.res_ex = (unsigned long *) (bits + 5*size); - - if ((ret = get_fd_set(n, inp, fds.in)) || - (ret = get_fd_set(n, outp, fds.out)) || - (ret = get_fd_set(n, exp, fds.ex))) - goto out; - zero_fd_set(n, fds.res_in); - zero_fd_set(n, fds.res_out); - zero_fd_set(n, fds.res_ex); - - ret = do_select(n, &fds, &timeout); - - if (tvp32 && !(current->personality & STICKY_TIMEOUTS)) { - time_t sec = 0, usec = 0; - if (timeout) { - sec = timeout / HZ; - usec = timeout % HZ; - usec *= (1000000/HZ); - } - if (put_user(sec, &tvp32->tv_sec) || put_user(usec, &tvp32->tv_usec)) { - ret = -EFAULT; - goto out; - } - } - - if (ret < 0) - goto out; - if (!ret) { - ret = -ERESTARTNOHAND; - if (signal_pending(current)) - goto out; - ret = 0; - } - - set_fd_set(n, inp, fds.res_in); - set_fd_set(n, outp, fds.res_out); - set_fd_set(n, exp, fds.res_ex); - -out: - kfree(bits); -out_nofds: - return ret; -} - struct sel_arg_struct { unsigned int n; unsigned int inp; @@ -937,87 +791,8 @@ sys32_old_select (struct sel_arg_struct if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; - return sys32_select(a.n, (fd_set *) A(a.inp), (fd_set *) A(a.outp), (fd_set *) A(a.exp), - (struct compat_timeval *) A(a.tvp)); -} - -static struct iovec * -get_compat_iovec (struct compat_iovec *iov32, struct iovec *iov_buf, u32 count, int type) -{ - u32 i, buf, len; - struct iovec *ivp, *iov; - - /* Get the "struct iovec" from user memory */ - - if (!count) - return 0; - if (verify_area(VERIFY_READ, iov32, sizeof(struct compat_iovec)*count)) - return NULL; - if (count > UIO_MAXIOV) - return NULL; - if (count > UIO_FASTIOV) { - iov = kmalloc(count*sizeof(struct iovec), GFP_KERNEL); - if (!iov) - return NULL; - } else - iov = iov_buf; - - ivp = iov; - for (i = 0; i < count; i++) { - if (__get_user(len, &iov32->iov_len) || __get_user(buf, &iov32->iov_base)) { - if (iov != iov_buf) - kfree(iov); - return NULL; - } - if (verify_area(type, (void *)A(buf), len)) { - if (iov != iov_buf) - kfree(iov); - return((struct iovec *)0); - } - ivp->iov_base = (void *)A(buf); - ivp->iov_len = (__kernel_size_t) len; - iov32++; - ivp++; - } - return iov; -} - -asmlinkage long -sys32_readv (int fd, struct compat_iovec *vector, u32 count) -{ - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov; - long ret; - mm_segment_t old_fs = get_fs(); - - iov = get_compat_iovec(vector, iovstack, count, VERIFY_WRITE); - if (!iov) - return -EFAULT; - set_fs(KERNEL_DS); - ret = sys_readv(fd, iov, count); - set_fs(old_fs); - if (iov != iovstack) - kfree(iov); - return ret; -} - -asmlinkage long -sys32_writev (int fd, struct compat_iovec *vector, u32 count) -{ - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov; - long ret; - mm_segment_t old_fs = get_fs(); - - iov = get_compat_iovec(vector, iovstack, count, VERIFY_READ); - if (!iov) - return -EFAULT; - set_fs(KERNEL_DS); - ret = sys_writev(fd, iov, count); - set_fs(old_fs); - if (iov != iovstack) - kfree(iov); - return ret; + return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp), + compat_ptr(a.exp), compat_ptr(a.tvp)); } #define SEMOP 1 @@ -2425,176 +2200,6 @@ sys32_setresgid(compat_gid_t rgid, compa return sys_setresgid(srgid, segid, ssgid); } -/* Stuff for NFS server syscalls... */ -struct nfsctl_svc32 { - u16 svc32_port; - s32 svc32_nthreads; -}; - -struct nfsctl_client32 { - s8 cl32_ident[NFSCLNT_IDMAX+1]; - s32 cl32_naddr; - struct in_addr cl32_addrlist[NFSCLNT_ADDRMAX]; - s32 cl32_fhkeytype; - s32 cl32_fhkeylen; - u8 cl32_fhkey[NFSCLNT_KEYMAX]; -}; - -struct nfsctl_export32 { - s8 ex32_client[NFSCLNT_IDMAX+1]; - s8 ex32_path[NFS_MAXPATHLEN+1]; - compat_dev_t ex32_dev; - compat_ino_t ex32_ino; - s32 ex32_flags; - compat_uid_t ex32_anon_uid; - compat_gid_t ex32_anon_gid; -}; - -struct nfsctl_arg32 { - s32 ca32_version; /* safeguard */ - union { - struct nfsctl_svc32 u32_svc; - struct nfsctl_client32 u32_client; - struct nfsctl_export32 u32_export; - u32 u32_debug; - } u; -#define ca32_svc u.u32_svc -#define ca32_client u.u32_client -#define ca32_export u.u32_export -#define ca32_debug u.u32_debug -}; - -union nfsctl_res32 { - struct knfs_fh cr32_getfh; - u32 cr32_debug; -}; - -static int -nfs_svc32_trans(struct nfsctl_arg *karg, struct nfsctl_arg32 *arg32) -{ - int err; - - err = __get_user(karg->ca_version, &arg32->ca32_version); - err |= __get_user(karg->ca_svc.svc_port, &arg32->ca32_svc.svc32_port); - err |= __get_user(karg->ca_svc.svc_nthreads, - &arg32->ca32_svc.svc32_nthreads); - return err; -} - -static int -nfs_clnt32_trans(struct nfsctl_arg *karg, struct nfsctl_arg32 *arg32) -{ - int err; - - err = __get_user(karg->ca_version, &arg32->ca32_version); - err |= copy_from_user(&karg->ca_client.cl_ident[0], - &arg32->ca32_client.cl32_ident[0], - NFSCLNT_IDMAX); - err |= __get_user(karg->ca_client.cl_naddr, - &arg32->ca32_client.cl32_naddr); - err |= copy_from_user(&karg->ca_client.cl_addrlist[0], - &arg32->ca32_client.cl32_addrlist[0], - (sizeof(struct in_addr) * NFSCLNT_ADDRMAX)); - err |= __get_user(karg->ca_client.cl_fhkeytype, - &arg32->ca32_client.cl32_fhkeytype); - err |= __get_user(karg->ca_client.cl_fhkeylen, - &arg32->ca32_client.cl32_fhkeylen); - err |= copy_from_user(&karg->ca_client.cl_fhkey[0], - &arg32->ca32_client.cl32_fhkey[0], - NFSCLNT_KEYMAX); - return err; -} - -static int -nfs_exp32_trans(struct nfsctl_arg *karg, struct nfsctl_arg32 *arg32) -{ - int err; - - err = __get_user(karg->ca_version, &arg32->ca32_version); - err |= copy_from_user(&karg->ca_export.ex_client[0], - &arg32->ca32_export.ex32_client[0], - NFSCLNT_IDMAX); - err |= copy_from_user(&karg->ca_export.ex_path[0], - &arg32->ca32_export.ex32_path[0], - NFS_MAXPATHLEN); - err |= __get_user(karg->ca_export.ex_dev, - &arg32->ca32_export.ex32_dev); - err |= __get_user(karg->ca_export.ex_ino, - &arg32->ca32_export.ex32_ino); - err |= __get_user(karg->ca_export.ex_flags, - &arg32->ca32_export.ex32_flags); - err |= __get_user(karg->ca_export.ex_anon_uid, - &arg32->ca32_export.ex32_anon_uid); - err |= __get_user(karg->ca_export.ex_anon_gid, - &arg32->ca32_export.ex32_anon_gid); - return err; -} - -static int -nfs_getfh32_res_trans(union nfsctl_res *kres, union nfsctl_res32 *res32) -{ - int err; - - err = copy_to_user(&res32->cr32_getfh, - &kres->cr_getfh, - sizeof(res32->cr32_getfh)); - err |= __put_user(kres->cr_debug, &res32->cr32_debug); - return err; -} - -int asmlinkage -sys32_nfsservctl(int cmd, struct nfsctl_arg32 *arg32, union nfsctl_res32 *res32) -{ - struct nfsctl_arg *karg = NULL; - union nfsctl_res *kres = NULL; - mm_segment_t oldfs; - int err; - - karg = kmalloc(sizeof(*karg), GFP_USER); - if(!karg) - return -ENOMEM; - if(res32) { - kres = kmalloc(sizeof(*kres), GFP_USER); - if(!kres) { - kfree(karg); - return -ENOMEM; - } - } - switch(cmd) { - case NFSCTL_SVC: - err = nfs_svc32_trans(karg, arg32); - break; - case NFSCTL_ADDCLIENT: - err = nfs_clnt32_trans(karg, arg32); - break; - case NFSCTL_DELCLIENT: - err = nfs_clnt32_trans(karg, arg32); - break; - case NFSCTL_EXPORT: - err = nfs_exp32_trans(karg, arg32); - break; - default: - err = -EINVAL; - break; - } - if(err) - goto done; - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = sys_nfsservctl(cmd, karg, kres); - set_fs(oldfs); - - if(!err && cmd == NFSCTL_GETFS) - err = nfs_getfh32_res_trans(kres, res32); - -done: - if(karg) - kfree(karg); - if(kres) - kfree(kres); - return err; -} - /* Handle adjtimex compatibility. */ struct timex32 { --- diff/arch/ia64/kernel/entry.S 2004-02-09 10:36:07.000000000 +0000 +++ source/arch/ia64/kernel/entry.S 2004-04-21 10:46:51.206797496 +0100 @@ -1501,9 +1501,9 @@ sys_call_table: data8 sys_clock_nanosleep data8 sys_fstatfs64 data8 sys_statfs64 - data8 sys_ni_syscall - data8 sys_ni_syscall // 1260 - data8 sys_ni_syscall + data8 sys_mbind + data8 sys_get_mempolicy // 1260 + data8 sys_set_mempolicy data8 sys_ni_syscall data8 sys_ni_syscall data8 sys_ni_syscall --- diff/arch/ia64/kernel/perfmon.c 2004-04-21 10:45:33.546603640 +0100 +++ source/arch/ia64/kernel/perfmon.c 2004-04-21 10:46:51.209797040 +0100 @@ -2308,6 +2308,7 @@ pfm_smpl_buffer_alloc(struct task_struct vma->vm_ops = NULL; vma->vm_pgoff = 0; vma->vm_file = NULL; + mpol_set_vma_default(vma); vma->vm_private_data = NULL; /* --- diff/arch/ia64/kernel/process.c 2004-04-21 10:45:33.547603488 +0100 +++ source/arch/ia64/kernel/process.c 2004-04-21 10:46:51.209797040 +0100 @@ -657,11 +657,6 @@ get_wchan (struct task_struct *p) struct unw_frame_info info; unsigned long ip; int count = 0; - /* - * These bracket the sleeping functions.. - */ -# define first_sched ((unsigned long) scheduling_functions_start_here) -# define last_sched ((unsigned long) scheduling_functions_end_here) /* * Note: p may not be a blocked task (it could be current or @@ -676,12 +671,10 @@ get_wchan (struct task_struct *p) if (unw_unwind(&info) < 0) return 0; unw_get_ip(&info, &ip); - if (ip < first_sched || ip >= last_sched) + if (!in_sched_functions(ip)) return ip; } while (count++ < 16); return 0; -# undef first_sched -# undef last_sched } void --- diff/arch/ia64/kernel/setup.c 2004-04-05 12:57:06.000000000 +0100 +++ source/arch/ia64/kernel/setup.c 2004-04-21 10:46:51.210796888 +0100 @@ -88,10 +88,6 @@ unsigned char aux_device_present = 0xaa; unsigned long ia64_max_iommu_merge_mask = ~0UL; EXPORT_SYMBOL(ia64_max_iommu_merge_mask); -#define COMMAND_LINE_SIZE 512 - -char saved_command_line[COMMAND_LINE_SIZE]; /* used in proc filesystem */ - /* * We use a special marker for the end of memory and it uses the extra (+1) slot */ --- diff/arch/ia64/lib/dec_and_lock.c 2004-01-19 10:22:55.000000000 +0000 +++ source/arch/ia64/lib/dec_and_lock.c 2004-04-21 10:46:51.210796888 +0100 @@ -13,6 +13,7 @@ #include #include +#ifndef CONFIG_LOCKMETER /* * Decrement REFCOUNT and if the count reaches zero, acquire the spinlock. Both of these * operations have to be done atomically, so that the count doesn't drop to zero without @@ -40,3 +41,4 @@ atomic_dec_and_lock (atomic_t *refcount, } EXPORT_SYMBOL(atomic_dec_and_lock); +#endif --- diff/arch/ia64/mm/init.c 2004-04-05 12:57:06.000000000 +0100 +++ source/arch/ia64/mm/init.c 2004-04-21 10:46:51.211796736 +0100 @@ -131,6 +131,7 @@ ia64_init_addr_space (void) vma->vm_pgoff = 0; vma->vm_file = NULL; vma->vm_private_data = NULL; + mpol_set_vma_default(vma); insert_vm_struct(current->mm, vma); } @@ -143,6 +144,7 @@ ia64_init_addr_space (void) vma->vm_end = PAGE_SIZE; vma->vm_page_prot = __pgprot(pgprot_val(PAGE_READONLY) | _PAGE_MA_NAT); vma->vm_flags = VM_READ | VM_MAYREAD | VM_IO | VM_RESERVED; + mpol_set_vma_default(vma); insert_vm_struct(current->mm, vma); } } --- diff/arch/m68k/atari/stram.c 2004-04-21 10:45:33.558601816 +0100 +++ source/arch/m68k/atari/stram.c 2004-04-21 10:46:51.211796736 +0100 @@ -752,7 +752,7 @@ static int unswap_by_read(unsigned short /* Get a page for the entry, using the existing swap cache page if there is one. Otherwise, get a clean page and read the swap into it. */ - page = read_swap_cache_async(entry); + page = read_swap_cache_async(entry, NULL, 0); if (!page) { swap_free(entry); return -ENOMEM; --- diff/arch/m68k/kernel/process.c 2004-04-21 10:45:33.560601512 +0100 +++ source/arch/m68k/kernel/process.c 2004-04-21 10:46:51.212796584 +0100 @@ -67,8 +67,7 @@ unsigned long thread_saved_pc(struct tas { struct switch_stack *sw = (struct switch_stack *)tsk->thread.ksp; /* Check whether the thread is blocked in resume() */ - if (sw->retpc > (unsigned long)scheduling_functions_start_here && - sw->retpc < (unsigned long)scheduling_functions_end_here) + if (in_sched_functions(sw->retpc)) return ((unsigned long *)sw->a6)[1]; else return sw->retpc; @@ -382,12 +381,6 @@ out: return error; } -/* - * These bracket the sleeping functions.. - */ -#define first_sched ((unsigned long) scheduling_functions_start_here) -#define last_sched ((unsigned long) scheduling_functions_end_here) - unsigned long get_wchan(struct task_struct *p) { unsigned long fp, pc; @@ -403,7 +396,7 @@ unsigned long get_wchan(struct task_stru fp >= 8184+stack_page) return 0; pc = ((unsigned long *)fp)[1]; - if (pc < first_sched || pc >= last_sched) + if (!in_sched_functions(pc)) return pc; fp = *(unsigned long *) fp; } while (count++ < 16); --- diff/arch/m68k/kernel/setup.c 2004-04-21 10:45:33.561601360 +0100 +++ source/arch/m68k/kernel/setup.c 2004-04-21 10:46:51.212796584 +0100 @@ -62,7 +62,6 @@ struct mem_info m68k_memory[NUM_MEMINFO] static struct mem_info m68k_ramdisk; static char m68k_command_line[CL_SIZE]; -char saved_command_line[CL_SIZE]; char m68k_debug_device[6] = ""; --- diff/arch/m68k/q40/config.c 2004-04-05 12:57:06.000000000 +0100 +++ source/arch/m68k/q40/config.c 2004-04-21 10:46:51.214796280 +0100 @@ -64,7 +64,6 @@ void q40_set_vectors (void); extern void q40_mksound(unsigned int /*freq*/, unsigned int /*ticks*/ ); -extern char *saved_command_line; extern char m68k_debug_device[]; static void q40_mem_console_write(struct console *co, const char *b, unsigned int count); --- diff/arch/m68knommu/kernel/process.c 2004-04-21 10:45:33.574599384 +0100 +++ source/arch/m68knommu/kernel/process.c 2004-04-21 10:46:51.213796432 +0100 @@ -404,12 +404,6 @@ out: return error; } -/* - * These bracket the sleeping functions.. - */ -#define first_sched ((unsigned long) scheduling_functions_start_here) -#define last_sched ((unsigned long) scheduling_functions_end_here) - unsigned long get_wchan(struct task_struct *p) { unsigned long fp, pc; @@ -425,7 +419,7 @@ unsigned long get_wchan(struct task_stru fp >= 8184+stack_page) return 0; pc = ((unsigned long *)fp)[1]; - if (pc < first_sched || pc >= last_sched) + if (!in_sched_functions(pc)) return pc; fp = *(unsigned long *) fp; } while (count++ < 16); @@ -440,8 +434,7 @@ unsigned long thread_saved_pc(struct tas struct switch_stack *sw = (struct switch_stack *)tsk->thread.ksp; /* Check whether the thread is blocked in resume() */ - if (sw->retpc > (unsigned long)scheduling_functions_start_here && - sw->retpc < (unsigned long)scheduling_functions_end_here) + if (in_sched_functions(sw->retpc)) return ((unsigned long *)sw->a6)[1]; else return sw->retpc; --- diff/arch/m68knommu/kernel/setup.c 2004-04-21 10:45:33.575599232 +0100 +++ source/arch/m68knommu/kernel/setup.c 2004-04-21 10:46:51.213796432 +0100 @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -44,8 +45,7 @@ unsigned long rom_length; unsigned long memory_start; unsigned long memory_end; -char command_line[512]; -char saved_command_line[512]; +char command_line[COMMAND_LINE_SIZE]; /* setup some dummy routines */ static void dummy_waitbut(void) --- diff/arch/mips/kernel/linux32.c 2004-04-21 10:45:33.645588592 +0100 +++ source/arch/mips/kernel/linux32.c 2004-04-21 10:46:51.215796128 +0100 @@ -142,228 +142,6 @@ asmlinkage int sys_ftruncate64(unsigned } /* - * count32() counts the number of arguments/envelopes - */ -static int count32(u32 * argv, int max) -{ - int i = 0; - - if (argv != NULL) { - for (;;) { - u32 p; int error; - - error = get_user(p,argv); - if (error) - return error; - if (!p) - break; - argv++; - if (++i > max) - return -E2BIG; - } - } - return i; -} - - -/* - * 'copy_strings32()' copies argument/envelope strings from user - * memory to free pages in kernel mem. These are in a format ready - * to be put directly into the top of new user memory. - */ -int copy_strings32(int argc, u32 * argv, struct linux_binprm *bprm) -{ - struct page *kmapped_page = NULL; - char *kaddr = NULL; - int ret; - - while (argc-- > 0) { - u32 str; - int len; - unsigned long pos; - - if (get_user(str, argv+argc) || !str || - !(len = strnlen_user((char *)A(str), bprm->p))) { - ret = -EFAULT; - goto out; - } - - if (bprm->p < len) { - ret = -E2BIG; - goto out; - } - - bprm->p -= len; - /* XXX: add architecture specific overflow check here. */ - - pos = bprm->p; - while (len > 0) { - int i, new, err; - int offset, bytes_to_copy; - struct page *page; - - offset = pos % PAGE_SIZE; - i = pos/PAGE_SIZE; - page = bprm->page[i]; - new = 0; - if (!page) { - page = alloc_page(GFP_HIGHUSER); - bprm->page[i] = page; - if (!page) { - ret = -ENOMEM; - goto out; - } - new = 1; - } - - if (page != kmapped_page) { - if (kmapped_page) - kunmap(kmapped_page); - kmapped_page = page; - kaddr = kmap(kmapped_page); - } - if (new && offset) - memset(kaddr, 0, offset); - bytes_to_copy = PAGE_SIZE - offset; - if (bytes_to_copy > len) { - bytes_to_copy = len; - if (new) - memset(kaddr+offset+len, 0, - PAGE_SIZE-offset-len); - } - err = copy_from_user(kaddr + offset, (char *)A(str), - bytes_to_copy); - if (err) { - ret = -EFAULT; - goto out; - } - - pos += bytes_to_copy; - str += bytes_to_copy; - len -= bytes_to_copy; - } - } - ret = 0; -out: - if (kmapped_page) - kunmap(kmapped_page); - return ret; -} - -#ifdef CONFIG_MMU - -#define free_arg_pages(bprm) do { } while (0) - -#else - -static inline void free_arg_pages(struct linux_binprm *bprm) -{ - int i; - - for (i = 0; i < MAX_ARG_PAGES; i++) { - if (bprm->page[i]) - __free_page(bprm->page[i]); - bprm->page[i] = NULL; - } -} - -#endif /* CONFIG_MMU */ - -/* - * sys32_execve() executes a new program. - */ -static inline int -do_execve32(char * filename, u32 * argv, u32 * envp, struct pt_regs * regs) -{ - struct linux_binprm bprm; - struct file * file; - int retval; - - sched_balance_exec(); - - file = open_exec(filename); - - retval = PTR_ERR(file); - if (IS_ERR(file)) - return retval; - - bprm.p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *); - memset(bprm.page, 0, MAX_ARG_PAGES * sizeof(bprm.page[0])); - - bprm.file = file; - bprm.filename = filename; - bprm.interp = filename; - bprm.sh_bang = 0; - bprm.loader = 0; - bprm.exec = 0; - bprm.security = NULL; - bprm.mm = mm_alloc(); - retval = -ENOMEM; - if (!bprm.mm) - goto out_file; - - retval = init_new_context(current, bprm.mm); - if (retval < 0) - goto out_mm; - - bprm.argc = count32(argv, bprm.p / sizeof(u32)); - if ((retval = bprm.argc) < 0) - goto out_mm; - - bprm.envc = count32(envp, bprm.p / sizeof(u32)); - if ((retval = bprm.envc) < 0) - goto out_mm; - - retval = security_bprm_alloc(&bprm); - if (retval) - goto out; - - retval = prepare_binprm(&bprm); - if (retval < 0) - goto out; - - retval = copy_strings_kernel(1, &bprm.filename, &bprm); - if (retval < 0) - goto out; - - bprm.exec = bprm.p; - retval = copy_strings32(bprm.envc, envp, &bprm); - if (retval < 0) - goto out; - - retval = copy_strings32(bprm.argc, argv, &bprm); - if (retval < 0) - goto out; - - retval = search_binary_handler(&bprm, regs); - if (retval >= 0) { - free_arg_pages(&bprm); - - /* execve success */ - security_bprm_free(&bprm); - return retval; - } - -out: - /* Something went wrong, return the inode and free the argument pages*/ - free_arg_pages(&bprm); - - if (bprm.security) - security_bprm_free(&bprm); - -out_mm: - if (bprm.mm) - mmdrop(bprm.mm); - -out_file: - if (bprm.file) { - allow_write_access(bprm.file); - fput(bprm.file); - } - return retval; -} - -/* * sys_execve() executes a new program. */ asmlinkage int sys32_execve(nabi_no_regargs struct pt_regs regs) @@ -371,12 +149,12 @@ asmlinkage int sys32_execve(nabi_no_rega int error; char * filename; - filename = getname((char *) (long)regs.regs[4]); + filename = getname(compat_ptr(regs.regs[4])); error = PTR_ERR(filename); if (IS_ERR(filename)) goto out; - error = do_execve32(filename, (u32 *) (long)regs.regs[5], - (u32 *) (long)regs.regs[6], ®s); + error = compat_do_execve(filename, compat_ptr(regs.regs[5]), + compat_ptr(regs.regs[6]), ®s); putname(filename); out: @@ -671,150 +449,6 @@ asmlinkage int sys32_llseek(unsigned int return sys_llseek(fd, offset_high, offset_low, result, origin); } -typedef ssize_t (*IO_fn_t)(struct file *, char *, size_t, loff_t *); - -static long -do_readv_writev32(int type, struct file *file, const struct compat_iovec *vector, - u32 count) -{ - unsigned long tot_len; - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov=iovstack, *ivp; - struct inode *inode; - long retval, i; - IO_fn_t fn; - - /* First get the "struct iovec" from user memory and - * verify all the pointers - */ - if (!count) - return 0; - if(verify_area(VERIFY_READ, vector, sizeof(struct compat_iovec)*count)) - return -EFAULT; - if (count > UIO_MAXIOV) - return -EINVAL; - if (count > UIO_FASTIOV) { - iov = kmalloc(count*sizeof(struct iovec), GFP_KERNEL); - if (!iov) - return -ENOMEM; - } - - tot_len = 0; - i = count; - ivp = iov; - while (i > 0) { - u32 len; - u32 buf; - - __get_user(len, &vector->iov_len); - __get_user(buf, &vector->iov_base); - tot_len += len; - ivp->iov_base = (void *)A(buf); - ivp->iov_len = (__kernel_size_t) len; - vector++; - ivp++; - i--; - } - - inode = file->f_dentry->d_inode; - /* VERIFY_WRITE actually means a read, as we write to user space */ - retval = locks_verify_area((type == VERIFY_WRITE - ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE), - inode, file, file->f_pos, tot_len); - if (retval) { - if (iov != iovstack) - kfree(iov); - return retval; - } - - /* Then do the actual IO. Note that sockets need to be handled - * specially as they have atomicity guarantees and can handle - * iovec's natively - */ -#ifdef CONFIG_NET - if (inode->i_sock) { - int err; - err = sock_readv_writev(type, inode, file, iov, count, tot_len); - if (iov != iovstack) - kfree(iov); - return err; - } -#endif - - if (!file->f_op) { - if (iov != iovstack) - kfree(iov); - return -EINVAL; - } - /* VERIFY_WRITE actually means a read, as we write to user space */ - fn = file->f_op->read; - if (type == VERIFY_READ) - fn = (IO_fn_t) file->f_op->write; - ivp = iov; - while (count > 0) { - void * base; - int len, nr; - - base = ivp->iov_base; - len = ivp->iov_len; - ivp++; - count--; - nr = fn(file, base, len, &file->f_pos); - if (nr < 0) { - if (retval) - break; - retval = nr; - break; - } - retval += nr; - if (nr != len) - break; - } - if (iov != iovstack) - kfree(iov); - - return retval; -} - -asmlinkage long -sys32_readv(int fd, struct compat_iovec *vector, u32 count) -{ - struct file *file; - ssize_t ret; - - ret = -EBADF; - file = fget(fd); - if (!file) - goto bad_file; - if (file->f_op && (file->f_mode & FMODE_READ) && - (file->f_op->readv || file->f_op->read)) - ret = do_readv_writev32(VERIFY_WRITE, file, vector, count); - - fput(file); - -bad_file: - return ret; -} - -asmlinkage long -sys32_writev(int fd, struct compat_iovec *vector, u32 count) -{ - struct file *file; - ssize_t ret; - - ret = -EBADF; - file = fget(fd); - if(!file) - goto bad_file; - if (file->f_op && (file->f_mode & FMODE_WRITE) && - (file->f_op->writev || file->f_op->write)) - ret = do_readv_writev32(VERIFY_READ, file, vector, count); - fput(file); - -bad_file: - return ret; -} - /* From the Single Unix Spec: pread & pwrite act like lseek to pos + op + lseek back to original location. They fail just like lseek does on non-seekable files. */ @@ -885,167 +519,6 @@ out: bad_file: return ret; } -/* - * Ooo, nasty. We need here to frob 32-bit unsigned longs to - * 64-bit unsigned longs. - */ - -static inline int -get_fd_set32(unsigned long n, unsigned long *fdset, u32 *ufdset) -{ - if (ufdset) { - unsigned long odd; - - if (verify_area(VERIFY_WRITE, ufdset, n*sizeof(u32))) - return -EFAULT; - - odd = n & 1UL; - n &= ~1UL; - while (n) { - unsigned long h, l; - __get_user(l, ufdset); - __get_user(h, ufdset+1); - ufdset += 2; - *fdset++ = h << 32 | l; - n -= 2; - } - if (odd) - __get_user(*fdset, ufdset); - } else { - /* Tricky, must clear full unsigned long in the - * kernel fdset at the end, this makes sure that - * actually happens. - */ - memset(fdset, 0, ((n + 1) & ~1)*sizeof(u32)); - } - return 0; -} - -static inline void -set_fd_set32(unsigned long n, u32 *ufdset, unsigned long *fdset) -{ - unsigned long odd; - - if (!ufdset) - return; - - odd = n & 1UL; - n &= ~1UL; - while (n) { - unsigned long h, l; - l = *fdset++; - h = l >> 32; - __put_user(l, ufdset); - __put_user(h, ufdset+1); - ufdset += 2; - n -= 2; - } - if (odd) - __put_user(*fdset, ufdset); -} - -/* - * We can actually return ERESTARTSYS instead of EINTR, but I'd - * like to be certain this leads to no problems. So I return - * EINTR just for safety. - * - * Update: ERESTARTSYS breaks at least the xview clock binary, so - * I'm trying ERESTARTNOHAND which restart only when you want to. - */ -#define MAX_SELECT_SECONDS \ - ((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1) - -asmlinkage int sys32_select(int n, u32 *inp, u32 *outp, u32 *exp, struct compat_timeval *tvp) -{ - fd_set_bits fds; - char *bits; - unsigned long nn; - long timeout; - int ret, size; - - timeout = MAX_SCHEDULE_TIMEOUT; - if (tvp) { - time_t sec, usec; - - if ((ret = verify_area(VERIFY_READ, tvp, sizeof(*tvp))) - || (ret = __get_user(sec, &tvp->tv_sec)) - || (ret = __get_user(usec, &tvp->tv_usec))) - goto out_nofds; - - ret = -EINVAL; - if(sec < 0 || usec < 0) - goto out_nofds; - - if ((unsigned long) sec < MAX_SELECT_SECONDS) { - timeout = (usec + 1000000/HZ - 1) / (1000000/HZ); - timeout += sec * (unsigned long) HZ; - } - } - - ret = -EINVAL; - if (n < 0) - goto out_nofds; - if (n > current->files->max_fdset) - n = current->files->max_fdset; - - /* - * We need 6 bitmaps (in/out/ex for both incoming and outgoing), - * since we used fdset we need to allocate memory in units of - * long-words. - */ - ret = -ENOMEM; - size = FDS_BYTES(n); - bits = kmalloc(6 * size, GFP_KERNEL); - if (!bits) - goto out_nofds; - fds.in = (unsigned long *) bits; - fds.out = (unsigned long *) (bits + size); - fds.ex = (unsigned long *) (bits + 2*size); - fds.res_in = (unsigned long *) (bits + 3*size); - fds.res_out = (unsigned long *) (bits + 4*size); - fds.res_ex = (unsigned long *) (bits + 5*size); - - nn = (n + 8*sizeof(u32) - 1) / (8*sizeof(u32)); - if ((ret = get_fd_set32(nn, fds.in, inp)) || - (ret = get_fd_set32(nn, fds.out, outp)) || - (ret = get_fd_set32(nn, fds.ex, exp))) - goto out; - zero_fd_set(n, fds.res_in); - zero_fd_set(n, fds.res_out); - zero_fd_set(n, fds.res_ex); - - ret = do_select(n, &fds, &timeout); - - if (tvp && !(current->personality & STICKY_TIMEOUTS)) { - time_t sec = 0, usec = 0; - if (timeout) { - sec = timeout / HZ; - usec = timeout % HZ; - usec *= (1000000/HZ); - } - put_user(sec, &tvp->tv_sec); - put_user(usec, &tvp->tv_usec); - } - - if (ret < 0) - goto out; - if (!ret) { - ret = -ERESTARTNOHAND; - if (signal_pending(current)) - goto out; - ret = 0; - } - - set_fd_set32(nn, inp, fds.res_in); - set_fd_set32(nn, outp, fds.res_out); - set_fd_set32(nn, exp, fds.res_ex); - -out: - kfree(bits); -out_nofds: - return ret; -} - asmlinkage int sys32_sched_rr_get_interval(compat_pid_t pid, struct compat_timespec *interval) --- diff/arch/mips/kernel/process.c 2004-04-21 10:45:33.647588288 +0100 +++ source/arch/mips/kernel/process.c 2004-04-21 10:46:51.215796128 +0100 @@ -280,12 +280,6 @@ unsigned long thread_saved_pc(struct tas return ((unsigned long *)t->reg29)[schedule_frame.pc_offset]; } -/* - * These bracket the sleeping functions.. - */ -#define first_sched ((unsigned long) scheduling_functions_start_here) -#define last_sched ((unsigned long) scheduling_functions_end_here) - /* get_wchan - a maintenance nightmare^W^Wpain in the ass ... */ unsigned long get_wchan(struct task_struct *p) { @@ -297,7 +291,7 @@ unsigned long get_wchan(struct task_stru if (!mips_frame_info_initialized) return 0; pc = thread_saved_pc(p); - if (pc < first_sched || pc >= last_sched) + if (!in_sched_functions(pc)) goto out; if (pc >= (unsigned long) sleep_on_timeout) @@ -331,7 +325,7 @@ schedule_timeout_caller: */ pc = ((unsigned long *)frame)[schedule_timeout_frame.pc_offset]; - if (pc >= first_sched && pc < last_sched) { + if (in_sched_functions(pc)) { /* schedule_timeout called by [interruptible_]sleep_on_timeout */ frame = ((unsigned long *)frame)[schedule_timeout_frame.frame_offset]; pc = ((unsigned long *)frame)[sleep_on_timeout_frame.pc_offset]; --- diff/arch/mips/kernel/scall64-n32.S 2004-04-21 10:45:33.648588136 +0100 +++ source/arch/mips/kernel/scall64-n32.S 2004-04-21 10:46:51.216795976 +0100 @@ -128,11 +128,11 @@ EXPORT(sysn32_call_table) PTR compat_sys_ioctl /* 6015 */ PTR sys_pread64 PTR sys_pwrite64 - PTR sys32_readv - PTR sys32_writev + PTR compat_sys_readv + PTR compat_sys_writev PTR sys_access /* 6020 */ PTR sys_pipe - PTR sys32_select + PTR compat_sys_select PTR sys_sched_yield PTR sys_mremap PTR sys_msync /* 6025 */ --- diff/arch/mips/kernel/scall64-o32.S 2004-03-11 10:20:21.000000000 +0000 +++ source/arch/mips/kernel/scall64-o32.S 2004-04-21 10:46:51.216795976 +0100 @@ -397,11 +397,11 @@ out: jr ra sys sys_setfsgid 1 sys sys32_llseek 5 /* 4140 */ sys sys32_getdents 3 - sys sys32_select 5 + sys compat_sys_select 5 sys sys_flock 2 sys sys_msync 3 - sys sys32_readv 3 /* 4145 */ - sys sys32_writev 3 + sys compat_sys_readv 3 /* 4145 */ + sys compat_sys_writev 3 sys sys_cacheflush 3 sys sys_cachectl 3 sys sys_sysmips 4 --- diff/arch/mips/kernel/setup.c 2004-03-11 10:20:21.000000000 +0000 +++ source/arch/mips/kernel/setup.c 2004-04-21 10:46:51.217795824 +0100 @@ -71,7 +71,6 @@ EXPORT_SYMBOL(mips_machgroup); struct boot_mem_map boot_mem_map; static char command_line[CL_SIZE]; - char saved_command_line[CL_SIZE]; char arcs_cmdline[CL_SIZE]=CONFIG_CMDLINE; /* --- diff/arch/parisc/kernel/setup.c 2003-10-09 09:47:33.000000000 +0100 +++ source/arch/parisc/kernel/setup.c 2004-04-21 10:46:51.217795824 +0100 @@ -45,8 +45,6 @@ #include #include -#define COMMAND_LINE_SIZE 1024 -char saved_command_line[COMMAND_LINE_SIZE]; char command_line[COMMAND_LINE_SIZE]; /* Intended for ccio/sba/cpu statistics under /proc/bus/{runway|gsc} */ --- diff/arch/parisc/kernel/sys_parisc32.c 2004-04-21 10:45:33.697580688 +0100 +++ source/arch/parisc/kernel/sys_parisc32.c 2004-04-21 10:46:51.218795672 +0100 @@ -65,189 +65,6 @@ #endif /* - * count32() counts the number of arguments/envelopes. It is basically - * a copy of count() from fs/exec.c, except that it works - * with 32 bit argv and envp pointers. - */ - -static int count32(u32 *argv, int max) -{ - int i = 0; - - if (argv != NULL) { - for (;;) { - u32 p; - int error; - - error = get_user(p,argv); - if (error) - return error; - if (!p) - break; - argv++; - if(++i > max) - return -E2BIG; - } - } - return i; -} - - -/* - * copy_strings32() is basically a copy of copy_strings() from fs/exec.c - * except that it works with 32 bit argv and envp pointers. - */ - - -static int copy_strings32(int argc, u32 *argv, struct linux_binprm *bprm) -{ - while (argc-- > 0) { - u32 str; - int len; - unsigned long pos; - - if (get_user(str, argv + argc) || - !str || - !(len = strnlen_user((char *)compat_ptr(str), bprm->p))) - return -EFAULT; - - if (bprm->p < len) - return -E2BIG; - - bprm->p -= len; - - pos = bprm->p; - while (len > 0) { - char *kaddr; - int i, new, err; - struct page *page; - int offset, bytes_to_copy; - - offset = pos % PAGE_SIZE; - i = pos/PAGE_SIZE; - page = bprm->page[i]; - new = 0; - if (!page) { - page = alloc_page(GFP_HIGHUSER); - bprm->page[i] = page; - if (!page) - return -ENOMEM; - new = 1; - } - kaddr = (char *)kmap(page); - - if (new && offset) - memset(kaddr, 0, offset); - bytes_to_copy = PAGE_SIZE - offset; - if (bytes_to_copy > len) { - bytes_to_copy = len; - if (new) - memset(kaddr+offset+len, 0, PAGE_SIZE-offset-len); - } - err = copy_from_user(kaddr + offset, (char *)compat_ptr(str), bytes_to_copy); - flush_dcache_page(page); - kunmap(page); - - if (err) - return -EFAULT; - - pos += bytes_to_copy; - str += bytes_to_copy; - len -= bytes_to_copy; - } - } - return 0; -} - -/* - * do_execve32() is mostly a copy of do_execve(), with the exception - * that it processes 32 bit argv and envp pointers. - */ - -static inline int -do_execve32(char * filename, u32 * argv, u32 * envp, struct pt_regs * regs) -{ - struct linux_binprm bprm; - struct file *file; - int retval; - int i; - - file = open_exec(filename); - - retval = PTR_ERR(file); - if (IS_ERR(file)) - return retval; - - bprm.p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *); - memset(bprm.page, 0, MAX_ARG_PAGES*sizeof(bprm.page[0])); - - DBG(("do_execve32(%s, %p, %p, %p)\n", filename, argv, envp, regs)); - - bprm.file = file; - bprm.filename = filename; - bprm.interp = filename; - bprm.sh_bang = 0; - bprm.loader = 0; - bprm.exec = 0; - - bprm.mm = mm_alloc(); - retval = -ENOMEM; - if (!bprm.mm) - goto out_file; - - retval = init_new_context(current, bprm.mm); - if (retval < 0) - goto out_mm; - - if ((bprm.argc = count32(argv, bprm.p / sizeof(u32))) < 0) - goto out_mm; - - if ((bprm.envc = count32(envp, bprm.p / sizeof(u32))) < 0) - goto out_mm; - - retval = prepare_binprm(&bprm); - if (retval < 0) - goto out; - - retval = copy_strings_kernel(1, &bprm.filename, &bprm); - if (retval < 0) - goto out; - - bprm.exec = bprm.p; - retval = copy_strings32(bprm.envc, envp, &bprm); - if (retval < 0) - goto out; - - retval = copy_strings32(bprm.argc, argv, &bprm); - if (retval < 0) - goto out; - - retval = search_binary_handler(&bprm,regs); - if (retval >= 0) - /* execve success */ - return retval; - -out: - /* Something went wrong, return the inode and free the argument pages*/ - for (i = 0; i < MAX_ARG_PAGES; i++) { - struct page *page = bprm.page[i]; - if (page) - __free_page(page); - } - -out_mm: - mmdrop(bprm.mm); - -out_file: - if (bprm.file) { - allow_write_access(bprm.file); - fput(bprm.file); - } - - return retval; -} - -/* * sys32_execve() executes a new program. */ @@ -261,8 +78,8 @@ asmlinkage int sys32_execve(struct pt_re error = PTR_ERR(filename); if (IS_ERR(filename)) goto out; - error = do_execve32(filename, (u32 *) regs->gr[25], - (u32 *) regs->gr[24], regs); + error = compat_do_execve(filename, compat_ptr(regs->gr[25]), + compat_ptr(regs->gr[24]), regs); if (error == 0) current->ptrace &= ~PT_DTRACE; putname(filename); @@ -609,149 +426,6 @@ out: return error; } -/* readv/writev stolen from mips64 */ -typedef ssize_t (*IO_fn_t)(struct file *, char *, size_t, loff_t *); - -static long -do_readv_writev32(int type, struct file *file, const struct compat_iovec *vector, - u32 count) -{ - unsigned long tot_len; - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov=iovstack, *ivp; - struct inode *inode; - long retval, i; - IO_fn_t fn; - - /* First get the "struct iovec" from user memory and - * verify all the pointers - */ - if (!count) - return 0; - if(verify_area(VERIFY_READ, vector, sizeof(struct compat_iovec)*count)) - return -EFAULT; - if (count > UIO_MAXIOV) - return -EINVAL; - if (count > UIO_FASTIOV) { - iov = kmalloc(count*sizeof(struct iovec), GFP_KERNEL); - if (!iov) - return -ENOMEM; - } - - tot_len = 0; - i = count; - ivp = iov; - while (i > 0) { - u32 len; - u32 buf; - - __get_user(len, &vector->iov_len); - __get_user(buf, &vector->iov_base); - tot_len += len; - ivp->iov_base = compat_ptr(buf); - ivp->iov_len = (compat_size_t) len; - vector++; - ivp++; - i--; - } - - inode = file->f_dentry->d_inode; - /* VERIFY_WRITE actually means a read, as we write to user space */ - retval = locks_verify_area((type == VERIFY_WRITE - ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE), - inode, file, file->f_pos, tot_len); - if (retval) { - if (iov != iovstack) - kfree(iov); - return retval; - } - - /* Then do the actual IO. Note that sockets need to be handled - * specially as they have atomicity guarantees and can handle - * iovec's natively - */ - if (inode->i_sock) { - int err; - err = sock_readv_writev(type, inode, file, iov, count, tot_len); - if (iov != iovstack) - kfree(iov); - return err; - } - - if (!file->f_op) { - if (iov != iovstack) - kfree(iov); - return -EINVAL; - } - /* VERIFY_WRITE actually means a read, as we write to user space */ - fn = file->f_op->read; - if (type == VERIFY_READ) - fn = (IO_fn_t) file->f_op->write; - ivp = iov; - while (count > 0) { - void * base; - int len, nr; - - base = ivp->iov_base; - len = ivp->iov_len; - ivp++; - count--; - nr = fn(file, base, len, &file->f_pos); - if (nr < 0) { - if (retval) - break; - retval = nr; - break; - } - retval += nr; - if (nr != len) - break; - } - if (iov != iovstack) - kfree(iov); - - return retval; -} - -asmlinkage long -sys32_readv(int fd, struct compat_iovec *vector, u32 count) -{ - struct file *file; - ssize_t ret; - - ret = -EBADF; - file = fget(fd); - if (!file) - goto bad_file; - if (file->f_op && (file->f_mode & FMODE_READ) && - (file->f_op->readv || file->f_op->read)) - ret = do_readv_writev32(VERIFY_WRITE, file, vector, count); - - fput(file); - -bad_file: - return ret; -} - -asmlinkage long -sys32_writev(int fd, struct compat_iovec *vector, u32 count) -{ - struct file *file; - ssize_t ret; - - ret = -EBADF; - file = fget(fd); - if(!file) - goto bad_file; - if (file->f_op && (file->f_mode & FMODE_WRITE) && - (file->f_op->writev || file->f_op->write)) - ret = do_readv_writev32(VERIFY_READ, file, vector, count); - fput(file); - -bad_file: - return ret; -} - /*** copied from mips64 ***/ /* * Ooo, nasty. We need here to frob 32-bit unsigned longs to @@ -814,126 +488,6 @@ set_fd_set32(unsigned long n, u32 *ufdse __put_user(*fdset, ufdset); } -/*** This is a virtual copy of sys_select from fs/select.c and probably - *** should be compared to it from time to time - ***/ -static inline void *select_bits_alloc(int size) -{ - return kmalloc(6 * size, GFP_KERNEL); -} - -static inline void select_bits_free(void *bits, int size) -{ - kfree(bits); -} - -/* - * We can actually return ERESTARTSYS instead of EINTR, but I'd - * like to be certain this leads to no problems. So I return - * EINTR just for safety. - * - * Update: ERESTARTSYS breaks at least the xview clock binary, so - * I'm trying ERESTARTNOHAND which restart only when you want to. - */ -#define MAX_SELECT_SECONDS \ - ((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1) -#define DIVIDE_ROUND_UP(x,y) (((x)+(y)-1)/(y)) - -asmlinkage long -sys32_select(int n, u32 *inp, u32 *outp, u32 *exp, struct compat_timeval *tvp) -{ - fd_set_bits fds; - char *bits; - long timeout; - int ret, size, err; - - timeout = MAX_SCHEDULE_TIMEOUT; - if (tvp) { - struct compat_timeval tv32; - time_t sec, usec; - - if ((ret = copy_from_user(&tv32, tvp, sizeof tv32))) - goto out_nofds; - - sec = tv32.tv_sec; - usec = tv32.tv_usec; - - ret = -EINVAL; - if (sec < 0 || usec < 0) - goto out_nofds; - - if ((unsigned long) sec < MAX_SELECT_SECONDS) { - timeout = DIVIDE_ROUND_UP(usec, 1000000/HZ); - timeout += sec * (unsigned long) HZ; - } - } - - ret = -EINVAL; - if (n < 0) - goto out_nofds; - - if (n > current->files->max_fdset) - n = current->files->max_fdset; - - /* - * We need 6 bitmaps (in/out/ex for both incoming and outgoing), - * since we used fdset we need to allocate memory in units of - * long-words. - */ - ret = -ENOMEM; - size = FDS_BYTES(n); - bits = select_bits_alloc(size); - if (!bits) - goto out_nofds; - fds.in = (unsigned long *) bits; - fds.out = (unsigned long *) (bits + size); - fds.ex = (unsigned long *) (bits + 2*size); - fds.res_in = (unsigned long *) (bits + 3*size); - fds.res_out = (unsigned long *) (bits + 4*size); - fds.res_ex = (unsigned long *) (bits + 5*size); - - if ((ret = get_fd_set32(n, inp, fds.in)) || - (ret = get_fd_set32(n, outp, fds.out)) || - (ret = get_fd_set32(n, exp, fds.ex))) - goto out; - zero_fd_set(n, fds.res_in); - zero_fd_set(n, fds.res_out); - zero_fd_set(n, fds.res_ex); - - ret = do_select(n, &fds, &timeout); - - if (tvp && !(current->personality & STICKY_TIMEOUTS)) { - time_t sec = 0, usec = 0; - if (timeout) { - sec = timeout / HZ; - usec = timeout % HZ; - usec *= (1000000/HZ); - } - err = put_user(sec, &tvp->tv_sec); - err |= __put_user(usec, &tvp->tv_usec); - if (err) - ret = -EFAULT; - } - - if (ret < 0) - goto out; - if (!ret) { - ret = -ERESTARTNOHAND; - if (signal_pending(current)) - goto out; - ret = 0; - } - - set_fd_set32(n, inp, fds.res_in); - set_fd_set32(n, outp, fds.res_out); - set_fd_set32(n, exp, fds.res_ex); - -out: - select_bits_free(bits, size); -out_nofds: - return ret; -} - struct msgbuf32 { int mtype; char mtext[1]; @@ -991,7 +545,6 @@ asmlinkage long sys32_msgrcv(int msqid, return err; } - asmlinkage int sys32_sendfile(int out_fd, int in_fd, compat_off_t *offset, s32 count) { mm_segment_t old_fs = get_fs(); @@ -1011,94 +564,6 @@ asmlinkage int sys32_sendfile(int out_fd return ret; } -/* EXPORT/UNEXPORT */ -struct nfsctl_export32 { - char ex_client[NFSCLNT_IDMAX+1]; - char ex_path[NFS_MAXPATHLEN+1]; - __kernel_old_dev_t ex_dev; - compat_ino_t ex_ino; - int ex_flags; - __kernel_uid_t ex_anon_uid; - __kernel_gid_t ex_anon_gid; -}; - -struct nfsctl_arg32 { - int ca_version; /* safeguard */ - /* wide kernel places this union on 8-byte boundary, narrow on 4 */ - union { - struct nfsctl_svc u_svc; - struct nfsctl_client u_client; - struct nfsctl_export32 u_export; - struct nfsctl_fdparm u_getfd; - struct nfsctl_fsparm u_getfs; - } u; -}; - -asmlinkage int sys32_nfsservctl(int cmd, void *argp, void *resp) -{ - int ret, tmp; - struct nfsctl_arg32 n32; - struct nfsctl_arg n; - - ret = copy_from_user(&n, argp, sizeof n.ca_version); - if (ret != 0) - return ret; - - /* adjust argp to point at the union inside the user's n32 struct */ - tmp = (unsigned long)&n32.u - (unsigned long)&n32; - argp = (void *)((unsigned long)argp + tmp); - switch(cmd) { - case NFSCTL_SVC: - ret = copy_from_user(&n.u, argp, sizeof n.u.u_svc); - break; - - case NFSCTL_ADDCLIENT: - case NFSCTL_DELCLIENT: - ret = copy_from_user(&n.u, argp, sizeof n.u.u_client); - break; - - case NFSCTL_GETFD: - ret = copy_from_user(&n.u, argp, sizeof n.u.u_getfd); - break; - - case NFSCTL_GETFS: - ret = copy_from_user(&n.u, argp, sizeof n.u.u_getfs); - break; - - case NFSCTL_UNEXPORT: /* nfsctl_export */ - case NFSCTL_EXPORT: /* nfsctl_export */ - ret = copy_from_user(&n32.u, argp, sizeof n32.u.u_export); -#undef CP -#define CP(x) n.u.u_export.ex_##x = n32.u.u_export.ex_##x - memcpy(n.u.u_export.ex_client, n32.u.u_export.ex_client, sizeof n32.u.u_export.ex_client); - memcpy(n.u.u_export.ex_path, n32.u.u_export.ex_path, sizeof n32.u.u_export.ex_path); - CP(dev); - CP(ino); - CP(flags); - CP(anon_uid); - CP(anon_gid); - break; - - default: - /* lockd probes for some other values (0x10000); - * so don't BUG() */ - ret = -EINVAL; - break; - } - - if (ret == 0) { - unsigned char rbuf[NFS_FHSIZE + sizeof (struct knfsd_fh)]; - KERNEL_SYSCALL(ret, sys_nfsservctl, cmd, &n, &rbuf); - if (cmd == NFSCTL_GETFD) { - ret = copy_to_user(resp, rbuf, NFS_FHSIZE); - } else if (cmd == NFSCTL_GETFS) { - ret = copy_to_user(resp, rbuf, sizeof (struct knfsd_fh)); - } - } - - return ret; -} - typedef long __kernel_loff_t32; /* move this to asm/posix_types.h? */ asmlinkage int sys32_sendfile64(int out_fd, int in_fd, __kernel_loff_t32 *offset, s32 count) --- diff/arch/parisc/kernel/syscall_table.S 2004-04-05 12:57:07.000000000 +0100 +++ source/arch/parisc/kernel/syscall_table.S 2004-04-21 10:46:51.217795824 +0100 @@ -232,12 +232,12 @@ ENTRY_DIFF(getdents) /* it is POSSIBLE that select will be OK because even though fd_set * contains longs, the macros and sizes are clever. */ - ENTRY_DIFF(select) + ENTRY_COMP(select) ENTRY_SAME(flock) ENTRY_SAME(msync) /* struct iovec contains pointers */ - ENTRY_DIFF(readv) /* 145 */ - ENTRY_DIFF(writev) + ENTRY_COMP(readv) /* 145 */ + ENTRY_COMP(writev) ENTRY_SAME(getsid) ENTRY_SAME(fdatasync) /* struct __sysctl_args is a mess */ @@ -266,7 +266,7 @@ ENTRY_SAME(ni_syscall) /* query_module */ ENTRY_SAME(poll) /* structs contain pointers and an in_addr... */ - ENTRY_DIFF(nfsservctl) + ENTRY_COMP(nfsservctl) ENTRY_SAME(setresgid) /* 170 */ ENTRY_SAME(getresgid) ENTRY_SAME(prctl) --- diff/arch/ppc/Kconfig 2004-04-05 12:57:07.000000000 +0100 +++ source/arch/ppc/Kconfig 2004-04-21 10:46:51.232793544 +0100 @@ -1209,6 +1209,19 @@ config DEBUG_INFO debug the kernel. If you don't debug the kernel, you can say N. +config SCHEDSTATS + bool "Collect scheduler statistics" + depends on PROC_FS + default y + help + If you say Y here, additional code will be inserted into the + scheduler and related routines to collect statistics about + scheduler behavior and provide them in /proc/schedstat. These + stats may be useful for both tuning and debugging the scheduler + If you aren't debugging the scheduler or trying to tune a specific + application, you can say N to avoid the very slight overhead + this adds. + config BOOTX_TEXT bool "Support for early boot text console (BootX or OpenFirmware only)" depends PPC_OF --- diff/arch/ppc/kernel/process.c 2004-04-21 10:45:33.752572328 +0100 +++ source/arch/ppc/kernel/process.c 2004-04-21 10:46:51.232793544 +0100 @@ -658,12 +658,6 @@ void __init ll_puts(const char *s) } #endif -/* - * These bracket the sleeping functions.. - */ -#define first_sched ((unsigned long) scheduling_functions_start_here) -#define last_sched ((unsigned long) scheduling_functions_end_here) - unsigned long get_wchan(struct task_struct *p) { unsigned long ip, sp; @@ -678,7 +672,7 @@ unsigned long get_wchan(struct task_stru return 0; if (count > 0) { ip = *(unsigned long *)(sp + 4); - if (ip < first_sched || ip >= last_sched) + if (!in_sched_functions(ip)) return ip; } } while (count++ < 16); --- diff/arch/ppc/kernel/setup.c 2004-03-11 10:20:22.000000000 +0000 +++ source/arch/ppc/kernel/setup.c 2004-04-21 10:46:51.239792480 +0100 @@ -53,7 +53,6 @@ extern void ppc6xx_idle(void); extern void power4_idle(void); extern boot_infos_t *boot_infos; -char saved_command_line[COMMAND_LINE_SIZE]; unsigned char aux_device_present; struct ide_machdep_calls ppc_ide_md; char *sysmap; --- diff/arch/ppc/mm/pgtable.c 2004-04-21 10:45:33.758571416 +0100 +++ source/arch/ppc/mm/pgtable.c 2004-04-21 10:46:51.240792328 +0100 @@ -86,9 +86,14 @@ pte_t *pte_alloc_one_kernel(struct mm_st extern int mem_init_done; extern void *early_get_page(void); - if (mem_init_done) + if (mem_init_done) { pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); - else + if (pte) { + struct page *ptepage = virt_to_page(pte); + ptepage->mapping = (void *) mm; + ptepage->index = address & PMD_MASK; + } + } else pte = (pte_t *)early_get_page(); if (pte) clear_page(pte); @@ -97,7 +102,7 @@ pte_t *pte_alloc_one_kernel(struct mm_st struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) { - struct page *pte; + struct page *ptepage; #ifdef CONFIG_HIGHPTE int flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_REPEAT; @@ -105,10 +110,13 @@ struct page *pte_alloc_one(struct mm_str int flags = GFP_KERNEL | __GFP_REPEAT; #endif - pte = alloc_pages(flags, 0); - if (pte) - clear_highpage(pte); - return pte; + ptepage = alloc_pages(flags, 0); + if (ptepage) { + ptepage->mapping = (void *) mm; + ptepage->index = address & PMD_MASK; + clear_highpage(ptepage); + } + return ptepage; } void pte_free_kernel(pte_t *pte) @@ -116,15 +124,17 @@ void pte_free_kernel(pte_t *pte) #ifdef CONFIG_SMP hash_page_sync(); #endif + virt_to_page(pte)->mapping = NULL; free_page((unsigned long)pte); } -void pte_free(struct page *pte) +void pte_free(struct page *ptepage) { #ifdef CONFIG_SMP hash_page_sync(); #endif - __free_page(pte); + ptepage->mapping = NULL; + __free_page(ptepage); } #ifndef CONFIG_44x --- diff/arch/ppc/platforms/lopec_setup.c 2003-09-30 15:46:12.000000000 +0100 +++ source/arch/ppc/platforms/lopec_setup.c 2004-04-21 10:46:51.243791872 +0100 @@ -33,7 +33,6 @@ #include #include -extern char saved_command_line[]; extern void lopec_find_bridges(void); /* --- diff/arch/ppc/platforms/pmac_setup.c 2004-02-18 08:54:08.000000000 +0000 +++ source/arch/ppc/platforms/pmac_setup.c 2004-04-21 10:46:51.251790656 +0100 @@ -103,8 +103,6 @@ int has_l2cache = 0; static int current_root_goodness = -1; -extern char saved_command_line[]; - extern int pmac_newworld; #define DEFAULT_ROOT_DEVICE Root_SDA1 /* sda1 - slightly silly choice */ --- diff/arch/ppc/platforms/pplus.c 2004-04-05 12:57:07.000000000 +0100 +++ source/arch/ppc/platforms/pplus.c 2004-04-21 10:46:51.259789440 +0100 @@ -48,8 +48,6 @@ TODC_ALLOC(); -extern char saved_command_line[]; - extern void pplus_setup_hose(void); extern void pplus_set_VIA_IDE_native(void); --- diff/arch/ppc/platforms/prep_setup.c 2004-04-05 12:57:07.000000000 +0100 +++ source/arch/ppc/platforms/prep_setup.c 2004-04-21 10:46:51.261789136 +0100 @@ -76,7 +76,6 @@ extern void rs_nvram_write_val(int addr, extern void ibm_prep_init(void); extern void prep_find_bridges(void); -extern char saved_command_line[]; int _prep_type; --- diff/arch/ppc64/Kconfig 2004-04-21 10:45:33.765570352 +0100 +++ source/arch/ppc64/Kconfig 2004-04-21 10:46:51.219795520 +0100 @@ -173,6 +173,15 @@ config NUMA bool "NUMA support" depends on DISCONTIGMEM +config SCHED_SMT + bool "SMT (Hyperthreading) scheduler support" + depends on SMP + default off + help + SMT scheduler support improves the CPU scheduler's decision making + when dealing with POWER5 cpus at a cost of slightly increased + overhead in some places. If unsure say N here. + config PREEMPT bool "Preemptible Kernel" depends on BROKEN @@ -394,7 +403,20 @@ config DEBUG_INFO debugging info resulting in a larger kernel image. Say Y here only if you plan to use gdb to debug the kernel. If you don't debug the kernel, you can say N. - + +config SCHEDSTATS + bool "Collect scheduler statistics" + depends on PROC_FS + default y + help + If you say Y here, additional code will be inserted into the + scheduler and related routines to collect statistics about + scheduler behavior and provide them in /proc/schedstat. These + stats may be useful for both tuning and debugging the scheduler + If you aren't debugging the scheduler or trying to tune a specific + application, you can say N to avoid the very slight overhead + this adds. + endmenu source "security/Kconfig" --- diff/arch/ppc64/kernel/head.S 2004-04-21 10:45:33.771569440 +0100 +++ source/arch/ppc64/kernel/head.S 2004-04-21 10:46:51.220795368 +0100 @@ -35,6 +35,7 @@ #include #include #include +#include #ifdef CONFIG_PPC_ISERIES #define DO_SOFT_DISABLE @@ -2285,4 +2286,4 @@ stab_array: */ .globl cmd_line cmd_line: - .space 512 /* COMMAND_LINE_SIZE */ + .space COMMAND_LINE_SIZE --- diff/arch/ppc64/kernel/misc.S 2004-04-21 10:45:33.775568832 +0100 +++ source/arch/ppc64/kernel/misc.S 2004-04-21 10:46:51.221795216 +0100 @@ -717,8 +717,8 @@ _GLOBAL(sys_call_table32) .llong .ppc32_select .llong .sys_flock .llong .sys_msync - .llong .sys32_readv /* 145 */ - .llong .sys32_writev + .llong .compat_sys_readv /* 145 */ + .llong .compat_sys_writev .llong .sys32_getsid .llong .sys_fdatasync .llong .sys32_sysctl @@ -740,7 +740,7 @@ _GLOBAL(sys_call_table32) .llong .sys_getresuid /* 165 */ .llong .sys_ni_syscall /* old query_module syscall */ .llong .sys_poll - .llong .sys32_nfsservctl + .llong .compat_sys_nfsservctl .llong .sys_setresgid .llong .sys_getresgid /* 170 */ .llong .sys32_prctl --- diff/arch/ppc64/kernel/process.c 2004-04-21 10:45:33.781567920 +0100 +++ source/arch/ppc64/kernel/process.c 2004-04-21 10:46:51.221795216 +0100 @@ -472,12 +472,6 @@ static inline int validate_sp(unsigned l return 1; } -/* - * These bracket the sleeping functions.. - */ -#define first_sched (*(unsigned long *)scheduling_functions_start_here) -#define last_sched (*(unsigned long *)scheduling_functions_end_here) - unsigned long get_wchan(struct task_struct *p) { unsigned long ip, sp; @@ -496,7 +490,7 @@ unsigned long get_wchan(struct task_stru return 0; if (count > 0) { ip = *(unsigned long *)(sp + 16); - if (ip < first_sched || ip >= last_sched) + if (!in_sched_functions(ip)) return ip; } } while (count++ < 16); --- diff/arch/ppc64/kernel/setup.c 2004-04-21 10:45:33.786567160 +0100 +++ source/arch/ppc64/kernel/setup.c 2004-04-21 10:46:51.222795064 +0100 @@ -82,7 +82,6 @@ unsigned long decr_overclock_proc0_set = int powersave_nap; -char saved_command_line[COMMAND_LINE_SIZE]; unsigned char aux_device_present; void parse_cmd_line(unsigned long r3, unsigned long r4, unsigned long r5, --- diff/arch/ppc64/kernel/smp.c 2004-04-21 10:45:33.788566856 +0100 +++ source/arch/ppc64/kernel/smp.c 2004-04-21 10:46:51.223794912 +0100 @@ -834,11 +834,6 @@ void __init smp_prepare_cpus(unsigned in paca[boot_cpuid].prof_counter = 1; paca[boot_cpuid].prof_multiplier = 1; - /* - * XXX very rough. - */ - cache_decay_ticks = HZ/100; - #ifndef CONFIG_PPC_ISERIES paca[boot_cpuid].next_jiffy_update_tb = tb_last_stamp = get_tb(); @@ -996,3 +991,218 @@ void __init smp_cpus_done(unsigned int m set_cpus_allowed(current, old_mask); } + +#ifdef CONFIG_SCHED_SMT +#ifdef CONFIG_NUMA +static struct sched_group sched_group_cpus[NR_CPUS]; +static struct sched_group sched_group_phys[NR_CPUS]; +static struct sched_group sched_group_nodes[MAX_NUMNODES]; +static DEFINE_PER_CPU(struct sched_domain, cpu_domains); +static DEFINE_PER_CPU(struct sched_domain, phys_domains); +static DEFINE_PER_CPU(struct sched_domain, node_domains); +__init void arch_init_sched_domains(void) +{ + int i; + struct sched_group *first = NULL, *last = NULL; + + /* Set up domains */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + struct sched_domain *phys_domain = &per_cpu(phys_domains, i); + struct sched_domain *node_domain = &per_cpu(node_domains, i); + int node = cpu_to_node(i); + cpumask_t nodemask = node_to_cpumask(node); + cpumask_t my_cpumask = cpumask_of_cpu(i); + cpumask_t sibling_cpumask = cpumask_of_cpu(i ^ 0x1); + + *cpu_domain = SD_SIBLING_INIT; + if (cur_cpu_spec->cpu_features & CPU_FTR_SMT) + cpus_or(cpu_domain->span, my_cpumask, sibling_cpumask); + else + cpu_domain->span = my_cpumask; + cpu_domain->parent = phys_domain; + cpu_domain->groups = &sched_group_cpus[i]; + + *phys_domain = SD_CPU_INIT; + phys_domain->span = nodemask; + phys_domain->parent = node_domain; + phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)]; + + *node_domain = SD_NODE_INIT; + node_domain->span = cpu_possible_map; + node_domain->groups = &sched_group_nodes[node]; + } + + /* Set up CPU (sibling) groups */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + int j; + first = last = NULL; + + if (i != first_cpu(cpu_domain->span)) + continue; + + for_each_cpu_mask(j, cpu_domain->span) { + struct sched_group *cpu = &sched_group_cpus[j]; + + cpus_clear(cpu->cpumask); + cpu_set(j, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; + + if (!first) + first = cpu; + if (last) + last->next = cpu; + last = cpu; + } + last->next = first; + } + + for (i = 0; i < MAX_NUMNODES; i++) { + int j; + cpumask_t nodemask; + struct sched_group *node = &sched_group_nodes[i]; + cpumask_t node_cpumask = node_to_cpumask(i); + cpus_and(nodemask, node_cpumask, cpu_possible_map); + + if (cpus_empty(nodemask)) + continue; + + first = last = NULL; + /* Set up physical groups */ + for_each_cpu_mask(j, nodemask) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, j); + struct sched_group *cpu = &sched_group_phys[j]; + + if (j != first_cpu(cpu_domain->span)) + continue; + + cpu->cpumask = cpu_domain->span; + /* + * Make each extra sibling increase power by 10% of + * the basic CPU. This is very arbitrary. + */ + cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10; + node->cpu_power += cpu->cpu_power; + + if (!first) + first = cpu; + if (last) + last->next = cpu; + last = cpu; + } + last->next = first; + } + + /* Set up nodes */ + first = last = NULL; + for (i = 0; i < MAX_NUMNODES; i++) { + struct sched_group *cpu = &sched_group_nodes[i]; + cpumask_t nodemask; + cpumask_t node_cpumask = node_to_cpumask(i); + cpus_and(nodemask, node_cpumask, cpu_possible_map); + + if (cpus_empty(nodemask)) + continue; + + cpu->cpumask = nodemask; + /* ->cpu_power already setup */ + + if (!first) + first = cpu; + if (last) + last->next = cpu; + last = cpu; + } + last->next = first; + + mb(); + for_each_cpu(i) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + cpu_attach_domain(cpu_domain, i); + } +} +#else /* !CONFIG_NUMA */ +static struct sched_group sched_group_cpus[NR_CPUS]; +static struct sched_group sched_group_phys[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_domain, cpu_domains); +static DEFINE_PER_CPU(struct sched_domain, phys_domains); +__init void arch_init_sched_domains(void) +{ + int i; + struct sched_group *first = NULL, *last = NULL; + + /* Set up domains */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + struct sched_domain *phys_domain = &per_cpu(phys_domains, i); + cpumask_t my_cpumask = cpumask_of_cpu(i); + cpumask_t sibling_cpumask = cpumask_of_cpu(i ^ 0x1); + + *cpu_domain = SD_SIBLING_INIT; + if (cur_cpu_spec->cpu_features & CPU_FTR_SMT) + cpus_or(cpu_domain->span, my_cpumask, sibling_cpumask); + else + cpu_domain->span = my_cpumask; + cpu_domain->parent = phys_domain; + cpu_domain->groups = &sched_group_cpus[i]; + + *phys_domain = SD_CPU_INIT; + phys_domain->span = cpu_possible_map; + phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)]; + } + + /* Set up CPU (sibling) groups */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + int j; + first = last = NULL; + + if (i != first_cpu(cpu_domain->span)) + continue; + + for_each_cpu_mask(j, cpu_domain->span) { + struct sched_group *cpu = &sched_group_cpus[j]; + + cpus_clear(cpu->cpumask); + cpu_set(j, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; + + if (!first) + first = cpu; + if (last) + last->next = cpu; + last = cpu; + } + last->next = first; + } + + first = last = NULL; + /* Set up physical groups */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + struct sched_group *cpu = &sched_group_phys[i]; + + if (i != first_cpu(cpu_domain->span)) + continue; + + cpu->cpumask = cpu_domain->span; + /* See SMT+NUMA setup for comment */ + cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10; + + if (!first) + first = cpu; + if (last) + last->next = cpu; + last = cpu; + } + last->next = first; + + mb(); + for_each_cpu(i) { + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + cpu_attach_domain(cpu_domain, i); + } +} +#endif /* CONFIG_NUMA */ +#endif /* CONFIG_SCHED_SMT */ --- diff/arch/ppc64/kernel/sys_ppc32.c 2004-04-05 12:57:07.000000000 +0100 +++ source/arch/ppc64/kernel/sys_ppc32.c 2004-04-21 10:46:51.224794760 +0100 @@ -78,178 +78,6 @@ #include "pci.h" -typedef ssize_t (*io_fn_t)(struct file *, char *, size_t, loff_t *); -typedef ssize_t (*iov_fn_t)(struct file *, const struct iovec *, unsigned long, loff_t *); - -static long do_readv_writev32(int type, struct file *file, - const struct compat_iovec *vector, u32 count) -{ - compat_ssize_t tot_len; - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov=iovstack, *ivp; - struct inode *inode; - long retval, i; - io_fn_t fn; - iov_fn_t fnv; - - /* - * SuS says "The readv() function *may* fail if the iovcnt argument - * was less than or equal to 0, or greater than {IOV_MAX}. Linux has - * traditionally returned zero for zero segments, so... - */ - retval = 0; - if (count == 0) - goto out; - - /* First get the "struct iovec" from user memory and - * verify all the pointers - */ - retval = -EINVAL; - if (count > UIO_MAXIOV) - goto out; - if (!file->f_op) - goto out; - if (count > UIO_FASTIOV) { - retval = -ENOMEM; - iov = kmalloc(count*sizeof(struct iovec), GFP_KERNEL); - if (!iov) - goto out; - } - retval = -EFAULT; - if (verify_area(VERIFY_READ, vector, sizeof(struct compat_iovec)*count)) - goto out; - - /* - * Single unix specification: - * We should -EINVAL if an element length is not >= 0 and fitting an - * ssize_t. The total length is fitting an ssize_t - * - * Be careful here because iov_len is a size_t not an ssize_t - */ - tot_len = 0; - i = count; - ivp = iov; - retval = -EINVAL; - while(i > 0) { - compat_ssize_t tmp = tot_len; - compat_ssize_t len; - u32 buf; - - if (__get_user(len, &vector->iov_len) || - __get_user(buf, &vector->iov_base)) { - retval = -EFAULT; - goto out; - } - if (len < 0) /* size_t not fitting an compat_ssize_t .. */ - goto out; - tot_len += len; - if (tot_len < tmp) /* maths overflow on the compat_ssize_t */ - goto out; - ivp->iov_base = (void *)A(buf); - ivp->iov_len = (__kernel_size_t) len; - vector++; - ivp++; - i--; - } - if (tot_len == 0) { - retval = 0; - goto out; - } - - inode = file->f_dentry->d_inode; - /* VERIFY_WRITE actually means a read, as we write to user space */ - retval = locks_verify_area((type == READ - ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE), - inode, file, file->f_pos, tot_len); - if (retval) - goto out; - - if (type == READ) { - fn = file->f_op->read; - fnv = file->f_op->readv; - } else { - fn = (io_fn_t)file->f_op->write; - fnv = file->f_op->writev; - } - if (fnv) { - retval = fnv(file, iov, count, &file->f_pos); - goto out; - } - - /* Do it by hand, with file-ops */ - ivp = iov; - while (count > 0) { - void * base; - int len, nr; - - base = ivp->iov_base; - len = ivp->iov_len; - ivp++; - count--; - - nr = fn(file, base, len, &file->f_pos); - - if (nr < 0) { - if (!retval) - retval = nr; - break; - } - retval += nr; - if (nr != len) - break; - } -out: - if (iov != iovstack) - kfree(iov); - if ((retval + (type == READ)) > 0) - dnotify_parent(file->f_dentry, - (type == READ) ? DN_ACCESS : DN_MODIFY); - - return retval; -} - -asmlinkage long sys32_readv(int fd, struct compat_iovec *vector, u32 count) -{ - struct file *file; - int ret = -EBADF; - - file = fget(fd); - if (!file || !(file->f_mode & FMODE_READ)) - goto out; - - ret = -EINVAL; - if (!file->f_op || (!file->f_op->readv && !file->f_op->read)) - goto out; - - ret = do_readv_writev32(READ, file, vector, count); - -out: - if (file) - fput(file); - return ret; -} - -asmlinkage long sys32_writev(int fd, struct compat_iovec *vector, u32 count) -{ - struct file *file; - int ret = -EBADF; - - file = fget(fd); - if (!file || !(file->f_mode & FMODE_WRITE)) - goto out; - - ret = -EINVAL; - if (!file->f_op || (!file->f_op->writev && !file->f_op->write)) - goto out; - - ret = do_readv_writev32(WRITE, file, vector, count); - -out: - if (file) - fput(file); - return ret; -} - /* readdir & getdents */ #define NAME_OFFSET(de) ((int) ((de)->d_name - (char *) (de))) #define ROUND_UP(x) (((x)+sizeof(u32)-1) & ~(sizeof(u32)-1)) @@ -398,167 +226,12 @@ out: return error; } -/* - * Ooo, nasty. We need here to frob 32-bit unsigned longs to - * 64-bit unsigned longs. - */ -static inline int -get_fd_set32(unsigned long n, unsigned long *fdset, u32 *ufdset) -{ - if (ufdset) { - unsigned long odd; - - if (verify_area(VERIFY_WRITE, ufdset, n*sizeof(u32))) - return -EFAULT; - - odd = n & 1UL; - n &= ~1UL; - while (n) { - unsigned long h, l; - __get_user(l, ufdset); - __get_user(h, ufdset+1); - ufdset += 2; - *fdset++ = h << 32 | l; - n -= 2; - } - if (odd) - __get_user(*fdset, ufdset); - } else { - /* Tricky, must clear full unsigned long in the - * kernel fdset at the end, this makes sure that - * actually happens. - */ - memset(fdset, 0, ((n + 1) & ~1)*sizeof(u32)); - } - return 0; -} - -static inline void -set_fd_set32(unsigned long n, u32 *ufdset, unsigned long *fdset) -{ - unsigned long odd; - - if (!ufdset) - return; - - odd = n & 1UL; - n &= ~1UL; - while (n) { - unsigned long h, l; - l = *fdset++; - h = l >> 32; - __put_user(l, ufdset); - __put_user(h, ufdset+1); - ufdset += 2; - n -= 2; - } - if (odd) - __put_user(*fdset, ufdset); -} - - - -#define MAX_SELECT_SECONDS ((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1) - -asmlinkage long sys32_select(int n, u32 *inp, u32 *outp, u32 *exp, u32 tvp_x) -{ - fd_set_bits fds; - struct compat_timeval *tvp = (struct compat_timeval *)AA(tvp_x); - char *bits; - unsigned long nn; - long timeout; - int ret, size, max_fdset; - - timeout = MAX_SCHEDULE_TIMEOUT; - if (tvp) { - time_t sec, usec; - if ((ret = verify_area(VERIFY_READ, tvp, sizeof(*tvp))) - || (ret = __get_user(sec, &tvp->tv_sec)) - || (ret = __get_user(usec, &tvp->tv_usec))) - goto out_nofds; - - ret = -EINVAL; - if(sec < 0 || usec < 0) - goto out_nofds; - - if ((unsigned long) sec < MAX_SELECT_SECONDS) { - timeout = (usec + 1000000/HZ - 1) / (1000000/HZ); - timeout += sec * (unsigned long) HZ; - } - } - - ret = -EINVAL; - if (n < 0) - goto out_nofds; - - /* max_fdset can increase, so grab it once to avoid race */ - max_fdset = current->files->max_fdset; - if (n > max_fdset) - n = max_fdset; - - /* - * We need 6 bitmaps (in/out/ex for both incoming and outgoing), - * since we used fdset we need to allocate memory in units of - * long-words. - */ - ret = -ENOMEM; - size = FDS_BYTES(n); - bits = kmalloc(6 * size, GFP_KERNEL); - if (!bits) - goto out_nofds; - fds.in = (unsigned long *) bits; - fds.out = (unsigned long *) (bits + size); - fds.ex = (unsigned long *) (bits + 2*size); - fds.res_in = (unsigned long *) (bits + 3*size); - fds.res_out = (unsigned long *) (bits + 4*size); - fds.res_ex = (unsigned long *) (bits + 5*size); - - nn = (n + 8*sizeof(u32) - 1) / (8*sizeof(u32)); - if ((ret = get_fd_set32(nn, fds.in, inp)) || - (ret = get_fd_set32(nn, fds.out, outp)) || - (ret = get_fd_set32(nn, fds.ex, exp))) - goto out; - zero_fd_set(n, fds.res_in); - zero_fd_set(n, fds.res_out); - zero_fd_set(n, fds.res_ex); - - ret = do_select(n, &fds, &timeout); - - if (tvp && !(current->personality & STICKY_TIMEOUTS)) { - time_t sec = 0, usec = 0; - if (timeout) { - sec = timeout / HZ; - usec = timeout % HZ; - usec *= (1000000/HZ); - } - put_user(sec, &tvp->tv_sec); - put_user(usec, &tvp->tv_usec); - } - - if (ret < 0) - goto out; - if (!ret) { - ret = -ERESTARTNOHAND; - if (signal_pending(current)) - goto out; - ret = 0; - } - - set_fd_set32(nn, inp, fds.res_in); - set_fd_set32(nn, outp, fds.res_out); - set_fd_set32(nn, exp, fds.res_ex); - -out: - kfree(bits); - -out_nofds: - return ret; -} - -int ppc32_select(u32 n, u32* inp, u32* outp, u32* exp, u32 tvp_x) +asmlinkage long ppc32_select(u32 n, compat_ulong_t __user *inp, + compat_ulong_t __user *outp, compat_ulong_t __user *exp, + compat_uptr_t tvp_x) { /* sign extend n */ - return sys32_select((int)n, inp, outp, exp, tvp_x); + return compat_sys_select((int)n, inp, outp, exp, compat_ptr(tvp_x)); } int cp_compat_stat(struct kstat *stat, struct compat_stat *statbuf) @@ -678,245 +351,6 @@ asmlinkage long sys32_adjtimex(struct ti return ret; } -/* Stuff for NFS server syscalls... */ -struct nfsctl_svc32 { - u16 svc32_port; - s32 svc32_nthreads; -}; - -struct nfsctl_client32 { - s8 cl32_ident[NFSCLNT_IDMAX+1]; - s32 cl32_naddr; - struct in_addr cl32_addrlist[NFSCLNT_ADDRMAX]; - s32 cl32_fhkeytype; - s32 cl32_fhkeylen; - u8 cl32_fhkey[NFSCLNT_KEYMAX]; -}; - -struct nfsctl_export32 { - s8 ex32_client[NFSCLNT_IDMAX+1]; - s8 ex32_path[NFS_MAXPATHLEN+1]; - compat_dev_t ex32_dev; - compat_ino_t ex32_ino; - s32 ex32_flags; - compat_uid_t ex32_anon_uid; - compat_gid_t ex32_anon_gid; -}; - -struct nfsctl_fdparm32 { - struct sockaddr gd32_addr; - s8 gd32_path[NFS_MAXPATHLEN+1]; - s32 gd32_version; -}; - -struct nfsctl_fsparm32 { - struct sockaddr gd32_addr; - s8 gd32_path[NFS_MAXPATHLEN+1]; - s32 gd32_maxlen; -}; - -struct nfsctl_arg32 { - s32 ca32_version; /* safeguard */ - union { - struct nfsctl_svc32 u32_svc; - struct nfsctl_client32 u32_client; - struct nfsctl_export32 u32_export; - struct nfsctl_fdparm32 u32_getfd; - struct nfsctl_fsparm32 u32_getfs; - } u; -#define ca32_svc u.u32_svc -#define ca32_client u.u32_client -#define ca32_export u.u32_export -#define ca32_getfd u.u32_getfd -#define ca32_getfs u.u32_getfs -}; - -union nfsctl_res32 { - __u8 cr32_getfh[NFS_FHSIZE]; - struct knfsd_fh cr32_getfs; -}; - -static int nfs_svc32_trans(struct nfsctl_arg *karg, struct nfsctl_arg32 *arg32) -{ - int err; - - err = __get_user(karg->ca_version, &arg32->ca32_version); - err |= __get_user(karg->ca_svc.svc_port, &arg32->ca32_svc.svc32_port); - err |= __get_user(karg->ca_svc.svc_nthreads, &arg32->ca32_svc.svc32_nthreads); - return err; -} - -static int nfs_clnt32_trans(struct nfsctl_arg *karg, struct nfsctl_arg32 *arg32) -{ - int err; - - err = __get_user(karg->ca_version, &arg32->ca32_version); - err |= copy_from_user(&karg->ca_client.cl_ident[0], - &arg32->ca32_client.cl32_ident[0], - NFSCLNT_IDMAX); - err |= __get_user(karg->ca_client.cl_naddr, &arg32->ca32_client.cl32_naddr); - err |= copy_from_user(&karg->ca_client.cl_addrlist[0], - &arg32->ca32_client.cl32_addrlist[0], - (sizeof(struct in_addr) * NFSCLNT_ADDRMAX)); - err |= __get_user(karg->ca_client.cl_fhkeytype, - &arg32->ca32_client.cl32_fhkeytype); - err |= __get_user(karg->ca_client.cl_fhkeylen, - &arg32->ca32_client.cl32_fhkeylen); - err |= copy_from_user(&karg->ca_client.cl_fhkey[0], - &arg32->ca32_client.cl32_fhkey[0], - NFSCLNT_KEYMAX); - - if(err) return -EFAULT; - return 0; -} - -static int nfs_exp32_trans(struct nfsctl_arg *karg, struct nfsctl_arg32 *arg32) -{ - int err; - - err = __get_user(karg->ca_version, &arg32->ca32_version); - err |= copy_from_user(&karg->ca_export.ex_client[0], - &arg32->ca32_export.ex32_client[0], - NFSCLNT_IDMAX); - err |= copy_from_user(&karg->ca_export.ex_path[0], - &arg32->ca32_export.ex32_path[0], - NFS_MAXPATHLEN); - err |= __get_user(karg->ca_export.ex_dev, - &arg32->ca32_export.ex32_dev); - err |= __get_user(karg->ca_export.ex_ino, - &arg32->ca32_export.ex32_ino); - err |= __get_user(karg->ca_export.ex_flags, - &arg32->ca32_export.ex32_flags); - err |= __get_user(karg->ca_export.ex_anon_uid, - &arg32->ca32_export.ex32_anon_uid); - err |= __get_user(karg->ca_export.ex_anon_gid, - &arg32->ca32_export.ex32_anon_gid); - karg->ca_export.ex_anon_uid = karg->ca_export.ex_anon_uid; - karg->ca_export.ex_anon_gid = karg->ca_export.ex_anon_gid; - - if(err) return -EFAULT; - return 0; -} - -static int nfs_getfd32_trans(struct nfsctl_arg *karg, struct nfsctl_arg32 *arg32) -{ - int err; - - err = __get_user(karg->ca_version, &arg32->ca32_version); - err |= copy_from_user(&karg->ca_getfd.gd_addr, - &arg32->ca32_getfd.gd32_addr, - (sizeof(struct sockaddr))); - err |= copy_from_user(&karg->ca_getfd.gd_path, - &arg32->ca32_getfd.gd32_path, - (NFS_MAXPATHLEN+1)); - err |= __get_user(karg->ca_getfd.gd_version, - &arg32->ca32_getfd.gd32_version); - - if(err) return -EFAULT; - return 0; -} - -static int nfs_getfs32_trans(struct nfsctl_arg *karg, struct nfsctl_arg32 *arg32) -{ - int err; - - err = __get_user(karg->ca_version, &arg32->ca32_version); - err |= copy_from_user(&karg->ca_getfs.gd_addr, - &arg32->ca32_getfs.gd32_addr, - (sizeof(struct sockaddr))); - err |= copy_from_user(&karg->ca_getfs.gd_path, - &arg32->ca32_getfs.gd32_path, - (NFS_MAXPATHLEN+1)); - err |= __get_user(karg->ca_getfs.gd_maxlen, - &arg32->ca32_getfs.gd32_maxlen); - - if(err) return -EFAULT; - return 0; -} - -/* This really doesn't need translations, we are only passing - * back a union which contains opaque nfs file handle data. - */ -static int nfs_getfh32_res_trans(union nfsctl_res *kres, union nfsctl_res32 *res32) -{ - int err; - - err = copy_to_user(res32, kres, sizeof(*res32)); - - if(err) return -EFAULT; - return 0; -} - -/* Note: it is necessary to treat cmd_parm as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -int asmlinkage sys32_nfsservctl(u32 cmd_parm, struct nfsctl_arg32 *arg32, union nfsctl_res32 *res32) -{ - int cmd = (int)cmd_parm; - struct nfsctl_arg *karg = NULL; - union nfsctl_res *kres = NULL; - mm_segment_t oldfs; - int err; - - karg = kmalloc(sizeof(*karg), GFP_USER); - if(!karg) - return -ENOMEM; - if(res32) { - kres = kmalloc(sizeof(*kres), GFP_USER); - if(!kres) { - kfree(karg); - return -ENOMEM; - } - } - switch(cmd) { - case NFSCTL_SVC: - err = nfs_svc32_trans(karg, arg32); - break; - case NFSCTL_ADDCLIENT: - err = nfs_clnt32_trans(karg, arg32); - break; - case NFSCTL_DELCLIENT: - err = nfs_clnt32_trans(karg, arg32); - break; - case NFSCTL_EXPORT: - case NFSCTL_UNEXPORT: - err = nfs_exp32_trans(karg, arg32); - break; - case NFSCTL_GETFD: - err = nfs_getfd32_trans(karg, arg32); - break; - case NFSCTL_GETFS: - err = nfs_getfs32_trans(karg, arg32); - break; - default: - err = -EINVAL; - break; - } - if(err) - goto done; - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = sys_nfsservctl(cmd, karg, kres); - set_fs(oldfs); - - if (err) - goto done; - - if((cmd == NFSCTL_GETFD) || - (cmd == NFSCTL_GETFS)) - err = nfs_getfh32_res_trans(kres, res32); - -done: - if(karg) - kfree(karg); - if(kres) - kfree(kres); - return err; -} - - /* These are here just in case some old sparc32 binary calls it. */ asmlinkage long sys32_pause(void) @@ -1721,191 +1155,6 @@ asmlinkage int sys32_sendfile64(int out_ return ret; } -/* - * count32() counts the number of arguments/envelopes - */ -static int count32(u32 * argv, int max) -{ - int i = 0; - - if (argv != NULL) { - for (;;) { - u32 p; int error; - - error = get_user(p,argv); - if (error) - return error; - if (!p) - break; - argv++; - if (++i > max) - return -E2BIG; - } - } - return i; -} - -/* - * 'copy_string32()' copies argument/envelope strings from user - * memory to free pages in kernel mem. These are in a format ready - * to be put directly into the top of new user memory. - */ -static int copy_strings32(int argc, u32 * argv, struct linux_binprm *bprm) -{ - while (argc-- > 0) { - u32 str; - int len; - unsigned long pos; - - if (get_user(str, argv + argc) || - !str || - !(len = strnlen_user((char *)A(str), bprm->p))) - return -EFAULT; - - if (bprm->p < len) - return -E2BIG; - - bprm->p -= len; - - pos = bprm->p; - while (len) { - char *kaddr; - struct page *page; - int offset, bytes_to_copy, new, err; - - offset = pos % PAGE_SIZE; - page = bprm->page[pos / PAGE_SIZE]; - new = 0; - if (!page) { - page = alloc_page(GFP_USER); - bprm->page[pos / PAGE_SIZE] = page; - if (!page) - return -ENOMEM; - new = 1; - } - kaddr = (char *)kmap(page); - - if (new && offset) - memset(kaddr, 0, offset); - bytes_to_copy = PAGE_SIZE - offset; - if (bytes_to_copy > len) { - bytes_to_copy = len; - if (new) - memset(kaddr+offset+len, 0, - PAGE_SIZE-offset-len); - } - - err = copy_from_user(kaddr + offset, (char *)A(str), - bytes_to_copy); - kunmap((unsigned long)kaddr); - - if (err) - return -EFAULT; - - pos += bytes_to_copy; - str += bytes_to_copy; - len -= bytes_to_copy; - } - } - return 0; -} - -/* - * sys32_execve() executes a new program. - */ -static int do_execve32(char * filename, u32 * argv, u32 * envp, struct pt_regs * regs) -{ - struct linux_binprm bprm; - struct file * file; - int retval; - int i; - - sched_balance_exec(); - - file = open_exec(filename); - - retval = PTR_ERR(file); - if (IS_ERR(file)) - return retval; - - bprm.p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *); - memset(bprm.page, 0, MAX_ARG_PAGES * sizeof(bprm.page[0])); - - bprm.file = file; - bprm.filename = filename; - bprm.interp = filename; - bprm.sh_bang = 0; - bprm.loader = 0; - bprm.exec = 0; - bprm.security = NULL; - bprm.mm = mm_alloc(); - retval = -ENOMEM; - if (!bprm.mm) - goto out_file; - - retval = init_new_context(current, bprm.mm); - if (retval < 0) - goto out_mm; - - bprm.argc = count32(argv, bprm.p / sizeof(u32)); - if ((retval = bprm.argc) < 0) - goto out_mm; - - bprm.envc = count32(envp, bprm.p / sizeof(u32)); - if ((retval = bprm.envc) < 0) - goto out_mm; - - retval = security_bprm_alloc(&bprm); - if (retval) - goto out; - - retval = prepare_binprm(&bprm); - if (retval < 0) - goto out; - - retval = copy_strings_kernel(1, &bprm.filename, &bprm); - if (retval < 0) - goto out; - - bprm.exec = bprm.p; - retval = copy_strings32(bprm.envc, envp, &bprm); - if (retval < 0) - goto out; - - retval = copy_strings32(bprm.argc, argv, &bprm); - if (retval < 0) - goto out; - - retval = search_binary_handler(&bprm,regs); - if (retval >= 0) { - /* execve success */ - security_bprm_free(&bprm); - return retval; - } - -out: - /* Something went wrong, return the inode and free the argument pages*/ - for (i = 0 ; i < MAX_ARG_PAGES ; i++) { - struct page * page = bprm.page[i]; - if (page) - __free_page(page); - } - - if (bprm.security) - security_bprm_free(&bprm); - -out_mm: - if (bprm.mm) - mmdrop(bprm.mm); - -out_file: - if (bprm.file) { - allow_write_access(bprm.file); - fput(bprm.file); - } - return retval; -} - long sys32_execve(unsigned long a0, unsigned long a1, unsigned long a2, unsigned long a3, unsigned long a4, unsigned long a5, struct pt_regs *regs) @@ -1924,7 +1173,7 @@ long sys32_execve(unsigned long a0, unsi giveup_altivec(current); #endif /* CONFIG_ALTIVEC */ - error = do_execve32(filename, (u32*) a1, (u32*) a2, regs); + error = compat_do_execve(filename, compat_ptr(a1), compat_ptr(a2), regs); if (error == 0) current->ptrace &= ~PT_DTRACE; --- diff/arch/ppc64/mm/hugetlbpage.c 2004-04-21 10:45:33.794565944 +0100 +++ source/arch/ppc64/mm/hugetlbpage.c 2004-04-21 10:46:51.225794608 +0100 @@ -25,7 +25,6 @@ #include #include #include -#include #include @@ -204,7 +203,7 @@ static int prepare_low_seg_for_htlb(stru } page = pmd_page(*pmd); pmd_clear(pmd); - pgtable_remove_rmap(page); + dec_page_state(nr_page_table_pages); pte_free_tlb(tlb, page); } tlb_finish_mmu(tlb, start, end); --- diff/arch/ppc64/mm/tlb.c 2004-03-11 10:20:22.000000000 +0000 +++ source/arch/ppc64/mm/tlb.c 2004-04-21 10:46:51.225794608 +0100 @@ -31,7 +31,6 @@ #include #include #include -#include DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch); @@ -42,6 +41,33 @@ DEFINE_PER_CPU(struct mmu_gather, mmu_ga DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur); unsigned long pte_freelist_forced_free; +void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage) +{ + /* This is safe as we are holding page_table_lock */ + cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id()); + struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur); + + if (atomic_read(&tlb->mm->mm_users) < 2 || + cpus_equal(tlb->mm->cpu_vm_mask, local_cpumask)) { + pte_free(ptepage); + return; + } + + if (*batchp == NULL) { + *batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC); + if (*batchp == NULL) { + pte_free_now(ptepage); + return; + } + (*batchp)->index = 0; + } + (*batchp)->pages[(*batchp)->index++] = ptepage; + if ((*batchp)->index == PTE_FREELIST_SIZE) { + pte_free_submit(*batchp); + *batchp = NULL; + } +} + /* * Update the MMU hash table to correspond with a change to * a Linux PTE. If wrprot is true, it is permissible to @@ -59,7 +85,8 @@ void hpte_update(pte_t *ptep, unsigned l ptepage = virt_to_page(ptep); mm = (struct mm_struct *) ptepage->mapping; - addr = ptep_to_address(ptep); + addr = ptepage->index + + (((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE); if (REGION_ID(addr) == USER_REGION_ID) context = mm->context.id; --- diff/arch/s390/kernel/compat_exec.c 2004-04-21 10:45:33.799565184 +0100 +++ source/arch/s390/kernel/compat_exec.c 2004-04-21 10:46:51.261789136 +0100 @@ -72,6 +72,7 @@ int setup_arg_pages32(struct linux_binpr mpnt->vm_ops = NULL; mpnt->vm_pgoff = 0; mpnt->vm_file = NULL; + mpol_set_vma_default(mpnt); INIT_LIST_HEAD(&mpnt->shared); mpnt->vm_private_data = (void *) 0; insert_vm_struct(mm, mpnt); --- diff/arch/s390/kernel/compat_linux.c 2004-04-05 12:57:07.000000000 +0100 +++ source/arch/s390/kernel/compat_linux.c 2004-04-21 10:46:51.263788832 +0100 @@ -373,144 +373,6 @@ asmlinkage long sys32_ftruncate64(unsign return sys_ftruncate(fd, (high << 32) | low); } -typedef ssize_t (*io_fn_t)(struct file *, char *, size_t, loff_t *); -typedef ssize_t (*iov_fn_t)(struct file *, const struct iovec *, unsigned long, loff_t *); - -static long do_readv_writev32(int type, struct file *file, - const struct compat_iovec *vector, u32 count) -{ - unsigned long tot_len; - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov=iovstack, *ivp; - struct inode *inode; - long retval, i; - io_fn_t fn; - iov_fn_t fnv; - - /* First get the "struct iovec" from user memory and - * verify all the pointers - */ - if (!count) - return 0; - if (verify_area(VERIFY_READ, vector, sizeof(struct compat_iovec)*count)) - return -EFAULT; - if (count > UIO_MAXIOV) - return -EINVAL; - if (count > UIO_FASTIOV) { - iov = kmalloc(count*sizeof(struct iovec), GFP_KERNEL); - if (!iov) - return -ENOMEM; - } - - tot_len = 0; - i = count; - ivp = iov; - retval = -EINVAL; - while(i > 0) { - compat_ssize_t tmp = tot_len; - compat_ssize_t len; - u32 buf; - - if (__get_user(len, &vector->iov_len) || - __get_user(buf, &vector->iov_base)) { - retval = -EFAULT; - goto out; - } - if (len < 0) /* size_t not fitting an ssize_t32 .. */ - goto out; - tot_len += len; - if (tot_len < tmp) /* maths overflow on the compat_ssize_t */ - goto out; - ivp->iov_base = (void *)A(buf); - ivp->iov_len = (__kernel_size_t) len; - vector++; - ivp++; - i--; - } - if (tot_len == 0) { - retval = 0; - goto out; - } - - inode = file->f_dentry->d_inode; - /* VERIFY_WRITE actually means a read, as we write to user space */ - retval = locks_verify_area((type == VERIFY_WRITE - ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE), - inode, file, file->f_pos, tot_len); - if (retval) - goto out; - - /* VERIFY_WRITE actually means a read, as we write to user space */ - fnv = (type == VERIFY_WRITE ? file->f_op->readv : file->f_op->writev); - if (fnv) { - retval = fnv(file, iov, count, &file->f_pos); - goto out; - } - - fn = (type == VERIFY_WRITE ? file->f_op->read : - (io_fn_t) file->f_op->write); - - ivp = iov; - while (count > 0) { - void * base; - int len, nr; - - base = ivp->iov_base; - len = ivp->iov_len; - ivp++; - count--; - nr = fn(file, base, len, &file->f_pos); - if (nr < 0) { - if (!retval) - retval = nr; - break; - } - retval += nr; - if (nr != len) - break; - } -out: - if (iov != iovstack) - kfree(iov); - - return retval; -} - -asmlinkage long sys32_readv(int fd, struct compat_iovec *vector, unsigned long count) -{ - struct file *file; - long ret = -EBADF; - - file = fget(fd); - if(!file) - goto bad_file; - - if (file->f_op && (file->f_mode & FMODE_READ) && - (file->f_op->readv || file->f_op->read)) - ret = do_readv_writev32(VERIFY_WRITE, file, vector, count); - fput(file); - -bad_file: - return ret; -} - -asmlinkage long sys32_writev(int fd, struct compat_iovec *vector, unsigned long count) -{ - struct file *file; - int ret = -EBADF; - - file = fget(fd); - if(!file) - goto bad_file; - if (file->f_op && (file->f_mode & FMODE_WRITE) && - (file->f_op->writev || file->f_op->write)) - ret = do_readv_writev32(VERIFY_READ, file, vector, count); - fput(file); - -bad_file: - return ret; -} - /* readdir & getdents */ #define NAME_OFFSET(de) ((int) ((de)->d_name - (char *) (de))) @@ -641,160 +503,6 @@ out: /* end of readdir & getdents */ -/* - * Ooo, nasty. We need here to frob 32-bit unsigned longs to - * 64-bit unsigned longs. - */ - -static inline int -get_fd_set32(unsigned long n, unsigned long *fdset, u32 *ufdset) -{ - if (ufdset) { - unsigned long odd; - - if (verify_area(VERIFY_WRITE, ufdset, n*sizeof(u32))) - return -EFAULT; - - odd = n & 1UL; - n &= ~1UL; - while (n) { - unsigned long h, l; - __get_user(l, ufdset); - __get_user(h, ufdset+1); - ufdset += 2; - *fdset++ = h << 32 | l; - n -= 2; - } - if (odd) - __get_user(*fdset, ufdset); - } else { - /* Tricky, must clear full unsigned long in the - * kernel fdset at the end, this makes sure that - * actually happens. - */ - memset(fdset, 0, ((n + 1) & ~1)*sizeof(u32)); - } - return 0; -} - -static inline void -set_fd_set32(unsigned long n, u32 *ufdset, unsigned long *fdset) -{ - unsigned long odd; - - if (!ufdset) - return; - - odd = n & 1UL; - n &= ~1UL; - while (n) { - unsigned long h, l; - l = *fdset++; - h = l >> 32; - __put_user(l, ufdset); - __put_user(h, ufdset+1); - ufdset += 2; - n -= 2; - } - if (odd) - __put_user(*fdset, ufdset); -} - -#define MAX_SELECT_SECONDS \ - ((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1) - -asmlinkage long sys32_select(int n, u32 *inp, u32 *outp, u32 *exp, - struct compat_timeval *tvp) -{ - fd_set_bits fds; - char *bits; - unsigned long nn; - long timeout; - int ret, size; - - timeout = MAX_SCHEDULE_TIMEOUT; - if (tvp) { - int sec, usec; - - if ((ret = verify_area(VERIFY_READ, tvp, sizeof(*tvp))) - || (ret = __get_user(sec, &tvp->tv_sec)) - || (ret = __get_user(usec, &tvp->tv_usec))) - goto out_nofds; - - ret = -EINVAL; - if(sec < 0 || usec < 0) - goto out_nofds; - - if ((unsigned long) sec < MAX_SELECT_SECONDS) { - timeout = (usec + 1000000/HZ - 1) / (1000000/HZ); - timeout += sec * (unsigned long) HZ; - } - } - - ret = -EINVAL; - if (n < 0) - goto out_nofds; - if (n > current->files->max_fdset) - n = current->files->max_fdset; - - /* - * We need 6 bitmaps (in/out/ex for both incoming and outgoing), - * since we used fdset we need to allocate memory in units of - * long-words. - */ - ret = -ENOMEM; - size = FDS_BYTES(n); - bits = kmalloc(6 * size, GFP_KERNEL); - if (!bits) - goto out_nofds; - fds.in = (unsigned long *) bits; - fds.out = (unsigned long *) (bits + size); - fds.ex = (unsigned long *) (bits + 2*size); - fds.res_in = (unsigned long *) (bits + 3*size); - fds.res_out = (unsigned long *) (bits + 4*size); - fds.res_ex = (unsigned long *) (bits + 5*size); - - nn = (n + 8*sizeof(u32) - 1) / (8*sizeof(u32)); - if ((ret = get_fd_set32(nn, fds.in, inp)) || - (ret = get_fd_set32(nn, fds.out, outp)) || - (ret = get_fd_set32(nn, fds.ex, exp))) - goto out; - zero_fd_set(n, fds.res_in); - zero_fd_set(n, fds.res_out); - zero_fd_set(n, fds.res_ex); - - ret = do_select(n, &fds, &timeout); - - if (tvp && !(current->personality & STICKY_TIMEOUTS)) { - int sec = 0, usec = 0; - if (timeout) { - sec = timeout / HZ; - usec = timeout % HZ; - usec *= (1000000/HZ); - } - put_user(sec, &tvp->tv_sec); - put_user(usec, &tvp->tv_usec); - } - - if (ret < 0) - goto out; - if (!ret) { - ret = -ERESTARTNOHAND; - if (signal_pending(current)) - goto out; - ret = 0; - } - - set_fd_set32(nn, inp, fds.res_in); - set_fd_set32(nn, outp, fds.res_out); - set_fd_set32(nn, exp, fds.res_ex); - -out: - kfree(bits); -out_nofds: - return ret; -} - int cp_compat_stat(struct kstat *stat, struct compat_stat *statbuf) { int err; @@ -1044,188 +752,6 @@ sys32_rt_sigqueueinfo(int pid, int sig, return ret; } -extern void check_pending(int signum); - -/* - * count32() counts the number of arguments/envelopes - */ -static int count32(u32 * argv) -{ - int i = 0; - - if (argv != NULL) { - for (;;) { - u32 p; int error; - - error = get_user(p,argv); - if (error) return error; - if (!p) break; - argv++; i++; - } - } - return i; -} - -/* - * 'copy_string32()' copies argument/envelope strings from user - * memory to free pages in kernel mem. These are in a format ready - * to be put directly into the top of new user memory. - */ -static int copy_strings32(int argc, u32 * argv, struct linux_binprm *bprm) -{ - while (argc-- > 0) { - u32 str; - int len; - unsigned long pos; - - if (get_user(str, argv + argc) || - !str || - !(len = strnlen_user((char *)A(str), bprm->p))) - return -EFAULT; - - if (bprm->p < len) - return -E2BIG; - - bprm->p -= len; - - pos = bprm->p; - while (len) { - char *kaddr; - struct page *page; - int offset, bytes_to_copy, new, err; - - offset = pos % PAGE_SIZE; - page = bprm->page[pos / PAGE_SIZE]; - new = 0; - if (!page) { - page = alloc_page(GFP_USER); - bprm->page[pos / PAGE_SIZE] = page; - if (!page) - return -ENOMEM; - new = 1; - } - kaddr = (char *)kmap(page); - - if (new && offset) - memset(kaddr, 0, offset); - bytes_to_copy = PAGE_SIZE - offset; - if (bytes_to_copy > len) { - bytes_to_copy = len; - if (new) - memset(kaddr+offset+len, 0, - PAGE_SIZE-offset-len); - } - - err = copy_from_user(kaddr + offset, (char *)A(str), - bytes_to_copy); - kunmap(page); - - if (err) - return -EFAULT; - - pos += bytes_to_copy; - str += bytes_to_copy; - len -= bytes_to_copy; - } - } - return 0; -} - -/* - * sys32_execve() executes a new program. - */ -static inline int -do_execve32(char * filename, u32 * argv, u32 * envp, struct pt_regs * regs) -{ - struct linux_binprm bprm; - struct file * file; - int retval; - int i; - - sched_balance_exec(); - - file = open_exec(filename); - - retval = PTR_ERR(file); - if (IS_ERR(file)) - return retval; - - bprm.p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *); - memset(bprm.page, 0, MAX_ARG_PAGES * sizeof(bprm.page[0])); - - bprm.file = file; - bprm.filename = filename; - bprm.interp = filename; - bprm.sh_bang = 0; - bprm.loader = 0; - bprm.exec = 0; - bprm.mm = mm_alloc(); - retval = -ENOMEM; - if (!bprm.mm) - goto out_file; - - /* init_new_context is empty for s390x. */ - - bprm.argc = count32(argv); - if ((retval = bprm.argc) < 0) - goto out_mm; - - bprm.envc = count32(envp); - if ((retval = bprm.envc) < 0) - goto out_mm; - - retval = security_bprm_alloc(&bprm); - if (retval) - goto out; - - retval = prepare_binprm(&bprm); - if (retval < 0) - goto out; - - retval = copy_strings_kernel(1, &bprm.filename, &bprm); - if (retval < 0) - goto out; - - bprm.exec = bprm.p; - retval = copy_strings32(bprm.envc, envp, &bprm); - if (retval < 0) - goto out; - - retval = copy_strings32(bprm.argc, argv, &bprm); - if (retval < 0) - goto out; - - retval = search_binary_handler(&bprm, regs); - if (retval >= 0) { - /* execve success */ - security_bprm_free(&bprm); - return retval; - } - -out: - /* Something went wrong, return the inode and free the argument pages*/ - for (i=0 ; iptrace &= ~PT_DTRACE; @@ -1288,226 +815,6 @@ sys32_delete_module(const char __user *n #endif /* CONFIG_MODULES */ -/* Stuff for NFS server syscalls... */ -struct nfsctl_svc32 { - u16 svc32_port; - s32 svc32_nthreads; -}; - -struct nfsctl_client32 { - s8 cl32_ident[NFSCLNT_IDMAX+1]; - s32 cl32_naddr; - struct in_addr cl32_addrlist[NFSCLNT_ADDRMAX]; - s32 cl32_fhkeytype; - s32 cl32_fhkeylen; - u8 cl32_fhkey[NFSCLNT_KEYMAX]; -}; - -struct nfsctl_export32 { - s8 ex32_client[NFSCLNT_IDMAX+1]; - s8 ex32_path[NFS_MAXPATHLEN+1]; - compat_dev_t ex32_dev; - compat_ino_t ex32_ino; - s32 ex32_flags; - compat_uid_t ex32_anon_uid; - compat_gid_t ex32_anon_gid; -}; - -struct nfsctl_fdparm32 { - struct sockaddr gd32_addr; - s8 gd32_path[NFS_MAXPATHLEN+1]; - s32 gd32_version; -}; - -struct nfsctl_fsparm32 { - struct sockaddr gd32_addr; - s8 gd32_path[NFS_MAXPATHLEN+1]; - s32 gd32_maxlen; -}; - -struct nfsctl_arg32 { - s32 ca32_version; /* safeguard */ - union { - struct nfsctl_svc32 u32_svc; - struct nfsctl_client32 u32_client; - struct nfsctl_export32 u32_export; - struct nfsctl_fdparm32 u32_getfd; - struct nfsctl_fsparm32 u32_getfs; - } u; -#define ca32_svc u.u32_svc -#define ca32_client u.u32_client -#define ca32_export u.u32_export -#define ca32_getfd u.u32_getfd -#define ca32_getfs u.u32_getfs -#define ca32_authd u.u32_authd -}; - -union nfsctl_res32 { - __u8 cr32_getfh[NFS_FHSIZE]; - struct knfsd_fh cr32_getfs; -}; - -static int nfs_svc32_trans(struct nfsctl_arg *karg, struct nfsctl_arg32 *arg32) -{ - int err; - - err = __get_user(karg->ca_version, &arg32->ca32_version); - err |= __get_user(karg->ca_svc.svc_port, &arg32->ca32_svc.svc32_port); - err |= __get_user(karg->ca_svc.svc_nthreads, &arg32->ca32_svc.svc32_nthreads); - return err; -} - -static int nfs_clnt32_trans(struct nfsctl_arg *karg, struct nfsctl_arg32 *arg32) -{ - int err; - - err = __get_user(karg->ca_version, &arg32->ca32_version); - err |= copy_from_user(&karg->ca_client.cl_ident[0], - &arg32->ca32_client.cl32_ident[0], - NFSCLNT_IDMAX); - err |= __get_user(karg->ca_client.cl_naddr, &arg32->ca32_client.cl32_naddr); - err |= copy_from_user(&karg->ca_client.cl_addrlist[0], - &arg32->ca32_client.cl32_addrlist[0], - (sizeof(struct in_addr) * NFSCLNT_ADDRMAX)); - err |= __get_user(karg->ca_client.cl_fhkeytype, - &arg32->ca32_client.cl32_fhkeytype); - err |= __get_user(karg->ca_client.cl_fhkeylen, - &arg32->ca32_client.cl32_fhkeylen); - err |= copy_from_user(&karg->ca_client.cl_fhkey[0], - &arg32->ca32_client.cl32_fhkey[0], - NFSCLNT_KEYMAX); - return err; -} - -static int nfs_exp32_trans(struct nfsctl_arg *karg, struct nfsctl_arg32 *arg32) -{ - int err; - - err = __get_user(karg->ca_version, &arg32->ca32_version); - err |= copy_from_user(&karg->ca_export.ex_client[0], - &arg32->ca32_export.ex32_client[0], - NFSCLNT_IDMAX); - err |= copy_from_user(&karg->ca_export.ex_path[0], - &arg32->ca32_export.ex32_path[0], - NFS_MAXPATHLEN); - err |= __get_user(karg->ca_export.ex_dev, - &arg32->ca32_export.ex32_dev); - err |= __get_user(karg->ca_export.ex_ino, - &arg32->ca32_export.ex32_ino); - err |= __get_user(karg->ca_export.ex_flags, - &arg32->ca32_export.ex32_flags); - err |= __get_user(karg->ca_export.ex_anon_uid, - &arg32->ca32_export.ex32_anon_uid); - err |= __get_user(karg->ca_export.ex_anon_gid, - &arg32->ca32_export.ex32_anon_gid); - karg->ca_export.ex_anon_uid = high2lowuid(karg->ca_export.ex_anon_uid); - karg->ca_export.ex_anon_gid = high2lowgid(karg->ca_export.ex_anon_gid); - return err; -} - -static int nfs_getfd32_trans(struct nfsctl_arg *karg, struct nfsctl_arg32 *arg32) -{ - int err; - - err = __get_user(karg->ca_version, &arg32->ca32_version); - err |= copy_from_user(&karg->ca_getfd.gd_addr, - &arg32->ca32_getfd.gd32_addr, - (sizeof(struct sockaddr))); - err |= copy_from_user(&karg->ca_getfd.gd_path, - &arg32->ca32_getfd.gd32_path, - (NFS_MAXPATHLEN+1)); - err |= __get_user(karg->ca_getfd.gd_version, - &arg32->ca32_getfd.gd32_version); - return err; -} - -static int nfs_getfs32_trans(struct nfsctl_arg *karg, struct nfsctl_arg32 *arg32) -{ - int err; - - err = __get_user(karg->ca_version, &arg32->ca32_version); - err |= copy_from_user(&karg->ca_getfs.gd_addr, - &arg32->ca32_getfs.gd32_addr, - (sizeof(struct sockaddr))); - err |= copy_from_user(&karg->ca_getfs.gd_path, - &arg32->ca32_getfs.gd32_path, - (NFS_MAXPATHLEN+1)); - err |= __get_user(karg->ca_getfs.gd_maxlen, - &arg32->ca32_getfs.gd32_maxlen); - return err; -} - -/* This really doesn't need translations, we are only passing - * back a union which contains opaque nfs file handle data. - */ -static int nfs_getfh32_res_trans(union nfsctl_res *kres, union nfsctl_res32 *res32) -{ - return copy_to_user(res32, kres, sizeof(*res32)) ? -EFAULT : 0; -} - -long asmlinkage sys32_nfsservctl(int cmd, struct nfsctl_arg32 *arg32, union nfsctl_res32 *res32) -{ - struct nfsctl_arg *karg = NULL; - union nfsctl_res *kres = NULL; - mm_segment_t oldfs; - int err; - - karg = kmalloc(sizeof(*karg), GFP_USER); - if(!karg) - return -ENOMEM; - if(res32) { - kres = kmalloc(sizeof(*kres), GFP_USER); - if(!kres) { - kfree(karg); - return -ENOMEM; - } - } - switch(cmd) { - case NFSCTL_SVC: - err = nfs_svc32_trans(karg, arg32); - break; - case NFSCTL_ADDCLIENT: - err = nfs_clnt32_trans(karg, arg32); - break; - case NFSCTL_DELCLIENT: - err = nfs_clnt32_trans(karg, arg32); - break; - case NFSCTL_EXPORT: - case NFSCTL_UNEXPORT: - err = nfs_exp32_trans(karg, arg32); - break; - case NFSCTL_GETFD: - err = nfs_getfd32_trans(karg, arg32); - break; - case NFSCTL_GETFS: - err = nfs_getfs32_trans(karg, arg32); - break; - default: - err = -EINVAL; - break; - } - if(err) - goto done; - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = sys_nfsservctl(cmd, karg, kres); - set_fs(oldfs); - - if (err) - goto done; - - if((cmd == NFSCTL_GETFD) || - (cmd == NFSCTL_GETFS)) - err = nfs_getfh32_res_trans(kres, res32); - -done: - if(karg) - kfree(karg); - if(kres) - kfree(kres); - return err; -} - /* Translations due to time_t size differences. Which affects all sorts of things, like timeval and itimerval. */ --- diff/arch/s390/kernel/compat_wrapper.S 2004-04-21 10:45:33.801564880 +0100 +++ source/arch/s390/kernel/compat_wrapper.S 2004-04-21 10:46:51.263788832 +0100 @@ -641,14 +641,14 @@ sys32_getdents_wrapper: llgfr %r4,%r4 # unsigned int jg sys32_getdents # branch to system call - .globl sys32_select_wrapper -sys32_select_wrapper: + .globl compat_sys_select_wrapper +compat_sys_select_wrapper: lgfr %r2,%r2 # int - llgtr %r3,%r3 # fd_set * - llgtr %r4,%r4 # fd_set * - llgtr %r5,%r5 # fd_set * - llgtr %r6,%r6 # struct timeval_emu31 * - jg sys32_select # branch to system call + llgtr %r3,%r3 # compat_fd_set * + llgtr %r4,%r4 # compat_fd_set * + llgtr %r5,%r5 # compat_fd_set * + llgtr %r6,%r6 # struct compat_timeval * + jg compat_sys_select # branch to system call .globl sys32_flock_wrapper sys32_flock_wrapper: @@ -663,19 +663,19 @@ sys32_msync_wrapper: lgfr %r4,%r4 # int jg sys_msync # branch to system call - .globl sys32_readv_wrapper -sys32_readv_wrapper: + .globl compat_sys_readv_wrapper +compat_sys_readv_wrapper: lgfr %r2,%r2 # int - llgtr %r3,%r3 # const struct iovec_emu31 * + llgtr %r3,%r3 # const struct compat_iovec * llgfr %r4,%r4 # unsigned long - jg sys32_readv # branch to system call + jg compat_sys_readv # branch to system call - .globl sys32_writev_wrapper -sys32_writev_wrapper: + .globl compat_sys_writev_wrapper +compat_sys_writev_wrapper: lgfr %r2,%r2 # int - llgtr %r3,%r3 # const struct iovec_emu31 * + llgtr %r3,%r3 # const struct compat_iovec * llgfr %r4,%r4 # unsigned long - jg sys32_writev # branch to system call + jg compat_sys_writev # branch to system call .globl sys32_getsid_wrapper sys32_getsid_wrapper: @@ -786,12 +786,12 @@ sys32_poll_wrapper: lgfr %r4,%r4 # long jg sys_poll # branch to system call - .globl sys32_nfsservctl_wrapper -sys32_nfsservctl_wrapper: + .globl compat_sys_nfsservctl_wrapper +compat_sys_nfsservctl_wrapper: lgfr %r2,%r2 # int - llgtr %r3,%r3 # struct nfsctl_arg_emu31 * - llgtr %r4,%r4 # union nfsctl_res_emu31 * - jg sys32_nfsservctl # branch to system call + llgtr %r3,%r3 # struct compat_nfsctl_arg* + llgtr %r4,%r4 # union compat_nfsctl_res* + jg compat_sys_nfsservctl # branch to system call .globl sys32_setresgid16_wrapper sys32_setresgid16_wrapper: --- diff/arch/s390/kernel/process.c 2004-04-21 10:45:33.802564728 +0100 +++ source/arch/s390/kernel/process.c 2004-04-21 10:46:51.264788680 +0100 @@ -381,12 +381,6 @@ void dump_thread(struct pt_regs * regs, dump->regs.per_info = current->thread.per_info; } -/* - * These bracket the sleeping functions.. - */ -#define first_sched ((unsigned long) scheduling_functions_start_here) -#define last_sched ((unsigned long) scheduling_functions_end_here) - unsigned long get_wchan(struct task_struct *p) { unsigned long r14, r15, bc; @@ -409,12 +403,10 @@ unsigned long get_wchan(struct task_stru #else r14 = *(unsigned long *) (bc+112); #endif - if (r14 < first_sched || r14 >= last_sched) + if (!in_sched_functions(r14)) return r14; bc = (*(unsigned long *) bc) & PSW_ADDR_INSN; } while (count++ < 16); return 0; } -#undef last_sched -#undef first_sched --- diff/arch/s390/kernel/setup.c 2004-04-05 12:57:07.000000000 +0100 +++ source/arch/s390/kernel/setup.c 2004-04-21 10:46:51.264788680 +0100 @@ -74,7 +74,6 @@ extern int _text,_etext, _edata, _end; #include static char command_line[COMMAND_LINE_SIZE] = { 0, }; - char saved_command_line[COMMAND_LINE_SIZE]; static struct resource code_resource = { "Kernel code", 0x100000, 0 }; static struct resource data_resource = { "Kernel data", 0, 0 }; --- diff/arch/s390/kernel/syscalls.S 2004-04-21 10:45:33.804564424 +0100 +++ source/arch/s390/kernel/syscalls.S 2004-04-21 10:46:51.265788528 +0100 @@ -150,11 +150,11 @@ SYSCALL(sys_setfsuid16,sys_ni_syscall,sy SYSCALL(sys_setfsgid16,sys_ni_syscall,sys32_setfsgid16_wrapper) /* old setfsgid16 syscall */ SYSCALL(sys_llseek,sys_llseek,sys32_llseek_wrapper) /* 140 */ SYSCALL(sys_getdents,sys_getdents,sys32_getdents_wrapper) -SYSCALL(sys_select,sys_select,sys32_select_wrapper) +SYSCALL(sys_select,sys_select,compat_sys_select_wrapper) SYSCALL(sys_flock,sys_flock,sys32_flock_wrapper) SYSCALL(sys_msync,sys_msync,sys32_msync_wrapper) -SYSCALL(sys_readv,sys_readv,sys32_readv_wrapper) /* 145 */ -SYSCALL(sys_writev,sys_writev,sys32_writev_wrapper) +SYSCALL(sys_readv,sys_readv,compat_sys_readv_wrapper) /* 145 */ +SYSCALL(sys_writev,sys_writev,compat_sys_writev_wrapper) SYSCALL(sys_getsid,sys_getsid,sys32_getsid_wrapper) SYSCALL(sys_fdatasync,sys_fdatasync,sys32_fdatasync_wrapper) SYSCALL(sys_sysctl,sys_sysctl,sys32_sysctl_wrapper) @@ -177,7 +177,7 @@ SYSCALL(sys_getresuid16,sys_ni_syscall,s NI_SYSCALL /* for vm86 */ NI_SYSCALL /* old sys_query_module */ SYSCALL(sys_poll,sys_poll,sys32_poll_wrapper) -SYSCALL(sys_nfsservctl,sys_nfsservctl,sys32_nfsservctl_wrapper) +SYSCALL(sys_nfsservctl,sys_nfsservctl,compat_sys_nfsservctl_wrapper) SYSCALL(sys_setresgid16,sys_ni_syscall,sys32_setresgid16_wrapper) /* 170 old setresgid16 syscall */ SYSCALL(sys_getresgid16,sys_ni_syscall,sys32_getresgid16_wrapper) /* old getresgid16 syscall */ SYSCALL(sys_prctl,sys_prctl,sys32_prctl_wrapper) --- diff/arch/sh/kernel/process.c 2004-04-21 10:45:33.805564272 +0100 +++ source/arch/sh/kernel/process.c 2004-04-21 10:46:51.265788528 +0100 @@ -461,12 +461,6 @@ out: return error; } -/* - * These bracket the sleeping functions.. - */ -#define first_sched ((unsigned long) scheduling_functions_start_here) -#define last_sched ((unsigned long) scheduling_functions_end_here) - unsigned long get_wchan(struct task_struct *p) { unsigned long schedule_frame; @@ -479,7 +473,7 @@ unsigned long get_wchan(struct task_stru * The same comment as on the Alpha applies here, too ... */ pc = thread_saved_pc(p); - if (pc >= first_sched && pc < last_sched) { + if (in_sched_functions(pc)) { schedule_frame = ((unsigned long *)(long)p->thread.sp)[1]; return (unsigned long)((unsigned long *)schedule_frame)[1]; } --- diff/arch/sh/kernel/setup.c 2004-02-09 10:36:09.000000000 +0000 +++ source/arch/sh/kernel/setup.c 2004-04-21 10:46:51.266788376 +0100 @@ -85,14 +85,12 @@ static struct sh_machine_vector* __init #define INITRD_SIZE (*(unsigned long *) (PARAM+0x014)) /* ... */ #define COMMAND_LINE ((char *) (PARAM+0x100)) -#define COMMAND_LINE_SIZE 256 #define RAMDISK_IMAGE_START_MASK 0x07FF #define RAMDISK_PROMPT_FLAG 0x8000 #define RAMDISK_LOAD_FLAG 0x4000 static char command_line[COMMAND_LINE_SIZE] = { 0, }; - char saved_command_line[COMMAND_LINE_SIZE]; struct resource standard_io_resources[] = { { "dma1", 0x00, 0x1f }, --- diff/arch/sparc/kernel/process.c 2004-04-21 10:45:33.830560472 +0100 +++ source/arch/sparc/kernel/process.c 2004-04-21 10:46:51.292784424 +0100 @@ -721,8 +721,7 @@ unsigned long get_wchan(struct task_stru break; rw = (struct reg_window *) fp; pc = rw->ins[7]; - if (pc < ((unsigned long) scheduling_functions_start_here) || - pc >= ((unsigned long) scheduling_functions_end_here)) { + if (!in_sched_functions(pc)) { ret = pc; goto out; } --- diff/arch/sparc/kernel/setup.c 2004-04-05 12:57:07.000000000 +0100 +++ source/arch/sparc/kernel/setup.c 2004-04-21 10:46:51.296783816 +0100 @@ -244,8 +244,7 @@ extern unsigned short ram_flags; extern int root_mountflags; -char saved_command_line[256]; -char reboot_command[256]; +char reboot_command[COMMAND_LINE_SIZE]; enum sparc_cpu sparc_cpu_model; struct tt_entry *sparc_ttable; --- diff/arch/sparc/kernel/sparc_ksyms.c 2004-04-05 12:57:07.000000000 +0100 +++ source/arch/sparc/kernel/sparc_ksyms.c 2004-04-21 10:46:51.314781080 +0100 @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -74,7 +75,6 @@ extern void *__memscan_zero(void *, size extern void *__memscan_generic(void *, int, size_t); extern int __memcmp(const void *, const void *, __kernel_size_t); extern int __strncmp(const char *, const char *, __kernel_size_t); -extern char saved_command_line[]; extern void bcopy (const char *, char *, int); extern int __ashrdi3(int, int); --- diff/arch/sparc64/Kconfig 2004-04-21 10:45:33.852557128 +0100 +++ source/arch/sparc64/Kconfig 2004-04-21 10:46:51.266788376 +0100 @@ -687,12 +687,19 @@ config DEBUG_BOOTMEM depends on DEBUG_KERNEL bool "Debug BOOTMEM initialization" +config LOCKMETER + bool "Kernel lock metering" + depends on SMP && !PREEMPT + help + Say Y to enable kernel lock metering, which adds overhead to SMP locks, + but allows you to see various statistics using the lockstat command. + # We have a custom atomic_dec_and_lock() implementation but it's not # compatible with spinlock debugging so we need to fall back on # the generic version in that case. config HAVE_DEC_LOCK bool - depends on SMP && !DEBUG_SPINLOCK + depends on SMP && !DEBUG_SPINLOCK && !LOCKMETER default y config MCOUNT --- diff/arch/sparc64/kernel/process.c 2004-04-21 10:45:33.880552872 +0100 +++ source/arch/sparc64/kernel/process.c 2004-04-21 10:46:51.267788224 +0100 @@ -847,8 +847,7 @@ unsigned long get_wchan(struct task_stru break; rw = (struct reg_window *) fp; pc = rw->ins[7]; - if (pc < ((unsigned long) scheduling_functions_start_here) || - pc >= ((unsigned long) scheduling_functions_end_here)) { + if (!in_sched_functions(pc)) { ret = pc; goto out; } --- diff/arch/sparc64/kernel/setup.c 2004-04-05 12:57:07.000000000 +0100 +++ source/arch/sparc64/kernel/setup.c 2004-04-21 10:46:51.276786856 +0100 @@ -451,8 +451,7 @@ extern unsigned short ram_flags; extern int root_mountflags; -char saved_command_line[256]; -char reboot_command[256]; +char reboot_command[COMMAND_LINE_SIZE]; static struct pt_regs fake_swapper_regs = { { 0, }, 0, 0, 0, 0 }; --- diff/arch/sparc64/kernel/sparc64_ksyms.c 2004-04-05 12:57:07.000000000 +0100 +++ source/arch/sparc64/kernel/sparc64_ksyms.c 2004-04-21 10:46:51.286785336 +0100 @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -76,7 +77,6 @@ extern int __memcmp(const void *, const extern int __strncmp(const char *, const char *, __kernel_size_t); extern __kernel_size_t __strlen(const char *); extern __kernel_size_t strlen(const char *); -extern char saved_command_line[]; extern void linux_sparc_syscall(void); extern void rtrap(void); extern void show_regs(struct pt_regs *); --- diff/arch/sparc64/kernel/sys_sparc32.c 2004-04-05 12:57:07.000000000 +0100 +++ source/arch/sparc64/kernel/sys_sparc32.c 2004-04-21 10:46:51.289784880 +0100 @@ -832,182 +832,6 @@ asmlinkage int sys32_ftruncate64(unsigne return sys_ftruncate(fd, (high << 32) | low); } -typedef ssize_t (*io_fn_t)(struct file *, char *, size_t, loff_t *); -typedef ssize_t (*iov_fn_t)(struct file *, const struct iovec *, unsigned long, loff_t *); - -static long do_readv_writev32(int type, struct file *file, - const struct compat_iovec *vector, u32 count) -{ - compat_ssize_t tot_len; - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov=iovstack, *ivp; - struct inode *inode; - long retval, i; - io_fn_t fn; - iov_fn_t fnv; - - /* - * SuS says "The readv() function *may* fail if the iovcnt argument - * was less than or equal to 0, or greater than {IOV_MAX}. Linux has - * traditionally returned zero for zero segments, so... - */ - retval = 0; - if (count == 0) - goto out; - - /* First get the "struct iovec" from user memory and - * verify all the pointers - */ - retval = -EINVAL; - if (count > UIO_MAXIOV) - goto out; - if (!file->f_op) - goto out; - if (count > UIO_FASTIOV) { - retval = -ENOMEM; - iov = kmalloc(count*sizeof(struct iovec), GFP_KERNEL); - if (!iov) - goto out; - } - retval = -EFAULT; - if (verify_area(VERIFY_READ, vector, sizeof(struct compat_iovec)*count)) - goto out; - - /* - * Single unix specification: - * We should -EINVAL if an element length is not >= 0 and fitting an - * ssize_t. The total length is fitting an ssize_t - * - * Be careful here because iov_len is a size_t not an ssize_t - */ - tot_len = 0; - i = count; - ivp = iov; - retval = -EINVAL; - while(i > 0) { - compat_ssize_t tmp = tot_len; - compat_ssize_t len; - u32 buf; - - if (__get_user(len, &vector->iov_len) || - __get_user(buf, &vector->iov_base)) { - retval = -EFAULT; - goto out; - } - if (len < 0) /* size_t not fitting an ssize_t32 .. */ - goto out; - tot_len += len; - if (tot_len < tmp) /* maths overflow on the compat_ssize_t */ - goto out; - ivp->iov_base = (void *)A(buf); - ivp->iov_len = (__kernel_size_t) len; - vector++; - ivp++; - i--; - } - if (tot_len == 0) { - retval = 0; - goto out; - } - - inode = file->f_dentry->d_inode; - /* VERIFY_WRITE actually means a read, as we write to user space */ - retval = locks_verify_area((type == READ - ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE), - inode, file, file->f_pos, tot_len); - if (retval) - goto out; - - if (type == READ) { - fn = file->f_op->read; - fnv = file->f_op->readv; - } else { - fn = (io_fn_t)file->f_op->write; - fnv = file->f_op->writev; - } - if (fnv) { - retval = fnv(file, iov, count, &file->f_pos); - goto out; - } - - /* Do it by hand, with file-ops */ - ivp = iov; - while (count > 0) { - void * base; - int len, nr; - - base = ivp->iov_base; - len = ivp->iov_len; - ivp++; - count--; - - nr = fn(file, base, len, &file->f_pos); - - if (nr < 0) { - if (!retval) - retval = nr; - break; - } - retval += nr; - if (nr != len) - break; - } -out: - if (iov != iovstack) - kfree(iov); - if ((retval + (type == READ)) > 0) - dnotify_parent(file->f_dentry, - (type == READ) ? DN_ACCESS : DN_MODIFY); - - return retval; -} - -asmlinkage long sys32_readv(int fd, struct compat_iovec *vector, u32 count) -{ - struct file *file; - int ret; - - file = fget(fd); - if(!file) - return -EBADF; - - ret = -EBADF; - if (!(file->f_mode & FMODE_READ)) - goto out; - ret = -EINVAL; - if (!file->f_op || (!file->f_op->readv && !file->f_op->read)) - goto out; - - ret = do_readv_writev32(READ, file, vector, count); - -out: - fput(file); - return ret; -} - -asmlinkage long sys32_writev(int fd, struct compat_iovec *vector, u32 count) -{ - struct file *file; - int ret; - - file = fget(fd); - if(!file) - return -EBADF; - - ret = -EBADF; - if (!(file->f_mode & FMODE_WRITE)) - goto out; - ret = -EINVAL; - if (!file->f_op || (!file->f_op->writev && !file->f_op->write)) - goto out; - - ret = do_readv_writev32(WRITE, file, vector, count); - -out: - fput(file); - return ret; -} - /* readdir & getdents */ #define NAME_OFFSET(de) ((int) ((de)->d_name - (char *) (de))) @@ -1140,158 +964,6 @@ out: /* end of readdir & getdents */ -/* - * Ooo, nasty. We need here to frob 32-bit unsigned longs to - * 64-bit unsigned longs. - */ - -static int get_fd_set32(unsigned long n, unsigned long *fdset, u32 *ufdset) -{ - if (ufdset) { - unsigned long odd; - - if (verify_area(VERIFY_WRITE, ufdset, n*sizeof(u32))) - return -EFAULT; - - odd = n & 1UL; - n &= ~1UL; - while (n) { - unsigned long h, l; - __get_user(l, ufdset); - __get_user(h, ufdset+1); - ufdset += 2; - *fdset++ = h << 32 | l; - n -= 2; - } - if (odd) - __get_user(*fdset, ufdset); - } else { - /* Tricky, must clear full unsigned long in the - * kernel fdset at the end, this makes sure that - * actually happens. - */ - memset(fdset, 0, ((n + 1) & ~1)*sizeof(u32)); - } - return 0; -} - -static void set_fd_set32(unsigned long n, u32 *ufdset, unsigned long *fdset) -{ - unsigned long odd; - - if (!ufdset) - return; - - odd = n & 1UL; - n &= ~1UL; - while (n) { - unsigned long h, l; - l = *fdset++; - h = l >> 32; - __put_user(l, ufdset); - __put_user(h, ufdset+1); - ufdset += 2; - n -= 2; - } - if (odd) - __put_user(*fdset, ufdset); -} - -#define MAX_SELECT_SECONDS \ - ((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1) - -asmlinkage int sys32_select(int n, u32 *inp, u32 *outp, u32 *exp, u32 tvp_x) -{ - fd_set_bits fds; - struct compat_timeval *tvp = (struct compat_timeval *)AA(tvp_x); - char *bits; - unsigned long nn; - long timeout; - int ret, size; - - timeout = MAX_SCHEDULE_TIMEOUT; - if (tvp) { - time_t sec, usec; - - if ((ret = verify_area(VERIFY_READ, tvp, sizeof(*tvp))) - || (ret = __get_user(sec, &tvp->tv_sec)) - || (ret = __get_user(usec, &tvp->tv_usec))) - goto out_nofds; - - ret = -EINVAL; - if(sec < 0 || usec < 0) - goto out_nofds; - - if ((unsigned long) sec < MAX_SELECT_SECONDS) { - timeout = (usec + 1000000/HZ - 1) / (1000000/HZ); - timeout += sec * (unsigned long) HZ; - } - } - - ret = -EINVAL; - if (n < 0) - goto out_nofds; - if (n > current->files->max_fdset) - n = current->files->max_fdset; - - /* - * We need 6 bitmaps (in/out/ex for both incoming and outgoing), - * since we used fdset we need to allocate memory in units of - * long-words. - */ - ret = -ENOMEM; - size = FDS_BYTES(n); - bits = kmalloc(6 * size, GFP_KERNEL); - if (!bits) - goto out_nofds; - fds.in = (unsigned long *) bits; - fds.out = (unsigned long *) (bits + size); - fds.ex = (unsigned long *) (bits + 2*size); - fds.res_in = (unsigned long *) (bits + 3*size); - fds.res_out = (unsigned long *) (bits + 4*size); - fds.res_ex = (unsigned long *) (bits + 5*size); - - nn = (n + 8*sizeof(u32) - 1) / (8*sizeof(u32)); - if ((ret = get_fd_set32(nn, fds.in, inp)) || - (ret = get_fd_set32(nn, fds.out, outp)) || - (ret = get_fd_set32(nn, fds.ex, exp))) - goto out; - zero_fd_set(n, fds.res_in); - zero_fd_set(n, fds.res_out); - zero_fd_set(n, fds.res_ex); - - ret = do_select(n, &fds, &timeout); - - if (tvp && !(current->personality & STICKY_TIMEOUTS)) { - time_t sec = 0, usec = 0; - if (timeout) { - sec = timeout / HZ; - usec = timeout % HZ; - usec *= (1000000/HZ); - } - put_user(sec, &tvp->tv_sec); - put_user(usec, &tvp->tv_usec); - } - - if (ret < 0) - goto out; - if (!ret) { - ret = -ERESTARTNOHAND; - if (signal_pending(current)) - goto out; - ret = 0; - } - - set_fd_set32(nn, inp, fds.res_in); - set_fd_set32(nn, outp, fds.res_out); - set_fd_set32(nn, exp, fds.res_ex); - -out: - kfree(bits); -out_nofds: - return ret; -} - int cp_compat_stat(struct kstat *stat, struct compat_stat *statbuf) { int err; @@ -1562,8 +1234,6 @@ sys32_rt_sigqueueinfo(int pid, int sig, return ret; } -extern void check_pending(int signum); - asmlinkage int sys32_sigaction (int sig, struct old_sigaction32 *act, struct old_sigaction32 *oact) { struct k_sigaction new_ka, old_ka; @@ -1659,193 +1329,6 @@ sys32_rt_sigaction(int sig, struct sigac return ret; } - -/* - * count32() counts the number of arguments/envelopes - */ -static int count32(u32 * argv, int max) -{ - int i = 0; - - if (argv != NULL) { - for (;;) { - u32 p; int error; - - error = get_user(p,argv); - if (error) - return error; - if (!p) - break; - argv++; - if (++i > max) - return -E2BIG; - } - } - return i; -} - -/* - * 'copy_string32()' copies argument/envelope strings from user - * memory to free pages in kernel mem. These are in a format ready - * to be put directly into the top of new user memory. - */ -static int copy_strings32(int argc, u32 * argv, struct linux_binprm *bprm) -{ - while (argc-- > 0) { - u32 str; - int len; - unsigned long pos; - - if (get_user(str, argv + argc) || - !str || - !(len = strnlen_user((char *)A(str), bprm->p))) - return -EFAULT; - - if (bprm->p < len) - return -E2BIG; - - bprm->p -= len; - - pos = bprm->p; - while (len) { - char *kaddr; - struct page *page; - int offset, bytes_to_copy, new, err; - - offset = pos % PAGE_SIZE; - page = bprm->page[pos / PAGE_SIZE]; - new = 0; - if (!page) { - page = alloc_page(GFP_USER); - bprm->page[pos / PAGE_SIZE] = page; - if (!page) - return -ENOMEM; - new = 1; - } - kaddr = kmap(page); - - if (new && offset) - memset(kaddr, 0, offset); - bytes_to_copy = PAGE_SIZE - offset; - if (bytes_to_copy > len) { - bytes_to_copy = len; - if (new) - memset(kaddr+offset+len, 0, - PAGE_SIZE-offset-len); - } - - err = copy_from_user(kaddr + offset, (char *)A(str), - bytes_to_copy); - kunmap(page); - - if (err) - return -EFAULT; - - pos += bytes_to_copy; - str += bytes_to_copy; - len -= bytes_to_copy; - } - } - return 0; -} - -/* - * sys32_execve() executes a new program. - */ -static inline int -do_execve32(char * filename, u32 * argv, u32 * envp, struct pt_regs * regs) -{ - struct linux_binprm bprm; - struct file * file; - int retval; - int i; - - sched_balance_exec(); - - file = open_exec(filename); - - retval = PTR_ERR(file); - if (IS_ERR(file)) - return retval; - - bprm.p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *); - memset(bprm.page, 0, MAX_ARG_PAGES * sizeof(bprm.page[0])); - - bprm.file = file; - bprm.filename = filename; - bprm.interp = filename; - bprm.sh_bang = 0; - bprm.loader = 0; - bprm.exec = 0; - bprm.security = NULL; - bprm.mm = mm_alloc(); - retval = -ENOMEM; - if (!bprm.mm) - goto out_file; - - retval = init_new_context(current, bprm.mm); - if (retval < 0) - goto out_mm; - - bprm.argc = count32(argv, bprm.p / sizeof(u32)); - if ((retval = bprm.argc) < 0) - goto out_mm; - - bprm.envc = count32(envp, bprm.p / sizeof(u32)); - if ((retval = bprm.envc) < 0) - goto out_mm; - - retval = security_bprm_alloc(&bprm); - if (retval) - goto out; - - retval = prepare_binprm(&bprm); - if (retval < 0) - goto out; - - retval = copy_strings_kernel(1, &bprm.filename, &bprm); - if (retval < 0) - goto out; - - bprm.exec = bprm.p; - retval = copy_strings32(bprm.envc, envp, &bprm); - if (retval < 0) - goto out; - - retval = copy_strings32(bprm.argc, argv, &bprm); - if (retval < 0) - goto out; - - retval = search_binary_handler(&bprm, regs); - if (retval >= 0) { - /* execve success */ - security_bprm_free(&bprm); - return retval; - } - -out: - /* Something went wrong, return the inode and free the argument pages*/ - for (i = 0 ; i < MAX_ARG_PAGES ; i++) { - struct page * page = bprm.page[i]; - if (page) - __free_page(page); - } - - if (bprm.security) - security_bprm_free(&bprm); - -out_mm: - if (bprm.mm) - mmdrop(bprm.mm); - -out_file: - if (bprm.file) { - allow_write_access(bprm.file); - fput(bprm.file); - } - return retval; -} - /* * sparc32_execve() executes a new program after the asm stub has set * things up for us. This should basically do what I want it to. @@ -1865,9 +1348,9 @@ asmlinkage int sparc32_execve(struct pt_ error = PTR_ERR(filename); if(IS_ERR(filename)) goto out; - error = do_execve32(filename, - (u32 *)AA((u32)regs->u_regs[base + UREG_I1]), - (u32 *)AA((u32)regs->u_regs[base + UREG_I2]), regs); + error = compat_do_execve(filename, + compat_ptr((u32)regs->u_regs[base + UREG_I1]), + compat_ptr((u32)regs->u_regs[base + UREG_I2]), regs); putname(filename); if(!error) { @@ -1909,232 +1392,6 @@ sys32_delete_module(const char *name_use #endif /* CONFIG_MODULES */ -#if defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE) -/* Stuff for NFS server syscalls... */ -struct nfsctl_svc32 { - u16 svc32_port; - s32 svc32_nthreads; -}; - -struct nfsctl_client32 { - s8 cl32_ident[NFSCLNT_IDMAX+1]; - s32 cl32_naddr; - struct in_addr cl32_addrlist[NFSCLNT_ADDRMAX]; - s32 cl32_fhkeytype; - s32 cl32_fhkeylen; - u8 cl32_fhkey[NFSCLNT_KEYMAX]; -}; - -struct nfsctl_export32 { - s8 ex32_client[NFSCLNT_IDMAX+1]; - s8 ex32_path[NFS_MAXPATHLEN+1]; - compat_dev_t ex32_dev; - compat_ino_t ex32_ino; - s32 ex32_flags; - compat_uid_t ex32_anon_uid; - compat_gid_t ex32_anon_gid; -}; - -struct nfsctl_fdparm32 { - struct sockaddr gd32_addr; - s8 gd32_path[NFS_MAXPATHLEN+1]; - s32 gd32_version; -}; - -struct nfsctl_fsparm32 { - struct sockaddr gd32_addr; - s8 gd32_path[NFS_MAXPATHLEN+1]; - s32 gd32_maxlen; -}; - -struct nfsctl_arg32 { - s32 ca32_version; /* safeguard */ - union { - struct nfsctl_svc32 u32_svc; - struct nfsctl_client32 u32_client; - struct nfsctl_export32 u32_export; - struct nfsctl_fdparm32 u32_getfd; - struct nfsctl_fsparm32 u32_getfs; - } u; -#define ca32_svc u.u32_svc -#define ca32_client u.u32_client -#define ca32_export u.u32_export -#define ca32_getfd u.u32_getfd -#define ca32_getfs u.u32_getfs -}; - -union nfsctl_res32 { - __u8 cr32_getfh[NFS_FHSIZE]; - struct knfsd_fh cr32_getfs; -}; - -static int nfs_svc32_trans(struct nfsctl_arg *karg, struct nfsctl_arg32 *arg32) -{ - int err; - - err = __get_user(karg->ca_version, &arg32->ca32_version); - err |= __get_user(karg->ca_svc.svc_port, &arg32->ca32_svc.svc32_port); - err |= __get_user(karg->ca_svc.svc_nthreads, &arg32->ca32_svc.svc32_nthreads); - return err; -} - -static int nfs_clnt32_trans(struct nfsctl_arg *karg, struct nfsctl_arg32 *arg32) -{ - int err; - - err = __get_user(karg->ca_version, &arg32->ca32_version); - err |= copy_from_user(&karg->ca_client.cl_ident[0], - &arg32->ca32_client.cl32_ident[0], - NFSCLNT_IDMAX); - err |= __get_user(karg->ca_client.cl_naddr, &arg32->ca32_client.cl32_naddr); - err |= copy_from_user(&karg->ca_client.cl_addrlist[0], - &arg32->ca32_client.cl32_addrlist[0], - (sizeof(struct in_addr) * NFSCLNT_ADDRMAX)); - err |= __get_user(karg->ca_client.cl_fhkeytype, - &arg32->ca32_client.cl32_fhkeytype); - err |= __get_user(karg->ca_client.cl_fhkeylen, - &arg32->ca32_client.cl32_fhkeylen); - err |= copy_from_user(&karg->ca_client.cl_fhkey[0], - &arg32->ca32_client.cl32_fhkey[0], - NFSCLNT_KEYMAX); - return (err ? -EFAULT : 0); -} - -static int nfs_exp32_trans(struct nfsctl_arg *karg, struct nfsctl_arg32 *arg32) -{ - int err; - - err = __get_user(karg->ca_version, &arg32->ca32_version); - err |= copy_from_user(&karg->ca_export.ex_client[0], - &arg32->ca32_export.ex32_client[0], - NFSCLNT_IDMAX); - err |= copy_from_user(&karg->ca_export.ex_path[0], - &arg32->ca32_export.ex32_path[0], - NFS_MAXPATHLEN); - err |= __get_user(karg->ca_export.ex_dev, - &arg32->ca32_export.ex32_dev); - err |= __get_user(karg->ca_export.ex_ino, - &arg32->ca32_export.ex32_ino); - err |= __get_user(karg->ca_export.ex_flags, - &arg32->ca32_export.ex32_flags); - err |= __get_user(karg->ca_export.ex_anon_uid, - &arg32->ca32_export.ex32_anon_uid); - err |= __get_user(karg->ca_export.ex_anon_gid, - &arg32->ca32_export.ex32_anon_gid); - karg->ca_export.ex_anon_uid = high2lowuid(karg->ca_export.ex_anon_uid); - karg->ca_export.ex_anon_gid = high2lowgid(karg->ca_export.ex_anon_gid); - return (err ? -EFAULT : 0); -} - -static int nfs_getfd32_trans(struct nfsctl_arg *karg, struct nfsctl_arg32 *arg32) -{ - int err; - - err = __get_user(karg->ca_version, &arg32->ca32_version); - err |= copy_from_user(&karg->ca_getfd.gd_addr, - &arg32->ca32_getfd.gd32_addr, - (sizeof(struct sockaddr))); - err |= copy_from_user(&karg->ca_getfd.gd_path, - &arg32->ca32_getfd.gd32_path, - (NFS_MAXPATHLEN+1)); - err |= __get_user(karg->ca_getfd.gd_version, - &arg32->ca32_getfd.gd32_version); - return (err ? -EFAULT : 0); -} - -static int nfs_getfs32_trans(struct nfsctl_arg *karg, struct nfsctl_arg32 *arg32) -{ - int err; - - err = __get_user(karg->ca_version, &arg32->ca32_version); - err |= copy_from_user(&karg->ca_getfs.gd_addr, - &arg32->ca32_getfs.gd32_addr, - (sizeof(struct sockaddr))); - err |= copy_from_user(&karg->ca_getfs.gd_path, - &arg32->ca32_getfs.gd32_path, - (NFS_MAXPATHLEN+1)); - err |= __get_user(karg->ca_getfs.gd_maxlen, - &arg32->ca32_getfs.gd32_maxlen); - return (err ? -EFAULT : 0); -} - -/* This really doesn't need translations, we are only passing - * back a union which contains opaque nfs file handle data. - */ -static int nfs_getfh32_res_trans(union nfsctl_res *kres, union nfsctl_res32 *res32) -{ - return (copy_to_user(res32, kres, sizeof(*res32)) ? -EFAULT : 0); -} - -int asmlinkage sys32_nfsservctl(int cmd, struct nfsctl_arg32 *arg32, union nfsctl_res32 *res32) -{ - struct nfsctl_arg *karg = NULL; - union nfsctl_res *kres = NULL; - mm_segment_t oldfs; - int err; - - karg = kmalloc(sizeof(*karg), GFP_USER); - if(!karg) - return -ENOMEM; - if(res32) { - kres = kmalloc(sizeof(*kres), GFP_USER); - if(!kres) { - kfree(karg); - return -ENOMEM; - } - } - switch(cmd) { - case NFSCTL_SVC: - err = nfs_svc32_trans(karg, arg32); - break; - case NFSCTL_ADDCLIENT: - err = nfs_clnt32_trans(karg, arg32); - break; - case NFSCTL_DELCLIENT: - err = nfs_clnt32_trans(karg, arg32); - break; - case NFSCTL_EXPORT: - case NFSCTL_UNEXPORT: - err = nfs_exp32_trans(karg, arg32); - break; - case NFSCTL_GETFD: - err = nfs_getfd32_trans(karg, arg32); - break; - case NFSCTL_GETFS: - err = nfs_getfs32_trans(karg, arg32); - break; - default: - err = -EINVAL; - break; - } - if(err) - goto done; - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = sys_nfsservctl(cmd, karg, kres); - set_fs(oldfs); - - if (err) - goto done; - - if((cmd == NFSCTL_GETFD) || - (cmd == NFSCTL_GETFS)) - err = nfs_getfh32_res_trans(kres, res32); - -done: - if(karg) - kfree(karg); - if(kres) - kfree(kres); - return err; -} -#else /* !NFSD */ -int asmlinkage sys32_nfsservctl(int cmd, void *notused, void *notused2) -{ - return sys_ni_syscall(); -} -#endif - /* Translations due to time_t size differences. Which affects all sorts of things, like timeval and itimerval. */ --- diff/arch/sparc64/kernel/sys_sunos32.c 2004-04-05 12:57:07.000000000 +0100 +++ source/arch/sparc64/kernel/sys_sunos32.c 2004-04-21 10:46:51.290784728 +0100 @@ -528,18 +528,15 @@ asmlinkage int sunos_pathconf(u32 u_path return ret; } -/* SunOS mount system call emulation */ -extern asmlinkage int -sys32_select(int n, u32 inp, u32 outp, u32 exp, u32 tvp); - asmlinkage int sunos_select(int width, u32 inp, u32 outp, u32 exp, u32 tvp_x) { int ret; /* SunOS binaries expect that select won't change the tvp contents */ - ret = sys32_select (width, inp, outp, exp, tvp_x); + ret = compat_sys_select(width, compat_ptr(inp), compat_ptr(outp), + compat_ptr(exp), compat_ptr(tvp_x)); if (ret == -EINTR && tvp_x) { - struct compat_timeval *tvp = (struct compat_timeval *)A(tvp_x); + struct compat_timeval *tvp = compat_ptr(tvp_x); time_t sec, usec; __get_user(sec, &tvp->tv_sec); @@ -1203,9 +1200,6 @@ static inline int check_nonblock(int ret return ret; } -extern asmlinkage int sys32_readv(u32 fd, u32 vector, s32 count); -extern asmlinkage int sys32_writev(u32 fd, u32 vector, s32 count); - asmlinkage int sunos_read(unsigned int fd, u32 buf, u32 count) { int ret; @@ -1218,7 +1212,7 @@ asmlinkage int sunos_readv(u32 fd, u32 v { int ret; - ret = check_nonblock(sys32_readv(fd, vector, count), fd); + ret = check_nonblock(compat_sys_readv(fd, (void*)A(vector), count), fd); return ret; } @@ -1234,7 +1228,7 @@ asmlinkage int sunos_writev(u32 fd, u32 { int ret; - ret = check_nonblock(sys32_writev(fd, vector, count), fd); + ret = check_nonblock(compat_sys_writev(fd, (void*)A(vector), count), fd); return ret; } --- diff/arch/sparc64/kernel/systbls.S 2004-04-21 10:45:33.928545576 +0100 +++ source/arch/sparc64/kernel/systbls.S 2004-04-21 10:46:51.291784576 +0100 @@ -37,13 +37,13 @@ sys_call_table32: .word sys_madvise, sys_vhangup, sys32_truncate64, sys_mincore, sys32_getgroups16 /*80*/ .word sys32_setgroups16, sys_getpgrp, sys_setgroups, compat_sys_setitimer, sys32_ftruncate64 .word sys_swapon, compat_sys_getitimer, sys_setuid, sys_sethostname, sys_setgid -/*90*/ .word sys_dup2, sys_setfsuid, compat_sys_fcntl, sys32_select, sys_setfsgid +/*90*/ .word sys_dup2, sys_setfsuid, compat_sys_fcntl, compat_sys_select, sys_setfsgid .word sys_fsync, sys_setpriority32, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall /*100*/ .word sys_getpriority, sys32_rt_sigreturn, sys32_rt_sigaction, sys32_rt_sigprocmask, sys32_rt_sigpending .word sys32_rt_sigtimedwait, sys32_rt_sigqueueinfo, sys32_rt_sigsuspend, sys_setresuid, sys_getresuid /*110*/ .word sys_setresgid, sys_getresgid, sys_setregid, sys_nis_syscall, sys_nis_syscall .word sys_getgroups, sys32_gettimeofday, compat_sys_getrusage, sys_nis_syscall, sys_getcwd -/*120*/ .word sys32_readv, sys32_writev, sys32_settimeofday, sys32_fchown16, sys_fchmod +/*120*/ .word compat_sys_readv, compat_sys_writev, sys32_settimeofday, sys32_fchown16, sys_fchmod .word sys_nis_syscall, sys32_setreuid16, sys32_setregid16, sys_rename, sys_truncate /*130*/ .word sys_ftruncate, sys_flock, sys_lstat64, sys_nis_syscall, sys_nis_syscall .word sys_nis_syscall, sys_mkdir, sys_rmdir, sys32_utimes, sys_stat64 @@ -65,11 +65,11 @@ sys_call_table32: .word sys32_ipc, sys32_sigreturn, sys_clone, sys_nis_syscall, sys32_adjtimex /*220*/ .word compat_sys_sigprocmask, sys_ni_syscall, sys32_delete_module, sys_ni_syscall, sys_getpgid .word sys32_bdflush, sys32_sysfs, sys_nis_syscall, sys32_setfsuid16, sys32_setfsgid16 -/*230*/ .word sys32_select, sys_time, sys_nis_syscall, sys_stime, compat_statfs64 +/*230*/ .word compat_sys_select, sys_time, sys_nis_syscall, sys_stime, compat_statfs64 .word compat_fstatfs64, sys_llseek, sys_mlock, sys_munlock, sys_mlockall /*240*/ .word sys_munlockall, sys_sched_setparam, sys_sched_getparam, sys_sched_setscheduler, sys_sched_getscheduler .word sys_sched_yield, sys_sched_get_priority_max, sys_sched_get_priority_min, sys32_sched_rr_get_interval, compat_sys_nanosleep -/*250*/ .word sys32_mremap, sys32_sysctl, sys_getsid, sys_fdatasync, sys32_nfsservctl +/*250*/ .word sys32_mremap, sys32_sysctl, sys_getsid, sys_fdatasync, compat_sys_nfsservctl .word sys_ni_syscall, compat_clock_settime, compat_clock_gettime, compat_clock_getres, compat_clock_nanosleep /*260*/ .word compat_sys_sched_getaffinity, compat_sys_sched_setaffinity, compat_timer_settime, compat_timer_gettime, sys_timer_getoverrun .word sys_timer_delete, sys32_timer_create, sys_ni_syscall, compat_sys_io_setup, sys_io_destroy --- diff/arch/sparc64/lib/rwlock.S 2003-11-25 15:24:57.000000000 +0000 +++ source/arch/sparc64/lib/rwlock.S 2004-04-21 10:46:51.291784576 +0100 @@ -85,5 +85,20 @@ __write_trylock_succeed: __write_trylock_fail: retl mov 0, %o0 + + .globl __read_trylock +__read_trylock: /* %o0 = lock_ptr */ + ldsw [%o0], %g5 + brlz,pn %g5, 100f + add %g5, 1, %g7 + cas [%o0], %g5, %g7 + cmp %g5, %g7 + bne,pn %icc, __read_trylock + membar #StoreLoad | #StoreStore + retl + mov 1, %o0 +100: retl + mov 0, %o0 + rwlock_impl_end: --- diff/arch/um/kernel/user_util.c 2003-10-09 09:47:16.000000000 +0100 +++ source/arch/um/kernel/user_util.c 2004-04-21 10:46:51.314781080 +0100 @@ -34,7 +34,6 @@ #define COMMAND_LINE_SIZE _POSIX_ARG_MAX /* Changed in linux_main and setup_arch, which run before SMP is started */ -char saved_command_line[COMMAND_LINE_SIZE] = { 0 }; char command_line[COMMAND_LINE_SIZE] = { 0 }; void add_arg(char *cmd_line, char *arg) --- diff/arch/v850/kernel/process.c 2004-04-21 10:45:33.952541928 +0100 +++ source/arch/v850/kernel/process.c 2004-04-21 10:46:51.315780928 +0100 @@ -203,8 +203,8 @@ int sys_execve (char *name, char **argv, /* * These bracket the sleeping functions.. */ -#define first_sched ((unsigned long) scheduling_functions_start_here) -#define last_sched ((unsigned long) scheduling_functions_end_here) +#define first_sched ((unsigned long)__sched_text_start) +#define last_sched ((unsigned long)__sched_text_end) unsigned long get_wchan (struct task_struct *p) { --- diff/arch/v850/kernel/setup.c 2004-02-18 08:54:08.000000000 +0000 +++ source/arch/v850/kernel/setup.c 2004-04-21 10:46:51.315780928 +0100 @@ -20,6 +20,7 @@ #include #include #include +#include #include @@ -40,8 +41,7 @@ extern char _root_fs_image_start __attri extern char _root_fs_image_end __attribute__ ((__weak__)); -char command_line[512]; -char saved_command_line[512]; +char command_line[COMMAND_LINE_SIZE]; /* Memory not used by the kernel. */ static unsigned long total_ram_pages; --- diff/arch/x86_64/Kconfig 2004-04-21 10:45:33.953541776 +0100 +++ source/arch/x86_64/Kconfig 2004-04-21 10:46:51.318780472 +0100 @@ -446,12 +446,26 @@ config INIT_DEBUG config DEBUG_INFO bool "Compile the kernel with debug info" depends on DEBUG_KERNEL + default n help If you say Y here the resulting kernel image will include debugging info resulting in a larger kernel image. Say Y here only if you plan to use gdb to debug the kernel. Please note that this option requires new binutils. If you don't debug the kernel, you can say N. + +config SCHEDSTATS + bool "Collect scheduler statistics" + depends on PROC_FS + default y + help + If you say Y here, additional code will be inserted into the + scheduler and related routines to collect statistics about + scheduler behavior and provide them in /proc/schedstat. These + stats may be useful for both tuning and debugging the scheduler + If you aren't debugging the scheduler or trying to tune a specific + application, you can say N to avoid the very slight overhead + this adds. config FRAME_POINTER bool "Compile the kernel with frame pointers" @@ -484,9 +498,8 @@ config IOMMU_LEAK help Add a simple leak tracer to the IOMMU code. This is useful when you are debugging a buggy device driver that leaks IOMMU mappings. - -#config X86_REMOTE_DEBUG -# bool "kgdb debugging stub" + +source "arch/x86_64/Kconfig.kgdb" endmenu --- diff/arch/x86_64/ia32/ia32_binfmt.c 2004-04-21 10:45:33.956541320 +0100 +++ source/arch/x86_64/ia32/ia32_binfmt.c 2004-04-21 10:46:51.316780776 +0100 @@ -365,6 +365,7 @@ int setup_arg_pages(struct linux_binprm mpnt->vm_ops = NULL; mpnt->vm_pgoff = 0; mpnt->vm_file = NULL; + mpol_set_vma_default(mpnt); INIT_LIST_HEAD(&mpnt->shared); mpnt->vm_private_data = (void *) 0; insert_vm_struct(mm, mpnt); --- diff/arch/x86_64/ia32/ia32entry.S 2004-04-21 10:45:33.957541168 +0100 +++ source/arch/x86_64/ia32/ia32entry.S 2004-04-21 10:46:51.316780776 +0100 @@ -447,11 +447,11 @@ ia32_sys_call_table: .quad sys_setfsgid16 .quad sys_llseek /* 140 */ .quad sys32_getdents - .quad sys32_select + .quad compat_sys_select .quad sys_flock .quad sys_msync - .quad sys32_readv /* 145 */ - .quad sys32_writev + .quad compat_sys_readv /* 145 */ + .quad compat_sys_writev .quad sys_getsid .quad sys_fdatasync .quad sys32_sysctl /* sysctl */ @@ -474,7 +474,7 @@ ia32_sys_call_table: .quad sys32_vm86_warning /* vm86 */ .quad quiet_ni_syscall /* query_module */ .quad sys_poll - .quad sys32_nfsservctl + .quad compat_sys_nfsservctl .quad sys_setresgid16 /* 170 */ .quad sys_getresgid16 .quad sys_prctl --- diff/arch/x86_64/ia32/sys_ia32.c 2004-04-21 10:45:33.958541016 +0100 +++ source/arch/x86_64/ia32/sys_ia32.c 2004-04-21 10:46:51.317780624 +0100 @@ -606,107 +606,6 @@ out: return error; } -/* - * We can actually return ERESTARTSYS instead of EINTR, but I'd - * like to be certain this leads to no problems. So I return - * EINTR just for safety. - * - * Update: ERESTARTSYS breaks at least the xview clock binary, so - * I'm trying ERESTARTNOHAND which restart only when you want to. - */ -#define MAX_SELECT_SECONDS \ - ((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1) -#define ROUND_UP_TIME(x,y) (((x)+(y)-1)/(y)) - -asmlinkage long -sys32_select(int n, fd_set *inp, fd_set *outp, fd_set *exp, struct compat_timeval *tvp32) -{ - fd_set_bits fds; - char *bits; - long timeout; - int ret, size; - - timeout = MAX_SCHEDULE_TIMEOUT; - if (tvp32) { - time_t sec, usec; - - get_user(sec, &tvp32->tv_sec); - get_user(usec, &tvp32->tv_usec); - - ret = -EINVAL; - if (sec < 0 || usec < 0) - goto out_nofds; - - if ((unsigned long) sec < MAX_SELECT_SECONDS) { - timeout = ROUND_UP_TIME(usec, 1000000/HZ); - timeout += sec * (unsigned long) HZ; - } - } - - ret = -EINVAL; - if (n < 0) - goto out_nofds; - - if (n > current->files->max_fdset) - n = current->files->max_fdset; - - /* - * We need 6 bitmaps (in/out/ex for both incoming and outgoing), - * since we used fdset we need to allocate memory in units of - * long-words. - */ - ret = -ENOMEM; - size = FDS_BYTES(n); - bits = kmalloc(6 * size, GFP_KERNEL); - if (!bits) - goto out_nofds; - fds.in = (unsigned long *) bits; - fds.out = (unsigned long *) (bits + size); - fds.ex = (unsigned long *) (bits + 2*size); - fds.res_in = (unsigned long *) (bits + 3*size); - fds.res_out = (unsigned long *) (bits + 4*size); - fds.res_ex = (unsigned long *) (bits + 5*size); - - if ((ret = get_fd_set(n, inp, fds.in)) || - (ret = get_fd_set(n, outp, fds.out)) || - (ret = get_fd_set(n, exp, fds.ex))) - goto out; - zero_fd_set(n, fds.res_in); - zero_fd_set(n, fds.res_out); - zero_fd_set(n, fds.res_ex); - - ret = do_select(n, &fds, &timeout); - - if (tvp32 && !(current->personality & STICKY_TIMEOUTS)) { - time_t sec = 0, usec = 0; - if (timeout) { - sec = timeout / HZ; - usec = timeout % HZ; - usec *= (1000000/HZ); - } - put_user(sec, (int *)&tvp32->tv_sec); - put_user(usec, (int *)&tvp32->tv_usec); - } - - if (ret < 0) - goto out; - if (!ret) { - ret = -ERESTARTNOHAND; - if (signal_pending(current)) - goto out; - ret = 0; - } - - set_fd_set(n, inp, fds.res_in); - set_fd_set(n, outp, fds.res_out); - set_fd_set(n, exp, fds.res_ex); - -out: - kfree(bits); -out_nofds: - return ret; -} - struct sel_arg_struct { unsigned int n; unsigned int inp; @@ -722,106 +621,8 @@ sys32_old_select(struct sel_arg_struct * if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; - return sys32_select(a.n, (fd_set *)A(a.inp), (fd_set *)A(a.outp), (fd_set *)A(a.exp), - (struct compat_timeval *)A(a.tvp)); -} - -static struct iovec * -get_compat_iovec(struct compat_iovec *iov32, struct iovec *iov_buf, u32 *count, int type, int *errp) -{ - int i; - u32 buf, len; - struct iovec *ivp, *iov; - unsigned long totlen; - - /* Get the "struct iovec" from user memory */ - - *errp = 0; - if (!*count) - return 0; - *errp = -EINVAL; - if (*count > UIO_MAXIOV) - return(struct iovec *)0; - *errp = -EFAULT; - if(verify_area(VERIFY_READ, iov32, sizeof(struct compat_iovec)*(*count))) - return(struct iovec *)0; - if (*count > UIO_FASTIOV) { - *errp = -ENOMEM; - iov = kmalloc(*count*sizeof(struct iovec), GFP_KERNEL); - if (!iov) - return((struct iovec *)0); - } else - iov = iov_buf; - - ivp = iov; - totlen = 0; - for (i = 0; i < *count; i++) { - *errp = __get_user(len, &iov32->iov_len) | - __get_user(buf, &iov32->iov_base); - if (*errp) - goto error; - *errp = verify_area(type, (void *)A(buf), len); - if (*errp) { - if (i > 0) { - *count = i; - break; - } - goto error; - } - /* SuS checks: */ - *errp = -EINVAL; - if ((int)len < 0) - goto error; - if ((totlen += len) >= 0x7fffffff) - goto error; - ivp->iov_base = (void *)A(buf); - ivp->iov_len = (__kernel_size_t)len; - iov32++; - ivp++; - } - *errp = 0; - return(iov); - -error: - if (iov != iov_buf) - kfree(iov); - return NULL; -} - -asmlinkage long -sys32_readv(int fd, struct compat_iovec *vector, u32 count) -{ - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov; - int ret; - mm_segment_t old_fs = get_fs(); - - if ((iov = get_compat_iovec(vector, iovstack, &count, VERIFY_WRITE, &ret)) == NULL) - return ret; - set_fs(KERNEL_DS); - ret = sys_readv(fd, iov, count); - set_fs(old_fs); - if (iov != iovstack) - kfree(iov); - return ret; -} - -asmlinkage long -sys32_writev(int fd, struct compat_iovec *vector, u32 count) -{ - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov; - int ret; - mm_segment_t old_fs = get_fs(); - - if ((iov = get_compat_iovec(vector, iovstack, &count, VERIFY_READ, &ret)) == NULL) - return ret; - set_fs(KERNEL_DS); - ret = sys_writev(fd, iov, count); - set_fs(old_fs); - if (iov != iovstack) - kfree(iov); - return ret; + return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp), + compat_ptr(a.exp), compat_ptr(a.tvp)); } /* @@ -1323,93 +1124,22 @@ long sys32_ustat(unsigned dev, struct us return ret; } -static int nargs(u32 src, char **dst) -{ - int cnt; - u32 val; - - cnt = 0; - do { - int ret = get_user(val, (__u32 *)(u64)src); - if (ret) - return ret; - if (dst) - dst[cnt] = (char *)(u64)val; - cnt++; - src += 4; - if (cnt >= (MAX_ARG_PAGES*PAGE_SIZE)/sizeof(void*)) - return -E2BIG; - } while(val); - if (dst) - dst[cnt-1] = 0; - return cnt; -} - -asmlinkage long sys32_execve(char *name, u32 argv, u32 envp, struct pt_regs regs) -{ - mm_segment_t oldseg; - char **buf = NULL; - int na = 0,ne = 0; - int ret; - unsigned sz = 0; - - if (argv) { - na = nargs(argv, NULL); - if (na < 0) - return -EFAULT; - } - if (envp) { - ne = nargs(envp, NULL); - if (ne < 0) - return -EFAULT; - } - - if (argv || envp) { - sz = (na+ne)*sizeof(void *); - if (sz > PAGE_SIZE) - buf = vmalloc(sz); - else - buf = kmalloc(sz, GFP_KERNEL); - if (!buf) - return -ENOMEM; - } - - if (argv) { - ret = nargs(argv, buf); - if (ret < 0) - goto free; - } - - if (envp) { - ret = nargs(envp, buf + na); - if (ret < 0) - goto free; - } - - name = getname(name); - ret = PTR_ERR(name); - if (IS_ERR(name)) - goto free; - - oldseg = get_fs(); - set_fs(KERNEL_DS); - ret = do_execve(name, argv ? buf : NULL, envp ? buf+na : NULL, ®s); - set_fs(oldseg); +asmlinkage long sys32_execve(char *name, compat_uptr_t __user *argv, + compat_uptr_t __user *envp, struct pt_regs regs) +{ + long error; + char * filename; - if (ret == 0) + filename = getname(name); + error = PTR_ERR(filename); + if (IS_ERR(filename)) + return error; + error = compat_do_execve(filename, argv, envp, ®s); + if (error == 0) current->ptrace &= ~PT_DTRACE; - - putname(name); - -free: - if (argv || envp) { - if (sz > PAGE_SIZE) - vfree(buf); - else - kfree(buf); - } - return ret; -} + putname(filename); + return error; +} asmlinkage long sys32_clone(unsigned int clone_flags, unsigned int newsp, struct pt_regs regs) { @@ -1436,233 +1166,6 @@ long sys32_kill(int pid, int sig) } -#if defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE) -/* Stuff for NFS server syscalls... */ -struct nfsctl_svc32 { - u16 svc32_port; - s32 svc32_nthreads; -}; - -struct nfsctl_client32 { - s8 cl32_ident[NFSCLNT_IDMAX+1]; - s32 cl32_naddr; - struct in_addr cl32_addrlist[NFSCLNT_ADDRMAX]; - s32 cl32_fhkeytype; - s32 cl32_fhkeylen; - u8 cl32_fhkey[NFSCLNT_KEYMAX]; -}; - -struct nfsctl_export32 { - s8 ex32_client[NFSCLNT_IDMAX+1]; - s8 ex32_path[NFS_MAXPATHLEN+1]; - compat_dev_t ex32_dev; - compat_ino_t ex32_ino; - s32 ex32_flags; - compat_pid_t ex32_anon_uid; - compat_gid_t ex32_anon_gid; -}; - -struct nfsctl_fdparm32 { - struct sockaddr gd32_addr; - s8 gd32_path[NFS_MAXPATHLEN+1]; - s32 gd32_version; -}; - -struct nfsctl_fsparm32 { - struct sockaddr gd32_addr; - s8 gd32_path[NFS_MAXPATHLEN+1]; - s32 gd32_maxlen; -}; - -struct nfsctl_arg32 { - s32 ca32_version; /* safeguard */ - union { - struct nfsctl_svc32 u32_svc; - struct nfsctl_client32 u32_client; - struct nfsctl_export32 u32_export; - struct nfsctl_fdparm32 u32_getfd; - struct nfsctl_fsparm32 u32_getfs; - } u; -#define ca32_svc u.u32_svc -#define ca32_client u.u32_client -#define ca32_export u.u32_export -#define ca32_getfd u.u32_getfd -#define ca32_getfs u.u32_getfs -}; - -union nfsctl_res32 { - __u8 cr32_getfh[NFS_FHSIZE]; - struct knfsd_fh cr32_getfs; -}; - -static int nfs_svc32_trans(struct nfsctl_arg *karg, struct nfsctl_arg32 *arg32) -{ - int err; - - err = get_user(karg->ca_version, &arg32->ca32_version); - err |= __get_user(karg->ca_svc.svc_port, &arg32->ca32_svc.svc32_port); - err |= __get_user(karg->ca_svc.svc_nthreads, &arg32->ca32_svc.svc32_nthreads); - return err; -} - -static int nfs_clnt32_trans(struct nfsctl_arg *karg, struct nfsctl_arg32 *arg32) -{ - int err; - - err = get_user(karg->ca_version, &arg32->ca32_version); - err |= copy_from_user(&karg->ca_client.cl_ident[0], - &arg32->ca32_client.cl32_ident[0], - NFSCLNT_IDMAX); - err |= __get_user(karg->ca_client.cl_naddr, &arg32->ca32_client.cl32_naddr); - err |= copy_from_user(&karg->ca_client.cl_addrlist[0], - &arg32->ca32_client.cl32_addrlist[0], - (sizeof(struct in_addr) * NFSCLNT_ADDRMAX)); - err |= __get_user(karg->ca_client.cl_fhkeytype, - &arg32->ca32_client.cl32_fhkeytype); - err |= __get_user(karg->ca_client.cl_fhkeylen, - &arg32->ca32_client.cl32_fhkeylen); - err |= copy_from_user(&karg->ca_client.cl_fhkey[0], - &arg32->ca32_client.cl32_fhkey[0], - NFSCLNT_KEYMAX); - return err; -} - -static int nfs_exp32_trans(struct nfsctl_arg *karg, struct nfsctl_arg32 *arg32) -{ - int err; - - err = get_user(karg->ca_version, &arg32->ca32_version); - err |= copy_from_user(&karg->ca_export.ex_client[0], - &arg32->ca32_export.ex32_client[0], - NFSCLNT_IDMAX); - err |= copy_from_user(&karg->ca_export.ex_path[0], - &arg32->ca32_export.ex32_path[0], - NFS_MAXPATHLEN); - err |= __get_user(karg->ca_export.ex_dev, - &arg32->ca32_export.ex32_dev); - err |= __get_user(karg->ca_export.ex_ino, - &arg32->ca32_export.ex32_ino); - err |= __get_user(karg->ca_export.ex_flags, - &arg32->ca32_export.ex32_flags); - err |= __get_user(karg->ca_export.ex_anon_uid, - &arg32->ca32_export.ex32_anon_uid); - err |= __get_user(karg->ca_export.ex_anon_gid, - &arg32->ca32_export.ex32_anon_gid); - SET_UID(karg->ca_export.ex_anon_uid, karg->ca_export.ex_anon_uid); - SET_GID(karg->ca_export.ex_anon_gid, karg->ca_export.ex_anon_gid); - return err; -} - - -static int nfs_getfd32_trans(struct nfsctl_arg *karg, struct nfsctl_arg32 *arg32) -{ - int err; - - err = get_user(karg->ca_version, &arg32->ca32_version); - err |= copy_from_user(&karg->ca_getfd.gd_addr, - &arg32->ca32_getfd.gd32_addr, - (sizeof(struct sockaddr))); - err |= copy_from_user(&karg->ca_getfd.gd_path, - &arg32->ca32_getfd.gd32_path, - (NFS_MAXPATHLEN+1)); - err |= get_user(karg->ca_getfd.gd_version, - &arg32->ca32_getfd.gd32_version); - return err; -} - -static int nfs_getfs32_trans(struct nfsctl_arg *karg, struct nfsctl_arg32 *arg32) -{ - int err; - - err = get_user(karg->ca_version, &arg32->ca32_version); - err |= copy_from_user(&karg->ca_getfs.gd_addr, - &arg32->ca32_getfs.gd32_addr, - (sizeof(struct sockaddr))); - err |= copy_from_user(&karg->ca_getfs.gd_path, - &arg32->ca32_getfs.gd32_path, - (NFS_MAXPATHLEN+1)); - err |= get_user(karg->ca_getfs.gd_maxlen, - &arg32->ca32_getfs.gd32_maxlen); - return err; -} - -/* This really doesn't need translations, we are only passing - * back a union which contains opaque nfs file handle data. - */ -static int nfs_getfh32_res_trans(union nfsctl_res *kres, union nfsctl_res32 *res32) -{ - return copy_to_user(res32, kres, sizeof(*res32)) ? -EFAULT : 0; -} - -long asmlinkage sys32_nfsservctl(int cmd, struct nfsctl_arg32 *arg32, union nfsctl_res32 *res32) -{ - struct nfsctl_arg *karg = NULL; - union nfsctl_res *kres = NULL; - mm_segment_t oldfs; - int err; - - karg = kmalloc(sizeof(*karg), GFP_USER); - if(!karg) - return -ENOMEM; - if(res32) { - kres = kmalloc(sizeof(*kres), GFP_USER); - if(!kres) { - kfree(karg); - return -ENOMEM; - } - } - switch(cmd) { - case NFSCTL_SVC: - err = nfs_svc32_trans(karg, arg32); - break; - case NFSCTL_ADDCLIENT: - err = nfs_clnt32_trans(karg, arg32); - break; - case NFSCTL_DELCLIENT: - err = nfs_clnt32_trans(karg, arg32); - break; - case NFSCTL_EXPORT: - case NFSCTL_UNEXPORT: - err = nfs_exp32_trans(karg, arg32); - break; - case NFSCTL_GETFD: - err = nfs_getfd32_trans(karg, arg32); - break; - case NFSCTL_GETFS: - err = nfs_getfs32_trans(karg, arg32); - break; - default: - err = -EINVAL; - break; - } - if(err) - goto done; - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = sys_nfsservctl(cmd, karg, kres); - set_fs(oldfs); - - if (err) - goto done; - - if((cmd == NFSCTL_GETFD) || - (cmd == NFSCTL_GETFS)) - err = nfs_getfh32_res_trans(kres, res32); - -done: - if(karg) - kfree(karg); - if(kres) - kfree(kres); - return err; -} -#else /* !NFSD */ -long asmlinkage sys32_nfsservctl(int cmd, void *notused, void *notused2) -{ - return sys_ni_syscall(); -} -#endif - long sys32_io_setup(unsigned nr_reqs, u32 *ctx32p) { long ret; --- diff/arch/x86_64/kernel/Makefile 2004-04-05 12:57:07.000000000 +0100 +++ source/arch/x86_64/kernel/Makefile 2004-04-21 10:46:51.323779712 +0100 @@ -8,7 +8,7 @@ obj-y := process.o semaphore.o signal.o ptrace.o i8259.o ioport.o ldt.o setup.o time.o sys_x86_64.o \ x8664_ksyms.o i387.o syscall.o vsyscall.o \ setup64.o bootflag.o e820.o reboot.o warmreboot.o -obj-y += mce.o acpi/ +obj-y += mce.o obj-$(CONFIG_MTRR) += ../../i386/kernel/cpu/mtrr/ obj-$(CONFIG_ACPI_BOOT) += acpi/ @@ -27,6 +27,7 @@ obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o obj-$(CONFIG_SWIOTLB) += swiotlb.o obj-$(CONFIG_MODULES) += module.o +obj-$(CONFIG_KGDB) += kgdb_stub.o obj-y += topology.o --- diff/arch/x86_64/kernel/i8259.c 2004-03-11 10:20:23.000000000 +0000 +++ source/arch/x86_64/kernel/i8259.c 2004-04-21 10:46:51.319780320 +0100 @@ -13,6 +13,7 @@ #include #include +#include #include #include #include @@ -490,5 +491,6 @@ void __init init_IRQ(void) */ setup_timer(); - setup_irq(2, &irq2); + if (!acpi_ioapic) + setup_irq(2, &irq2); } --- diff/arch/x86_64/kernel/irq.c 2004-03-11 10:20:23.000000000 +0000 +++ source/arch/x86_64/kernel/irq.c 2004-04-21 10:46:51.319780320 +0100 @@ -405,6 +405,9 @@ out: spin_unlock(&desc->lock); irq_exit(); + + kgdb_process_breakpoint(); + return 1; } --- diff/arch/x86_64/kernel/mpparse.c 2004-04-21 10:45:33.961540560 +0100 +++ source/arch/x86_64/kernel/mpparse.c 2004-04-21 10:46:51.324779560 +0100 @@ -784,8 +784,6 @@ void __init mp_override_legacy_irq ( u32 gsi) { struct mpc_config_intsrc intsrc; - int i = 0; - int found = 0; int ioapic = -1; int pin = -1; @@ -818,23 +816,9 @@ void __init mp_override_legacy_irq ( (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq); - /* - * If an existing [IOAPIC.PIN -> IRQ] routing entry exists we override it. - * Otherwise create a new entry (e.g. gsi == 2). - */ - for (i = 0; i < mp_irq_entries; i++) { - if ((mp_irqs[i].mpc_srcbus == intsrc.mpc_srcbus) - && (mp_irqs[i].mpc_srcbusirq == intsrc.mpc_srcbusirq)) { - mp_irqs[i] = intsrc; - found = 1; - break; - } - } - if (!found) { - mp_irqs[mp_irq_entries] = intsrc; - if (++mp_irq_entries == MAX_IRQ_SOURCES) - panic("Max # of irq sources exceeded!\n"); - } + mp_irqs[mp_irq_entries] = intsrc; + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!\n"); return; } @@ -865,13 +849,22 @@ void __init mp_config_acpi_legacy_irqs ( intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* - * Use the default configuration for the IRQs 0-15. These may be + * Use the default configuration for the IRQs 0-15. Unless * overridden by (MADT) interrupt source override entries. */ for (i = 0; i < 16; i++) { + int idx; - if (i == 2) - continue; /* Don't connect IRQ2 */ + for (idx = 0; idx < mp_irq_entries; idx++) + if (mp_irqs[idx].mpc_srcbus == MP_ISA_BUS && + (mp_irqs[idx].mpc_srcbusirq == i || + mp_irqs[idx].mpc_dstirq == i)) + break; + + if (idx != mp_irq_entries) { + printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i); + continue; /* IRQ already used */ + } intsrc.mpc_irqtype = mp_INT; intsrc.mpc_srcbusirq = i; /* Identity mapped */ --- diff/arch/x86_64/kernel/process.c 2004-04-21 10:45:33.963540256 +0100 +++ source/arch/x86_64/kernel/process.c 2004-04-21 10:46:51.324779560 +0100 @@ -573,12 +573,6 @@ asmlinkage long sys_vfork(struct pt_regs NULL, NULL); } -/* - * These bracket the sleeping functions.. - */ -#define first_sched ((unsigned long) scheduling_functions_start_here) -#define last_sched ((unsigned long) scheduling_functions_end_here) - unsigned long get_wchan(struct task_struct *p) { unsigned long stack; @@ -595,14 +589,12 @@ unsigned long get_wchan(struct task_stru if (fp < (unsigned long)stack || fp > (unsigned long)stack+THREAD_SIZE) return 0; rip = *(u64 *)(fp+8); - if (rip < first_sched || rip >= last_sched) + if (!in_sched_functions(rip)) return rip; fp = *(u64 *)fp; } while (count++ < 16); return 0; } -#undef last_sched -#undef first_sched long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) { --- diff/arch/x86_64/kernel/setup.c 2004-04-21 10:45:33.964540104 +0100 +++ source/arch/x86_64/kernel/setup.c 2004-04-21 10:46:51.325779408 +0100 @@ -100,7 +100,6 @@ extern int root_mountflags; extern char _text, _etext, _edata, _end; char command_line[COMMAND_LINE_SIZE]; -char saved_command_line[COMMAND_LINE_SIZE]; struct resource standard_io_resources[] = { { "dma1", 0x00, 0x1f, IORESOURCE_BUSY | IORESOURCE_IO }, --- diff/arch/x86_64/kernel/smp.c 2003-11-25 15:24:57.000000000 +0000 +++ source/arch/x86_64/kernel/smp.c 2004-04-21 10:46:51.325779408 +0100 @@ -362,6 +362,18 @@ void smp_send_reschedule(int cpu) send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); } +#ifdef CONFIG_KGDB +/* + * By using the NMI code instead of a vector we just sneak thru the + * word generator coming out with just what we want. AND it does + * not matter if clustered_apic_mode is set or not. + */ +void smp_send_nmi_allbutself(void) +{ + send_IPI_allbutself(APIC_DM_NMI); +} +#endif + /* * Structure and data for smp_call_function(). This is designed to minimise * static memory requirements. It also looks cleaner. --- diff/arch/x86_64/kernel/traps.c 2004-04-05 12:57:07.000000000 +0100 +++ source/arch/x86_64/kernel/traps.c 2004-04-21 10:46:51.326779256 +0100 @@ -45,6 +45,9 @@ #include #include +#ifdef CONFIG_KGDB +#include +#endif extern struct gate_struct idt_table[256]; --- diff/arch/x86_64/lib/Makefile 2004-04-05 12:57:07.000000000 +0100 +++ source/arch/x86_64/lib/Makefile 2004-04-21 10:46:51.327779104 +0100 @@ -10,3 +10,4 @@ lib-y := csum-partial.o csum-copy.o csum lib-y += memcpy.o memmove.o memset.o copy_user.o lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o +lib-$(CONFIG_KGDB) += kgdb_serial.o --- diff/drivers/atm/firestream.c 2003-10-09 09:47:33.000000000 +0100 +++ source/drivers/atm/firestream.c 2004-04-21 10:46:51.348775912 +0100 @@ -576,7 +576,7 @@ static inline void write_fs (struct fs_d } -static inline u32 read_fs (struct fs_dev *dev, int offset) +static inline u32 read_fs (struct fs_dev *dev, int offset) { return readl (dev->base + offset); } @@ -1380,7 +1380,7 @@ static void __devinit *aligned_kmalloc ( if (alignment <= 0x10) { t = kmalloc (size, flags); - if ((unsigned int)t & (alignment-1)) { + if ((unsigned long)t & (alignment-1)) { printk ("Kmalloc doesn't align things correctly! %p\n", t); kfree (t); return aligned_kmalloc (size, flags, alignment * 4); @@ -1496,7 +1496,7 @@ static void top_off_fp (struct fs_dev *d ne->skb = skb; ne->fp = fp; - qe = (struct FS_BPENTRY *) (read_fs (dev, FP_EA(fp->offset))); + qe = (struct FS_BPENTRY *)(long)(read_fs (dev, FP_EA(fp->offset))); fs_dprintk (FS_DEBUG_QUEUE, "link at %p\n", qe); if (qe) { qe = bus_to_virt ((long) qe); --- diff/drivers/base/node.c 2004-03-11 10:20:23.000000000 +0000 +++ source/drivers/base/node.c 2004-04-21 10:46:51.349775760 +0100 @@ -30,13 +30,20 @@ static ssize_t node_read_cpumap(struct s static SYSDEV_ATTR(cpumap,S_IRUGO,node_read_cpumap,NULL); +/* Can be overwritten by architecture specific code. */ +int __attribute__((weak)) hugetlb_report_node_meminfo(int node, char *buf) +{ + return 0; +} + #define K(x) ((x) << (PAGE_SHIFT - 10)) static ssize_t node_read_meminfo(struct sys_device * dev, char * buf) { + int n; int nid = dev->id; struct sysinfo i; si_meminfo_node(&i, nid); - return sprintf(buf, "\n" + n = sprintf(buf, "\n" "Node %d MemTotal: %8lu kB\n" "Node %d MemFree: %8lu kB\n" "Node %d MemUsed: %8lu kB\n" @@ -51,10 +58,52 @@ static ssize_t node_read_meminfo(struct nid, K(i.freehigh), nid, K(i.totalram-i.totalhigh), nid, K(i.freeram-i.freehigh)); + n += hugetlb_report_node_meminfo(nid, buf + n); + return n; } + #undef K static SYSDEV_ATTR(meminfo,S_IRUGO,node_read_meminfo,NULL); +static ssize_t node_read_numastat(struct sys_device * dev, char * buf) +{ + unsigned long numa_hit, numa_miss, interleave_hit, numa_foreign; + unsigned long local_node, other_node; + int i, cpu; + pg_data_t *pg = NODE_DATA(dev->id); + numa_hit = 0; + numa_miss = 0; + interleave_hit = 0; + numa_foreign = 0; + local_node = 0; + other_node = 0; + for (i = 0; i < MAX_NR_ZONES; i++) { + struct zone *z = &pg->node_zones[i]; + for (cpu = 0; cpu < NR_CPUS; cpu++) { + struct per_cpu_pageset *ps = &z->pageset[cpu]; + numa_hit += ps->numa_hit; + numa_miss += ps->numa_miss; + numa_foreign += ps->numa_foreign; + interleave_hit += ps->interleave_hit; + local_node += ps->local_node; + other_node += ps->other_node; + } + } + return sprintf(buf, + "numa_hit %lu\n" + "numa_miss %lu\n" + "numa_foreign %lu\n" + "interleave_hit %lu\n" + "local_node %lu\n" + "other_node %lu\n", + numa_hit, + numa_miss, + numa_foreign, + interleave_hit, + local_node, + other_node); +} +static SYSDEV_ATTR(numastat,S_IRUGO,node_read_numastat,NULL); /* * register_node - Setup a driverfs device for a node. @@ -74,6 +123,7 @@ int __init register_node(struct node *no if (!error){ sysdev_create_file(&node->sysdev, &attr_cpumap); sysdev_create_file(&node->sysdev, &attr_meminfo); + sysdev_create_file(&node->sysdev, &attr_numastat); } return error; } --- diff/drivers/block/as-iosched.c 2004-02-09 10:36:10.000000000 +0000 +++ source/drivers/block/as-iosched.c 2004-04-21 10:46:51.350775608 +0100 @@ -43,7 +43,7 @@ * read_batch_expire describes how long we will allow a stream of reads to * persist before looking to see whether it is time to switch over to writes. */ -#define default_read_batch_expire (HZ / 4) +#define default_read_batch_expire (HZ / 2) /* * write_batch_expire describes how long we want a stream of writes to run for. @@ -51,7 +51,7 @@ * See, the problem is: we can send a lot of writes to disk cache / TCQ in * a short amount of time... */ -#define default_write_batch_expire (HZ / 16) +#define default_write_batch_expire (HZ / 8) /* * max time we may wait to anticipate a read (default around 6ms) --- diff/drivers/block/cciss.c 2004-04-21 10:45:33.975538432 +0100 +++ source/drivers/block/cciss.c 2004-04-21 10:46:51.351775456 +0100 @@ -988,7 +988,7 @@ static int revalidate_allvol(ctlr_info_t drive_info_struct *drv = &(host->drv[i]); if (!drv->nr_blocks) continue; - blk_queue_hardsect_size(host->queue, drv->block_size); + blk_queue_hardsect_size(drv->queue, drv->block_size); set_capacity(disk, drv->nr_blocks); add_disk(disk); } @@ -2013,7 +2013,7 @@ static irqreturn_t do_cciss_intr(int irq CommandList_struct *c; unsigned long flags; __u32 a, a1; - + int j; /* Is this interrupt for us? */ if (( h->access.intr_pending(h) == 0) || (h->interrupts_enabled == 0)) @@ -2059,11 +2059,18 @@ static irqreturn_t do_cciss_intr(int irq } } } - /* * See if we can queue up some more IO + * check every disk that exists on this controller + * and start it's IO */ - blk_start_queue(h->queue); + for(j=0;j < NWD; j++) { + /* make sure the disk has been added and the drive is real */ + /* because this can be called from the middle of init_one */ + if(!(h->gendisk[j]->queue) || !(h->drv[j].nr_blocks) ) + continue; + blk_start_queue(h->gendisk[j]->queue); + } spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags); return IRQ_HANDLED; } @@ -2510,7 +2517,6 @@ static void free_hba(int i) static int __devinit cciss_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) { - request_queue_t *q; int i; int j; @@ -2568,13 +2574,6 @@ static int __devinit cciss_init_one(stru } spin_lock_init(&hba[i]->lock); - q = blk_init_queue(do_cciss_request, &hba[i]->lock); - if (!q) - goto clean4; - - q->backing_dev_info.ra_pages = READ_AHEAD; - hba[i]->queue = q; - q->queuedata = hba[i]; /* Initialize the pdev driver private data. have it point to hba[i]. */ @@ -2596,6 +2595,19 @@ static int __devinit cciss_init_one(stru cciss_procinit(i); + for(j=0; jdrv[j]); + struct gendisk *disk = hba[i]->gendisk[j]; + request_queue_t *q; + + q = blk_init_queue(do_cciss_request, &hba[i]->lock); + if (!q) { + printk(KERN_ERR + "cciss: unable to allocate queue for disk %d\n", + j); + break; + } + drv->queue = q; blk_queue_bounce_limit(q, hba[i]->pdev->dma_mask); /* This is a hardware imposed limit. */ @@ -2606,21 +2618,17 @@ static int __devinit cciss_init_one(stru blk_queue_max_sectors(q, 512); - - for(j=0; jdrv[j]); - struct gendisk *disk = hba[i]->gendisk[j]; - + q->queuedata = hba[i]; sprintf(disk->disk_name, "cciss/c%dd%d", i, j); sprintf(disk->devfs_name, "cciss/host%d/target%d", i, j); disk->major = COMPAQ_CISS_MAJOR + i; disk->first_minor = j << NWD_SHIFT; disk->fops = &cciss_fops; - disk->queue = hba[i]->queue; + disk->queue = q; disk->private_data = drv; if( !(drv->nr_blocks)) continue; - blk_queue_hardsect_size(hba[i]->queue, drv->block_size); + blk_queue_hardsect_size(q, drv->block_size); set_capacity(disk, drv->nr_blocks); add_disk(disk); } @@ -2690,9 +2698,9 @@ static void __devexit cciss_remove_one ( struct gendisk *disk = hba[i]->gendisk[j]; if (disk->flags & GENHD_FL_UP) del_gendisk(disk); + blk_cleanup_queue(disk->queue); } - blk_cleanup_queue(hba[i]->queue); pci_free_consistent(hba[i]->pdev, NR_CMDS * sizeof(CommandList_struct), hba[i]->cmd_pool, hba[i]->cmd_pool_dhandle); pci_free_consistent(hba[i]->pdev, NR_CMDS * sizeof( ErrorInfo_struct), --- diff/drivers/block/cciss.h 2004-02-18 08:54:08.000000000 +0000 +++ source/drivers/block/cciss.h 2004-04-21 10:46:51.351775456 +0100 @@ -27,6 +27,7 @@ typedef struct _drive_info_struct { __u32 LunID; int usage_count; + struct request_queue *queue; sector_t nr_blocks; int block_size; int heads; @@ -69,7 +70,6 @@ struct ctlr_info unsigned int maxQsinceinit; unsigned int maxSG; spinlock_t lock; - struct request_queue *queue; //* pointers to command and error info pool */ CommandList_struct *cmd_pool; @@ -252,7 +252,7 @@ struct board_type { struct access_method *access; }; -#define CCISS_LOCK(i) (hba[i]->queue->queue_lock) +#define CCISS_LOCK(i) (&(hba[i]->lock)) #endif /* CCISS_H */ --- diff/drivers/block/ll_rw_blk.c 2004-04-21 10:45:33.988536456 +0100 +++ source/drivers/block/ll_rw_blk.c 2004-04-21 10:46:51.353775152 +0100 @@ -1153,6 +1153,7 @@ static inline void __generic_unplug_devi **/ void generic_unplug_device(request_queue_t *q) { + might_sleep(); spin_lock_irq(q->queue_lock); __generic_unplug_device(q); spin_unlock_irq(q->queue_lock); --- diff/drivers/block/loop.c 2004-04-21 10:45:33.989536304 +0100 +++ source/drivers/block/loop.c 2004-04-21 10:46:51.353775152 +0100 @@ -656,7 +656,7 @@ static int loop_set_fd(struct loop_devic * If we can't read - sorry. If we only can't write - well, * it's going to be read-only. */ - if (!lo_file->f_op->sendfile) + if (!file->f_op->sendfile) goto out_putf; if (!aops->prepare_write || !aops->commit_write) --- diff/drivers/char/agp/ati-agp.c 2004-04-05 12:57:07.000000000 +0100 +++ source/drivers/char/agp/ati-agp.c 2004-04-21 10:46:51.365773328 +0100 @@ -131,6 +131,7 @@ static int ati_create_gatt_pages(int nr_ i--; } kfree (tables); + tables = NULL; retval = -ENOMEM; break; } --- diff/drivers/char/drm/drm.h 2003-07-22 18:54:27.000000000 +0100 +++ source/drivers/char/drm/drm.h 2004-04-21 10:46:51.390769528 +0100 @@ -46,8 +46,8 @@ #define DRM_IOC_WRITE _IOC_WRITE #define DRM_IOC_READWRITE _IOC_READ|_IOC_WRITE #define DRM_IOC(dir, group, nr, size) _IOC(dir, group, nr, size) -#elif defined(__FreeBSD__) || defined(__NetBSD__) -#if defined(__FreeBSD__) && defined(XFree86Server) +#elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) +#if defined(__FreeBSD__) && defined(IN_MODULE) /* Prevent name collision when including sys/ioccom.h */ #undef ioctl #include @@ -130,6 +130,18 @@ typedef struct drm_tex_region { unsigned int age; } drm_tex_region_t; +/** + * Hardware lock. + * + * The lock structure is a simple cache-line aligned integer. To avoid + * processor bus contention on a multiprocessor system, there should not be any + * other data stored in the same cache line. + */ +typedef struct drm_hw_lock { + __volatile__ unsigned int lock; /**< lock variable */ + char padding[60]; /**< Pad to cache line */ +} drm_hw_lock_t; + /** * DRM_IOCTL_VERSION ioctl argument type. @@ -580,6 +592,16 @@ typedef struct drm_scatter_gather { unsigned long handle; /**< Used for mapping / unmapping */ } drm_scatter_gather_t; +/** + * DRM_IOCTL_SET_VERSION ioctl argument type. + */ +typedef struct drm_set_version { + int drm_di_major; + int drm_di_minor; + int drm_dd_major; + int drm_dd_minor; +} drm_set_version_t; + #define DRM_IOCTL_BASE 'd' #define DRM_IO(nr) _IO(DRM_IOCTL_BASE,nr) @@ -594,6 +616,7 @@ typedef struct drm_scatter_gather { #define DRM_IOCTL_GET_MAP DRM_IOWR(0x04, drm_map_t) #define DRM_IOCTL_GET_CLIENT DRM_IOWR(0x05, drm_client_t) #define DRM_IOCTL_GET_STATS DRM_IOR( 0x06, drm_stats_t) +#define DRM_IOCTL_SET_VERSION DRM_IOWR(0x07, drm_set_version_t) #define DRM_IOCTL_SET_UNIQUE DRM_IOW( 0x10, drm_unique_t) #define DRM_IOCTL_AUTH_MAGIC DRM_IOW( 0x11, drm_auth_t) --- diff/drivers/char/drm/drmP.h 2004-01-19 10:22:56.000000000 +0000 +++ source/drivers/char/drm/drmP.h 2004-04-21 10:46:51.404767400 +0100 @@ -92,8 +92,8 @@ #ifndef __HAVE_DMA #define __HAVE_DMA 0 #endif -#ifndef __HAVE_DMA_IRQ -#define __HAVE_DMA_IRQ 0 +#ifndef __HAVE_IRQ +#define __HAVE_IRQ 0 #endif #ifndef __HAVE_DMA_WAITLIST #define __HAVE_DMA_WAITLIST 0 @@ -148,6 +148,7 @@ #define DRM_MEM_CTXBITMAP 18 #define DRM_MEM_STUB 19 #define DRM_MEM_SGLISTS 20 +#define DRM_MEM_CTXLIST 21 #define DRM_MAX_CTXBITMAP (PAGE_SIZE * 8) @@ -324,6 +325,7 @@ do { \ #define DRM_BUFCOUNT(x) ((x)->count - DRM_LEFTCOUNT(x)) #define DRM_WAITCOUNT(dev,idx) DRM_BUFCOUNT(&dev->queuelist[idx]->waitlist) +#define DRM_IF_VERSION(maj, min) (maj << 16 | min) /** * Get the private SAREA mapping. * @@ -362,10 +364,12 @@ do { \ typedef int drm_ioctl_t( struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg ); -typedef struct drm_pci_list { - u16 vendor; - u16 device; -} drm_pci_list_t; +typedef struct drm_pci_id_list +{ + int vendor; + int device; + long driver_private; +} drm_pci_id_list_t; typedef struct drm_ioctl_desc { drm_ioctl_t *func; @@ -463,18 +467,6 @@ typedef struct drm_buf_entry { drm_freelist_t freelist; } drm_buf_entry_t; -/** - * Hardware lock. - * - * The lock structure is a simple cache-line aligned integer. To avoid - * processor bus contention on a multiprocessor system, there should not be any - * other data stored in the same cache line. - */ -typedef struct drm_hw_lock { - __volatile__ unsigned int lock; /**< lock variable */ - char padding[60]; /**< Pad to cache line */ -} drm_hw_lock_t; - /** File private data */ typedef struct drm_file { int authenticated; @@ -488,6 +480,9 @@ typedef struct drm_file { struct drm_device *dev; int remove_auth_on_close; unsigned long lock_count; +#ifdef DRIVER_FILE_FIELDS + DRIVER_FILE_FIELDS; +#endif } drm_file_t; /** Wait queue */ @@ -602,6 +597,15 @@ typedef struct drm_map_list { typedef drm_map_t drm_local_map_t; +/** + * Context handle list + */ +typedef struct drm_ctx_list { + struct list_head head; /**< list head */ + drm_context_t handle; /**< context handle */ + drm_file_t *tag; /**< associated fd private data */ +} drm_ctx_list_t; + #if __HAVE_VBL_IRQ typedef struct drm_vbl_sig { @@ -622,6 +626,8 @@ typedef struct drm_device { int unique_len; /**< Length of unique field */ dev_t device; /**< Device number for mknod */ char *devname; /**< For /proc/interrupts */ + int minor; /**< Minor device number */ + int if_version; /**< Highest interface version set */ int blocked; /**< Blocked due to VC switch? */ struct proc_dir_entry *root; /**< Root for this device's entries */ @@ -660,6 +666,12 @@ typedef struct drm_device { drm_map_list_t *maplist; /**< Linked list of regions */ int map_count; /**< Number of mappable regions */ + /** \name Context handle management */ + /*@{*/ + drm_ctx_list_t *ctxlist; /**< Linked list of context handles */ + int ctx_count; /**< Number of context handles */ + struct semaphore ctxlist_sem; /**< For ctxlist */ + drm_map_t **context_sareas; /**< per-context SAREA's */ int max_context; @@ -679,6 +691,7 @@ typedef struct drm_device { /** \name Context support */ /*@{*/ int irq; /**< Interrupt used by board */ + int irq_enabled; /**< True if irq handler is enabled */ __volatile__ long context_flag; /**< Context swapping flag */ __volatile__ long interrupt_flag; /**< Interruption handler flag */ __volatile__ long dma_flag; /**< DMA dispatch flag */ @@ -714,7 +727,12 @@ typedef struct drm_device { #if __REALLY_HAVE_AGP drm_agp_head_t *agp; /**< AGP data */ #endif - struct pci_dev *pdev; /**< PCI device structure */ + + struct pci_dev *pdev; /**< PCI device structure */ + int pci_domain; /**< PCI bus domain number */ + int pci_bus; /**< PCI bus number */ + int pci_slot; /**< PCI slot number */ + int pci_func; /**< PCI function number */ #ifdef __alpha__ #if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,3) struct pci_controler *hose; @@ -758,18 +776,6 @@ extern int DRM(flush)(struct file * extern int DRM(fasync)(int fd, struct file *filp, int on); /* Mapping support (drm_vm.h) */ -extern struct page *DRM(vm_nopage)(struct vm_area_struct *vma, - unsigned long address, - int *type); -extern struct page *DRM(vm_shm_nopage)(struct vm_area_struct *vma, - unsigned long address, - int *type); -extern struct page *DRM(vm_dma_nopage)(struct vm_area_struct *vma, - unsigned long address, - int *type); -extern struct page *DRM(vm_sg_nopage)(struct vm_area_struct *vma, - unsigned long address, - int *type); extern void DRM(vm_open)(struct vm_area_struct *vma); extern void DRM(vm_close)(struct vm_area_struct *vma); extern void DRM(vm_shm_close)(struct vm_area_struct *vma); @@ -804,8 +810,8 @@ extern int DRM(unbind_agp)(DRM #endif /* Misc. IOCTL support (drm_ioctl.h) */ -extern int DRM(irq_busid)(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg); +extern int DRM(irq_by_busid)(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg); extern int DRM(getunique)(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg); extern int DRM(setunique)(struct inode *inode, struct file *filp, @@ -816,6 +822,8 @@ extern int DRM(getclient)(struct in unsigned int cmd, unsigned long arg); extern int DRM(getstats)(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg); +extern int DRM(setversion)(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg); /* Context IOCTL support (drm_context.h) */ extern int DRM(resctx)( struct inode *inode, struct file *filp, @@ -900,12 +908,17 @@ extern int DRM(dma_setup)(drm_devic extern void DRM(dma_takedown)(drm_device_t *dev); extern void DRM(free_buffer)(drm_device_t *dev, drm_buf_t *buf); extern void DRM(reclaim_buffers)( struct file *filp ); -#if __HAVE_DMA_IRQ +#endif /* __HAVE_DMA */ + + /* IRQ support (drm_irq.h) */ +#if __HAVE_IRQ || __HAVE_DMA extern int DRM(control)( struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg ); -extern int DRM(irq_install)( drm_device_t *dev, int irq ); +#endif +#if __HAVE_IRQ +extern int DRM(irq_install)( drm_device_t *dev ); extern int DRM(irq_uninstall)( drm_device_t *dev ); -extern irqreturn_t DRM(dma_service)( DRM_IRQ_ARGS ); +extern irqreturn_t DRM(irq_handler)( DRM_IRQ_ARGS ); extern void DRM(driver_irq_preinstall)( drm_device_t *dev ); extern void DRM(driver_irq_postinstall)( drm_device_t *dev ); extern void DRM(driver_irq_uninstall)( drm_device_t *dev ); @@ -915,12 +928,11 @@ extern int DRM(wait_vblank)(st extern int DRM(vblank_wait)(drm_device_t *dev, unsigned int *vbl_seq); extern void DRM(vbl_send_signals)( drm_device_t *dev ); #endif -#if __HAVE_DMA_IRQ_BH -extern void DRM(dma_immediate_bh)( void *dev ); +#if __HAVE_IRQ_BH +extern void DRM(irq_immediate_bh)( void *dev ); #endif #endif -#endif /* __HAVE_DMA */ #if __REALLY_HAVE_AGP /* AGP/GART support (drm_agpsupport.h) */ --- diff/drivers/char/drm/drm_agpsupport.h 2003-09-30 15:46:12.000000000 +0100 +++ source/drivers/char/drm/drm_agpsupport.h 2004-04-21 10:46:51.368772872 +0100 @@ -103,7 +103,13 @@ int DRM(agp_acquire)(struct inode *inode drm_device_t *dev = priv->dev; int retcode; - if (!dev->agp || dev->agp->acquired || !drm_agp->acquire) + if (!dev->agp) + return -ENODEV; + if (dev->agp->acquired) + return -EBUSY; + if (!drm_agp->acquire) + return -EINVAL; + if ( dev->agp->cant_use_aperture ) return -EINVAL; if ((retcode = drm_agp->acquire())) return retcode; --- diff/drivers/char/drm/drm_bufs.h 2003-07-22 18:54:27.000000000 +0100 +++ source/drivers/char/drm/drm_bufs.h 2004-04-21 10:46:51.374771960 +0100 @@ -147,7 +147,9 @@ int DRM(addmap)( struct inode *inode, st MTRR_TYPE_WRCOMB, 1 ); } #endif - map->handle = DRM(ioremap)( map->offset, map->size, dev ); + if (map->type == _DRM_REGISTERS) + map->handle = DRM(ioremap)( map->offset, map->size, + dev ); break; case _DRM_SHM: @@ -160,6 +162,12 @@ int DRM(addmap)( struct inode *inode, st } map->offset = (unsigned long)map->handle; if ( map->flags & _DRM_CONTAINS_LOCK ) { + /* Prevent a 2nd X Server from creating a 2nd lock */ + if (dev->lock.hw_lock != NULL) { + vfree( map->handle ); + DRM(free)( map, sizeof(*map), DRM_MEM_MAPS ); + return -EBUSY; + } dev->sigdata.lock = dev->lock.hw_lock = map->handle; /* Pointer to lock */ } @@ -767,7 +775,7 @@ int DRM(addbufs_pci)( struct inode *inod } #endif /* __HAVE_PCI_DMA */ -#ifdef __HAVE_SG +#if __HAVE_SG int DRM(addbufs_sg)( struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg ) { --- diff/drivers/char/drm/drm_context.h 2003-07-22 18:54:27.000000000 +0100 +++ source/drivers/char/drm/drm_context.h 2004-04-21 10:46:51.375771808 +0100 @@ -401,6 +401,7 @@ int DRM(addctx)( struct inode *inode, st { drm_file_t *priv = filp->private_data; drm_device_t *dev = priv->dev; + drm_ctx_list_t * ctx_entry; drm_ctx_t ctx; if ( copy_from_user( &ctx, (drm_ctx_t *)arg, sizeof(ctx) ) ) @@ -421,6 +422,20 @@ int DRM(addctx)( struct inode *inode, st if ( ctx.handle != DRM_KERNEL_CONTEXT ) DRIVER_CTX_CTOR(ctx.handle); /* XXX: also pass dev ? */ #endif + ctx_entry = DRM(alloc)( sizeof(*ctx_entry), DRM_MEM_CTXLIST ); + if ( !ctx_entry ) { + DRM_DEBUG("out of memory\n"); + return -ENOMEM; + } + + INIT_LIST_HEAD( &ctx_entry->head ); + ctx_entry->handle = ctx.handle; + ctx_entry->tag = priv; + + down( &dev->ctxlist_sem ); + list_add( &ctx_entry->head, &dev->ctxlist->head ); + ++dev->ctx_count; + up( &dev->ctxlist_sem ); if ( copy_to_user( (drm_ctx_t *)arg, &ctx, sizeof(ctx) ) ) return -EFAULT; @@ -543,6 +558,20 @@ int DRM(rmctx)( struct inode *inode, str DRM(ctxbitmap_free)( dev, ctx.handle ); } + down( &dev->ctxlist_sem ); + if ( !list_empty( &dev->ctxlist->head ) ) { + drm_ctx_list_t *pos, *n; + + list_for_each_entry_safe( pos, n, &dev->ctxlist->head, head ) { + if ( pos->handle == ctx.handle ) { + list_del( &pos->head ); + DRM(free)( pos, sizeof(*pos), DRM_MEM_CTXLIST ); + --dev->ctx_count; + } + } + } + up( &dev->ctxlist_sem ); + return 0; } --- diff/drivers/char/drm/drm_dma.h 2003-07-22 18:54:27.000000000 +0100 +++ source/drivers/char/drm/drm_dma.h 2004-04-21 10:46:51.376771656 +0100 @@ -35,7 +35,6 @@ #include "drmP.h" -#include /* For task queue support */ #ifndef __HAVE_DMA_WAITQUEUE #define __HAVE_DMA_WAITQUEUE 0 @@ -43,15 +42,6 @@ #ifndef __HAVE_DMA_RECLAIM #define __HAVE_DMA_RECLAIM 0 #endif -#ifndef __HAVE_SHARED_IRQ -#define __HAVE_SHARED_IRQ 0 -#endif - -#if __HAVE_SHARED_IRQ -#define DRM_IRQ_TYPE SA_SHIRQ -#else -#define DRM_IRQ_TYPE 0 -#endif #if __HAVE_DMA @@ -214,293 +204,11 @@ void DRM(reclaim_buffers)( struct file * } #endif - - - -#if __HAVE_DMA_IRQ - -/** - * Install IRQ handler. - * - * \param dev DRM device. - * \param irq IRQ number. - * - * Initializes the IRQ related data, and setups drm_device::vbl_queue. Installs the handler, calling the driver - * \c DRM(driver_irq_preinstall)() and \c DRM(driver_irq_postinstall)() functions - * before and after the installation. - */ -int DRM(irq_install)( drm_device_t *dev, int irq ) -{ - int ret; - - if ( !irq ) - return -EINVAL; - - down( &dev->struct_sem ); - - /* Driver must have been initialized */ - if ( !dev->dev_private ) { - up( &dev->struct_sem ); - return -EINVAL; - } - - if ( dev->irq ) { - up( &dev->struct_sem ); - return -EBUSY; - } - dev->irq = irq; - up( &dev->struct_sem ); - - DRM_DEBUG( "%s: irq=%d\n", __FUNCTION__, irq ); - - dev->context_flag = 0; - dev->interrupt_flag = 0; - dev->dma_flag = 0; - - dev->dma->next_buffer = NULL; - dev->dma->next_queue = NULL; - dev->dma->this_buffer = NULL; - -#if __HAVE_DMA_IRQ_BH - INIT_WORK(&dev->work, DRM(dma_immediate_bh), dev); -#endif - -#if __HAVE_VBL_IRQ - init_waitqueue_head(&dev->vbl_queue); - - spin_lock_init( &dev->vbl_lock ); - - INIT_LIST_HEAD( &dev->vbl_sigs.head ); - - dev->vbl_pending = 0; -#endif - - /* Before installing handler */ - DRM(driver_irq_preinstall)(dev); - - /* Install handler */ - ret = request_irq( dev->irq, DRM(dma_service), - DRM_IRQ_TYPE, dev->devname, dev ); - if ( ret < 0 ) { - down( &dev->struct_sem ); - dev->irq = 0; - up( &dev->struct_sem ); - return ret; - } - - /* After installing handler */ - DRM(driver_irq_postinstall)(dev); - - return 0; -} - -/** - * Uninstall the IRQ handler. - * - * \param dev DRM device. - * - * Calls the driver's \c DRM(driver_irq_uninstall)() function, and stops the irq. - */ -int DRM(irq_uninstall)( drm_device_t *dev ) -{ - int irq; - - down( &dev->struct_sem ); - irq = dev->irq; - dev->irq = 0; - up( &dev->struct_sem ); - - if ( !irq ) - return -EINVAL; - - DRM_DEBUG( "%s: irq=%d\n", __FUNCTION__, irq ); - - DRM(driver_irq_uninstall)( dev ); - - free_irq( irq, dev ); - - return 0; -} - -/** - * IRQ control ioctl. - * - * \param inode device inode. - * \param filp file pointer. - * \param cmd command. - * \param arg user argument, pointing to a drm_control structure. - * \return zero on success or a negative number on failure. - * - * Calls irq_install() or irq_uninstall() according to \p arg. - */ -int DRM(control)( struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg ) -{ - drm_file_t *priv = filp->private_data; - drm_device_t *dev = priv->dev; - drm_control_t ctl; - - if ( copy_from_user( &ctl, (drm_control_t *)arg, sizeof(ctl) ) ) - return -EFAULT; - - switch ( ctl.func ) { - case DRM_INST_HANDLER: - return DRM(irq_install)( dev, ctl.irq ); - case DRM_UNINST_HANDLER: - return DRM(irq_uninstall)( dev ); - default: - return -EINVAL; - } -} - -#if __HAVE_VBL_IRQ - -/** - * Wait for VBLANK. - * - * \param inode device inode. - * \param filp file pointer. - * \param cmd command. - * \param data user argument, pointing to a drm_wait_vblank structure. - * \return zero on success or a negative number on failure. - * - * Verifies the IRQ is installed. - * - * If a signal is requested checks if this task has already scheduled the same signal - * for the same vblank sequence number - nothing to be done in - * that case. If the number of tasks waiting for the interrupt exceeds 100 the - * function fails. Otherwise adds a new entry to drm_device::vbl_sigs for this - * task. - * - * If a signal is not requested, then calls vblank_wait(). - */ -int DRM(wait_vblank)( DRM_IOCTL_ARGS ) -{ - drm_file_t *priv = filp->private_data; - drm_device_t *dev = priv->dev; - drm_wait_vblank_t vblwait; - struct timeval now; - int ret = 0; - unsigned int flags; - - if (!dev->irq) - return -EINVAL; - - DRM_COPY_FROM_USER_IOCTL( vblwait, (drm_wait_vblank_t *)data, - sizeof(vblwait) ); - - switch ( vblwait.request.type & ~_DRM_VBLANK_FLAGS_MASK ) { - case _DRM_VBLANK_RELATIVE: - vblwait.request.sequence += atomic_read( &dev->vbl_received ); - vblwait.request.type &= ~_DRM_VBLANK_RELATIVE; - case _DRM_VBLANK_ABSOLUTE: - break; - default: - return -EINVAL; - } - - flags = vblwait.request.type & _DRM_VBLANK_FLAGS_MASK; - - if ( flags & _DRM_VBLANK_SIGNAL ) { - unsigned long irqflags; - drm_vbl_sig_t *vbl_sig; - - vblwait.reply.sequence = atomic_read( &dev->vbl_received ); - - spin_lock_irqsave( &dev->vbl_lock, irqflags ); - - /* Check if this task has already scheduled the same signal - * for the same vblank sequence number; nothing to be done in - * that case - */ - list_for_each_entry( vbl_sig, &dev->vbl_sigs.head, head ) { - if (vbl_sig->sequence == vblwait.request.sequence - && vbl_sig->info.si_signo == vblwait.request.signal - && vbl_sig->task == current) - { - spin_unlock_irqrestore( &dev->vbl_lock, irqflags ); - goto done; - } - } - - if ( dev->vbl_pending >= 100 ) { - spin_unlock_irqrestore( &dev->vbl_lock, irqflags ); - return -EBUSY; - } - - dev->vbl_pending++; - - spin_unlock_irqrestore( &dev->vbl_lock, irqflags ); - - if ( !( vbl_sig = DRM_MALLOC( sizeof( drm_vbl_sig_t ) ) ) ) { - return -ENOMEM; - } - - memset( (void *)vbl_sig, 0, sizeof(*vbl_sig) ); - - vbl_sig->sequence = vblwait.request.sequence; - vbl_sig->info.si_signo = vblwait.request.signal; - vbl_sig->task = current; - - spin_lock_irqsave( &dev->vbl_lock, irqflags ); - - list_add_tail( (struct list_head *) vbl_sig, &dev->vbl_sigs.head ); - - spin_unlock_irqrestore( &dev->vbl_lock, irqflags ); - } else { - ret = DRM(vblank_wait)( dev, &vblwait.request.sequence ); - - do_gettimeofday( &now ); - vblwait.reply.tval_sec = now.tv_sec; - vblwait.reply.tval_usec = now.tv_usec; - } - -done: - DRM_COPY_TO_USER_IOCTL( (drm_wait_vblank_t *)data, vblwait, - sizeof(vblwait) ); - - return ret; -} - -/** - * Send the VBLANK signals. - * - * \param dev DRM device. - * - * Sends a signal for each task in drm_device::vbl_sigs and empties the list. - * - * If a signal is not requested, then calls vblank_wait(). +#if !__HAVE_IRQ +/* This stub DRM_IOCTL_CONTROL handler is for the drivers that used to require + * IRQs for DMA but no longer do. It maintains compatibility with the X Servers + * that try to use the control ioctl by simply returning success. */ -void DRM(vbl_send_signals)( drm_device_t *dev ) -{ - struct list_head *list, *tmp; - drm_vbl_sig_t *vbl_sig; - unsigned int vbl_seq = atomic_read( &dev->vbl_received ); - unsigned long flags; - - spin_lock_irqsave( &dev->vbl_lock, flags ); - - list_for_each_safe( list, tmp, &dev->vbl_sigs.head ) { - vbl_sig = list_entry( list, drm_vbl_sig_t, head ); - if ( ( vbl_seq - vbl_sig->sequence ) <= (1<<23) ) { - vbl_sig->info.si_code = vbl_seq; - send_sig_info( vbl_sig->info.si_signo, &vbl_sig->info, vbl_sig->task ); - - list_del( list ); - - DRM_FREE( vbl_sig, sizeof(*vbl_sig) ); - - dev->vbl_pending--; - } - } - - spin_unlock_irqrestore( &dev->vbl_lock, flags ); -} - -#endif /* __HAVE_VBL_IRQ */ - -#else - int DRM(control)( struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg ) { @@ -517,7 +225,6 @@ int DRM(control)( struct inode *inode, s return -EINVAL; } } - -#endif /* __HAVE_DMA_IRQ */ +#endif #endif /* __HAVE_DMA */ --- diff/drivers/char/drm/drm_drv.h 2003-10-27 09:20:43.000000000 +0000 +++ source/drivers/char/drm/drm_drv.h 2004-04-21 10:46:51.383770592 +0100 @@ -58,8 +58,8 @@ #ifndef __HAVE_CTX_BITMAP #define __HAVE_CTX_BITMAP 0 #endif -#ifndef __HAVE_DMA_IRQ -#define __HAVE_DMA_IRQ 0 +#ifndef __HAVE_IRQ +#define __HAVE_IRQ 0 #endif #ifndef __HAVE_DMA_QUEUE #define __HAVE_DMA_QUEUE 0 @@ -126,6 +126,9 @@ #ifndef DRIVER_IOCTLS #define DRIVER_IOCTLS #endif +#ifndef DRIVER_OPEN_HELPER +#define DRIVER_OPEN_HELPER( priv, dev ) +#endif #ifndef DRIVER_FOPS #define DRIVER_FOPS \ static struct file_operations DRM(fops) = { \ @@ -159,15 +162,8 @@ __setup( DRIVER_NAME "=", DRM_OPTIONS_FU #undef DRM_OPTIONS_FUNC #endif -/** - * The default number of instances (minor numbers) to initialize. - */ -#ifndef DRIVER_NUM_CARDS -#define DRIVER_NUM_CARDS 1 -#endif - -static drm_device_t *DRM(device); -static int *DRM(minor); +#define MAX_DEVICES 4 +static drm_device_t DRM(device)[MAX_DEVICES]; static int DRM(numdevs) = 0; DRIVER_FOPS; @@ -177,10 +173,13 @@ static drm_ioctl_desc_t DRM(ioctls)[] [DRM_IOCTL_NR(DRM_IOCTL_VERSION)] = { DRM(version), 0, 0 }, [DRM_IOCTL_NR(DRM_IOCTL_GET_UNIQUE)] = { DRM(getunique), 0, 0 }, [DRM_IOCTL_NR(DRM_IOCTL_GET_MAGIC)] = { DRM(getmagic), 0, 0 }, - [DRM_IOCTL_NR(DRM_IOCTL_IRQ_BUSID)] = { DRM(irq_busid), 0, 1 }, +#if __HAVE_IRQ + [DRM_IOCTL_NR(DRM_IOCTL_IRQ_BUSID)] = { DRM(irq_by_busid), 0, 1 }, +#endif [DRM_IOCTL_NR(DRM_IOCTL_GET_MAP)] = { DRM(getmap), 0, 0 }, [DRM_IOCTL_NR(DRM_IOCTL_GET_CLIENT)] = { DRM(getclient), 0, 0 }, [DRM_IOCTL_NR(DRM_IOCTL_GET_STATS)] = { DRM(getstats), 0, 0 }, + [DRM_IOCTL_NR(DRM_IOCTL_SET_VERSION)] = { DRM(setversion), 0, 1 }, [DRM_IOCTL_NR(DRM_IOCTL_SET_UNIQUE)] = { DRM(setunique), 1, 1 }, [DRM_IOCTL_NR(DRM_IOCTL_BLOCK)] = { DRM(noop), 1, 1 }, @@ -222,9 +221,9 @@ static drm_ioctl_desc_t DRM(ioctls)[] [DRM_IOCTL_NR(DRM_IOCTL_INFO_BUFS)] = { DRM(infobufs), 1, 0 }, [DRM_IOCTL_NR(DRM_IOCTL_MAP_BUFS)] = { DRM(mapbufs), 1, 0 }, [DRM_IOCTL_NR(DRM_IOCTL_FREE_BUFS)] = { DRM(freebufs), 1, 0 }, - - /* The DRM_IOCTL_DMA ioctl should be defined by the driver. - */ + /* The DRM_IOCTL_DMA ioctl should be defined by the driver. */ +#endif +#if __HAVE_IRQ || __HAVE_DMA [DRM_IOCTL_NR(DRM_IOCTL_CONTROL)] = { DRM(control), 1, 1 }, #endif @@ -330,6 +329,12 @@ static int DRM(setup)( drm_device_t *dev memset(dev->maplist, 0, sizeof(*dev->maplist)); INIT_LIST_HEAD(&dev->maplist->head); + dev->ctxlist = DRM(alloc)(sizeof(*dev->ctxlist), + DRM_MEM_CTXLIST); + if(dev->ctxlist == NULL) return -ENOMEM; + memset(dev->ctxlist, 0, sizeof(*dev->ctxlist)); + INIT_LIST_HEAD(&dev->ctxlist->head); + dev->vmalist = NULL; dev->sigdata.lock = dev->lock.hw_lock = NULL; init_waitqueue_head( &dev->lock.lock_queue ); @@ -337,7 +342,7 @@ static int DRM(setup)( drm_device_t *dev dev->queue_reserved = 0; dev->queue_slots = 0; dev->queuelist = NULL; - dev->irq = 0; + dev->irq_enabled = 0; dev->context_flag = 0; dev->interrupt_flag = 0; dev->dma_flag = 0; @@ -345,6 +350,7 @@ static int DRM(setup)( drm_device_t *dev dev->last_switch = 0; dev->last_checked = 0; init_waitqueue_head( &dev->context_wait ); + dev->if_version = 0; dev->ctx_start = 0; dev->lck_start = 0; @@ -391,8 +397,8 @@ static int DRM(takedown)( drm_device_t * DRM_DEBUG( "\n" ); DRIVER_PRETAKEDOWN(); -#if __HAVE_DMA_IRQ - if ( dev->irq ) DRM(irq_uninstall)( dev ); +#if __HAVE_IRQ + if ( dev->irq_enabled ) DRM(irq_uninstall)( dev ); #endif down( &dev->struct_sem ); @@ -534,43 +540,101 @@ static int DRM(takedown)( drm_device_t * return 0; } -/** - * Figure out how many instances to initialize. - * - * \return number of cards found. - * - * Searches for every PCI card in \c DRIVER_CARD_LIST with matching vendor and device ids. - */ -static int drm_count_cards(void) +static drm_pci_id_list_t DRM(pciidlist)[] = { + DRIVER_PCI_IDS +}; + +static int DRM(probe)(struct pci_dev *pdev) { - int num = 0; -#if defined(DRIVER_CARD_LIST) - int i; - drm_pci_list_t *l; - u16 device, vendor; - struct pci_dev *pdev = NULL; + drm_device_t *dev; +#if __HAVE_CTX_BITMAP + int retcode; #endif + int i; + int is_compat = 0; DRM_DEBUG( "\n" ); -#if defined(DRIVER_COUNT_CARDS) - num = DRIVER_COUNT_CARDS(); -#elif defined(DRIVER_CARD_LIST) - for (i = 0, l = DRIVER_CARD_LIST; l[i].vendor != 0; i++) { - pdev = NULL; - vendor = l[i].vendor; - device = l[i].device; - if(device == 0xffff) device = PCI_ANY_ID; - if(vendor == 0xffff) vendor = PCI_ANY_ID; - while ((pdev = pci_find_device(vendor, device, pdev))) { - num++; + for (i = 0; DRM(pciidlist)[i].vendor != 0; i++) { + if ((DRM(pciidlist)[i].vendor == pdev->vendor) && + (DRM(pciidlist)[i].device == pdev->device)) { + is_compat = 1; } } + if (is_compat == 0) + return -ENODEV; + + if (DRM(numdevs) >= MAX_DEVICES) + return -ENODEV; + + dev = &(DRM(device)[DRM(numdevs)]); + + memset( (void *)dev, 0, sizeof(*dev) ); + dev->count_lock = SPIN_LOCK_UNLOCKED; + init_timer( &dev->timer ); + sema_init( &dev->struct_sem, 1 ); + sema_init( &dev->ctxlist_sem, 1 ); + + if ((dev->minor = DRM(stub_register)(DRIVER_NAME, &DRM(fops),dev)) < 0) + return -EPERM; + dev->device = MKDEV(DRM_MAJOR, dev->minor ); + dev->name = DRIVER_NAME; + + dev->pdev = pdev; +#ifdef __alpha__ + dev->hose = pdev->sysdata; + dev->pci_domain = dev->hose->bus->number; #else - num = DRIVER_NUM_CARDS; + dev->pci_domain = 0; #endif - DRM_DEBUG("numdevs = %d\n", num); - return num; + dev->pci_bus = pdev->bus->number; + dev->pci_slot = PCI_SLOT(pdev->devfn); + dev->pci_func = PCI_FUNC(pdev->devfn); + dev->irq = pdev->irq; + + DRIVER_PREINIT(); + +#if __REALLY_HAVE_AGP + dev->agp = DRM(agp_init)(); +#if __MUST_HAVE_AGP + if ( dev->agp == NULL ) { + DRM_ERROR( "Cannot initialize the agpgart module.\n" ); + DRM(stub_unregister)(dev->minor); + DRM(takedown)( dev ); + return -EINVAL; + } +#endif +#if __REALLY_HAVE_MTRR + if (dev->agp) + dev->agp->agp_mtrr = mtrr_add( dev->agp->agp_info.aper_base, + dev->agp->agp_info.aper_size*1024*1024, + MTRR_TYPE_WRCOMB, + 1 ); +#endif +#endif + +#if __HAVE_CTX_BITMAP + retcode = DRM(ctxbitmap_init)( dev ); + if( retcode ) { + DRM_ERROR( "Cannot allocate memory for context bitmap.\n" ); + DRM(stub_unregister)(dev->minor); + DRM(takedown)( dev ); + return retcode; + } +#endif + DRM(numdevs)++; /* no errors, mark it reserved */ + + DRM_INFO( "Initialized %s %d.%d.%d %s on minor %d\n", + DRIVER_NAME, + DRIVER_MAJOR, + DRIVER_MINOR, + DRIVER_PATCHLEVEL, + DRIVER_DATE, + dev->minor); + + DRIVER_POSTINIT(); + + return 0; } /** @@ -579,7 +643,7 @@ static int drm_count_cards(void) * * \return zero on success or a negative number on failure. * - * Allocates and initialize an array of drm_device structures, and attempts to + * Initializes an array of drm_device structures, and attempts to * initialize all available devices, using consecutive minors, registering the * stubs and initializing the AGP device. * @@ -588,88 +652,19 @@ static int drm_count_cards(void) */ static int __init drm_init( void ) { + struct pci_dev *pdev = NULL; - drm_device_t *dev; - int i; -#if __HAVE_CTX_BITMAP - int retcode; -#endif DRM_DEBUG( "\n" ); #ifdef MODULE DRM(parse_options)( drm_opts ); #endif - DRM(numdevs) = drm_count_cards(); - /* Force at least one instance. */ - if (DRM(numdevs) <= 0) - DRM(numdevs) = 1; - - DRM(device) = kmalloc(sizeof(*DRM(device)) * DRM(numdevs), GFP_KERNEL); - if (!DRM(device)) { - return -ENOMEM; - } - DRM(minor) = kmalloc(sizeof(*DRM(minor)) * DRM(numdevs), GFP_KERNEL); - if (!DRM(minor)) { - kfree(DRM(device)); - return -ENOMEM; - } - - DRIVER_PREINIT(); - DRM(mem_init)(); - for (i = 0; i < DRM(numdevs); i++) { - dev = &(DRM(device)[i]); - memset( (void *)dev, 0, sizeof(*dev) ); - dev->count_lock = SPIN_LOCK_UNLOCKED; - init_timer( &dev->timer ); - sema_init( &dev->struct_sem, 1 ); - - if ((DRM(minor)[i] = DRM(stub_register)(DRIVER_NAME, &DRM(fops),dev)) < 0) - return -EPERM; - dev->device = MKDEV(DRM_MAJOR, DRM(minor)[i] ); - dev->name = DRIVER_NAME; - -#if __REALLY_HAVE_AGP - dev->agp = DRM(agp_init)(); -#if __MUST_HAVE_AGP - if ( dev->agp == NULL ) { - DRM_ERROR( "Cannot initialize the agpgart module.\n" ); - DRM(stub_unregister)(DRM(minor)[i]); - DRM(takedown)( dev ); - return -EINVAL; - } -#endif -#if __REALLY_HAVE_MTRR - if (dev->agp) - dev->agp->agp_mtrr = mtrr_add( dev->agp->agp_info.aper_base, - dev->agp->agp_info.aper_size*1024*1024, - MTRR_TYPE_WRCOMB, - 1 ); -#endif -#endif - -#if __HAVE_CTX_BITMAP - retcode = DRM(ctxbitmap_init)( dev ); - if( retcode ) { - DRM_ERROR( "Cannot allocate memory for context bitmap.\n" ); - DRM(stub_unregister)(DRM(minor)[i]); - DRM(takedown)( dev ); - return retcode; - } -#endif - DRM_INFO( "Initialized %s %d.%d.%d %s on minor %d\n", - DRIVER_NAME, - DRIVER_MAJOR, - DRIVER_MINOR, - DRIVER_PATCHLEVEL, - DRIVER_DATE, - DRM(minor)[i] ); + while ((pdev = pci_find_device(PCI_ANY_ID, PCI_ANY_ID, pdev)) != NULL) { + DRM(probe)(pdev); } - - DRIVER_POSTINIT(); - return 0; } @@ -689,10 +684,10 @@ static void __exit drm_cleanup( void ) for (i = DRM(numdevs) - 1; i >= 0; i--) { dev = &(DRM(device)[i]); - if ( DRM(stub_unregister)(DRM(minor)[i]) ) { + if ( DRM(stub_unregister)(dev->minor) ) { DRM_ERROR( "Cannot unload module\n" ); } else { - DRM_DEBUG("minor %d unregistered\n", DRM(minor)[i]); + DRM_DEBUG("minor %d unregistered\n", dev->minor); if (i == 0) { DRM_INFO( "Module unloaded\n" ); } @@ -722,8 +717,6 @@ static void __exit drm_cleanup( void ) #endif } DRIVER_POSTCLEANUP(); - kfree(DRM(minor)); - kfree(DRM(device)); DRM(numdevs) = 0; } @@ -795,7 +788,7 @@ int DRM(open)( struct inode *inode, stru int i; for (i = 0; i < DRM(numdevs); i++) { - if (iminor(inode) == DRM(minor)[i]) { + if (iminor(inode) == DRM(device)[i].minor) { dev = &(DRM(device)[i]); break; } @@ -908,6 +901,26 @@ int DRM(release)( struct inode *inode, s DRM(fasync)( -1, filp, 0 ); + down( &dev->ctxlist_sem ); + if ( !list_empty( &dev->ctxlist->head ) ) { + drm_ctx_list_t *pos, *n; + + list_for_each_entry_safe( pos, n, &dev->ctxlist->head, head ) { + if ( pos->tag == priv && + pos->handle != DRM_KERNEL_CONTEXT ) { +#ifdef DRIVER_CTX_DTOR + DRIVER_CTX_DTOR(pos->handle); +#endif +#if __HAVE_CTX_BITMAP + DRM(ctxbitmap_free)( dev, pos->handle ); +#endif + list_del( &pos->head ); + DRM(free)( pos, sizeof(*pos), DRM_MEM_CTXLIST ); + } + } + } + up( &dev->ctxlist_sem ); + down( &dev->struct_sem ); if ( priv->remove_auth_on_close == 1 ) { drm_file_t *temp = dev->file_first; --- diff/drivers/char/drm/drm_fops.h 2003-10-09 09:47:16.000000000 +0100 +++ source/drivers/char/drm/drm_fops.h 2004-04-21 10:46:51.387769984 +0100 @@ -72,6 +72,8 @@ int DRM(open_helper)(struct inode *inode priv->authenticated = capable(CAP_SYS_ADMIN); priv->lock_count = 0; + DRIVER_OPEN_HELPER( priv, dev ); + down(&dev->struct_sem); if (!dev->file_last) { priv->next = NULL; --- diff/drivers/char/drm/drm_ioctl.h 2003-07-22 18:54:27.000000000 +0100 +++ source/drivers/char/drm/drm_ioctl.h 2004-04-21 10:46:51.392769224 +0100 @@ -35,69 +35,7 @@ #include "drmP.h" - -/** - * Get interrupt from bus id. - * - * \param inode device inode. - * \param filp file pointer. - * \param cmd command. - * \param arg user argument, pointing to a drm_irq_busid structure. - * \return zero on success or a negative number on failure. - * - * Finds the PCI device with the specified bus id and gets its IRQ number. - */ -int DRM(irq_busid)(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg) -{ - drm_irq_busid_t p; - struct pci_dev *dev; - - if (copy_from_user(&p, (drm_irq_busid_t *)arg, sizeof(p))) - return -EFAULT; -#ifdef __alpha__ - { - int domain = p.busnum >> 8; - p.busnum &= 0xff; - - /* - * Find the hose the device is on (the domain number is the - * hose index) and offset the bus by the root bus of that - * hose. - */ - for(dev = pci_find_device(PCI_ANY_ID,PCI_ANY_ID,NULL); - dev; - dev = pci_find_device(PCI_ANY_ID,PCI_ANY_ID,dev)) { - struct pci_controller *hose = dev->sysdata; - - if (hose->index == domain) { - p.busnum += hose->bus->number; - break; - } - } - } -#endif - dev = pci_find_slot(p.busnum, PCI_DEVFN(p.devnum, p.funcnum)); - if (!dev) { - DRM_ERROR("pci_find_slot failed for %d:%d:%d\n", - p.busnum, p.devnum, p.funcnum); - p.irq = 0; - goto out; - } - if (pci_enable_device(dev) != 0) { - DRM_ERROR("pci_enable_device failed for %d:%d:%d\n", - p.busnum, p.devnum, p.funcnum); - p.irq = 0; - goto out; - } - p.irq = dev->irq; - out: - DRM_DEBUG("%d:%d:%d => IRQ %d\n", - p.busnum, p.devnum, p.funcnum, p.irq); - if (copy_to_user((drm_irq_busid_t *)arg, &p, sizeof(p))) - return -EFAULT; - return 0; -} +#include "linux/pci.h" /** * Get the bus id. @@ -138,8 +76,10 @@ int DRM(getunique)(struct inode *inode, * \param arg user argument, pointing to a drm_unique structure. * \return zero on success or a negative number on failure. * - * Copies the bus id from userspace into drm_device::unique, and searches for - * the respective PCI device, updating drm_device::pdev. + * Copies the bus id from userspace into drm_device::unique, and verifies that + * it matches the device this DRM is attached to (EINVAL otherwise). Deprecated + * in interface version 1.1 and will return EBUSY when setversion has requested + * version 1.1 or greater. */ int DRM(setunique)(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg) @@ -147,6 +87,7 @@ int DRM(setunique)(struct inode *inode, drm_file_t *priv = filp->private_data; drm_device_t *dev = priv->dev; drm_unique_t u; + int domain, bus, slot, func, ret; if (dev->unique_len || dev->unique) return -EBUSY; @@ -164,55 +105,42 @@ int DRM(setunique)(struct inode *inode, dev->devname = DRM(alloc)(strlen(dev->name) + strlen(dev->unique) + 2, DRM_MEM_DRIVER); - if(!dev->devname) { - DRM(free)(dev->devname, sizeof(*dev->devname), DRM_MEM_DRIVER); + if (!dev->devname) return -ENOMEM; - } + sprintf(dev->devname, "%s@%s", dev->name, dev->unique); - do { - struct pci_dev *pci_dev; - int domain, b, d, f; - char *p; - - for(p = dev->unique; p && *p && *p != ':'; p++); - if (!p || !*p) break; - b = (int)simple_strtoul(p+1, &p, 10); - if (*p != ':') break; - d = (int)simple_strtoul(p+1, &p, 10); - if (*p != ':') break; - f = (int)simple_strtoul(p+1, &p, 10); - if (*p) break; - - domain = b >> 8; - b &= 0xff; - -#ifdef __alpha__ - /* - * Find the hose the device is on (the domain number is the - * hose index) and offset the bus by the root bus of that - * hose. - */ - for(pci_dev = pci_find_device(PCI_ANY_ID,PCI_ANY_ID,NULL); - pci_dev; - pci_dev = pci_find_device(PCI_ANY_ID,PCI_ANY_ID,pci_dev)) { - struct pci_controller *hose = pci_dev->sysdata; - - if (hose->index == domain) { - b += hose->bus->number; - break; - } - } -#endif + /* Return error if the busid submitted doesn't match the device's actual + * busid. + */ + ret = sscanf(dev->unique, "PCI:%d:%d:%d", &bus, &slot, &func); + if (ret != 3) + return DRM_ERR(EINVAL); + domain = bus >> 8; + bus &= 0xff; + + if ((domain != dev->pci_domain) || + (bus != dev->pci_bus) || + (slot != dev->pci_slot) || + (func != dev->pci_func)) + return -EINVAL; - pci_dev = pci_find_slot(b, PCI_DEVFN(d,f)); - if (pci_dev) { - dev->pdev = pci_dev; -#ifdef __alpha__ - dev->hose = pci_dev->sysdata; -#endif - } - } while(0); + return 0; +} + +static int +DRM(set_busid)(drm_device_t *dev) +{ + if (dev->unique != NULL) + return EBUSY; + + dev->unique_len = 20; + dev->unique = DRM(alloc)(dev->unique_len + 1, DRM_MEM_DRIVER); + if (dev->unique == NULL) + return ENOMEM; + + snprintf(dev->unique, dev->unique_len, "pci:%04x:%02x:%02x.%d", + dev->pci_domain, dev->pci_bus, dev->pci_slot, dev->pci_func); return 0; } @@ -363,3 +291,47 @@ int DRM(getstats)( struct inode *inode, return -EFAULT; return 0; } + +#define DRM_IF_MAJOR 1 +#define DRM_IF_MINOR 2 + +int DRM(setversion)(DRM_IOCTL_ARGS) +{ + DRM_DEVICE; + drm_set_version_t sv; + drm_set_version_t retv; + int if_version; + + DRM_COPY_FROM_USER_IOCTL(sv, (drm_set_version_t *)data, sizeof(sv)); + + retv.drm_di_major = DRM_IF_MAJOR; + retv.drm_di_minor = DRM_IF_MINOR; + retv.drm_dd_major = DRIVER_MAJOR; + retv.drm_dd_minor = DRIVER_MINOR; + + DRM_COPY_TO_USER_IOCTL((drm_set_version_t *)data, retv, sizeof(sv)); + + if (sv.drm_di_major != -1) { + if (sv.drm_di_major != DRM_IF_MAJOR || + sv.drm_di_minor < 0 || sv.drm_di_minor > DRM_IF_MINOR) + return EINVAL; + if_version = DRM_IF_VERSION(sv.drm_di_major, sv.drm_dd_minor); + dev->if_version = DRM_MAX(if_version, dev->if_version); + if (sv.drm_di_minor >= 1) { + /* + * Version 1.1 includes tying of DRM to specific device + */ + DRM(set_busid)(dev); + } + } + + if (sv.drm_dd_major != -1) { + if (sv.drm_dd_major != DRIVER_MAJOR || + sv.drm_dd_minor < 0 || sv.drm_dd_minor > DRIVER_MINOR) + return EINVAL; +#ifdef DRIVER_SETVERSION + DRIVER_SETVERSION(dev, &sv); +#endif + } + return 0; +} --- diff/drivers/char/drm/drm_memory_debug.h 2004-04-05 12:57:07.000000000 +0100 +++ source/drivers/char/drm/drm_memory_debug.h 2004-04-21 10:46:51.402767704 +0100 @@ -67,6 +67,7 @@ static drm_mem_stats_t DRM(mem_stats)[ [DRM_MEM_TOTALAGP] = { "totalagp" }, [DRM_MEM_BOUNDAGP] = { "boundagp" }, [DRM_MEM_CTXBITMAP] = { "ctxbitmap"}, + [DRM_MEM_CTXLIST] = { "ctxlist" }, [DRM_MEM_STUB] = { "stub" }, { NULL, 0, } /* Last entry must be null */ }; --- diff/drivers/char/drm/drm_os_linux.h 2003-09-30 15:46:12.000000000 +0100 +++ source/drivers/char/drm/drm_os_linux.h 2004-04-21 10:46:51.402767704 +0100 @@ -62,8 +62,12 @@ verify_area( VERIFY_READ, uaddr, size ) #define DRM_COPY_FROM_USER_UNCHECKED(arg1, arg2, arg3) \ __copy_from_user(arg1, arg2, arg3) +#define DRM_COPY_TO_USER_UNCHECKED(arg1, arg2, arg3) \ + __copy_to_user(arg1, arg2, arg3) #define DRM_GET_USER_UNCHECKED(val, uaddr) \ __get_user(val, uaddr) +#define DRM_PUT_USER_UNCHECKED(uaddr, val) \ + __put_user(val, uaddr) /** 'malloc' without the overhead of DRM(alloc)() */ @@ -71,6 +75,8 @@ /** 'free' without the overhead of DRM(free)() */ #define DRM_FREE(x,size) kfree(x) +#define DRM_GET_PRIV_WITH_RETURN(_priv, _filp) _priv = _filp->private_data + /** * Get the pointer to the SAREA. * --- diff/drivers/char/drm/drm_sarea.h 2003-07-22 18:54:27.000000000 +0100 +++ source/drivers/char/drm/drm_sarea.h 2004-04-21 10:46:51.405767248 +0100 @@ -32,9 +32,23 @@ #ifndef _DRM_SAREA_H_ #define _DRM_SAREA_H_ +#include "drm.h" + +/* SAREA area needs to be at least a page */ +#if defined(__alpha__) +#define SAREA_MAX 0x2000 +#elif defined(__ia64__) +#define SAREA_MAX 0x10000 /* 64kB */ +#else +/* Intel 830M driver needs at least 8k SAREA */ +#define SAREA_MAX 0x2000 +#endif + /** Maximum number of drawables in the SAREA */ #define SAREA_MAX_DRAWABLES 256 +#define SAREA_DRAWABLE_CLAIMED_ENTRY 0x80000000 + /** SAREA drawable */ typedef struct drm_sarea_drawable { unsigned int stamp; --- diff/drivers/char/drm/drm_stub.h 2004-04-05 12:57:07.000000000 +0100 +++ source/drivers/char/drm/drm_stub.h 2004-04-21 10:46:51.405767248 +0100 @@ -209,8 +209,8 @@ int DRM(stub_register)(const char *name, ret2 = DRM(stub_info).info_register(name, fops, dev); if (ret2) { if (!ret1) { - unregister_chrdev(DRM_MAJOR, "drm"); - class_simple_destroy(drm_class); + unregister_chrdev(DRM_MAJOR, "drm"); + class_simple_destroy(drm_class); } if (!i) inter_module_unregister("drm"); --- diff/drivers/char/drm/drm_vm.h 2004-01-19 10:22:56.000000000 +0000 +++ source/drivers/char/drm/drm_vm.h 2004-04-21 10:46:51.406767096 +0100 @@ -35,48 +35,19 @@ #include "drmP.h" -/** AGP virtual memory operations */ -struct vm_operations_struct DRM(vm_ops) = { - .nopage = DRM(vm_nopage), - .open = DRM(vm_open), - .close = DRM(vm_close), -}; - -/** Shared virtual memory operations */ -struct vm_operations_struct DRM(vm_shm_ops) = { - .nopage = DRM(vm_shm_nopage), - .open = DRM(vm_open), - .close = DRM(vm_shm_close), -}; - -/** DMA virtual memory operations */ -struct vm_operations_struct DRM(vm_dma_ops) = { - .nopage = DRM(vm_dma_nopage), - .open = DRM(vm_open), - .close = DRM(vm_close), -}; - -/** Scatter-gather virtual memory operations */ -struct vm_operations_struct DRM(vm_sg_ops) = { - .nopage = DRM(vm_sg_nopage), - .open = DRM(vm_open), - .close = DRM(vm_close), -}; /** * \c nopage method for AGP virtual memory. * * \param vma virtual memory area. * \param address access address. - * \param write_access sharing. * \return pointer to the page structure. * * Find the right map and if it's AGP memory find the real physical page to * map, get the page, increment the use count and return it. */ -struct page *DRM(vm_nopage)(struct vm_area_struct *vma, - unsigned long address, - int *type) +static __inline__ struct page *DRM(do_vm_nopage)(struct vm_area_struct *vma, + unsigned long address) { #if __REALLY_HAVE_AGP drm_file_t *priv = vma->vm_file->private_data; @@ -133,8 +104,6 @@ struct page *DRM(vm_nopage)(struct vm_ar baddr, __va(agpmem->memory->memory[offset]), offset, atomic_read(&page->count)); - if (type) - *type = VM_FAULT_MINOR; return page; } vm_nopage_error: @@ -148,15 +117,13 @@ vm_nopage_error: * * \param vma virtual memory area. * \param address access address. - * \param write_access sharing. * \return pointer to the page structure. * * Get the the mapping, find the real physical page to map, get the page, and * return it. */ -struct page *DRM(vm_shm_nopage)(struct vm_area_struct *vma, - unsigned long address, - int *type) +static __inline__ struct page *DRM(do_vm_shm_nopage)(struct vm_area_struct *vma, + unsigned long address) { drm_map_t *map = (drm_map_t *)vma->vm_private_data; unsigned long offset; @@ -172,8 +139,6 @@ struct page *DRM(vm_shm_nopage)(struct v if (!page) return NOPAGE_OOM; get_page(page); - if (type) - *type = VM_FAULT_MINOR; DRM_DEBUG("shm_nopage 0x%lx\n", address); return page; @@ -265,14 +230,12 @@ void DRM(vm_shm_close)(struct vm_area_st * * \param vma virtual memory area. * \param address access address. - * \param write_access sharing. * \return pointer to the page structure. * * Determine the page number from the page offset and get it from drm_device_dma::pagelist. */ -struct page *DRM(vm_dma_nopage)(struct vm_area_struct *vma, - unsigned long address, - int *type) +static __inline__ struct page *DRM(do_vm_dma_nopage)(struct vm_area_struct *vma, + unsigned long address) { drm_file_t *priv = vma->vm_file->private_data; drm_device_t *dev = priv->dev; @@ -291,8 +254,6 @@ struct page *DRM(vm_dma_nopage)(struct v (offset & (~PAGE_MASK)))); get_page(page); - if (type) - *type = VM_FAULT_MINOR; DRM_DEBUG("dma_nopage 0x%lx (page %lu)\n", address, page_nr); return page; @@ -303,14 +264,12 @@ struct page *DRM(vm_dma_nopage)(struct v * * \param vma virtual memory area. * \param address access address. - * \param write_access sharing. * \return pointer to the page structure. * * Determine the map offset from the page offset and get it from drm_sg_mem::pagelist. */ -struct page *DRM(vm_sg_nopage)(struct vm_area_struct *vma, - unsigned long address, - int *type) +static __inline__ struct page *DRM(do_vm_sg_nopage)(struct vm_area_struct *vma, + unsigned long address) { drm_map_t *map = (drm_map_t *)vma->vm_private_data; drm_file_t *priv = vma->vm_file->private_data; @@ -331,12 +290,99 @@ struct page *DRM(vm_sg_nopage)(struct vm page_offset = (offset >> PAGE_SHIFT) + (map_offset >> PAGE_SHIFT); page = entry->pagelist[page_offset]; get_page(page); - if (type) - *type = VM_FAULT_MINOR; return page; } + +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) + +static struct page *DRM(vm_nopage)(struct vm_area_struct *vma, + unsigned long address, + int *type) { + if (type) *type = VM_FAULT_MINOR; + return DRM(do_vm_nopage)(vma, address); +} + +static struct page *DRM(vm_shm_nopage)(struct vm_area_struct *vma, + unsigned long address, + int *type) { + if (type) *type = VM_FAULT_MINOR; + return DRM(do_vm_shm_nopage)(vma, address); +} + +static struct page *DRM(vm_dma_nopage)(struct vm_area_struct *vma, + unsigned long address, + int *type) { + if (type) *type = VM_FAULT_MINOR; + return DRM(do_vm_dma_nopage)(vma, address); +} + +static struct page *DRM(vm_sg_nopage)(struct vm_area_struct *vma, + unsigned long address, + int *type) { + if (type) *type = VM_FAULT_MINOR; + return DRM(do_vm_sg_nopage)(vma, address); +} + +#else /* LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,0) */ + +static struct page *DRM(vm_nopage)(struct vm_area_struct *vma, + unsigned long address, + int unused) { + return DRM(do_vm_nopage)(vma, address); +} + +static struct page *DRM(vm_shm_nopage)(struct vm_area_struct *vma, + unsigned long address, + int unused) { + return DRM(do_vm_shm_nopage)(vma, address); +} + +static struct page *DRM(vm_dma_nopage)(struct vm_area_struct *vma, + unsigned long address, + int unused) { + return DRM(do_vm_dma_nopage)(vma, address); +} + +static struct page *DRM(vm_sg_nopage)(struct vm_area_struct *vma, + unsigned long address, + int unused) { + return DRM(do_vm_sg_nopage)(vma, address); +} + +#endif + + +/** AGP virtual memory operations */ +static struct vm_operations_struct DRM(vm_ops) = { + .nopage = DRM(vm_nopage), + .open = DRM(vm_open), + .close = DRM(vm_close), +}; + +/** Shared virtual memory operations */ +static struct vm_operations_struct DRM(vm_shm_ops) = { + .nopage = DRM(vm_shm_nopage), + .open = DRM(vm_open), + .close = DRM(vm_shm_close), +}; + +/** DMA virtual memory operations */ +static struct vm_operations_struct DRM(vm_dma_ops) = { + .nopage = DRM(vm_dma_nopage), + .open = DRM(vm_open), + .close = DRM(vm_close), +}; + +/** Scatter-gather virtual memory operations */ +static struct vm_operations_struct DRM(vm_sg_ops) = { + .nopage = DRM(vm_sg_nopage), + .open = DRM(vm_open), + .close = DRM(vm_close), +}; + + /** * \c open method for shared virtual memory. * --- diff/drivers/char/drm/gamma.h 2003-05-21 11:50:14.000000000 +0100 +++ source/drivers/char/drm/gamma.h 2004-04-21 10:46:51.408766792 +0100 @@ -53,6 +53,10 @@ [DRM_IOCTL_NR(DRM_IOCTL_GAMMA_INIT)] = { gamma_dma_init, 1, 1 }, \ [DRM_IOCTL_NR(DRM_IOCTL_GAMMA_COPY)] = { gamma_dma_copy, 1, 1 } +#define DRIVER_PCI_IDS \ + {0x3d3d, 0x0008, 0}, \ + {0, 0, 0} + #define IOCTL_TABLE_NAME DRM(ioctls) #define IOCTL_FUNC_NAME DRM(ioctl) @@ -104,8 +108,8 @@ return 0; \ } while (0) -#define __HAVE_DMA_IRQ 1 -#define __HAVE_DMA_IRQ_BH 1 +#define __HAVE_IRQ 1 +#define __HAVE_IRQ_BH 1 #define DRIVER_AGP_BUFFERS_MAP( dev ) \ ((drm_gamma_private_t *)((dev)->dev_private))->buffers --- diff/drivers/char/drm/gamma_dma.c 2003-09-30 15:46:12.000000000 +0100 +++ source/drivers/char/drm/gamma_dma.c 2004-04-21 10:46:51.408766792 +0100 @@ -116,7 +116,7 @@ static inline int gamma_dma_is_ready(drm return (!GAMMA_READ(GAMMA_DMACOUNT)); } -irqreturn_t gamma_dma_service( DRM_IRQ_ARGS ) +irqreturn_t gamma_irq_handler( DRM_IRQ_ARGS ) { drm_device_t *dev = (drm_device_t *)arg; drm_device_dma_t *dma = dev->dma; @@ -262,7 +262,7 @@ static void gamma_dma_timer_bh(unsigned gamma_dma_schedule((drm_device_t *)dev, 0); } -void gamma_dma_immediate_bh(void *dev) +void gamma_irq_immediate_bh(void *dev) { gamma_dma_schedule(dev, 0); } @@ -656,12 +656,12 @@ int gamma_do_cleanup_dma( drm_device_t * { DRM_DEBUG( "%s\n", __FUNCTION__ ); -#if _HAVE_DMA_IRQ +#if __HAVE_IRQ /* Make sure interrupts are disabled here because the uninstall ioctl * may not have been called from userspace and after dev_private * is freed, it's too late. */ - if ( dev->irq ) DRM(irq_uninstall)(dev); + if ( dev->irq_enabled ) DRM(irq_uninstall)(dev); #endif if ( dev->dev_private ) { --- diff/drivers/char/drm/gamma_drv.c 2003-05-21 11:50:14.000000000 +0100 +++ source/drivers/char/drm/gamma_drv.c 2004-04-21 10:46:51.408766792 +0100 @@ -48,6 +48,7 @@ #include "drm_fops.h" #include "drm_init.h" #include "drm_ioctl.h" +#include "drm_irq.h" #include "gamma_lists.h" /* NOTE */ #include "drm_lock.h" #include "gamma_lock.h" /* NOTE */ --- diff/drivers/char/drm/i810.h 2003-08-26 10:00:52.000000000 +0100 +++ source/drivers/char/drm/i810.h 2004-04-21 10:46:51.410766488 +0100 @@ -77,7 +77,14 @@ [DRM_IOCTL_NR(DRM_IOCTL_I810_MC)] = { i810_dma_mc, 1, 1 }, \ [DRM_IOCTL_NR(DRM_IOCTL_I810_RSTATUS)] = { i810_rstatus, 1, 0 }, \ [DRM_IOCTL_NR(DRM_IOCTL_I810_FLIP)] = { i810_flip_bufs, 1, 0 } - + +#define DRIVER_PCI_IDS \ + {0x8086, 0x7121, 0}, \ + {0x8086, 0x7123, 0}, \ + {0x8086, 0x7125, 0}, \ + {0x8086, 0x1132, 0}, \ + {0, 0, 0} + #define __HAVE_COUNTERS 4 #define __HAVE_COUNTER6 _DRM_STAT_IRQ @@ -112,7 +119,7 @@ * a noop stub is generated for compatibility. */ /* XXX: Add vblank support? */ -#define __HAVE_DMA_IRQ 0 +#define __HAVE_IRQ 0 /* Buffer customization: */ --- diff/drivers/char/drm/i810_dma.c 2004-04-21 10:45:34.033529616 +0100 +++ source/drivers/char/drm/i810_dma.c 2004-04-21 10:46:51.409766640 +0100 @@ -232,12 +232,12 @@ int i810_dma_cleanup(drm_device_t *dev) { drm_device_dma_t *dma = dev->dma; -#if _HAVE_DMA_IRQ +#if __HAVE_IRQ /* Make sure interrupts are disabled here because the uninstall ioctl * may not have been called from userspace and after dev_private * is freed, it's too late. */ - if (dev->irq) DRM(irq_uninstall)(dev); + if ( dev->irq_enabled ) DRM(irq_uninstall)(dev); #endif if (dev->dev_private) { @@ -1275,6 +1275,9 @@ int i810_dma_mc(struct inode *inode, str return -EINVAL; } + if (mc.idx >= dma->buf_count || mc.idx < 0) + return -EINVAL; + i810_dma_dispatch_mc(dev, dma->buflist[mc.idx], mc.used, mc.last_render ); --- diff/drivers/char/drm/i830.h 2003-05-21 11:50:14.000000000 +0100 +++ source/drivers/char/drm/i830.h 2004-04-21 10:46:51.411766336 +0100 @@ -77,6 +77,13 @@ [DRM_IOCTL_NR(DRM_IOCTL_I830_GETPARAM)] = { i830_getparam, 1, 0 }, \ [DRM_IOCTL_NR(DRM_IOCTL_I830_SETPARAM)] = { i830_setparam, 1, 0 } +#define DRIVER_PCI_IDS \ + {0x8086, 0x3577, 0}, \ + {0x8086, 0x2562, 0}, \ + {0x8086, 0x3582, 0}, \ + {0x8086, 0x2572, 0}, \ + {0, 0, 0} + #define __HAVE_COUNTERS 4 #define __HAVE_COUNTER6 _DRM_STAT_IRQ #define __HAVE_COUNTER7 _DRM_STAT_PRIMARY @@ -115,10 +122,10 @@ #define USE_IRQS 0 #if USE_IRQS -#define __HAVE_DMA_IRQ 1 +#define __HAVE_IRQ 1 #define __HAVE_SHARED_IRQ 1 #else -#define __HAVE_DMA_IRQ 0 +#define __HAVE_IRQ 0 #endif --- diff/drivers/char/drm/i830_dma.c 2004-04-21 10:45:34.043528096 +0100 +++ source/drivers/char/drm/i830_dma.c 2004-04-21 10:46:51.410766488 +0100 @@ -232,12 +232,12 @@ int i830_dma_cleanup(drm_device_t *dev) { drm_device_dma_t *dma = dev->dma; -#if _HAVE_DMA_IRQ +#if __HAVE_IRQ /* Make sure interrupts are disabled here because the uninstall ioctl * may not have been called from userspace and after dev_private * is freed, it's too late. */ - if (dev->irq) DRM(irq_uninstall)(dev); + if ( dev->irq_enabled ) DRM(irq_uninstall)(dev); #endif if (dev->dev_private) { @@ -1540,7 +1540,7 @@ int i830_getparam( struct inode *inode, switch( param.param ) { case I830_PARAM_IRQ_ACTIVE: - value = dev->irq ? 1 : 0; + value = dev->irq_enabled; break; default: return -EINVAL; --- diff/drivers/char/drm/i830_drv.c 2003-05-21 11:50:14.000000000 +0100 +++ source/drivers/char/drm/i830_drv.c 2004-04-21 10:46:51.411766336 +0100 @@ -50,6 +50,7 @@ #include "drm_fops.h" #include "drm_init.h" #include "drm_ioctl.h" +#include "drm_irq.h" #include "drm_lock.h" #include "drm_memory.h" #include "drm_proc.h" --- diff/drivers/char/drm/i830_irq.c 2004-04-21 10:45:34.053526576 +0100 +++ source/drivers/char/drm/i830_irq.c 2004-04-21 10:46:51.412766184 +0100 @@ -35,7 +35,7 @@ #include -irqreturn_t DRM(dma_service)( DRM_IRQ_ARGS ) +irqreturn_t DRM(irq_handler)( DRM_IRQ_ARGS ) { drm_device_t *dev = (drm_device_t *)arg; drm_i830_private_t *dev_priv = (drm_i830_private_t *)dev->dev_private; --- diff/drivers/char/drm/mga.h 2003-05-21 11:50:14.000000000 +0100 +++ source/drivers/char/drm/mga.h 2004-04-21 10:46:51.425764208 +0100 @@ -64,6 +64,12 @@ [DRM_IOCTL_NR(DRM_IOCTL_MGA_BLIT)] = { mga_dma_blit, 1, 0 }, \ [DRM_IOCTL_NR(DRM_IOCTL_MGA_GETPARAM)]= { mga_getparam, 1, 0 }, +#define DRIVER_PCI_IDS \ + {0x102b, 0x0521, 0}, \ + {0x102b, 0x0525, 0}, \ + {0x102b, 0x2527, 0}, \ + {0, 0, 0} + #define __HAVE_COUNTERS 3 #define __HAVE_COUNTER6 _DRM_STAT_IRQ #define __HAVE_COUNTER7 _DRM_STAT_PRIMARY @@ -78,7 +84,7 @@ /* DMA customization: */ #define __HAVE_DMA 1 -#define __HAVE_DMA_IRQ 1 +#define __HAVE_IRQ 1 #define __HAVE_VBL_IRQ 1 #define __HAVE_SHARED_IRQ 1 --- diff/drivers/char/drm/mga_dma.c 2003-06-09 14:18:18.000000000 +0100 +++ source/drivers/char/drm/mga_dma.c 2004-04-21 10:46:51.422764664 +0100 @@ -500,14 +500,6 @@ static int mga_do_init_dma( drm_device_t return DRM_ERR(EINVAL); } - DRM_FIND_MAP( dev_priv->fb, init->fb_offset ); - if(!dev_priv->fb) { - DRM_ERROR( "failed to find framebuffer!\n" ); - /* Assign dev_private so we can do cleanup. */ - dev->dev_private = (void *)dev_priv; - mga_do_cleanup_dma( dev ); - return DRM_ERR(EINVAL); - } DRM_FIND_MAP( dev_priv->mmio, init->mmio_offset ); if(!dev_priv->mmio) { DRM_ERROR( "failed to find mmio region!\n" ); @@ -639,12 +631,12 @@ int mga_do_cleanup_dma( drm_device_t *de { DRM_DEBUG( "\n" ); -#if _HAVE_DMA_IRQ +#if __HAVE_IRQ /* Make sure interrupts are disabled here because the uninstall ioctl * may not have been called from userspace and after dev_private * is freed, it's too late. */ - if ( dev->irq ) DRM(irq_uninstall)(dev); + if ( dev->irq_enabled ) DRM(irq_uninstall)(dev); #endif if ( dev->dev_private ) { --- diff/drivers/char/drm/mga_drm.h 2002-11-18 10:11:54.000000000 +0000 +++ source/drivers/char/drm/mga_drm.h 2004-04-21 10:46:51.423764512 +0100 @@ -117,6 +117,8 @@ #define MGA_NR_TEX_REGIONS 16 #define MGA_LOG_MIN_TEX_REGION_SIZE 16 +#define DRM_MGA_IDLE_RETRY 2048 + #endif /* __MGA_SAREA_DEFINES__ */ @@ -230,16 +232,27 @@ typedef struct _drm_mga_sarea { /* MGA specific ioctls * The device specific ioctl range is 0x40 to 0x79. */ -#define DRM_IOCTL_MGA_INIT DRM_IOW( 0x40, drm_mga_init_t) -#define DRM_IOCTL_MGA_FLUSH DRM_IOW( 0x41, drm_lock_t) -#define DRM_IOCTL_MGA_RESET DRM_IO( 0x42) -#define DRM_IOCTL_MGA_SWAP DRM_IO( 0x43) -#define DRM_IOCTL_MGA_CLEAR DRM_IOW( 0x44, drm_mga_clear_t) -#define DRM_IOCTL_MGA_VERTEX DRM_IOW( 0x45, drm_mga_vertex_t) -#define DRM_IOCTL_MGA_INDICES DRM_IOW( 0x46, drm_mga_indices_t) -#define DRM_IOCTL_MGA_ILOAD DRM_IOW( 0x47, drm_mga_iload_t) -#define DRM_IOCTL_MGA_BLIT DRM_IOW( 0x48, drm_mga_blit_t) -#define DRM_IOCTL_MGA_GETPARAM DRM_IOWR(0x49, drm_mga_getparam_t) +#define DRM_MGA_INIT 0x00 +#define DRM_MGA_FLUSH 0x01 +#define DRM_MGA_RESET 0x02 +#define DRM_MGA_SWAP 0x03 +#define DRM_MGA_CLEAR 0x04 +#define DRM_MGA_VERTEX 0x05 +#define DRM_MGA_INDICES 0x06 +#define DRM_MGA_ILOAD 0x07 +#define DRM_MGA_BLIT 0x08 +#define DRM_MGA_GETPARAM 0x09 + +#define DRM_IOCTL_MGA_INIT DRM_IOW( DRM_COMMAND_BASE + DRM_MGA_INIT, drm_mga_init_t) +#define DRM_IOCTL_MGA_FLUSH DRM_IOW( DRM_COMMAND_BASE + DRM_MGA_FLUSH, drm_lock_t) +#define DRM_IOCTL_MGA_RESET DRM_IO( DRM_COMMAND_BASE + DRM_MGA_RESET) +#define DRM_IOCTL_MGA_SWAP DRM_IO( DRM_COMMAND_BASE + DRM_MGA_SWAP) +#define DRM_IOCTL_MGA_CLEAR DRM_IOW( DRM_COMMAND_BASE + DRM_MGA_CLEAR, drm_mga_clear_t) +#define DRM_IOCTL_MGA_VERTEX DRM_IOW( DRM_COMMAND_BASE + DRM_MGA_VERTEX, drm_mga_vertex_t) +#define DRM_IOCTL_MGA_INDICES DRM_IOW( DRM_COMMAND_BASE + DRM_MGA_INDICES, drm_mga_indices_t) +#define DRM_IOCTL_MGA_ILOAD DRM_IOW( DRM_COMMAND_BASE + DRM_MGA_ILOAD, drm_mga_iload_t) +#define DRM_IOCTL_MGA_BLIT DRM_IOW( DRM_COMMAND_BASE + DRM_MGA_BLIT, drm_mga_blit_t) +#define DRM_IOCTL_MGA_GETPARAM DRM_IOWR(DRM_COMMAND_BASE + DRM_MGA_GETPARAM, drm_mga_getparam_t) typedef struct _drm_mga_warp_index { int installed; @@ -330,7 +343,7 @@ typedef struct _drm_mga_blit { typedef struct drm_mga_getparam { int param; - int *value; + void *value; } drm_mga_getparam_t; #endif --- diff/drivers/char/drm/mga_drv.c 2002-10-16 04:27:56.000000000 +0100 +++ source/drivers/char/drm/mga_drv.c 2004-04-21 10:46:51.424764360 +0100 @@ -45,6 +45,7 @@ #include "drm_fops.h" #include "drm_init.h" #include "drm_ioctl.h" +#include "drm_irq.h" #include "drm_lock.h" #include "drm_memory.h" #include "drm_proc.h" --- diff/drivers/char/drm/mga_drv.h 2003-06-09 14:18:18.000000000 +0100 +++ source/drivers/char/drm/mga_drv.h 2004-04-21 10:46:51.425764208 +0100 @@ -91,7 +91,6 @@ typedef struct drm_mga_private { unsigned int texture_size; drm_local_map_t *sarea; - drm_local_map_t *fb; drm_local_map_t *mmio; drm_local_map_t *status; drm_local_map_t *warp; --- diff/drivers/char/drm/mga_irq.c 2003-05-21 11:50:14.000000000 +0100 +++ source/drivers/char/drm/mga_irq.c 2004-04-21 10:46:51.425764208 +0100 @@ -36,7 +36,7 @@ #include "mga_drm.h" #include "mga_drv.h" -irqreturn_t mga_dma_service( DRM_IRQ_ARGS ) +irqreturn_t mga_irq_handler( DRM_IRQ_ARGS ) { drm_device_t *dev = (drm_device_t *) arg; drm_mga_private_t *dev_priv = --- diff/drivers/char/drm/r128.h 2003-08-26 10:00:52.000000000 +0100 +++ source/drivers/char/drm/r128.h 2004-04-21 10:46:51.441761776 +0100 @@ -79,6 +79,46 @@ [DRM_IOCTL_NR(DRM_IOCTL_R128_INDIRECT)] = { r128_cce_indirect, 1, 1 }, \ [DRM_IOCTL_NR(DRM_IOCTL_R128_GETPARAM)] = { r128_getparam, 1, 0 }, +#define DRIVER_PCI_IDS \ + {0x1002, 0x4c45, 0}, \ + {0x1002, 0x4c46, 0}, \ + {0x1002, 0x4d46, 0}, \ + {0x1002, 0x4d4c, 0}, \ + {0x1002, 0x5041, 0}, \ + {0x1002, 0x5042, 0}, \ + {0x1002, 0x5043, 0}, \ + {0x1002, 0x5044, 0}, \ + {0x1002, 0x5045, 0}, \ + {0x1002, 0x5046, 0}, \ + {0x1002, 0x5047, 0}, \ + {0x1002, 0x5048, 0}, \ + {0x1002, 0x5049, 0}, \ + {0x1002, 0x504A, 0}, \ + {0x1002, 0x504B, 0}, \ + {0x1002, 0x504C, 0}, \ + {0x1002, 0x504D, 0}, \ + {0x1002, 0x504E, 0}, \ + {0x1002, 0x504F, 0}, \ + {0x1002, 0x5050, 0}, \ + {0x1002, 0x5051, 0}, \ + {0x1002, 0x5052, 0}, \ + {0x1002, 0x5053, 0}, \ + {0x1002, 0x5054, 0}, \ + {0x1002, 0x5055, 0}, \ + {0x1002, 0x5056, 0}, \ + {0x1002, 0x5057, 0}, \ + {0x1002, 0x5058, 0}, \ + {0x1002, 0x5245, 0}, \ + {0x1002, 0x5246, 0}, \ + {0x1002, 0x5247, 0}, \ + {0x1002, 0x524b, 0}, \ + {0x1002, 0x524c, 0}, \ + {0x1002, 0x534d, 0}, \ + {0x1002, 0x5446, 0}, \ + {0x1002, 0x544C, 0}, \ + {0x1002, 0x5452, 0}, \ + {0, 0, 0} + /* Driver customization: */ #define DRIVER_PRERELEASE() do { \ @@ -97,7 +137,7 @@ /* DMA customization: */ #define __HAVE_DMA 1 -#define __HAVE_DMA_IRQ 1 +#define __HAVE_IRQ 1 #define __HAVE_VBL_IRQ 1 #define __HAVE_SHARED_IRQ 1 --- diff/drivers/char/drm/r128_cce.c 2003-09-30 15:46:12.000000000 +0100 +++ source/drivers/char/drm/r128_cce.c 2004-04-21 10:46:51.439762080 +0100 @@ -212,7 +212,7 @@ int r128_do_cce_idle( drm_r128_private_t int i; for ( i = 0 ; i < dev_priv->usec_timeout ; i++ ) { - if ( GET_RING_HEAD( &dev_priv->ring ) == dev_priv->ring.tail ) { + if ( GET_RING_HEAD( dev_priv ) == dev_priv->ring.tail ) { int pm4stat = R128_READ( R128_PM4_STAT ); if ( ( (pm4stat & R128_PM4_FIFOCNT_MASK) >= dev_priv->cce_fifo_size ) && @@ -238,7 +238,8 @@ static void r128_do_cce_start( drm_r128_ r128_do_wait_for_idle( dev_priv ); R128_WRITE( R128_PM4_BUFFER_CNTL, - dev_priv->cce_mode | dev_priv->ring.size_l2qw ); + dev_priv->cce_mode | dev_priv->ring.size_l2qw + | R128_PM4_BUFFER_CNTL_NOUPDATE ); R128_READ( R128_PM4_BUFFER_ADDR ); /* as per the sample code */ R128_WRITE( R128_PM4_MICRO_CNTL, R128_PM4_MICRO_FREERUN ); @@ -253,7 +254,6 @@ static void r128_do_cce_reset( drm_r128_ { R128_WRITE( R128_PM4_BUFFER_DL_WPTR, 0 ); R128_WRITE( R128_PM4_BUFFER_DL_RPTR, 0 ); - SET_RING_HEAD( &dev_priv->ring, 0 ); dev_priv->ring.tail = 0; } @@ -264,7 +264,8 @@ static void r128_do_cce_reset( drm_r128_ static void r128_do_cce_stop( drm_r128_private_t *dev_priv ) { R128_WRITE( R128_PM4_MICRO_CNTL, 0 ); - R128_WRITE( R128_PM4_BUFFER_CNTL, R128_PM4_NONPM4 ); + R128_WRITE( R128_PM4_BUFFER_CNTL, + R128_PM4_NONPM4 | R128_PM4_BUFFER_CNTL_NOUPDATE ); dev_priv->cce_running = 0; } @@ -333,26 +334,6 @@ static void r128_cce_init_ring_buffer( d R128_WRITE( R128_PM4_BUFFER_DL_WPTR, 0 ); R128_WRITE( R128_PM4_BUFFER_DL_RPTR, 0 ); - /* DL_RPTR_ADDR is a physical address in AGP space. */ - SET_RING_HEAD( &dev_priv->ring, 0 ); - - if ( !dev_priv->is_pci ) { - R128_WRITE( R128_PM4_BUFFER_DL_RPTR_ADDR, - dev_priv->ring_rptr->offset ); - } else { - drm_sg_mem_t *entry = dev->sg; - unsigned long tmp_ofs, page_ofs; - - tmp_ofs = dev_priv->ring_rptr->offset - dev->sg->handle; - page_ofs = tmp_ofs >> PAGE_SHIFT; - - R128_WRITE( R128_PM4_BUFFER_DL_RPTR_ADDR, - entry->busaddr[page_ofs]); - DRM_DEBUG( "ring rptr: offset=0x%08lx handle=0x%08lx\n", - (unsigned long) entry->busaddr[page_ofs], - entry->handle + tmp_ofs ); - } - /* Set watermark control */ R128_WRITE( R128_PM4_BUFFER_WM_CNTL, ((R128_WATERMARK_L/4) << R128_WMA_SHIFT) @@ -486,13 +467,6 @@ static int r128_do_init_cce( drm_device_ return DRM_ERR(EINVAL); } - DRM_FIND_MAP( dev_priv->fb, init->fb_offset ); - if(!dev_priv->fb) { - DRM_ERROR("could not find framebuffer!\n"); - dev->dev_private = (void *)dev_priv; - r128_do_cleanup_cce( dev ); - return DRM_ERR(EINVAL); - } DRM_FIND_MAP( dev_priv->mmio, init->mmio_offset ); if(!dev_priv->mmio) { DRM_ERROR("could not find mmio region!\n"); @@ -567,9 +541,6 @@ static int r128_do_init_cce( drm_device_ #endif dev_priv->cce_buffers_offset = dev->sg->handle; - dev_priv->ring.head = ((__volatile__ u32 *) - dev_priv->ring_rptr->handle); - dev_priv->ring.start = (u32 *)dev_priv->cce_ring->handle; dev_priv->ring.end = ((u32 *)dev_priv->cce_ring->handle + init->ring_size / sizeof(u32)); @@ -580,7 +551,6 @@ static int r128_do_init_cce( drm_device_ (dev_priv->ring.size / sizeof(u32)) - 1; dev_priv->ring.high_mark = 128; - dev_priv->ring.ring_rptr = dev_priv->ring_rptr; dev_priv->sarea_priv->last_frame = 0; R128_WRITE( R128_LAST_FRAME_REG, dev_priv->sarea_priv->last_frame ); @@ -589,8 +559,9 @@ static int r128_do_init_cce( drm_device_ R128_WRITE( R128_LAST_DISPATCH_REG, dev_priv->sarea_priv->last_dispatch ); -#if __REALLY_HAVE_SG +#if __REALLY_HAVE_AGP if ( dev_priv->is_pci ) { +#endif if (!DRM(ati_pcigart_init)( dev, &dev_priv->phys_pci_gart, &dev_priv->bus_pci_gart) ) { DRM_ERROR( "failed to init PCI GART!\n" ); @@ -599,6 +570,7 @@ static int r128_do_init_cce( drm_device_ return DRM_ERR(ENOMEM); } R128_WRITE( R128_PCI_GART_PAGE, dev_priv->bus_pci_gart ); +#if __REALLY_HAVE_AGP } #endif @@ -615,12 +587,12 @@ static int r128_do_init_cce( drm_device_ int r128_do_cleanup_cce( drm_device_t *dev ) { -#if _HAVE_DMA_IRQ +#if __HAVE_IRQ /* Make sure interrupts are disabled here because the uninstall ioctl * may not have been called from userspace and after dev_private * is freed, it's too late. */ - if ( dev->irq ) DRM(irq_uninstall)(dev); + if ( dev->irq_enabled ) DRM(irq_uninstall)(dev); #endif if ( dev->dev_private ) { @@ -901,7 +873,7 @@ int r128_wait_ring( drm_r128_private_t * int i; for ( i = 0 ; i < dev_priv->usec_timeout ; i++ ) { - r128_update_ring_snapshot( ring ); + r128_update_ring_snapshot( dev_priv ); if ( ring->space >= n ) return 0; DRM_UDELAY( 1 ); --- diff/drivers/char/drm/r128_drm.h 2003-08-20 14:16:27.000000000 +0100 +++ source/drivers/char/drm/r128_drm.h 2004-04-21 10:46:51.439762080 +0100 @@ -176,24 +176,47 @@ typedef struct drm_r128_sarea { /* Rage 128 specific ioctls * The device specific ioctl range is 0x40 to 0x79. */ -#define DRM_IOCTL_R128_INIT DRM_IOW( 0x40, drm_r128_init_t) -#define DRM_IOCTL_R128_CCE_START DRM_IO( 0x41) -#define DRM_IOCTL_R128_CCE_STOP DRM_IOW( 0x42, drm_r128_cce_stop_t) -#define DRM_IOCTL_R128_CCE_RESET DRM_IO( 0x43) -#define DRM_IOCTL_R128_CCE_IDLE DRM_IO( 0x44) -#define DRM_IOCTL_R128_RESET DRM_IO( 0x46) -#define DRM_IOCTL_R128_SWAP DRM_IO( 0x47) -#define DRM_IOCTL_R128_CLEAR DRM_IOW( 0x48, drm_r128_clear_t) -#define DRM_IOCTL_R128_VERTEX DRM_IOW( 0x49, drm_r128_vertex_t) -#define DRM_IOCTL_R128_INDICES DRM_IOW( 0x4a, drm_r128_indices_t) -#define DRM_IOCTL_R128_BLIT DRM_IOW( 0x4b, drm_r128_blit_t) -#define DRM_IOCTL_R128_DEPTH DRM_IOW( 0x4c, drm_r128_depth_t) -#define DRM_IOCTL_R128_STIPPLE DRM_IOW( 0x4d, drm_r128_stipple_t) -#define DRM_IOCTL_R128_INDIRECT DRM_IOWR(0x4f, drm_r128_indirect_t) -#define DRM_IOCTL_R128_FULLSCREEN DRM_IOW( 0x50, drm_r128_fullscreen_t) -#define DRM_IOCTL_R128_CLEAR2 DRM_IOW( 0x51, drm_r128_clear2_t) -#define DRM_IOCTL_R128_GETPARAM DRM_IOW( 0x52, drm_r128_getparam_t) -#define DRM_IOCTL_R128_FLIP DRM_IO( 0x53) +#define DRM_R128_INIT 0x00 +#define DRM_R128_CCE_START 0x01 +#define DRM_R128_CCE_STOP 0x02 +#define DRM_R128_CCE_RESET 0x03 +#define DRM_R128_CCE_IDLE 0x04 +/* 0x05 not used */ +#define DRM_R128_RESET 0x06 +#define DRM_R128_SWAP 0x07 +#define DRM_R128_CLEAR 0x08 +#define DRM_R128_VERTEX 0x09 +#define DRM_R128_INDICES 0x0a +#define DRM_R128_BLIT 0x0b +#define DRM_R128_DEPTH 0x0c +#define DRM_R128_STIPPLE 0x0d +/* 0x0e not used */ +#define DRM_R128_INDIRECT 0x0f +#define DRM_R128_FULLSCREEN 0x10 +#define DRM_R128_CLEAR2 0x11 +#define DRM_R128_GETPARAM 0x12 +#define DRM_R128_FLIP 0x13 + +#define DRM_IOCTL_R128_INIT DRM_IOW( DRM_COMMAND_BASE + DRM_R128_INIT, drm_r128_init_t) +#define DRM_IOCTL_R128_CCE_START DRM_IO( DRM_COMMAND_BASE + DRM_R128_CCE_START) +#define DRM_IOCTL_R128_CCE_STOP DRM_IOW( DRM_COMMAND_BASE + DRM_R128_CCE_STOP, drm_r128_cce_stop_t) +#define DRM_IOCTL_R128_CCE_RESET DRM_IO( DRM_COMMAND_BASE + DRM_R128_CCE_RESET) +#define DRM_IOCTL_R128_CCE_IDLE DRM_IO( DRM_COMMAND_BASE + DRM_R128_CCE_IDLE) +/* 0x05 not used */ +#define DRM_IOCTL_R128_RESET DRM_IO( DRM_COMMAND_BASE + DRM_R128_RESET) +#define DRM_IOCTL_R128_SWAP DRM_IO( DRM_COMMAND_BASE + DRM_R128_SWAP) +#define DRM_IOCTL_R128_CLEAR DRM_IOW( DRM_COMMAND_BASE + DRM_R128_CLEAR, drm_r128_clear_t) +#define DRM_IOCTL_R128_VERTEX DRM_IOW( DRM_COMMAND_BASE + DRM_R128_VERTEX, drm_r128_vertex_t) +#define DRM_IOCTL_R128_INDICES DRM_IOW( DRM_COMMAND_BASE + DRM_R128_INDICES, drm_r128_indices_t) +#define DRM_IOCTL_R128_BLIT DRM_IOW( DRM_COMMAND_BASE + DRM_R128_BLIT, drm_r128_blit_t) +#define DRM_IOCTL_R128_DEPTH DRM_IOW( DRM_COMMAND_BASE + DRM_R128_DEPTH, drm_r128_depth_t) +#define DRM_IOCTL_R128_STIPPLE DRM_IOW( DRM_COMMAND_BASE + DRM_R128_STIPPLE, drm_r128_stipple_t) +/* 0x0e not used */ +#define DRM_IOCTL_R128_INDIRECT DRM_IOWR(DRM_COMMAND_BASE + DRM_R128_INDIRECT, drm_r128_indirect_t) +#define DRM_IOCTL_R128_FULLSCREEN DRM_IOW( DRM_COMMAND_BASE + DRM_R128_FULLSCREEN, drm_r128_fullscreen_t) +#define DRM_IOCTL_R128_CLEAR2 DRM_IOW( DRM_COMMAND_BASE + DRM_R128_CLEAR2, drm_r128_clear2_t) +#define DRM_IOCTL_R128_GETPARAM DRM_IOW( DRM_COMMAND_BASE + DRM_R128_GETPARAM, drm_r128_getparam_t) +#define DRM_IOCTL_R128_FLIP DRM_IO( DRM_COMMAND_BASE + DRM_R128_FLIP) typedef struct drm_r128_init { enum { @@ -316,7 +339,7 @@ typedef struct drm_r128_fullscreen { typedef struct drm_r128_getparam { int param; - int *value; + void *value; } drm_r128_getparam_t; #endif --- diff/drivers/char/drm/r128_drv.c 2002-10-16 04:27:48.000000000 +0100 +++ source/drivers/char/drm/r128_drv.c 2004-04-21 10:46:51.440761928 +0100 @@ -47,6 +47,7 @@ #include "drm_fops.h" #include "drm_init.h" #include "drm_ioctl.h" +#include "drm_irq.h" #include "drm_lock.h" #include "drm_memory.h" #include "drm_proc.h" --- diff/drivers/char/drm/r128_drv.h 2003-08-20 14:16:27.000000000 +0100 +++ source/drivers/char/drm/r128_drv.h 2004-04-21 10:46:51.441761776 +0100 @@ -34,8 +34,7 @@ #ifndef __R128_DRV_H__ #define __R128_DRV_H__ -#define GET_RING_HEAD(ring) DRM_READ32( (ring)->ring_rptr, 0 ) /* (ring)->head */ -#define SET_RING_HEAD(ring,val) DRM_WRITE32( (ring)->ring_rptr, 0, (val) ) /* (ring)->head */ +#define GET_RING_HEAD(dev_priv) R128_READ( R128_PM4_BUFFER_DL_RPTR ) typedef struct drm_r128_freelist { unsigned int age; @@ -50,13 +49,11 @@ typedef struct drm_r128_ring_buffer { int size; int size_l2qw; - volatile u32 *head; u32 tail; u32 tail_mask; int space; int high_mark; - drm_local_map_t *ring_rptr; } drm_r128_ring_buffer_t; typedef struct drm_r128_private { @@ -100,7 +97,6 @@ typedef struct drm_r128_private { u32 span_pitch_offset_c; drm_local_map_t *sarea; - drm_local_map_t *fb; drm_local_map_t *mmio; drm_local_map_t *cce_ring; drm_local_map_t *ring_rptr; @@ -132,14 +128,6 @@ extern drm_buf_t *r128_freelist_get( drm extern int r128_wait_ring( drm_r128_private_t *dev_priv, int n ); -static __inline__ void -r128_update_ring_snapshot( drm_r128_ring_buffer_t *ring ) -{ - ring->space = (GET_RING_HEAD( ring ) - ring->tail) * sizeof(u32); - if ( ring->space <= 0 ) - ring->space += ring->size; -} - extern int r128_do_cce_idle( drm_r128_private_t *dev_priv ); extern int r128_do_cleanup_cce( drm_device_t *dev ); extern int r128_do_cleanup_pageflip( drm_device_t *dev ); @@ -279,6 +267,7 @@ extern int r128_cce_indirect( DRM_IOCTL_ # define R128_PM4_64PIO_64VCBM_64INDBM (7 << 28) # define R128_PM4_64BM_64VCBM_64INDBM (8 << 28) # define R128_PM4_64PIO_64VCPIO_64INDPIO (15 << 28) +# define R128_PM4_BUFFER_CNTL_NOUPDATE (1 << 27) #define R128_PM4_BUFFER_WM_CNTL 0x0708 # define R128_WMA_SHIFT 0 @@ -403,6 +392,15 @@ extern int R128_READ_PLL(drm_device_t *d (pkt) | ((n) << 16)) +static __inline__ void +r128_update_ring_snapshot( drm_r128_private_t *dev_priv ) +{ + drm_r128_ring_buffer_t *ring = &dev_priv->ring; + ring->space = (GET_RING_HEAD( dev_priv ) - ring->tail) * sizeof(u32); + if ( ring->space <= 0 ) + ring->space += ring->size; +} + /* ================================================================ * Misc helper macros */ @@ -412,7 +410,7 @@ do { \ drm_r128_ring_buffer_t *ring = &dev_priv->ring; int i; \ if ( ring->space < ring->high_mark ) { \ for ( i = 0 ; i < dev_priv->usec_timeout ; i++ ) { \ - r128_update_ring_snapshot( ring ); \ + r128_update_ring_snapshot( dev_priv ); \ if ( ring->space >= ring->high_mark ) \ goto __ring_space_done; \ DRM_UDELAY(1); \ @@ -445,17 +443,10 @@ do { \ * Ring control */ -#if defined(__powerpc__) -#define r128_flush_write_combine() (void) GET_RING_HEAD( &dev_priv->ring ) -#else -#define r128_flush_write_combine() DRM_WRITEMEMORYBARRIER() -#endif - - #define R128_VERBOSE 0 #define RING_LOCALS \ - int write; unsigned int tail_mask; volatile u32 *ring; + int write, _nr; unsigned int tail_mask; volatile u32 *ring; #define BEGIN_RING( n ) do { \ if ( R128_VERBOSE ) { \ @@ -463,9 +454,10 @@ do { \ (n), __FUNCTION__ ); \ } \ if ( dev_priv->ring.space <= (n) * sizeof(u32) ) { \ + COMMIT_RING(); \ r128_wait_ring( dev_priv, (n) * sizeof(u32) ); \ } \ - dev_priv->ring.space -= (n) * sizeof(u32); \ + _nr = n; dev_priv->ring.space -= (n) * sizeof(u32); \ ring = dev_priv->ring.start; \ write = dev_priv->ring.tail; \ tail_mask = dev_priv->ring.tail_mask; \ @@ -488,9 +480,23 @@ do { \ dev_priv->ring.start, \ write * sizeof(u32) ); \ } \ - r128_flush_write_combine(); \ - dev_priv->ring.tail = write; \ - R128_WRITE( R128_PM4_BUFFER_DL_WPTR, write ); \ + if (((dev_priv->ring.tail + _nr) & tail_mask) != write) { \ + DRM_ERROR( \ + "ADVANCE_RING(): mismatch: nr: %x write: %x line: %d\n", \ + ((dev_priv->ring.tail + _nr) & tail_mask), \ + write, __LINE__); \ + } else \ + dev_priv->ring.tail = write; \ +} while (0) + +#define COMMIT_RING() do { \ + if ( R128_VERBOSE ) { \ + DRM_INFO( "COMMIT_RING() tail=0x%06x\n", \ + dev_priv->ring.tail ); \ + } \ + DRM_MEMORYBARRIER(); \ + R128_WRITE( R128_PM4_BUFFER_DL_WPTR, dev_priv->ring.tail ); \ + R128_READ( R128_PM4_BUFFER_DL_WPTR ); \ } while (0) #define OUT_RING( x ) do { \ --- diff/drivers/char/drm/r128_irq.c 2003-05-21 11:50:14.000000000 +0100 +++ source/drivers/char/drm/r128_irq.c 2004-04-21 10:46:51.442761624 +0100 @@ -36,7 +36,7 @@ #include "r128_drm.h" #include "r128_drv.h" -irqreturn_t r128_dma_service( DRM_IRQ_ARGS ) +irqreturn_t r128_irq_handler( DRM_IRQ_ARGS ) { drm_device_t *dev = (drm_device_t *) arg; drm_r128_private_t *dev_priv = --- diff/drivers/char/drm/r128_state.c 2004-03-11 10:20:23.000000000 +0000 +++ source/drivers/char/drm/r128_state.c 2004-04-21 10:46:51.451760256 +0100 @@ -45,7 +45,7 @@ static void r128_emit_clip_rects( drm_r1 RING_LOCALS; DRM_DEBUG( " %s\n", __FUNCTION__ ); - BEGIN_RING( 17 ); + BEGIN_RING( (count < 3? count: 3) * 5 + 2 ); if ( count >= 1 ) { OUT_RING( CCE_PACKET0( R128_AUX1_SC_LEFT, 3 ) ); @@ -1011,7 +1011,7 @@ static int r128_cce_dispatch_write_pixel DRM_DEBUG( "\n" ); count = depth->n; - if (count > 4096 || count <= 0) + if (count > 4096 || count <= 0) return -EMSGSIZE; xbuf_size = count * sizeof(*x); @@ -1280,6 +1280,7 @@ int r128_cce_clear( DRM_IOCTL_ARGS ) sarea_priv->nbox = R128_NR_SAREA_CLIPRECTS; r128_cce_dispatch_clear( dev, &clear ); + COMMIT_RING(); /* Make sure we restore the 3D state next time. */ @@ -1315,8 +1316,10 @@ int r128_do_cleanup_pageflip( drm_device R128_WRITE( R128_CRTC_OFFSET, dev_priv->crtc_offset ); R128_WRITE( R128_CRTC_OFFSET_CNTL, dev_priv->crtc_offset_cntl ); - if (dev_priv->current_page != 0) + if (dev_priv->current_page != 0) { r128_cce_dispatch_flip( dev ); + COMMIT_RING(); + } dev_priv->page_flipping = 0; return 0; @@ -1341,6 +1344,7 @@ int r128_cce_flip( DRM_IOCTL_ARGS ) r128_cce_dispatch_flip( dev ); + COMMIT_RING(); return 0; } @@ -1362,6 +1366,7 @@ int r128_cce_swap( DRM_IOCTL_ARGS ) dev_priv->sarea_priv->dirty |= (R128_UPLOAD_CONTEXT | R128_UPLOAD_MASKS); + COMMIT_RING(); return 0; } @@ -1421,6 +1426,7 @@ int r128_cce_vertex( DRM_IOCTL_ARGS ) r128_cce_dispatch_vertex( dev, buf ); + COMMIT_RING(); return 0; } @@ -1492,6 +1498,7 @@ int r128_cce_indices( DRM_IOCTL_ARGS ) r128_cce_dispatch_indices( dev, buf, elts.start, elts.end, count ); + COMMIT_RING(); return 0; } @@ -1501,6 +1508,7 @@ int r128_cce_blit( DRM_IOCTL_ARGS ) drm_device_dma_t *dma = dev->dma; drm_r128_private_t *dev_priv = dev->dev_private; drm_r128_blit_t blit; + int ret; LOCK_TEST_WITH_RETURN( dev, filp ); @@ -1518,7 +1526,10 @@ int r128_cce_blit( DRM_IOCTL_ARGS ) RING_SPACE_TEST_WITH_RETURN( dev_priv ); VB_AGE_TEST_WITH_RETURN( dev_priv ); - return r128_cce_dispatch_blit( filp, dev, &blit ); + ret = r128_cce_dispatch_blit( filp, dev, &blit ); + + COMMIT_RING(); + return ret; } int r128_cce_depth( DRM_IOCTL_ARGS ) @@ -1526,6 +1537,7 @@ int r128_cce_depth( DRM_IOCTL_ARGS ) DRM_DEVICE; drm_r128_private_t *dev_priv = dev->dev_private; drm_r128_depth_t depth; + int ret; LOCK_TEST_WITH_RETURN( dev, filp ); @@ -1534,18 +1546,20 @@ int r128_cce_depth( DRM_IOCTL_ARGS ) RING_SPACE_TEST_WITH_RETURN( dev_priv ); + ret = DRM_ERR(EINVAL); switch ( depth.func ) { case R128_WRITE_SPAN: - return r128_cce_dispatch_write_span( dev, &depth ); + ret = r128_cce_dispatch_write_span( dev, &depth ); case R128_WRITE_PIXELS: - return r128_cce_dispatch_write_pixels( dev, &depth ); + ret = r128_cce_dispatch_write_pixels( dev, &depth ); case R128_READ_SPAN: - return r128_cce_dispatch_read_span( dev, &depth ); + ret = r128_cce_dispatch_read_span( dev, &depth ); case R128_READ_PIXELS: - return r128_cce_dispatch_read_pixels( dev, &depth ); + ret = r128_cce_dispatch_read_pixels( dev, &depth ); } - return DRM_ERR(EINVAL); + COMMIT_RING(); + return ret; } int r128_cce_stipple( DRM_IOCTL_ARGS ) @@ -1568,6 +1582,7 @@ int r128_cce_stipple( DRM_IOCTL_ARGS ) r128_cce_dispatch_stipple( dev, mask ); + COMMIT_RING(); return 0; } @@ -1643,6 +1658,7 @@ int r128_cce_indirect( DRM_IOCTL_ARGS ) */ r128_cce_dispatch_indirect( dev, buf, indirect.start, indirect.end ); + COMMIT_RING(); return 0; } --- diff/drivers/char/drm/radeon.h 2003-09-30 15:46:12.000000000 +0100 +++ source/drivers/char/drm/radeon.h 2004-04-21 10:46:51.456759496 +0100 @@ -51,7 +51,7 @@ #define DRIVER_DATE "20020828" #define DRIVER_MAJOR 1 -#define DRIVER_MINOR 9 +#define DRIVER_MINOR 10 #define DRIVER_PATCHLEVEL 0 /* Interface history: @@ -81,6 +81,9 @@ * Add 'GET' queries for starting additional clients on different VT's. * 1.9 - Add DRM_IOCTL_RADEON_CP_RESUME ioctl. * Add texture rectangle support for r100. + * 1.10- Add SETPARAM ioctl; first parameter to set is FB_LOCATION, which + * clients use to tell the DRM where they think the framebuffer is + * located in the card's address space */ #define DRIVER_IOCTLS \ [DRM_IOCTL_NR(DRM_IOCTL_DMA)] = { radeon_cp_buffers, 1, 0 }, \ @@ -106,10 +109,82 @@ [DRM_IOCTL_NR(DRM_IOCTL_RADEON_ALLOC)] = { radeon_mem_alloc, 1, 0 }, \ [DRM_IOCTL_NR(DRM_IOCTL_RADEON_FREE)] = { radeon_mem_free, 1, 0 }, \ [DRM_IOCTL_NR(DRM_IOCTL_RADEON_INIT_HEAP)] = { radeon_mem_init_heap, 1, 1 }, \ - [DRM_IOCTL_NR(DRM_IOCTL_RADEON_IRQ_EMIT)] = { radeon_irq_emit, 1, 0 }, \ - [DRM_IOCTL_NR(DRM_IOCTL_RADEON_IRQ_WAIT)] = { radeon_irq_wait, 1, 0 }, + [DRM_IOCTL_NR(DRM_IOCTL_RADEON_IRQ_EMIT)] = { radeon_irq_emit, 1, 0 }, \ + [DRM_IOCTL_NR(DRM_IOCTL_RADEON_IRQ_WAIT)] = { radeon_irq_wait, 1, 0 }, \ + [DRM_IOCTL_NR(DRM_IOCTL_RADEON_SETPARAM)] = { radeon_cp_setparam, 1, 0 }, \ + +#define DRIVER_PCI_IDS \ + {0x1002, 0x4136, 0}, \ + {0x1002, 0x4137, 0}, \ + {0x1002, 0x4237, 0}, \ + {0x1002, 0x4242, 0}, \ + {0x1002, 0x4242, 0}, \ + {0x1002, 0x4336, 0}, \ + {0x1002, 0x4337, 0}, \ + {0x1002, 0x4437, 0}, \ + {0x1002, 0x4964, 0}, \ + {0x1002, 0x4965, 0}, \ + {0x1002, 0x4966, 0}, \ + {0x1002, 0x4967, 0}, \ + {0x1002, 0x4C57, 0}, \ + {0x1002, 0x4C58, 0}, \ + {0x1002, 0x4C59, 0}, \ + {0x1002, 0x4C5A, 0}, \ + {0x1002, 0x4C64, 0}, \ + {0x1002, 0x4C65, 0}, \ + {0x1002, 0x4C66, 0}, \ + {0x1002, 0x4C67, 0}, \ + {0x1002, 0x5144, 0}, \ + {0x1002, 0x5145, 0}, \ + {0x1002, 0x5146, 0}, \ + {0x1002, 0x5147, 0}, \ + {0x1002, 0x5148, 0}, \ + {0x1002, 0x5149, 0}, \ + {0x1002, 0x514A, 0}, \ + {0x1002, 0x514B, 0}, \ + {0x1002, 0x514C, 0}, \ + {0x1002, 0x514D, 0}, \ + {0x1002, 0x514E, 0}, \ + {0x1002, 0x514F, 0}, \ + {0x1002, 0x5157, 0}, \ + {0x1002, 0x5158, 0}, \ + {0x1002, 0x5159, 0}, \ + {0x1002, 0x515A, 0}, \ + {0x1002, 0x5168, 0}, \ + {0x1002, 0x5169, 0}, \ + {0x1002, 0x516A, 0}, \ + {0x1002, 0x516B, 0}, \ + {0x1002, 0x516C, 0}, \ + {0x1002, 0x5834, 0}, \ + {0x1002, 0x5835, 0}, \ + {0x1002, 0x5836, 0}, \ + {0x1002, 0x5837, 0}, \ + {0x1002, 0x5960, 0}, \ + {0x1002, 0x5961, 0}, \ + {0x1002, 0x5962, 0}, \ + {0x1002, 0x5963, 0}, \ + {0x1002, 0x5964, 0}, \ + {0x1002, 0x5968, 0}, \ + {0x1002, 0x5969, 0}, \ + {0x1002, 0x596A, 0}, \ + {0x1002, 0x596B, 0}, \ + {0x1002, 0x5c61, 0}, \ + {0x1002, 0x5c62, 0}, \ + {0x1002, 0x5c63, 0}, \ + {0x1002, 0x5c64, 0}, \ + {0, 0, 0} +#define DRIVER_FILE_FIELDS \ + int64_t radeon_fb_delta; \ +#define DRIVER_OPEN_HELPER( filp_priv, dev ) \ +do { \ + drm_radeon_private_t *dev_priv = dev->dev_private; \ + if ( dev_priv ) \ + filp_priv->radeon_fb_delta = dev_priv->fb_location; \ + else \ + filp_priv->radeon_fb_delta = 0; \ +} while( 0 ) /* When a client dies: * - Check for and clean up flipped page state @@ -142,7 +217,7 @@ do { \ /* DMA customization: */ #define __HAVE_DMA 1 -#define __HAVE_DMA_IRQ 1 +#define __HAVE_IRQ 1 #define __HAVE_VBL_IRQ 1 #define __HAVE_SHARED_IRQ 1 --- diff/drivers/char/drm/radeon_cp.c 2003-09-30 15:46:12.000000000 +0100 +++ source/drivers/char/drm/radeon_cp.c 2004-04-21 10:46:51.453759952 +0100 @@ -855,7 +855,8 @@ static void radeon_cp_init_ring_buffer( /* Initialize the memory controller */ RADEON_WRITE( RADEON_MC_FB_LOCATION, - (dev_priv->gart_vm_start - 1) & 0xffff0000 ); + ( ( dev_priv->gart_vm_start - 1 ) & 0xffff0000 ) + | ( dev_priv->fb_location >> 16 ) ); #if __REALLY_HAVE_AGP if ( !dev_priv->is_pci ) { @@ -1071,13 +1072,6 @@ static int radeon_do_init_cp( drm_device dev_priv->depth_offset = init->depth_offset; dev_priv->depth_pitch = init->depth_pitch; - dev_priv->front_pitch_offset = (((dev_priv->front_pitch/64) << 22) | - (dev_priv->front_offset >> 10)); - dev_priv->back_pitch_offset = (((dev_priv->back_pitch/64) << 22) | - (dev_priv->back_offset >> 10)); - dev_priv->depth_pitch_offset = (((dev_priv->depth_pitch/64) << 22) | - (dev_priv->depth_offset >> 10)); - /* Hardware state for depth clears. Remove this if/when we no * longer clear the depth buffer with a 3D rectangle. Hard-code * all values to prevent unwanted 3D state from slipping through @@ -1124,13 +1118,6 @@ static int radeon_do_init_cp( drm_device return DRM_ERR(EINVAL); } - DRM_FIND_MAP( dev_priv->fb, init->fb_offset ); - if(!dev_priv->fb) { - DRM_ERROR("could not find framebuffer!\n"); - dev->dev_private = (void *)dev_priv; - radeon_do_cleanup_cp(dev); - return DRM_ERR(EINVAL); - } DRM_FIND_MAP( dev_priv->mmio, init->mmio_offset ); if(!dev_priv->mmio) { DRM_ERROR("could not find mmio region!\n"); @@ -1204,9 +1191,26 @@ static int radeon_do_init_cp( drm_device dev_priv->buffers->handle ); } + dev_priv->fb_location = ( RADEON_READ( RADEON_MC_FB_LOCATION ) + & 0xffff ) << 16; + + dev_priv->front_pitch_offset = (((dev_priv->front_pitch/64) << 22) | + ( ( dev_priv->front_offset + + dev_priv->fb_location ) >> 10 ) ); + + dev_priv->back_pitch_offset = (((dev_priv->back_pitch/64) << 22) | + ( ( dev_priv->back_offset + + dev_priv->fb_location ) >> 10 ) ); + + dev_priv->depth_pitch_offset = (((dev_priv->depth_pitch/64) << 22) | + ( ( dev_priv->depth_offset + + dev_priv->fb_location ) >> 10 ) ); + dev_priv->gart_size = init->gart_size; - dev_priv->gart_vm_start = RADEON_READ( RADEON_CONFIG_APER_SIZE ); + dev_priv->gart_vm_start = dev_priv->fb_location + + RADEON_READ( RADEON_CONFIG_APER_SIZE ); + #if __REALLY_HAVE_AGP if ( !dev_priv->is_pci ) dev_priv->gart_buffers_offset = (dev_priv->buffers->offset @@ -1271,12 +1275,12 @@ int radeon_do_cleanup_cp( drm_device_t * { DRM_DEBUG( "\n" ); -#if _HAVE_DMA_IRQ +#if __HAVE_IRQ /* Make sure interrupts are disabled here because the uninstall ioctl * may not have been called from userspace and after dev_private * is freed, it's too late. */ - if ( dev->irq ) DRM(irq_uninstall)(dev); + if ( dev->irq_enabled ) DRM(irq_uninstall)(dev); #endif if ( dev->dev_private ) { --- diff/drivers/char/drm/radeon_drm.h 2003-09-30 15:46:12.000000000 +0100 +++ source/drivers/char/drm/radeon_drm.h 2004-04-21 10:46:51.454759800 +0100 @@ -226,6 +226,13 @@ typedef union { #define RADEON_MAX_TEXTURE_LEVELS 12 #define RADEON_MAX_TEXTURE_UNITS 3 +/* Blits have strict offset rules. All blit offset must be aligned on + * a 1K-byte boundary. + */ +#define RADEON_OFFSET_SHIFT 10 +#define RADEON_OFFSET_ALIGN (1 << RADEON_OFFSET_SHIFT) +#define RADEON_OFFSET_MASK (RADEON_OFFSET_ALIGN - 1) + #endif /* __RADEON_SAREA_DEFINES__ */ typedef struct { @@ -365,31 +372,58 @@ typedef struct { /* Radeon specific ioctls * The device specific ioctl range is 0x40 to 0x79. */ -#define DRM_IOCTL_RADEON_CP_INIT DRM_IOW( 0x40, drm_radeon_init_t) -#define DRM_IOCTL_RADEON_CP_START DRM_IO( 0x41) -#define DRM_IOCTL_RADEON_CP_STOP DRM_IOW( 0x42, drm_radeon_cp_stop_t) -#define DRM_IOCTL_RADEON_CP_RESET DRM_IO( 0x43) -#define DRM_IOCTL_RADEON_CP_IDLE DRM_IO( 0x44) -#define DRM_IOCTL_RADEON_RESET DRM_IO( 0x45) -#define DRM_IOCTL_RADEON_FULLSCREEN DRM_IOW( 0x46, drm_radeon_fullscreen_t) -#define DRM_IOCTL_RADEON_SWAP DRM_IO( 0x47) -#define DRM_IOCTL_RADEON_CLEAR DRM_IOW( 0x48, drm_radeon_clear_t) -#define DRM_IOCTL_RADEON_VERTEX DRM_IOW( 0x49, drm_radeon_vertex_t) -#define DRM_IOCTL_RADEON_INDICES DRM_IOW( 0x4a, drm_radeon_indices_t) -#define DRM_IOCTL_RADEON_STIPPLE DRM_IOW( 0x4c, drm_radeon_stipple_t) -#define DRM_IOCTL_RADEON_INDIRECT DRM_IOWR(0x4d, drm_radeon_indirect_t) -#define DRM_IOCTL_RADEON_TEXTURE DRM_IOWR(0x4e, drm_radeon_texture_t) -#define DRM_IOCTL_RADEON_VERTEX2 DRM_IOW( 0x4f, drm_radeon_vertex2_t) -#define DRM_IOCTL_RADEON_CMDBUF DRM_IOW( 0x50, drm_radeon_cmd_buffer_t) -#define DRM_IOCTL_RADEON_GETPARAM DRM_IOWR(0x51, drm_radeon_getparam_t) -#define DRM_IOCTL_RADEON_FLIP DRM_IO( 0x52) -#define DRM_IOCTL_RADEON_ALLOC DRM_IOWR( 0x53, drm_radeon_mem_alloc_t) -#define DRM_IOCTL_RADEON_FREE DRM_IOW( 0x54, drm_radeon_mem_free_t) -#define DRM_IOCTL_RADEON_INIT_HEAP DRM_IOW( 0x55, drm_radeon_mem_init_heap_t) -#define DRM_IOCTL_RADEON_IRQ_EMIT DRM_IOWR( 0x56, drm_radeon_irq_emit_t) -#define DRM_IOCTL_RADEON_IRQ_WAIT DRM_IOW( 0x57, drm_radeon_irq_wait_t) -/* added by Charl P. Botha - see radeon_cp.c for details */ -#define DRM_IOCTL_RADEON_CP_RESUME DRM_IO(0x58) +#define DRM_RADEON_CP_INIT 0x00 +#define DRM_RADEON_CP_START 0x01 +#define DRM_RADEON_CP_STOP 0x02 +#define DRM_RADEON_CP_RESET 0x03 +#define DRM_RADEON_CP_IDLE 0x04 +#define DRM_RADEON_RESET 0x05 +#define DRM_RADEON_FULLSCREEN 0x06 +#define DRM_RADEON_SWAP 0x07 +#define DRM_RADEON_CLEAR 0x08 +#define DRM_RADEON_VERTEX 0x09 +#define DRM_RADEON_INDICES 0x0A +#define DRM_RADEON_NOT_USED +#define DRM_RADEON_STIPPLE 0x0C +#define DRM_RADEON_INDIRECT 0x0D +#define DRM_RADEON_TEXTURE 0x0E +#define DRM_RADEON_VERTEX2 0x0F +#define DRM_RADEON_CMDBUF 0x10 +#define DRM_RADEON_GETPARAM 0x11 +#define DRM_RADEON_FLIP 0x12 +#define DRM_RADEON_ALLOC 0x13 +#define DRM_RADEON_FREE 0x14 +#define DRM_RADEON_INIT_HEAP 0x15 +#define DRM_RADEON_IRQ_EMIT 0x16 +#define DRM_RADEON_IRQ_WAIT 0x17 +#define DRM_RADEON_CP_RESUME 0x18 +#define DRM_RADEON_SETPARAM 0x19 + +#define DRM_IOCTL_RADEON_CP_INIT DRM_IOW( DRM_COMMAND_BASE + DRM_RADEON_CP_INIT, drm_radeon_init_t) +#define DRM_IOCTL_RADEON_CP_START DRM_IO( DRM_COMMAND_BASE + DRM_RADEON_CP_START) +#define DRM_IOCTL_RADEON_CP_STOP DRM_IOW( DRM_COMMAND_BASE + DRM_RADEON_CP_STOP, drm_radeon_cp_stop_t) +#define DRM_IOCTL_RADEON_CP_RESET DRM_IO( DRM_COMMAND_BASE + DRM_RADEON_CP_RESET) +#define DRM_IOCTL_RADEON_CP_IDLE DRM_IO( DRM_COMMAND_BASE + DRM_RADEON_CP_IDLE) +#define DRM_IOCTL_RADEON_RESET DRM_IO( DRM_COMMAND_BASE + DRM_RADEON_RESET) +#define DRM_IOCTL_RADEON_FULLSCREEN DRM_IOW( DRM_COMMAND_BASE + DRM_RADEON_FULLSCREEN, drm_radeon_fullscreen_t) +#define DRM_IOCTL_RADEON_SWAP DRM_IO( DRM_COMMAND_BASE + DRM_RADEON_SWAP) +#define DRM_IOCTL_RADEON_CLEAR DRM_IOW( DRM_COMMAND_BASE + DRM_RADEON_CLEAR, drm_radeon_clear_t) +#define DRM_IOCTL_RADEON_VERTEX DRM_IOW( DRM_COMMAND_BASE + DRM_RADEON_VERTEX, drm_radeon_vertex_t) +#define DRM_IOCTL_RADEON_INDICES DRM_IOW( DRM_COMMAND_BASE + DRM_RADEON_INDICES, drm_radeon_indices_t) +#define DRM_IOCTL_RADEON_STIPPLE DRM_IOW( DRM_COMMAND_BASE + DRM_RADEON_STIPPLE, drm_radeon_stipple_t) +#define DRM_IOCTL_RADEON_INDIRECT DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_INDIRECT, drm_radeon_indirect_t) +#define DRM_IOCTL_RADEON_TEXTURE DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_TEXTURE, drm_radeon_texture_t) +#define DRM_IOCTL_RADEON_VERTEX2 DRM_IOW( DRM_COMMAND_BASE + DRM_RADEON_VERTEX2, drm_radeon_vertex2_t) +#define DRM_IOCTL_RADEON_CMDBUF DRM_IOW( DRM_COMMAND_BASE + DRM_RADEON_CMDBUF, drm_radeon_cmd_buffer_t) +#define DRM_IOCTL_RADEON_GETPARAM DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GETPARAM, drm_radeon_getparam_t) +#define DRM_IOCTL_RADEON_FLIP DRM_IO( DRM_COMMAND_BASE + DRM_RADEON_FLIP) +#define DRM_IOCTL_RADEON_ALLOC DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_ALLOC, drm_radeon_mem_alloc_t) +#define DRM_IOCTL_RADEON_FREE DRM_IOW( DRM_COMMAND_BASE + DRM_RADEON_FREE, drm_radeon_mem_free_t) +#define DRM_IOCTL_RADEON_INIT_HEAP DRM_IOW( DRM_COMMAND_BASE + DRM_RADEON_INIT_HEAP, drm_radeon_mem_init_heap_t) +#define DRM_IOCTL_RADEON_IRQ_EMIT DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_IRQ_EMIT, drm_radeon_irq_emit_t) +#define DRM_IOCTL_RADEON_IRQ_WAIT DRM_IOW( DRM_COMMAND_BASE + DRM_RADEON_IRQ_WAIT, drm_radeon_irq_wait_t) +#define DRM_IOCTL_RADEON_CP_RESUME DRM_IO( DRM_COMMAND_BASE + DRM_RADEON_CP_RESUME) +#define DRM_IOCTL_RADEON_SETPARAM DRM_IOW( DRM_COMMAND_BASE + DRM_RADEON_SETPARAM, drm_radeon_setparam_t) typedef struct drm_radeon_init { enum { @@ -502,7 +536,7 @@ typedef struct drm_radeon_tex_image { } drm_radeon_tex_image_t; typedef struct drm_radeon_texture { - int offset; + unsigned int offset; int pitch; int format; int width; /* Texture image coordinates */ @@ -537,10 +571,11 @@ typedef struct drm_radeon_indirect { #define RADEON_PARAM_STATUS_HANDLE 8 #define RADEON_PARAM_SAREA_HANDLE 9 #define RADEON_PARAM_GART_TEX_HANDLE 10 +#define RADEON_PARAM_SCRATCH_OFFSET 11 typedef struct drm_radeon_getparam { int param; - int *value; + void *value; } drm_radeon_getparam_t; /* 1.6: Set up a memory manager for regions of shared memory: @@ -578,4 +613,16 @@ typedef struct drm_radeon_irq_wait { } drm_radeon_irq_wait_t; +/* 1.10: Clients tell the DRM where they think the framebuffer is located in + * the card's address space, via a new generic ioctl to set parameters + */ + +typedef struct drm_radeon_setparam { + unsigned int param; + int64_t value; +} drm_radeon_setparam_t; + +#define RADEON_SETPARAM_FB_LOCATION 1 /* determined framebuffer location */ + + #endif --- diff/drivers/char/drm/radeon_drv.c 2003-07-22 18:54:27.000000000 +0100 +++ source/drivers/char/drm/radeon_drv.c 2004-04-21 10:46:51.454759800 +0100 @@ -48,6 +48,7 @@ #include "drm_fops.h" #include "drm_init.h" #include "drm_ioctl.h" +#include "drm_irq.h" #include "drm_lock.h" #include "drm_memory.h" #include "drm_proc.h" --- diff/drivers/char/drm/radeon_drv.h 2003-09-30 15:46:12.000000000 +0100 +++ source/drivers/char/drm/radeon_drv.h 2004-04-21 10:46:51.455759648 +0100 @@ -73,6 +73,8 @@ typedef struct drm_radeon_private { drm_radeon_ring_buffer_t ring; drm_radeon_sarea_t *sarea_priv; + u32 fb_location; + int gart_size; u32 gart_vm_start; unsigned long gart_buffers_offset; @@ -133,7 +135,6 @@ typedef struct drm_radeon_private { unsigned long gart_textures_offset; drm_local_map_t *sarea; - drm_local_map_t *fb; drm_local_map_t *mmio; drm_local_map_t *cp_ring; drm_local_map_t *ring_rptr; @@ -184,6 +185,7 @@ extern int radeon_cp_indirect( DRM_IOCTL extern int radeon_cp_vertex2( DRM_IOCTL_ARGS ); extern int radeon_cp_cmdbuf( DRM_IOCTL_ARGS ); extern int radeon_cp_getparam( DRM_IOCTL_ARGS ); +extern int radeon_cp_setparam( DRM_IOCTL_ARGS ); extern int radeon_cp_flip( DRM_IOCTL_ARGS ); extern int radeon_mem_alloc( DRM_IOCTL_ARGS ); @@ -239,6 +241,7 @@ extern void radeon_do_release(drm_device #define RADEON_CRTC2_OFFSET 0x0324 #define RADEON_CRTC2_OFFSET_CNTL 0x0328 +#define RADEON_RB3D_COLOROFFSET 0x1c40 #define RADEON_RB3D_COLORPITCH 0x1c48 #define RADEON_DP_GUI_MASTER_CNTL 0x146c @@ -332,6 +335,7 @@ extern void radeon_do_release(drm_device #define RADEON_PP_MISC 0x1c14 #define RADEON_PP_ROT_MATRIX_0 0x1d58 #define RADEON_PP_TXFILTER_0 0x1c54 +#define RADEON_PP_TXOFFSET_0 0x1c5c #define RADEON_PP_TXFILTER_1 0x1c6c #define RADEON_PP_TXFILTER_2 0x1c84 --- diff/drivers/char/drm/radeon_irq.c 2003-05-21 11:50:14.000000000 +0100 +++ source/drivers/char/drm/radeon_irq.c 2004-04-21 10:46:51.463758432 +0100 @@ -54,7 +54,7 @@ * tied to dma at all, this is just a hangover from dri prehistory. */ -irqreturn_t DRM(dma_service)( DRM_IRQ_ARGS ) +irqreturn_t DRM(irq_handler)( DRM_IRQ_ARGS ) { drm_device_t *dev = (drm_device_t *) arg; drm_radeon_private_t *dev_priv = --- diff/drivers/char/drm/radeon_state.c 2004-02-18 08:54:09.000000000 +0000 +++ source/drivers/char/drm/radeon_state.c 2004-04-21 10:46:51.471757216 +0100 @@ -36,6 +36,240 @@ /* ================================================================ + * Helper functions for client state checking and fixup + */ + +static __inline__ int radeon_check_and_fixup_offset( drm_radeon_private_t *dev_priv, + drm_file_t *filp_priv, + u32 *offset ) { + u32 off = *offset; + + if ( off >= dev_priv->fb_location && + off < ( dev_priv->gart_vm_start + dev_priv->gart_size ) ) + return 0; + + off += filp_priv->radeon_fb_delta; + + DRM_DEBUG( "offset fixed up to 0x%x\n", off ); + + if ( off < dev_priv->fb_location || + off >= ( dev_priv->gart_vm_start + dev_priv->gart_size ) ) + return DRM_ERR( EINVAL ); + + *offset = off; + + return 0; +} + +static __inline__ int radeon_check_and_fixup_offset_user( drm_radeon_private_t *dev_priv, + drm_file_t *filp_priv, + u32 *offset ) { + u32 off; + + DRM_GET_USER_UNCHECKED( off, offset ); + + if ( radeon_check_and_fixup_offset( dev_priv, filp_priv, &off ) ) + return DRM_ERR( EINVAL ); + + DRM_PUT_USER_UNCHECKED( offset, off ); + + return 0; +} + +static __inline__ int radeon_check_and_fixup_packets( drm_radeon_private_t *dev_priv, + drm_file_t *filp_priv, + int id, + u32 *data ) { + switch ( id ) { + + case RADEON_EMIT_PP_MISC: + if ( radeon_check_and_fixup_offset_user( dev_priv, filp_priv, + &data[( RADEON_RB3D_DEPTHOFFSET + - RADEON_PP_MISC ) / 4] ) ) { + DRM_ERROR( "Invalid depth buffer offset\n" ); + return DRM_ERR( EINVAL ); + } + break; + + case RADEON_EMIT_PP_CNTL: + if ( radeon_check_and_fixup_offset_user( dev_priv, filp_priv, + &data[( RADEON_RB3D_COLOROFFSET + - RADEON_PP_CNTL ) / 4] ) ) { + DRM_ERROR( "Invalid colour buffer offset\n" ); + return DRM_ERR( EINVAL ); + } + break; + + case R200_EMIT_PP_TXOFFSET_0: + case R200_EMIT_PP_TXOFFSET_1: + case R200_EMIT_PP_TXOFFSET_2: + case R200_EMIT_PP_TXOFFSET_3: + case R200_EMIT_PP_TXOFFSET_4: + case R200_EMIT_PP_TXOFFSET_5: + if ( radeon_check_and_fixup_offset_user( dev_priv, filp_priv, + &data[0] ) ) { + DRM_ERROR( "Invalid R200 texture offset\n" ); + return DRM_ERR( EINVAL ); + } + break; + + case RADEON_EMIT_PP_TXFILTER_0: + case RADEON_EMIT_PP_TXFILTER_1: + case RADEON_EMIT_PP_TXFILTER_2: + if ( radeon_check_and_fixup_offset_user( dev_priv, filp_priv, + &data[( RADEON_PP_TXOFFSET_0 + - RADEON_PP_TXFILTER_0 ) / 4] ) ) { + DRM_ERROR( "Invalid R100 texture offset\n" ); + return DRM_ERR( EINVAL ); + } + break; + + case R200_EMIT_PP_CUBIC_OFFSETS_0: + case R200_EMIT_PP_CUBIC_OFFSETS_1: + case R200_EMIT_PP_CUBIC_OFFSETS_2: + case R200_EMIT_PP_CUBIC_OFFSETS_3: + case R200_EMIT_PP_CUBIC_OFFSETS_4: + case R200_EMIT_PP_CUBIC_OFFSETS_5: { + int i; + for ( i = 0; i < 5; i++ ) { + if ( radeon_check_and_fixup_offset_user( dev_priv, + filp_priv, + &data[i] ) ) { + DRM_ERROR( "Invalid R200 cubic texture offset\n" ); + return DRM_ERR( EINVAL ); + } + } + break; + } + + case RADEON_EMIT_RB3D_COLORPITCH: + case RADEON_EMIT_RE_LINE_PATTERN: + case RADEON_EMIT_SE_LINE_WIDTH: + case RADEON_EMIT_PP_LUM_MATRIX: + case RADEON_EMIT_PP_ROT_MATRIX_0: + case RADEON_EMIT_RB3D_STENCILREFMASK: + case RADEON_EMIT_SE_VPORT_XSCALE: + case RADEON_EMIT_SE_CNTL: + case RADEON_EMIT_SE_CNTL_STATUS: + case RADEON_EMIT_RE_MISC: + case RADEON_EMIT_PP_BORDER_COLOR_0: + case RADEON_EMIT_PP_BORDER_COLOR_1: + case RADEON_EMIT_PP_BORDER_COLOR_2: + case RADEON_EMIT_SE_ZBIAS_FACTOR: + case RADEON_EMIT_SE_TCL_OUTPUT_VTX_FMT: + case RADEON_EMIT_SE_TCL_MATERIAL_EMMISSIVE_RED: + case R200_EMIT_PP_TXCBLEND_0: + case R200_EMIT_PP_TXCBLEND_1: + case R200_EMIT_PP_TXCBLEND_2: + case R200_EMIT_PP_TXCBLEND_3: + case R200_EMIT_PP_TXCBLEND_4: + case R200_EMIT_PP_TXCBLEND_5: + case R200_EMIT_PP_TXCBLEND_6: + case R200_EMIT_PP_TXCBLEND_7: + case R200_EMIT_TCL_LIGHT_MODEL_CTL_0: + case R200_EMIT_TFACTOR_0: + case R200_EMIT_VTX_FMT_0: + case R200_EMIT_VAP_CTL: + case R200_EMIT_MATRIX_SELECT_0: + case R200_EMIT_TEX_PROC_CTL_2: + case R200_EMIT_TCL_UCP_VERT_BLEND_CTL: + case R200_EMIT_PP_TXFILTER_0: + case R200_EMIT_PP_TXFILTER_1: + case R200_EMIT_PP_TXFILTER_2: + case R200_EMIT_PP_TXFILTER_3: + case R200_EMIT_PP_TXFILTER_4: + case R200_EMIT_PP_TXFILTER_5: + case R200_EMIT_VTE_CNTL: + case R200_EMIT_OUTPUT_VTX_COMP_SEL: + case R200_EMIT_PP_TAM_DEBUG3: + case R200_EMIT_PP_CNTL_X: + case R200_EMIT_RB3D_DEPTHXY_OFFSET: + case R200_EMIT_RE_AUX_SCISSOR_CNTL: + case R200_EMIT_RE_SCISSOR_TL_0: + case R200_EMIT_RE_SCISSOR_TL_1: + case R200_EMIT_RE_SCISSOR_TL_2: + case R200_EMIT_SE_VAP_CNTL_STATUS: + case R200_EMIT_SE_VTX_STATE_CNTL: + case R200_EMIT_RE_POINTSIZE: + case R200_EMIT_TCL_INPUT_VTX_VECTOR_ADDR_0: + case R200_EMIT_PP_CUBIC_FACES_0: + case R200_EMIT_PP_CUBIC_FACES_1: + case R200_EMIT_PP_CUBIC_FACES_2: + case R200_EMIT_PP_CUBIC_FACES_3: + case R200_EMIT_PP_CUBIC_FACES_4: + case R200_EMIT_PP_CUBIC_FACES_5: + case RADEON_EMIT_PP_TEX_SIZE_0: + case RADEON_EMIT_PP_TEX_SIZE_1: + case RADEON_EMIT_PP_TEX_SIZE_2: + /* These packets don't contain memory offsets */ + break; + + default: + DRM_ERROR( "Unknown state packet ID %d\n", id ); + return DRM_ERR( EINVAL ); + } + + return 0; +} + +static __inline__ int radeon_check_and_fixup_packet3( drm_radeon_private_t *dev_priv, + drm_file_t *filp_priv, + drm_radeon_cmd_buffer_t *cmdbuf, + unsigned int *cmdsz ) { + u32 tmp[4], *cmd = ( u32* )cmdbuf->buf; + + if ( DRM_COPY_FROM_USER_UNCHECKED( tmp, cmd, sizeof( tmp ) ) ) { + DRM_ERROR( "Failed to copy data from user space\n" ); + return DRM_ERR( EFAULT ); + } + + *cmdsz = 2 + ( ( tmp[0] & RADEON_CP_PACKET_COUNT_MASK ) >> 16 ); + + if ( ( tmp[0] & 0xc0000000 ) != RADEON_CP_PACKET3 ) { + DRM_ERROR( "Not a type 3 packet\n" ); + return DRM_ERR( EINVAL ); + } + + if ( 4 * *cmdsz > cmdbuf->bufsz ) { + DRM_ERROR( "Packet size larger than size of data provided\n" ); + return DRM_ERR( EINVAL ); + } + + /* Check client state and fix it up if necessary */ + if ( tmp[0] & 0x8000 ) { /* MSB of opcode: next DWORD GUI_CNTL */ + u32 offset; + + if ( tmp[1] & ( RADEON_GMC_SRC_PITCH_OFFSET_CNTL + | RADEON_GMC_DST_PITCH_OFFSET_CNTL ) ) { + offset = tmp[2] << 10; + if ( radeon_check_and_fixup_offset( dev_priv, filp_priv, &offset ) ) { + DRM_ERROR( "Invalid first packet offset\n" ); + return DRM_ERR( EINVAL ); + } + tmp[2] = ( tmp[2] & 0xffc00000 ) | offset >> 10; + } + + if ( ( tmp[1] & RADEON_GMC_SRC_PITCH_OFFSET_CNTL ) && + ( tmp[1] & RADEON_GMC_DST_PITCH_OFFSET_CNTL ) ) { + offset = tmp[3] << 10; + if ( radeon_check_and_fixup_offset( dev_priv, filp_priv, &offset ) ) { + DRM_ERROR( "Invalid second packet offset\n" ); + return DRM_ERR( EINVAL ); + } + tmp[3] = ( tmp[3] & 0xffc00000 ) | offset >> 10; + } + + if ( DRM_COPY_TO_USER_UNCHECKED( cmd, tmp, sizeof( tmp ) ) ) { + DRM_ERROR( "Failed to copy data to user space\n" ); + return DRM_ERR( EFAULT ); + } + } + + return 0; +} + + +/* ================================================================ * CP hardware state programming functions */ @@ -57,15 +291,28 @@ static __inline__ void radeon_emit_clip_ /* Emit 1.1 state */ -static void radeon_emit_state( drm_radeon_private_t *dev_priv, - drm_radeon_context_regs_t *ctx, - drm_radeon_texture_regs_t *tex, - unsigned int dirty ) +static int radeon_emit_state( drm_radeon_private_t *dev_priv, + drm_file_t *filp_priv, + drm_radeon_context_regs_t *ctx, + drm_radeon_texture_regs_t *tex, + unsigned int dirty ) { RING_LOCALS; DRM_DEBUG( "dirty=0x%08x\n", dirty ); if ( dirty & RADEON_UPLOAD_CONTEXT ) { + if ( radeon_check_and_fixup_offset( dev_priv, filp_priv, + &ctx->rb3d_depthoffset ) ) { + DRM_ERROR( "Invalid depth buffer offset\n" ); + return DRM_ERR( EINVAL ); + } + + if ( radeon_check_and_fixup_offset( dev_priv, filp_priv, + &ctx->rb3d_coloroffset ) ) { + DRM_ERROR( "Invalid depth buffer offset\n" ); + return DRM_ERR( EINVAL ); + } + BEGIN_RING( 14 ); OUT_RING( CP_PACKET0( RADEON_PP_MISC, 6 ) ); OUT_RING( ctx->pp_misc ); @@ -149,6 +396,12 @@ static void radeon_emit_state( drm_radeo } if ( dirty & RADEON_UPLOAD_TEX0 ) { + if ( radeon_check_and_fixup_offset( dev_priv, filp_priv, + &tex[0].pp_txoffset ) ) { + DRM_ERROR( "Invalid texture offset for unit 0\n" ); + return DRM_ERR( EINVAL ); + } + BEGIN_RING( 9 ); OUT_RING( CP_PACKET0( RADEON_PP_TXFILTER_0, 5 ) ); OUT_RING( tex[0].pp_txfilter ); @@ -163,6 +416,12 @@ static void radeon_emit_state( drm_radeo } if ( dirty & RADEON_UPLOAD_TEX1 ) { + if ( radeon_check_and_fixup_offset( dev_priv, filp_priv, + &tex[1].pp_txoffset ) ) { + DRM_ERROR( "Invalid texture offset for unit 1\n" ); + return DRM_ERR( EINVAL ); + } + BEGIN_RING( 9 ); OUT_RING( CP_PACKET0( RADEON_PP_TXFILTER_1, 5 ) ); OUT_RING( tex[1].pp_txfilter ); @@ -177,6 +436,12 @@ static void radeon_emit_state( drm_radeo } if ( dirty & RADEON_UPLOAD_TEX2 ) { + if ( radeon_check_and_fixup_offset( dev_priv, filp_priv, + &tex[2].pp_txoffset ) ) { + DRM_ERROR( "Invalid texture offset for unit 2\n" ); + return DRM_ERR( EINVAL ); + } + BEGIN_RING( 9 ); OUT_RING( CP_PACKET0( RADEON_PP_TXFILTER_2, 5 ) ); OUT_RING( tex[2].pp_txfilter ); @@ -189,12 +454,15 @@ static void radeon_emit_state( drm_radeo OUT_RING( tex[2].pp_border_color ); ADVANCE_RING(); } + + return 0; } /* Emit 1.2 state */ -static void radeon_emit_state2( drm_radeon_private_t *dev_priv, - drm_radeon_state_t *state ) +static int radeon_emit_state2( drm_radeon_private_t *dev_priv, + drm_file_t *filp_priv, + drm_radeon_state_t *state ) { RING_LOCALS; @@ -206,7 +474,7 @@ static void radeon_emit_state2( drm_rade ADVANCE_RING(); } - radeon_emit_state( dev_priv, &state->context, + return radeon_emit_state( dev_priv, filp_priv, &state->context, state->tex, state->dirty ); } @@ -1065,6 +1333,7 @@ static int radeon_cp_dispatch_texture( D drm_radeon_tex_image_t *image ) { drm_radeon_private_t *dev_priv = dev->dev_private; + drm_file_t *filp_priv; drm_buf_t *buf; u32 format; u32 *buffer; @@ -1074,6 +1343,13 @@ static int radeon_cp_dispatch_texture( D int i; RING_LOCALS; + DRM_GET_PRIV_WITH_RETURN( filp_priv, filp ); + + if ( radeon_check_and_fixup_offset( dev_priv, filp_priv, &tex->offset ) ) { + DRM_ERROR( "Invalid destination offset\n" ); + return DRM_ERR( EINVAL ); + } + dev_priv->stats.boxes |= RADEON_BOX_TEXTURE_LOAD; /* Flush the pixel cache. This ensures no pixel data gets mixed @@ -1377,6 +1653,7 @@ int radeon_cp_vertex( DRM_IOCTL_ARGS ) { DRM_DEVICE; drm_radeon_private_t *dev_priv = dev->dev_private; + drm_file_t *filp_priv; drm_radeon_sarea_t *sarea_priv = dev_priv->sarea_priv; drm_device_dma_t *dma = dev->dma; drm_buf_t *buf; @@ -1390,6 +1667,8 @@ int radeon_cp_vertex( DRM_IOCTL_ARGS ) return DRM_ERR(EINVAL); } + DRM_GET_PRIV_WITH_RETURN( filp_priv, filp ); + DRM_COPY_FROM_USER_IOCTL( vertex, (drm_radeon_vertex_t *)data, sizeof(vertex) ); @@ -1429,11 +1708,14 @@ int radeon_cp_vertex( DRM_IOCTL_ARGS ) buf->used = vertex.count; /* not used? */ if ( sarea_priv->dirty & ~RADEON_UPLOAD_CLIPRECTS ) { - radeon_emit_state( dev_priv, - &sarea_priv->context_state, - sarea_priv->tex_state, - sarea_priv->dirty ); - + if ( radeon_emit_state( dev_priv, filp_priv, + &sarea_priv->context_state, + sarea_priv->tex_state, + sarea_priv->dirty ) ) { + DRM_ERROR( "radeon_emit_state failed\n" ); + return DRM_ERR( EINVAL ); + } + sarea_priv->dirty &= ~(RADEON_UPLOAD_TEX0IMAGES | RADEON_UPLOAD_TEX1IMAGES | RADEON_UPLOAD_TEX2IMAGES | @@ -1461,6 +1743,7 @@ int radeon_cp_indices( DRM_IOCTL_ARGS ) { DRM_DEVICE; drm_radeon_private_t *dev_priv = dev->dev_private; + drm_file_t *filp_priv; drm_radeon_sarea_t *sarea_priv = dev_priv->sarea_priv; drm_device_dma_t *dma = dev->dma; drm_buf_t *buf; @@ -1475,6 +1758,8 @@ int radeon_cp_indices( DRM_IOCTL_ARGS ) return DRM_ERR(EINVAL); } + DRM_GET_PRIV_WITH_RETURN( filp_priv, filp ); + DRM_COPY_FROM_USER_IOCTL( elts, (drm_radeon_indices_t *)data, sizeof(elts) ); @@ -1523,10 +1808,13 @@ int radeon_cp_indices( DRM_IOCTL_ARGS ) buf->used = elts.end; if ( sarea_priv->dirty & ~RADEON_UPLOAD_CLIPRECTS ) { - radeon_emit_state( dev_priv, - &sarea_priv->context_state, - sarea_priv->tex_state, - sarea_priv->dirty ); + if ( radeon_emit_state( dev_priv, filp_priv, + &sarea_priv->context_state, + sarea_priv->tex_state, + sarea_priv->dirty ) ) { + DRM_ERROR( "radeon_emit_state failed\n" ); + return DRM_ERR( EINVAL ); + } sarea_priv->dirty &= ~(RADEON_UPLOAD_TEX0IMAGES | RADEON_UPLOAD_TEX1IMAGES | @@ -1686,6 +1974,7 @@ int radeon_cp_vertex2( DRM_IOCTL_ARGS ) { DRM_DEVICE; drm_radeon_private_t *dev_priv = dev->dev_private; + drm_file_t *filp_priv; drm_radeon_sarea_t *sarea_priv = dev_priv->sarea_priv; drm_device_dma_t *dma = dev->dma; drm_buf_t *buf; @@ -1700,6 +1989,8 @@ int radeon_cp_vertex2( DRM_IOCTL_ARGS ) return DRM_ERR(EINVAL); } + DRM_GET_PRIV_WITH_RETURN( filp_priv, filp ); + DRM_COPY_FROM_USER_IOCTL( vertex, (drm_radeon_vertex2_t *)data, sizeof(vertex) ); @@ -1747,7 +2038,10 @@ int radeon_cp_vertex2( DRM_IOCTL_ARGS ) sizeof(state) ) ) return DRM_ERR(EFAULT); - radeon_emit_state2( dev_priv, &state ); + if ( radeon_emit_state2( dev_priv, filp_priv, &state ) ) { + DRM_ERROR( "radeon_emit_state2 failed\n" ); + return DRM_ERR( EINVAL ); + } laststate = prim.stateidx; } @@ -1784,6 +2078,7 @@ int radeon_cp_vertex2( DRM_IOCTL_ARGS ) static int radeon_emit_packets( drm_radeon_private_t *dev_priv, + drm_file_t *filp_priv, drm_radeon_cmd_header_t header, drm_radeon_cmd_buffer_t *cmdbuf ) { @@ -1798,8 +2093,15 @@ static int radeon_emit_packets( sz = packet[id].len; reg = packet[id].start; - if (sz * sizeof(int) > cmdbuf->bufsz) + if (sz * sizeof(int) > cmdbuf->bufsz) { + DRM_ERROR( "Packet size provided larger than data provided\n" ); return DRM_ERR(EINVAL); + } + + if ( radeon_check_and_fixup_packets( dev_priv, filp_priv, id, data ) ) { + DRM_ERROR( "Packet verification failed\n" ); + return DRM_ERR( EINVAL ); + } BEGIN_RING(sz+1); OUT_RING( CP_PACKET0( reg, (sz-1) ) ); @@ -1882,24 +2184,21 @@ static __inline__ int radeon_emit_vector static int radeon_emit_packet3( drm_device_t *dev, + drm_file_t *filp_priv, drm_radeon_cmd_buffer_t *cmdbuf ) { drm_radeon_private_t *dev_priv = dev->dev_private; - int cmdsz, tmp; - int *cmd = (int *)cmdbuf->buf; + unsigned int cmdsz; + int *cmd = (int *)cmdbuf->buf, ret; RING_LOCALS; - DRM_DEBUG("\n"); - if (DRM_GET_USER_UNCHECKED( tmp, &cmd[0])) - return DRM_ERR(EFAULT); - - cmdsz = 2 + ((tmp & RADEON_CP_PACKET_COUNT_MASK) >> 16); - - if ((tmp & 0xc0000000) != RADEON_CP_PACKET3 || - cmdsz * 4 > cmdbuf->bufsz) - return DRM_ERR(EINVAL); + if ( ( ret = radeon_check_and_fixup_packet3( dev_priv, filp_priv, + cmdbuf, &cmdsz ) ) ) { + DRM_ERROR( "Packet verification failed\n" ); + return ret; + } BEGIN_RING( cmdsz ); OUT_RING_USER_TABLE( cmd, cmdsz ); @@ -1912,27 +2211,25 @@ static int radeon_emit_packet3( drm_devi static int radeon_emit_packet3_cliprect( drm_device_t *dev, + drm_file_t *filp_priv, drm_radeon_cmd_buffer_t *cmdbuf, int orig_nbox ) { drm_radeon_private_t *dev_priv = dev->dev_private; drm_clip_rect_t box; - int cmdsz, tmp; - int *cmd = (int *)cmdbuf->buf; + unsigned int cmdsz; + int *cmd = (int *)cmdbuf->buf, ret; drm_clip_rect_t *boxes = cmdbuf->boxes; int i = 0; RING_LOCALS; DRM_DEBUG("\n"); - if (DRM_GET_USER_UNCHECKED( tmp, &cmd[0])) - return DRM_ERR(EFAULT); - - cmdsz = 2 + ((tmp & RADEON_CP_PACKET_COUNT_MASK) >> 16); - - if ((tmp & 0xc0000000) != RADEON_CP_PACKET3 || - cmdsz * 4 > cmdbuf->bufsz) - return DRM_ERR(EINVAL); + if ( ( ret = radeon_check_and_fixup_packet3( dev_priv, filp_priv, + cmdbuf, &cmdsz ) ) ) { + DRM_ERROR( "Packet verification failed\n" ); + return ret; + } if (!orig_nbox) goto out; @@ -2009,6 +2306,7 @@ int radeon_cp_cmdbuf( DRM_IOCTL_ARGS ) { DRM_DEVICE; drm_radeon_private_t *dev_priv = dev->dev_private; + drm_file_t *filp_priv; drm_device_dma_t *dma = dev->dma; drm_buf_t *buf = 0; int idx; @@ -2023,6 +2321,8 @@ int radeon_cp_cmdbuf( DRM_IOCTL_ARGS ) return DRM_ERR(EINVAL); } + DRM_GET_PRIV_WITH_RETURN( filp_priv, filp ); + DRM_COPY_FROM_USER_IOCTL( cmdbuf, (drm_radeon_cmd_buffer_t *)data, sizeof(cmdbuf) ); @@ -2053,7 +2353,7 @@ int radeon_cp_cmdbuf( DRM_IOCTL_ARGS ) switch (header.header.cmd_type) { case RADEON_CMD_PACKET: DRM_DEBUG("RADEON_CMD_PACKET\n"); - if (radeon_emit_packets( dev_priv, header, &cmdbuf )) { + if (radeon_emit_packets( dev_priv, filp_priv, header, &cmdbuf )) { DRM_ERROR("radeon_emit_packets failed\n"); return DRM_ERR(EINVAL); } @@ -2096,7 +2396,7 @@ int radeon_cp_cmdbuf( DRM_IOCTL_ARGS ) case RADEON_CMD_PACKET3: DRM_DEBUG("RADEON_CMD_PACKET3\n"); - if (radeon_emit_packet3( dev, &cmdbuf )) { + if (radeon_emit_packet3( dev, filp_priv, &cmdbuf )) { DRM_ERROR("radeon_emit_packet3 failed\n"); return DRM_ERR(EINVAL); } @@ -2104,7 +2404,7 @@ int radeon_cp_cmdbuf( DRM_IOCTL_ARGS ) case RADEON_CMD_PACKET3_CLIP: DRM_DEBUG("RADEON_CMD_PACKET3_CLIP\n"); - if (radeon_emit_packet3_cliprect( dev, &cmdbuf, orig_nbox )) { + if (radeon_emit_packet3_cliprect( dev, filp_priv, &cmdbuf, orig_nbox )) { DRM_ERROR("radeon_emit_packet3_clip failed\n"); return DRM_ERR(EINVAL); } @@ -2214,3 +2514,31 @@ int radeon_cp_getparam( DRM_IOCTL_ARGS ) return 0; } + +int radeon_cp_setparam( DRM_IOCTL_ARGS ) { + DRM_DEVICE; + drm_radeon_private_t *dev_priv = dev->dev_private; + drm_file_t *filp_priv; + drm_radeon_setparam_t sp; + + if ( !dev_priv ) { + DRM_ERROR( "%s called with no initialization\n", __FUNCTION__ ); + return DRM_ERR( EINVAL ); + } + + DRM_GET_PRIV_WITH_RETURN( filp_priv, filp ); + + DRM_COPY_FROM_USER_IOCTL( sp, ( drm_radeon_setparam_t* )data, + sizeof( sp ) ); + + switch( sp.param ) { + case RADEON_SETPARAM_FB_LOCATION: + filp_priv->radeon_fb_delta = dev_priv->fb_location - sp.value; + break; + default: + DRM_DEBUG( "Invalid parameter %d\n", sp.param ); + return DRM_ERR( EINVAL ); + } + + return 0; +} --- diff/drivers/char/drm/sis.h 2003-09-30 15:46:12.000000000 +0100 +++ source/drivers/char/drm/sis.h 2004-04-21 10:46:51.472757064 +0100 @@ -62,6 +62,13 @@ [DRM_IOCTL_NR(DRM_IOCTL_SIS_AGP_FREE)] = { sis_ioctl_agp_free, 1, 0 }, \ [DRM_IOCTL_NR(DRM_IOCTL_SIS_FB_INIT)] = { sis_fb_init, 1, 1 } +#define DRIVER_PCI_IDS \ + {0x1039, 0x0300, 0}, \ + {0x1039, 0x5300, 0}, \ + {0x1039, 0x6300, 0}, \ + {0x1039, 0x7300, 0}, \ + {0, 0, 0} + #define __HAVE_COUNTERS 5 /* Buffer customization: --- diff/drivers/char/drm/tdfx.h 2002-10-16 04:28:22.000000000 +0100 +++ source/drivers/char/drm/tdfx.h 2004-04-21 10:46:51.472757064 +0100 @@ -39,4 +39,22 @@ #define __HAVE_MTRR 1 #define __HAVE_CTX_BITMAP 1 +#define DRIVER_AUTHOR "VA Linux Systems Inc." + +#define DRIVER_NAME "tdfx" +#define DRIVER_DESC "3dfx Banshee/Voodoo3+" +#define DRIVER_DATE "20010216" + +#define DRIVER_MAJOR 1 +#define DRIVER_MINOR 0 +#define DRIVER_PATCHLEVEL 0 + +#define DRIVER_PCI_IDS \ + {0x121a, 0x0003, 0}, \ + {0x121a, 0x0004, 0}, \ + {0x121a, 0x0005, 0}, \ + {0x121a, 0x0007, 0}, \ + {0x121a, 0x0009, 0}, \ + {0, 0, 0} + #endif --- diff/drivers/char/drm/tdfx_drv.c 2002-10-16 04:28:22.000000000 +0100 +++ source/drivers/char/drm/tdfx_drv.c 2004-04-21 10:46:51.472757064 +0100 @@ -34,47 +34,6 @@ #include "tdfx.h" #include "drmP.h" -#define DRIVER_AUTHOR "VA Linux Systems Inc." - -#define DRIVER_NAME "tdfx" -#define DRIVER_DESC "3dfx Banshee/Voodoo3+" -#define DRIVER_DATE "20010216" - -#define DRIVER_MAJOR 1 -#define DRIVER_MINOR 0 -#define DRIVER_PATCHLEVEL 0 - -#ifndef PCI_VENDOR_ID_3DFX -#define PCI_VENDOR_ID_3DFX 0x121A -#endif -#ifndef PCI_DEVICE_ID_3DFX_VOODOO5 -#define PCI_DEVICE_ID_3DFX_VOODOO5 0x0009 -#endif -#ifndef PCI_DEVICE_ID_3DFX_VOODOO4 -#define PCI_DEVICE_ID_3DFX_VOODOO4 0x0007 -#endif -#ifndef PCI_DEVICE_ID_3DFX_VOODOO3_3000 /* Voodoo3 3000 */ -#define PCI_DEVICE_ID_3DFX_VOODOO3_3000 0x0005 -#endif -#ifndef PCI_DEVICE_ID_3DFX_VOODOO3_2000 /* Voodoo3 3000 */ -#define PCI_DEVICE_ID_3DFX_VOODOO3_2000 0x0004 -#endif -#ifndef PCI_DEVICE_ID_3DFX_BANSHEE -#define PCI_DEVICE_ID_3DFX_BANSHEE 0x0003 -#endif - -static drm_pci_list_t DRM(idlist)[] = { - { PCI_VENDOR_ID_3DFX, PCI_DEVICE_ID_3DFX_BANSHEE }, - { PCI_VENDOR_ID_3DFX, PCI_DEVICE_ID_3DFX_VOODOO3_2000 }, - { PCI_VENDOR_ID_3DFX, PCI_DEVICE_ID_3DFX_VOODOO3_3000 }, - { PCI_VENDOR_ID_3DFX, PCI_DEVICE_ID_3DFX_VOODOO4 }, - { PCI_VENDOR_ID_3DFX, PCI_DEVICE_ID_3DFX_VOODOO5 }, - { 0, 0 } -}; - -#define DRIVER_CARD_LIST DRM(idlist) - - #include "drm_auth.h" #include "drm_bufs.h" #include "drm_context.h" --- diff/drivers/char/keyboard.c 2004-02-18 08:54:09.000000000 +0000 +++ source/drivers/char/keyboard.c 2004-04-21 10:46:51.477756304 +0100 @@ -1066,6 +1066,9 @@ void kbd_keycode(unsigned int keycode, i } if (sysrq_down && down && !rep) { handle_sysrq(kbd_sysrq_xlate[keycode], regs, tty); +#ifdef CONFIG_KGDB_SYSRQ + sysrq_down = 0; /* in case we miss the "up" event */ +#endif return; } #endif --- diff/drivers/char/mem.c 2004-04-21 10:45:34.131514720 +0100 +++ source/drivers/char/mem.c 2004-04-21 10:46:51.478756152 +0100 @@ -26,7 +26,6 @@ #include #include -#include #ifdef CONFIG_IA64 # include @@ -39,6 +38,7 @@ extern void fbmem_init(void); extern void tapechar_init(void); #endif +#ifdef pgprot_noncached /* * Architectures vary in how they handle caching for addresses * outside of main memory. @@ -64,7 +64,8 @@ static inline int uncached_access(struct && addr >= __pa(high_memory); #elif defined(CONFIG_IA64) /* - * On ia64, we ignore O_SYNC because we cannot tolerate memory attribute aliases. + * On ia64, we ignore O_SYNC because we cannot tolerate memory + * attribute aliases. */ return !(efi_mem_attributes(addr) & EFI_MEMORY_WB); #elif defined(CONFIG_PPC64) @@ -77,14 +78,15 @@ static inline int uncached_access(struct return !page_is_ram(addr); #else /* - * Accessing memory above the top the kernel knows about or through a file pointer - * that was marked O_SYNC will be done non-cached. + * Accessing memory above the top the kernel knows about or through a + * file pointer that was marked O_SYNC will be done non-cached. */ if (file->f_flags & O_SYNC) return 1; return addr >= __pa(high_memory); #endif } +#endif #ifndef ARCH_HAS_VALID_PHYS_ADDR_RANGE static inline int valid_phys_addr_range(unsigned long addr, size_t *count) @@ -181,28 +183,24 @@ static ssize_t write_mem(struct file * f return do_write_mem(__va(p), p, buf, count, ppos); } -static int mmap_mem(struct file * file, struct vm_area_struct * vma) +static int mmap_mem(struct file *file, struct vm_area_struct *vma) { unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; - int uncached; - uncached = uncached_access(file, offset); #ifdef pgprot_noncached - if (uncached) + if (uncached_access(file, offset)) vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); #endif - /* Don't try to swap out physical pages.. */ - vma->vm_flags |= VM_RESERVED; - /* - * Don't dump addresses that are not real memory to a core file. + * Don't try to swap out physical pages.. + * And treat /dev/mem mappings as "IO" regions: they may not + * describe valid pageframes. */ - if (uncached) - vma->vm_flags |= VM_IO; + vma->vm_flags |= VM_RESERVED|VM_IO; - if (remap_page_range(vma, vma->vm_start, offset, vma->vm_end-vma->vm_start, - vma->vm_page_prot)) + if (remap_page_range(vma, vma->vm_start, offset, + vma->vm_end-vma->vm_start, vma->vm_page_prot)) return -EAGAIN; return 0; } --- diff/drivers/char/sysrq.c 2004-02-09 10:36:10.000000000 +0000 +++ source/drivers/char/sysrq.c 2004-04-21 10:46:51.485755088 +0100 @@ -35,6 +35,25 @@ #include #include +#ifdef CONFIG_KGDB_SYSRQ + +#define GDB_OP &kgdb_op +static void kgdb_sysrq(int key, struct pt_regs *pt_regs, struct tty_struct *tty) +{ + printk("kgdb sysrq\n"); + breakpoint(); +} + +static struct sysrq_key_op kgdb_op = { + .handler = kgdb_sysrq, + .help_msg = "kGdb|Fgdb", + .action_msg = "Debug breakpoint\n", +}; + +#else +#define GDB_OP NULL +#endif + extern void reset_vc(unsigned int); @@ -238,8 +257,8 @@ static struct sysrq_key_op *sysrq_key_ta /* c */ NULL, /* d */ NULL, /* e */ &sysrq_term_op, -/* f */ NULL, -/* g */ NULL, +/* f */ GDB_OP, +/* g */ GDB_OP, /* h */ NULL, /* i */ &sysrq_kill_op, /* j */ NULL, --- diff/drivers/char/watchdog/Kconfig 2004-04-05 12:57:07.000000000 +0100 +++ source/drivers/char/watchdog/Kconfig 2004-04-21 10:46:51.497753264 +0100 @@ -143,19 +143,6 @@ config ALIM7101_WDT Most people will say N. -config AMD7XX_TCO - tristate "AMD 766/768 TCO Timer/Watchdog" - depends on WATCHDOG && X86 && PCI - help - This is the driver for the hardware watchdog built in to the - AMD 766/768 chipsets. - This watchdog simply watches your kernel to make sure it doesn't - freeze, and if it does, it reboots your computer after a certain - amount of time. - - You can compile this driver directly into the kernel, or use - it as a module. The module will be called amd7xx_tco. - config SC520_WDT tristate "AMD Elan SC520 processor Watchdog" depends on WATCHDOG && X86 --- diff/drivers/char/watchdog/Makefile 2004-03-11 10:20:23.000000000 +0000 +++ source/drivers/char/watchdog/Makefile 2004-04-21 10:46:51.498753112 +0100 @@ -32,7 +32,6 @@ obj-$(CONFIG_ALIM1535_WDT) += alim1535_w obj-$(CONFIG_SC1200_WDT) += sc1200wdt.o obj-$(CONFIG_WAFER_WDT) += wafer5823wdt.o obj-$(CONFIG_CPU5_WDT) += cpu5wdt.o -obj-$(CONFIG_AMD7XX_TCO) += amd7xx_tco.o obj-$(CONFIG_INDYDOG) += indydog.o obj-$(CONFIG_PCIPCWATCHDOG) += pcwd_pci.o obj-$(CONFIG_USBPCWATCHDOG) += pcwd_usb.o --- diff/drivers/cpufreq/cpufreq_userspace.c 2004-02-18 08:54:09.000000000 +0000 +++ source/drivers/cpufreq/cpufreq_userspace.c 2004-04-21 10:46:51.498753112 +0100 @@ -167,7 +167,7 @@ cpufreq_procctl(ctl_table *ctl, int writ void __user *buffer, size_t *lenp) { char buf[16], *p; - int cpu = (int) ctl->extra1; + int cpu = (long) ctl->extra1; int len, left = *lenp; if (!left || (filp->f_pos && !write) || !cpu_online(cpu)) { @@ -205,7 +205,7 @@ cpufreq_sysctl(ctl_table *table, int __u void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen, void **context) { - int cpu = (int) table->extra1; + int cpu = (long) table->extra1; if (!cpu_online(cpu)) return -EINVAL; --- diff/drivers/ide/ide-probe.c 2004-04-21 10:45:34.233499216 +0100 +++ source/drivers/ide/ide-probe.c 2004-04-21 10:46:51.500752808 +0100 @@ -918,8 +918,12 @@ static int ide_init_queue(ide_drive_t *d q->queuedata = HWGROUP(drive); blk_queue_segment_boundary(q, 0xffff); - if (!hwif->rqsize) - hwif->rqsize = hwif->no_lba48 ? 256 : 65536; + if (!hwif->rqsize) { + if (hwif->max_rqsize) + hwif->rqsize = hwif->max_rqsize(drive); + else + hwif->rqsize = hwif->no_lba48 ? 256 : 65536; + } if (hwif->rqsize < max_sectors) max_sectors = hwif->rqsize; blk_queue_max_sectors(q, max_sectors); --- diff/drivers/ide/ide.c 2004-04-21 10:45:34.236498760 +0100 +++ source/drivers/ide/ide.c 2004-04-21 10:46:51.500752808 +0100 @@ -606,18 +606,24 @@ EXPORT_SYMBOL(ide_hwif_release_regions); * Unregister restores the hwif structures to the default state. * This is raving bonkers. */ - -void ide_unregister (unsigned int index) +/* XXX(cw) this needs to be able to return an exit code here and the + * callers need to check for this. */ +void ide_unregister(unsigned int index) { ide_drive_t *drive; ide_hwif_t *hwif, *g; ide_hwgroup_t *hwgroup; int irq_count = 0, unit, i; - ide_hwif_t old_hwif; + ide_hwif_t *old_hwif; + + BUG_ON(index >= MAX_HWIFS); + + old_hwif = kmalloc(sizeof(*old_hwif), GFP_KERNEL|__GFP_NOFAIL); + if (!old_hwif) { + printk(KERN_ERR "ide_unregister: unable to allocate memory\n"); + return; + } - if (index >= MAX_HWIFS) - BUG(); - BUG_ON(in_interrupt()); BUG_ON(irqs_disabled()); down(&ide_cfg_sem); @@ -763,122 +769,124 @@ void ide_unregister (unsigned int index) hwif->dma_prdtable = 0; } - old_hwif = *hwif; + *old_hwif = *hwif; init_hwif_data(hwif, index); /* restore hwif data to pristine status */ init_hwif_default(hwif, index); - hwif->hwgroup = old_hwif.hwgroup; + hwif->hwgroup = old_hwif->hwgroup; - hwif->gendev.parent = old_hwif.gendev.parent; + hwif->gendev.parent = old_hwif->gendev.parent; - hwif->proc = old_hwif.proc; + hwif->proc = old_hwif->proc; - hwif->major = old_hwif.major; -// hwif->index = old_hwif.index; -// hwif->channel = old_hwif.channel; - hwif->straight8 = old_hwif.straight8; - hwif->bus_state = old_hwif.bus_state; + hwif->major = old_hwif->major; +// hwif->index = old_hwif->index; +// hwif->channel = old_hwif->channel; + hwif->straight8 = old_hwif->straight8; + hwif->bus_state = old_hwif->bus_state; - hwif->atapi_dma = old_hwif.atapi_dma; - hwif->ultra_mask = old_hwif.ultra_mask; - hwif->mwdma_mask = old_hwif.mwdma_mask; - hwif->swdma_mask = old_hwif.swdma_mask; + hwif->atapi_dma = old_hwif->atapi_dma; + hwif->ultra_mask = old_hwif->ultra_mask; + hwif->mwdma_mask = old_hwif->mwdma_mask; + hwif->swdma_mask = old_hwif->swdma_mask; - hwif->chipset = old_hwif.chipset; - hwif->hold = old_hwif.hold; + hwif->chipset = old_hwif->chipset; + hwif->hold = old_hwif->hold; #ifdef CONFIG_BLK_DEV_IDEPCI - hwif->pci_dev = old_hwif.pci_dev; - hwif->cds = old_hwif.cds; + hwif->pci_dev = old_hwif->pci_dev; + hwif->cds = old_hwif->cds; #endif /* CONFIG_BLK_DEV_IDEPCI */ #if 0 - hwif->hwifops = old_hwif.hwifops; + hwif->hwifops = old_hwif->hwifops; #else - hwif->identify = old_hwif.identify; - hwif->tuneproc = old_hwif.tuneproc; - hwif->speedproc = old_hwif.speedproc; - hwif->selectproc = old_hwif.selectproc; - hwif->reset_poll = old_hwif.reset_poll; - hwif->pre_reset = old_hwif.pre_reset; - hwif->resetproc = old_hwif.resetproc; - hwif->intrproc = old_hwif.intrproc; - hwif->maskproc = old_hwif.maskproc; - hwif->quirkproc = old_hwif.quirkproc; - hwif->busproc = old_hwif.busproc; + hwif->identify = old_hwif->identify; + hwif->tuneproc = old_hwif->tuneproc; + hwif->speedproc = old_hwif->speedproc; + hwif->selectproc = old_hwif->selectproc; + hwif->reset_poll = old_hwif->reset_poll; + hwif->pre_reset = old_hwif->pre_reset; + hwif->resetproc = old_hwif->resetproc; + hwif->intrproc = old_hwif->intrproc; + hwif->maskproc = old_hwif->maskproc; + hwif->quirkproc = old_hwif->quirkproc; + hwif->busproc = old_hwif->busproc; #endif #if 0 - hwif->pioops = old_hwif.pioops; + hwif->pioops = old_hwif->pioops; #else - hwif->ata_input_data = old_hwif.ata_input_data; - hwif->ata_output_data = old_hwif.ata_output_data; - hwif->atapi_input_bytes = old_hwif.atapi_input_bytes; - hwif->atapi_output_bytes = old_hwif.atapi_output_bytes; + hwif->ata_input_data = old_hwif->ata_input_data; + hwif->ata_output_data = old_hwif->ata_output_data; + hwif->atapi_input_bytes = old_hwif->atapi_input_bytes; + hwif->atapi_output_bytes = old_hwif->atapi_output_bytes; #endif #if 0 - hwif->dmaops = old_hwif.dmaops; + hwif->dmaops = old_hwif->dmaops; #else - hwif->ide_dma_read = old_hwif.ide_dma_read; - hwif->ide_dma_write = old_hwif.ide_dma_write; - hwif->ide_dma_begin = old_hwif.ide_dma_begin; - hwif->ide_dma_end = old_hwif.ide_dma_end; - hwif->ide_dma_check = old_hwif.ide_dma_check; - hwif->ide_dma_on = old_hwif.ide_dma_on; - hwif->ide_dma_off_quietly = old_hwif.ide_dma_off_quietly; - hwif->ide_dma_test_irq = old_hwif.ide_dma_test_irq; - hwif->ide_dma_host_on = old_hwif.ide_dma_host_on; - hwif->ide_dma_host_off = old_hwif.ide_dma_host_off; - hwif->ide_dma_verbose = old_hwif.ide_dma_verbose; - hwif->ide_dma_lostirq = old_hwif.ide_dma_lostirq; - hwif->ide_dma_timeout = old_hwif.ide_dma_timeout; + hwif->ide_dma_read = old_hwif->ide_dma_read; + hwif->ide_dma_write = old_hwif->ide_dma_write; + hwif->ide_dma_begin = old_hwif->ide_dma_begin; + hwif->ide_dma_end = old_hwif->ide_dma_end; + hwif->ide_dma_check = old_hwif->ide_dma_check; + hwif->ide_dma_on = old_hwif->ide_dma_on; + hwif->ide_dma_off_quietly = old_hwif->ide_dma_off_quietly; + hwif->ide_dma_test_irq = old_hwif->ide_dma_test_irq; + hwif->ide_dma_host_on = old_hwif->ide_dma_host_on; + hwif->ide_dma_host_off = old_hwif->ide_dma_host_off; + hwif->ide_dma_verbose = old_hwif->ide_dma_verbose; + hwif->ide_dma_lostirq = old_hwif->ide_dma_lostirq; + hwif->ide_dma_timeout = old_hwif->ide_dma_timeout; #endif #if 0 - hwif->iops = old_hwif.iops; + hwif->iops = old_hwif->iops; #else - hwif->OUTB = old_hwif.OUTB; - hwif->OUTBSYNC = old_hwif.OUTBSYNC; - hwif->OUTW = old_hwif.OUTW; - hwif->OUTL = old_hwif.OUTL; - hwif->OUTSW = old_hwif.OUTSW; - hwif->OUTSL = old_hwif.OUTSL; - - hwif->INB = old_hwif.INB; - hwif->INW = old_hwif.INW; - hwif->INL = old_hwif.INL; - hwif->INSW = old_hwif.INSW; - hwif->INSL = old_hwif.INSL; -#endif - - hwif->mmio = old_hwif.mmio; - hwif->rqsize = old_hwif.rqsize; - hwif->no_lba48 = old_hwif.no_lba48; + hwif->OUTB = old_hwif->OUTB; + hwif->OUTBSYNC = old_hwif->OUTBSYNC; + hwif->OUTW = old_hwif->OUTW; + hwif->OUTL = old_hwif->OUTL; + hwif->OUTSW = old_hwif->OUTSW; + hwif->OUTSL = old_hwif->OUTSL; + + hwif->INB = old_hwif->INB; + hwif->INW = old_hwif->INW; + hwif->INL = old_hwif->INL; + hwif->INSW = old_hwif->INSW; + hwif->INSL = old_hwif->INSL; +#endif + + hwif->mmio = old_hwif->mmio; + hwif->rqsize = old_hwif->rqsize; + hwif->no_lba48 = old_hwif->no_lba48; #ifndef CONFIG_BLK_DEV_IDECS - hwif->irq = old_hwif.irq; + hwif->irq = old_hwif->irq; #endif /* CONFIG_BLK_DEV_IDECS */ - hwif->dma_base = old_hwif.dma_base; - hwif->dma_master = old_hwif.dma_master; - hwif->dma_command = old_hwif.dma_command; - hwif->dma_vendor1 = old_hwif.dma_vendor1; - hwif->dma_status = old_hwif.dma_status; - hwif->dma_vendor3 = old_hwif.dma_vendor3; - hwif->dma_prdtable = old_hwif.dma_prdtable; - - hwif->dma_extra = old_hwif.dma_extra; - hwif->config_data = old_hwif.config_data; - hwif->select_data = old_hwif.select_data; - hwif->autodma = old_hwif.autodma; - hwif->udma_four = old_hwif.udma_four; - hwif->no_dsc = old_hwif.no_dsc; + hwif->dma_base = old_hwif->dma_base; + hwif->dma_master = old_hwif->dma_master; + hwif->dma_command = old_hwif->dma_command; + hwif->dma_vendor1 = old_hwif->dma_vendor1; + hwif->dma_status = old_hwif->dma_status; + hwif->dma_vendor3 = old_hwif->dma_vendor3; + hwif->dma_prdtable = old_hwif->dma_prdtable; + + hwif->dma_extra = old_hwif->dma_extra; + hwif->config_data = old_hwif->config_data; + hwif->select_data = old_hwif->select_data; + hwif->autodma = old_hwif->autodma; + hwif->udma_four = old_hwif->udma_four; + hwif->no_dsc = old_hwif->no_dsc; - hwif->hwif_data = old_hwif.hwif_data; + hwif->hwif_data = old_hwif->hwif_data; abort: spin_unlock_irq(&ide_lock); up(&ide_cfg_sem); + + kfree(old_hwif); } EXPORT_SYMBOL(ide_unregister); --- diff/drivers/ide/pci/siimage.c 2004-04-05 12:57:07.000000000 +0100 +++ source/drivers/ide/pci/siimage.c 2004-04-21 10:46:51.506751896 +0100 @@ -203,13 +203,12 @@ static byte siimage_ratemask (ide_drive_ else pci_read_config_byte(hwif->pci_dev, 0x8A, &scsc); - if(is_sata(hwif)) - { - if(strstr(drive->id->model, "Maxtor")) + if (is_sata(hwif)) { + if (strstr(drive->id->model, "Maxtor 4D060H3")) return 3; return 4; } - + if ((scsc & 0x30) == 0x10) /* 133 */ mode = 4; else if ((scsc & 0x30) == 0x20) /* 2xPCI */ @@ -1046,25 +1045,34 @@ static void __init init_mmio_iops_siimag hwif->mmio = 2; } -static int is_dev_seagate_sata(ide_drive_t *drive) +/* TODO firmware versions should be added - eric */ +static const char * sil_blacklist [] = { + "ST320012AS", + "ST330013AS", + "ST340017AS", + "ST360015AS", + "ST380023AS", + "ST3120023AS", + "ST340014ASL", + "ST360014ASL", + "ST380011ASL", + "ST3120022ASL", + "ST3160021ASL", +}; + +static unsigned int siimage_sata_max_rqsize(ide_drive_t *drive) { const char *s = &drive->id->model[0]; - unsigned len; + unsigned int n; - if (!drive->present) - return 0; - - len = strnlen(s, sizeof(drive->id->model)); - - if ((len > 4) && (!memcmp(s, "ST", 2))) { - if ((!memcmp(s + len - 2, "AS", 2)) || - (!memcmp(s + len - 3, "ASL", 3))) { - printk(KERN_INFO "%s: applying pessimistic Seagate " - "errata fix\n", drive->name); - return 1; + for (n = 0; n < ARRAY_SIZE(sil_blacklist); n++) + if (!memcmp(sil_blacklist[n], s, strlen(sil_blacklist[n]))) { + printk(KERN_INFO "%s: applying Seagate errata fix\n", + drive->name); + return 15; } - } - return 0; + + return 128; } /** @@ -1087,9 +1095,10 @@ static void __init init_iops_siimage (id hwif->hwif_data = 0; - hwif->rqsize = 128; - if (is_sata(hwif) && is_dev_seagate_sata(&hwif->drives[0])) - hwif->rqsize = 15; + if (is_sata(hwif) && (class_rev <= 0x01)) + hwif->max_rqsize = siimage_sata_max_rqsize; + else + hwif->rqsize = 128; if (pci_get_drvdata(dev) == NULL) return; --- diff/drivers/input/Kconfig 2003-09-30 15:46:14.000000000 +0100 +++ source/drivers/input/Kconfig 2004-04-21 10:46:51.507751744 +0100 @@ -41,9 +41,16 @@ config INPUT_MOUSEDEV module will be called mousedev. config INPUT_MOUSEDEV_PSAUX - bool "Provide legacy /dev/psaux device" if EMBEDDED + bool "Provide legacy /dev/psaux device" default y depends on INPUT_MOUSEDEV + ---help--- + Say Y here if you want your mouse also be accessible as char device + 10:1 - /dev/psaux. The data available through /dev/psaux is exactly + the same as the data from /dev/input/mice. + + If unsure, say Y. + config INPUT_MOUSEDEV_SCREEN_X int "Horizontal screen resolution" --- diff/drivers/input/keyboard/atkbd.c 2004-04-05 12:57:08.000000000 +0100 +++ source/drivers/input/keyboard/atkbd.c 2004-04-21 10:46:51.507751744 +0100 @@ -26,7 +26,6 @@ #include #include #include -#include MODULE_AUTHOR("Vojtech Pavlik "); MODULE_DESCRIPTION("AT and PS/2 keyboard driver"); @@ -81,13 +80,13 @@ static unsigned char atkbd_set2_keycode[ 82, 83, 80, 76, 77, 72, 1, 69, 87, 78, 81, 74, 55, 73, 70, 99, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 217,100,255, 0, 97,165, 0, 0,156, 0, 0, 0, 0, 0, 0,125, - 173,114, 0,113, 0, 0, 0,126,128, 0, 0,140, 0, 0, 0,127, + 217,100,255, 0, 97,165,172, 0,156, 0, 0, 0, 0, 0,187,125, + 173,114, 0,113, 0, 0,189,126,128, 0, 0,140, 0, 0, 0,127, 159, 0,115, 0,164, 0, 0,116,158, 0,150,166, 0, 0, 0,142, 157, 0, 0, 0, 0, 0, 0, 0,155, 0, 98, 0, 0,163, 0, 0, 226, 0, 0, 0, 0, 0, 0, 0, 0,255, 96, 0, 0, 0,143, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,107, 0,105,102, 0, 0,112, - 110,111,108,112,106,103, 0,119, 0,118,109, 0, 99,104,119, 0, + 110,111,108,112,106,103,171,119, 0,118,109, 0, 99,104,119, 0, 0, 0, 0, 65, 99, }; @@ -173,22 +172,24 @@ struct atkbd { unsigned char keycode[512]; struct input_dev dev; struct serio *serio; - struct timer_list timer; + char name[64]; char phys[32]; + unsigned short id; + unsigned char set; + unsigned int translated:1; + unsigned int extra:1; + unsigned int write:1; + unsigned char cmdbuf[4]; unsigned char cmdcnt; - unsigned char set; - unsigned char extra; - unsigned char release; - int lastkey; volatile signed char ack; unsigned char emul; - unsigned short id; - unsigned char write; - unsigned char translated; - unsigned char resend; - unsigned char bat_xl; + unsigned int resend:1; + unsigned int release:1; + unsigned int bat_xl:1; + unsigned int enabled:1; + unsigned int last; unsigned long time; }; @@ -248,6 +249,9 @@ static irqreturn_t atkbd_interrupt(struc goto out; } + if (!atkbd->enabled) + goto out; + if (atkbd->translated) { if (atkbd->emul || @@ -300,15 +304,20 @@ static irqreturn_t atkbd_interrupt(struc case ATKBD_KEY_NULL: break; case ATKBD_KEY_UNKNOWN: - printk(KERN_WARNING "atkbd.c: Unknown key %s (%s set %d, code %#x on %s).\n", - atkbd->release ? "released" : "pressed", - atkbd->translated ? "translated" : "raw", - atkbd->set, code, serio->phys); - if (atkbd->translated && atkbd->set == 2 && code == 0x7a) - printk(KERN_WARNING "atkbd.c: This is an XFree86 bug. It shouldn't access" - " hardware directly.\n"); - else - printk(KERN_WARNING "atkbd.c: Use 'setkeycodes %s%02x ' to make it known.\n", code & 0x80 ? "e0" : "", code & 0x7f); + if (data == ATKBD_RET_ACK || data == ATKBD_RET_NAK) { + printk(KERN_WARNING "atkbd.c: Spurious %s on %s. Some program, " + "like XFree86, might be trying access hardware directly.\n", + data == ATKBD_RET_ACK ? "ACK" : "NAK", serio->phys); + } else { + printk(KERN_WARNING "atkbd.c: Unknown key %s " + "(%s set %d, code %#x on %s).\n", + atkbd->release ? "released" : "pressed", + atkbd->translated ? "translated" : "raw", + atkbd->set, code, serio->phys); + printk(KERN_WARNING "atkbd.c: Use 'setkeycodes %s%02x ' " + "to make it known.\n", + code & 0x80 ? "e0" : "", code & 0x7f); + } break; case ATKBD_SCR_1: scroll = 1 - atkbd->release * 2; @@ -745,6 +754,8 @@ static void atkbd_connect(struct serio * atkbd->id = 0xab00; } + atkbd->enabled = 1; + if (atkbd->extra) { atkbd->dev.ledbit[0] |= BIT(LED_COMPOSE) | BIT(LED_SUSPEND) | BIT(LED_SLEEP) | BIT(LED_MUTE) | BIT(LED_MISC); sprintf(atkbd->name, "AT Set 2 Extra keyboard"); @@ -809,12 +820,12 @@ static int atkbd_reconnect(struct serio param[0] = (test_bit(LED_SCROLLL, atkbd->dev.led) ? 1 : 0) | (test_bit(LED_NUML, atkbd->dev.led) ? 2 : 0) | (test_bit(LED_CAPSL, atkbd->dev.led) ? 4 : 0); - + if (atkbd_probe(atkbd)) return -1; if (atkbd->set != atkbd_set_3(atkbd)) return -1; - + atkbd_enable(atkbd); if (atkbd_command(atkbd, param, ATKBD_CMD_SETLEDS)) --- diff/drivers/input/keyboard/lkkbd.c 2004-04-05 12:57:08.000000000 +0100 +++ source/drivers/input/keyboard/lkkbd.c 2004-04-21 10:46:51.508751592 +0100 @@ -12,7 +12,7 @@ * adaptor). * * DISCLAUNER: This works for _me_. If you break anything by using the - * information given below, I will _not_ be lieable! + * information given below, I will _not_ be liable! * * RJ11 pinout: To DB9: Or DB25: * 1 - RxD <----> Pin 3 (TxD) <-> Pin 2 (TxD) @@ -39,18 +39,18 @@ /* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * + * * Should you need to contact me, the author, you can do so either by * email or by paper mail: * Jan-Benedict Glaw, Lilienstraße 16, 33790 Hörste (near Halle/Westf.), @@ -527,9 +527,7 @@ lkkbd_connect (struct serio *serio, stru if ((serio->type & SERIO_TYPE) != SERIO_RS232) return; - if (!(serio->type & SERIO_PROTO)) - return; - if ((serio->type & SERIO_PROTO) && (serio->type & SERIO_PROTO) != SERIO_LKKBD) + if ((serio->type & SERIO_PROTO) != SERIO_LKKBD) return; if (!(lk = kmalloc (sizeof (struct lkkbd), GFP_KERNEL))) --- diff/drivers/input/mouse/logips2pp.c 2004-02-09 10:36:10.000000000 +0000 +++ source/drivers/input/mouse/logips2pp.c 2004-04-21 10:46:51.509751440 +0100 @@ -63,7 +63,6 @@ void ps2pp_process_packet(struct psmouse packet[0] &= 0x0f; packet[1] = 0; packet[2] = 0; - } } @@ -76,18 +75,9 @@ void ps2pp_process_packet(struct psmouse static int ps2pp_cmd(struct psmouse *psmouse, unsigned char *param, unsigned char command) { - unsigned char d; - int i; - - if (psmouse_command(psmouse, NULL, PSMOUSE_CMD_SETSCALE11)) + if (psmouse_sliced_command(psmouse, command)) return -1; - for (i = 6; i >= 0; i -= 2) { - d = (command >> i) & 3; - if(psmouse_command(psmouse, &d, PSMOUSE_CMD_SETRES)) - return -1; - } - if (psmouse_command(psmouse, param, PSMOUSE_CMD_POLL)) return -1; @@ -99,7 +89,7 @@ static int ps2pp_cmd(struct psmouse *psm * enabled if we do nothing to it. Of course I put this in because I want it * disabled :P * 1 - enabled (if previously disabled, also default) - * 0/2 - disabled + * 0/2 - disabled */ static void ps2pp_set_smartscroll(struct psmouse *psmouse) @@ -113,14 +103,11 @@ static void ps2pp_set_smartscroll(struct psmouse_command(psmouse, param, PSMOUSE_CMD_SETRES); psmouse_command(psmouse, param, PSMOUSE_CMD_SETRES); - if (psmouse_smartscroll == 1) - param[0] = 1; - else - if (psmouse_smartscroll > 2) - return; - - /* else leave param[0] == 0 to disable */ - psmouse_command(psmouse, param, PSMOUSE_CMD_SETRES); + if (psmouse_smartscroll < 2) { + /* 0 - disabled, 1 - enabled */ + param[0] = psmouse_smartscroll; + psmouse_command(psmouse, param, PSMOUSE_CMD_SETRES); + } } /* @@ -138,111 +125,128 @@ void ps2pp_set_800dpi(struct psmouse *ps psmouse_command(psmouse, ¶m, PSMOUSE_CMD_SETRES); } + +static int is_model_in_list(unsigned char model, int *model_list) +{ + int i; + + for (i = 0; model_list[i] != -1; i++) + if (model == model_list[i]) + return 1; + return 0; +} + /* - * Detect the exact model and features of a PS2++ or PS2T++ Logitech mouse or - * touchpad. + * Set up input device's properties based on the detected mouse model. */ -static int ps2pp_detect_model(struct psmouse *psmouse, unsigned char *param) +static void ps2pp_set_properties(struct psmouse *psmouse, unsigned char protocol, + unsigned char model, unsigned char buttons) { - int i; static int logitech_4btn[] = { 12, 40, 41, 42, 43, 52, 73, 80, -1 }; static int logitech_wheel[] = { 52, 53, 75, 76, 80, 81, 83, 88, 112, -1 }; - static int logitech_ps2pp[] = { 12, 13, 40, 41, 42, 43, 50, 51, 52, 53, 73, 75, - 76, 80, 81, 83, 88, 96, 97, 112, -1 }; static int logitech_mx[] = { 61, 112, -1 }; psmouse->vendor = "Logitech"; - psmouse->model = ((param[0] >> 4) & 0x07) | ((param[0] << 3) & 0x78); + psmouse->model = model; - if (param[1] < 3) + if (buttons < 3) clear_bit(BTN_MIDDLE, psmouse->dev.keybit); - if (param[1] < 2) + if (buttons < 2) clear_bit(BTN_RIGHT, psmouse->dev.keybit); - psmouse->type = PSMOUSE_PS2; + if (protocol == PSMOUSE_PS2PP) { - for (i = 0; logitech_ps2pp[i] != -1; i++) - if (logitech_ps2pp[i] == psmouse->model) - psmouse->type = PSMOUSE_PS2PP; - - if (psmouse->type == PSMOUSE_PS2PP) { - - for (i = 0; logitech_4btn[i] != -1; i++) - if (logitech_4btn[i] == psmouse->model) - set_bit(BTN_SIDE, psmouse->dev.keybit); - - for (i = 0; logitech_wheel[i] != -1; i++) - if (logitech_wheel[i] == psmouse->model) { - set_bit(REL_WHEEL, psmouse->dev.relbit); - psmouse->name = "Wheel Mouse"; - } + if (is_model_in_list(model, logitech_4btn)) + set_bit(BTN_SIDE, psmouse->dev.keybit); + + if (is_model_in_list(model, logitech_wheel)) { + set_bit(REL_WHEEL, psmouse->dev.relbit); + psmouse->name = "Wheel Mouse"; + } + + if (is_model_in_list(model, logitech_mx)) { + set_bit(BTN_SIDE, psmouse->dev.keybit); + set_bit(BTN_EXTRA, psmouse->dev.keybit); + set_bit(BTN_BACK, psmouse->dev.keybit); + set_bit(BTN_FORWARD, psmouse->dev.keybit); + set_bit(BTN_TASK, psmouse->dev.keybit); + psmouse->name = "MX Mouse"; + } + } + + if (protocol == PSMOUSE_PS2TPP) { + set_bit(REL_WHEEL, psmouse->dev.relbit); + set_bit(REL_HWHEEL, psmouse->dev.relbit); + psmouse->name = "TouchPad 3"; + } +} - for (i = 0; logitech_mx[i] != -1; i++) - if (logitech_mx[i] == psmouse->model) { - set_bit(BTN_SIDE, psmouse->dev.keybit); - set_bit(BTN_EXTRA, psmouse->dev.keybit); - set_bit(BTN_BACK, psmouse->dev.keybit); - set_bit(BTN_FORWARD, psmouse->dev.keybit); - set_bit(BTN_TASK, psmouse->dev.keybit); - psmouse->name = "MX Mouse"; - } /* - * Do Logitech PS2++ / PS2T++ magic init. + * Logitech magic init. Detect whether the mouse is a Logitech one + * and its exact model and try turning on extended protocol for ones + * that support it. */ - if (psmouse->model == 97) { /* TouchPad 3 */ +int ps2pp_init(struct psmouse *psmouse, int set_properties) +{ + static int logitech_ps2pp[] = { 12, 13, 40, 41, 42, 43, 50, 51, 52, 53, 73, 75, + 76, 80, 81, 83, 88, 96, 97, 112, -1 }; + unsigned char param[4]; + unsigned char protocol = PSMOUSE_PS2; + unsigned char model, buttons; - set_bit(REL_WHEEL, psmouse->dev.relbit); - set_bit(REL_HWHEEL, psmouse->dev.relbit); + param[0] = 0; + psmouse_command(psmouse, param, PSMOUSE_CMD_SETRES); + psmouse_command(psmouse, NULL, PSMOUSE_CMD_SETSCALE11); + psmouse_command(psmouse, NULL, PSMOUSE_CMD_SETSCALE11); + psmouse_command(psmouse, NULL, PSMOUSE_CMD_SETSCALE11); + param[1] = 0; + psmouse_command(psmouse, param, PSMOUSE_CMD_GETINFO); - param[0] = 0x11; param[1] = 0x04; param[2] = 0x68; /* Unprotect RAM */ + if (param[1] != 0) { + model = ((param[0] >> 4) & 0x07) | ((param[0] << 3) & 0x78); + buttons = param[1]; +/* + * Do Logitech PS2++ / PS2T++ magic init. + */ + if (model == 97) { /* Touch Pad 3 */ + + /* Unprotect RAM */ + param[0] = 0x11; param[1] = 0x04; param[2] = 0x68; psmouse_command(psmouse, param, 0x30d1); - param[0] = 0x11; param[1] = 0x05; param[2] = 0x0b; /* Enable features */ + /* Enable features */ + param[0] = 0x11; param[1] = 0x05; param[2] = 0x0b; psmouse_command(psmouse, param, 0x30d1); - param[0] = 0x11; param[1] = 0x09; param[2] = 0xc3; /* Enable PS2++ */ + /* Enable PS2++ */ + param[0] = 0x11; param[1] = 0x09; param[2] = 0xc3; psmouse_command(psmouse, param, 0x30d1); param[0] = 0; if (!psmouse_command(psmouse, param, 0x13d1) && - param[0] == 0x06 && param[1] == 0x00 && param[2] == 0x14) { - psmouse->name = "TouchPad 3"; - return PSMOUSE_PS2TPP; + param[0] == 0x06 && param[1] == 0x00 && param[2] == 0x14) { + protocol = PSMOUSE_PS2TPP; } - } else { + } else if (is_model_in_list(model, logitech_ps2pp)) { param[0] = param[1] = param[2] = 0; ps2pp_cmd(psmouse, param, 0x39); /* Magic knock */ ps2pp_cmd(psmouse, param, 0xDB); - if ((param[0] & 0x78) == 0x48 && (param[1] & 0xf3) == 0xc2 && - (param[2] & 3) == ((param[1] >> 2) & 3)) { - ps2pp_set_smartscroll(psmouse); - return PSMOUSE_PS2PP; + if ((param[0] & 0x78) == 0x48 && + (param[1] & 0xf3) == 0xc2 && + (param[2] & 0x03) == ((param[1] >> 2) & 3)) { + ps2pp_set_smartscroll(psmouse); + protocol = PSMOUSE_PS2PP; } } - } - - return 0; -} - -/* - * Logitech magic init. - */ -int ps2pp_detect(struct psmouse *psmouse) -{ - unsigned char param[4]; - param[0] = 0; - psmouse_command(psmouse, param, PSMOUSE_CMD_SETRES); - psmouse_command(psmouse, NULL, PSMOUSE_CMD_SETSCALE11); - psmouse_command(psmouse, NULL, PSMOUSE_CMD_SETSCALE11); - psmouse_command(psmouse, NULL, PSMOUSE_CMD_SETSCALE11); - param[1] = 0; - psmouse_command(psmouse, param, PSMOUSE_CMD_GETINFO); + if (set_properties) + ps2pp_set_properties(psmouse, protocol, model, buttons); + } - return param[1] != 0 ? ps2pp_detect_model(psmouse, param) : 0; + return protocol; } --- diff/drivers/input/mouse/logips2pp.h 2004-01-19 10:22:56.000000000 +0000 +++ source/drivers/input/mouse/logips2pp.h 2004-04-21 10:46:51.509751440 +0100 @@ -10,8 +10,9 @@ #ifndef _LOGIPS2PP_H #define _LOGIPS2PP_H -struct psmouse; + void ps2pp_process_packet(struct psmouse *psmouse); void ps2pp_set_800dpi(struct psmouse *psmouse); -int ps2pp_detect(struct psmouse *psmouse); +int ps2pp_init(struct psmouse *psmouse, int set_properties); + #endif --- diff/drivers/input/mouse/psmouse-base.c 2004-04-05 12:57:08.000000000 +0100 +++ source/drivers/input/mouse/psmouse-base.c 2004-04-21 10:46:51.510751288 +0100 @@ -43,9 +43,9 @@ int psmouse_smartscroll = 1; module_param_named(smartscroll, psmouse_smartscroll, bool, 0); MODULE_PARM_DESC(smartscroll, "Logitech Smartscroll autorepeat, 1 = enabled (default), 0 = disabled."); -unsigned int psmouse_resetafter; +static unsigned int psmouse_resetafter; module_param_named(resetafter, psmouse_resetafter, uint, 0); -MODULE_PARM_DESC(resetafter, "Reset Synaptics Touchpad after so many bad packets (0 = never)."); +MODULE_PARM_DESC(resetafter, "Reset device after so many bad packets (0 = never)."); __obsolete_setup("psmouse_noext"); __obsolete_setup("psmouse_resolution="); @@ -56,15 +56,22 @@ __obsolete_setup("psmouse_rate="); static char *psmouse_protocols[] = { "None", "PS/2", "PS2++", "PS2T++", "GenPS/2", "ImPS/2", "ImExPS/2", "SynPS/2"}; /* - * psmouse_process_packet() analyzes the PS/2 mouse packet contents and - * reports relevant events to the input module. + * psmouse_process_byte() analyzes the PS/2 data stream and reports + * relevant events to the input module once full packet has arrived. */ -static void psmouse_process_packet(struct psmouse *psmouse, struct pt_regs *regs) +static psmouse_ret_t psmouse_process_byte(struct psmouse *psmouse, struct pt_regs *regs) { struct input_dev *dev = &psmouse->dev; unsigned char *packet = psmouse->packet; + if (psmouse->pktcnt < 3 + (psmouse->type >= PSMOUSE_GENPS)) + return PSMOUSE_GOOD_DATA; + +/* + * Full packet accumulated, process it + */ + input_regs(dev, regs); /* @@ -112,6 +119,8 @@ static void psmouse_process_packet(struc input_report_rel(dev, REL_Y, packet[2] ? (int) ((packet[0] << 3) & 0x100) - (int) packet[2] : 0); input_sync(dev); + + return PSMOUSE_FULL_PACKET; } /* @@ -123,6 +132,7 @@ static irqreturn_t psmouse_interrupt(str unsigned char data, unsigned int flags, struct pt_regs *regs) { struct psmouse *psmouse = serio->private; + psmouse_ret_t rc; if (psmouse->state == PSMOUSE_IGNORE) goto out; @@ -180,7 +190,7 @@ static irqreturn_t psmouse_interrupt(str if (psmouse->pktcnt == 2) { if (psmouse->packet[1] == PSMOUSE_RET_ID) { psmouse->state = PSMOUSE_IGNORE; - serio_rescan(serio); + serio_reconnect(serio); goto out; } if (psmouse->type == PSMOUSE_SYNAPTICS) { @@ -193,19 +203,32 @@ static irqreturn_t psmouse_interrupt(str } } - if (psmouse->type == PSMOUSE_SYNAPTICS) { - /* - * The synaptics driver has its own resync logic, - * so it needs to receive all bytes one at a time. - */ - synaptics_process_byte(psmouse, regs); - goto out; - } + rc = psmouse->protocol_handler(psmouse, regs); - if (psmouse->pktcnt == 3 + (psmouse->type >= PSMOUSE_GENPS)) { - psmouse_process_packet(psmouse, regs); - psmouse->pktcnt = 0; - goto out; + switch (rc) { + case PSMOUSE_BAD_DATA: + printk(KERN_WARNING "psmouse.c: %s at %s lost sync at byte %d\n", + psmouse->name, psmouse->phys, psmouse->pktcnt); + psmouse->pktcnt = 0; + + if (++psmouse->out_of_sync == psmouse_resetafter) { + psmouse->state = PSMOUSE_IGNORE; + printk(KERN_NOTICE "psmouse.c: issuing reconnect request\n"); + serio_reconnect(psmouse->serio); + } + break; + + case PSMOUSE_FULL_PACKET: + psmouse->pktcnt = 0; + if (psmouse->out_of_sync) { + psmouse->out_of_sync = 0; + printk(KERN_NOTICE "psmouse.c: %s at %s - driver resynched.\n", + psmouse->name, psmouse->phys); + } + break; + + case PSMOUSE_GOOD_DATA: + break; } out: return IRQ_HANDLED; @@ -289,6 +312,30 @@ int psmouse_command(struct psmouse *psmo /* + * psmouse_sliced_command() sends an extended PS/2 command to the mouse + * using sliced syntax, understood by advanced devices, such as Logitech + * or Synaptics touchpads. The command is encoded as: + * 0xE6 0xE8 rr 0xE8 ss 0xE8 tt 0xE8 uu where (rr*64)+(ss*16)+(tt*4)+uu + * is the command. + */ +int psmouse_sliced_command(struct psmouse *psmouse, unsigned char command) +{ + int i; + + if (psmouse_command(psmouse, NULL, PSMOUSE_CMD_SETSCALE11)) + return -1; + + for (i = 6; i >= 0; i -= 2) { + unsigned char d = (command >> i) & 3; + if (psmouse_command(psmouse, &d, PSMOUSE_CMD_SETRES)) + return -1; + } + + return 0; +} + + +/* * psmouse_reset() resets the mouse into power-on state. */ int psmouse_reset(struct psmouse *psmouse) @@ -363,23 +410,23 @@ static int im_explorer_detect(struct psm * the mouse may have. */ -static int psmouse_extensions(struct psmouse *psmouse) +static int psmouse_extensions(struct psmouse *psmouse, + unsigned int max_proto, int set_properties) { int synaptics_hardware = 0; - psmouse->vendor = "Generic"; - psmouse->name = "Mouse"; - psmouse->model = 0; - /* * Try Synaptics TouchPad */ - if (psmouse_max_proto > PSMOUSE_PS2 && synaptics_detect(psmouse)) { + if (max_proto > PSMOUSE_PS2 && synaptics_detect(psmouse)) { synaptics_hardware = 1; - psmouse->vendor = "Synaptics"; - psmouse->name = "TouchPad"; - if (psmouse_max_proto > PSMOUSE_IMEX) { + if (set_properties) { + psmouse->vendor = "Synaptics"; + psmouse->name = "TouchPad"; + } + + if (max_proto > PSMOUSE_IMEX) { if (synaptics_init(psmouse) == 0) return PSMOUSE_SYNAPTICS; /* @@ -387,7 +434,7 @@ static int psmouse_extensions(struct psm * Unfortunately Logitech/Genius probes confuse some firmware versions so * we'll have to skip them. */ - psmouse_max_proto = PSMOUSE_IMEX; + max_proto = PSMOUSE_IMEX; } /* * Make sure that touchpad is in relative mode, gestures (taps) are enabled @@ -395,35 +442,45 @@ static int psmouse_extensions(struct psm synaptics_reset(psmouse); } - if (psmouse_max_proto > PSMOUSE_IMEX && genius_detect(psmouse)) { - set_bit(BTN_EXTRA, psmouse->dev.keybit); - set_bit(BTN_SIDE, psmouse->dev.keybit); - set_bit(REL_WHEEL, psmouse->dev.relbit); + if (max_proto > PSMOUSE_IMEX && genius_detect(psmouse)) { + + if (set_properties) { + set_bit(BTN_EXTRA, psmouse->dev.keybit); + set_bit(BTN_SIDE, psmouse->dev.keybit); + set_bit(REL_WHEEL, psmouse->dev.relbit); + psmouse->vendor = "Genius"; + psmouse->name = "Wheel Mouse"; + } - psmouse->vendor = "Genius"; - psmouse->name = "Wheel Mouse"; return PSMOUSE_GENPS; } - if (psmouse_max_proto > PSMOUSE_IMEX) { - int type = ps2pp_detect(psmouse); - if (type) + if (max_proto > PSMOUSE_IMEX) { + int type = ps2pp_init(psmouse, set_properties); + if (type > PSMOUSE_PS2) return type; } - if (psmouse_max_proto >= PSMOUSE_IMPS && intellimouse_detect(psmouse)) { - set_bit(REL_WHEEL, psmouse->dev.relbit); + if (max_proto >= PSMOUSE_IMPS && intellimouse_detect(psmouse)) { - if (psmouse_max_proto >= PSMOUSE_IMEX && - im_explorer_detect(psmouse)) { - set_bit(BTN_SIDE, psmouse->dev.keybit); - set_bit(BTN_EXTRA, psmouse->dev.keybit); + if (set_properties) { + set_bit(REL_WHEEL, psmouse->dev.relbit); + if (!psmouse->name) + psmouse->name = "Wheel Mouse"; + } + + if (max_proto >= PSMOUSE_IMEX && im_explorer_detect(psmouse)) { + + if (!set_properties) { + set_bit(BTN_SIDE, psmouse->dev.keybit); + set_bit(BTN_EXTRA, psmouse->dev.keybit); + if (!psmouse->name) + psmouse->name = "Explorer Mouse"; + } - psmouse->name = "Explorer Mouse"; return PSMOUSE_IMEX; } - psmouse->name = "Wheel Mouse"; return PSMOUSE_IMPS; } @@ -473,12 +530,7 @@ static int psmouse_probe(struct psmouse if (psmouse_command(psmouse, NULL, PSMOUSE_CMD_RESET_DIS)) printk(KERN_WARNING "psmouse.c: Failed to reset mouse on %s\n", psmouse->serio->phys); -/* - * And here we try to determine if it has any extensions over the - * basic PS/2 3-button mouse. - */ - - return psmouse->type = psmouse_extensions(psmouse); + return 0; } /* @@ -616,7 +668,6 @@ static void psmouse_connect(struct serio psmouse->dev.evbit[0] = BIT(EV_KEY) | BIT(EV_REL); psmouse->dev.keybit[LONG(BTN_MOUSE)] = BIT(BTN_LEFT) | BIT(BTN_MIDDLE) | BIT(BTN_RIGHT); psmouse->dev.relbit[0] = BIT(REL_X) | BIT(REL_Y); - psmouse->state = PSMOUSE_CMD_MODE; psmouse->serio = serio; psmouse->dev.private = psmouse; @@ -628,13 +679,21 @@ static void psmouse_connect(struct serio return; } - if (psmouse_probe(psmouse) <= 0) { + if (psmouse_probe(psmouse) < 0) { serio_close(serio); kfree(psmouse); serio->private = NULL; return; } + psmouse->type = psmouse_extensions(psmouse, psmouse_max_proto, 1); + if (!psmouse->vendor) + psmouse->vendor = "Generic"; + if (!psmouse->name) + psmouse->name = "Mouse"; + if (!psmouse->protocol_handler) + psmouse->protocol_handler = psmouse_process_byte; + sprintf(psmouse->devname, "%s %s %s", psmouse_protocols[psmouse->type], psmouse->vendor, psmouse->name); sprintf(psmouse->phys, "%s/input0", @@ -668,27 +727,24 @@ static int psmouse_reconnect(struct seri { struct psmouse *psmouse = serio->private; struct serio_dev *dev = serio->dev; - int old_type; if (!dev || !psmouse) { printk(KERN_DEBUG "psmouse: reconnect request, but serio is disconnected, ignoring...\n"); return -1; } - old_type = psmouse->type; - psmouse->state = PSMOUSE_CMD_MODE; - psmouse->type = psmouse->acking = psmouse->cmdcnt = psmouse->pktcnt = 0; + psmouse->acking = psmouse->cmdcnt = psmouse->pktcnt = psmouse->out_of_sync = 0; if (psmouse->reconnect) { if (psmouse->reconnect(psmouse)) return -1; - } else if (psmouse_probe(psmouse) != old_type) + } else if (psmouse_probe(psmouse) < 0 || + psmouse->type != psmouse_extensions(psmouse, psmouse_max_proto, 0)) return -1; /* ok, the device type (and capabilities) match the old one, * we can continue using it, complete intialization */ - psmouse->type = old_type; psmouse_initialize(psmouse); if (psmouse->ptport) { --- diff/drivers/input/mouse/psmouse.h 2004-04-05 12:57:08.000000000 +0100 +++ source/drivers/input/mouse/psmouse.h 2004-04-21 10:46:51.510751288 +0100 @@ -22,6 +22,13 @@ #define PSMOUSE_ACTIVATED 1 #define PSMOUSE_IGNORE 2 +/* psmouse protocol handler return codes */ +typedef enum { + PSMOUSE_BAD_DATA, + PSMOUSE_GOOD_DATA, + PSMOUSE_FULL_PACKET +} psmouse_ret_t; + struct psmouse; struct psmouse_ptport { @@ -45,6 +52,7 @@ struct psmouse { unsigned char type; unsigned char model; unsigned long last; + unsigned long out_of_sync; unsigned char state; char acking; volatile char ack; @@ -52,6 +60,7 @@ struct psmouse { char devname[64]; char phys[32]; + psmouse_ret_t (*protocol_handler)(struct psmouse *psmouse, struct pt_regs *regs); int (*reconnect)(struct psmouse *psmouse); void (*disconnect)(struct psmouse *psmouse); }; @@ -65,10 +74,10 @@ struct psmouse { #define PSMOUSE_SYNAPTICS 7 int psmouse_command(struct psmouse *psmouse, unsigned char *param, int command); +int psmouse_sliced_command(struct psmouse *psmouse, unsigned char command); int psmouse_reset(struct psmouse *psmouse); extern int psmouse_smartscroll; extern unsigned int psmouse_rate; -extern unsigned int psmouse_resetafter; #endif /* _PSMOUSE_H */ --- diff/drivers/input/mouse/synaptics.c 2004-04-05 12:57:08.000000000 +0100 +++ source/drivers/input/mouse/synaptics.c 2004-04-21 10:46:51.511751136 +0100 @@ -44,33 +44,11 @@ ****************************************************************************/ /* - * Use the Synaptics extended ps/2 syntax to write a special command byte. - * special command: 0xE8 rr 0xE8 ss 0xE8 tt 0xE8 uu where (rr*64)+(ss*16)+(tt*4)+uu - * is the command. A 0xF3 or 0xE9 must follow (see synaptics_send_cmd - * and synaptics_mode_cmd) - */ -static int synaptics_special_cmd(struct psmouse *psmouse, unsigned char command) -{ - int i; - - if (psmouse_command(psmouse, NULL, PSMOUSE_CMD_SETSCALE11)) - return -1; - - for (i = 6; i >= 0; i -= 2) { - unsigned char d = (command >> i) & 3; - if (psmouse_command(psmouse, &d, PSMOUSE_CMD_SETRES)) - return -1; - } - - return 0; -} - -/* * Send a command to the synpatics touchpad by special commands */ static int synaptics_send_cmd(struct psmouse *psmouse, unsigned char c, unsigned char *param) { - if (synaptics_special_cmd(psmouse, c)) + if (psmouse_sliced_command(psmouse, c)) return -1; if (psmouse_command(psmouse, param, PSMOUSE_CMD_GETINFO)) return -1; @@ -84,7 +62,7 @@ static int synaptics_mode_cmd(struct psm { unsigned char param[1]; - if (synaptics_special_cmd(psmouse, mode)) + if (psmouse_sliced_command(psmouse, mode)) return -1; param[0] = SYN_PS_SET_MODE2; if (psmouse_command(psmouse, param, PSMOUSE_CMD_SETRATE)) @@ -118,17 +96,31 @@ static int synaptics_capability(struct p if (synaptics_send_cmd(psmouse, SYN_QUE_CAPABILITIES, cap)) return -1; - priv->capabilities = (cap[0]<<16) | (cap[1]<<8) | cap[2]; + priv->capabilities = (cap[0] << 16) | (cap[1] << 8) | cap[2]; priv->ext_cap = 0; if (!SYN_CAP_VALID(priv->capabilities)) return -1; - if (SYN_EXT_CAP_REQUESTS(priv->capabilities)) { + /* + * Unless capExtended is set the rest of the flags should be ignored + */ + if (!SYN_CAP_EXTENDED(priv->capabilities)) + priv->capabilities = 0; + + if (SYN_EXT_CAP_REQUESTS(priv->capabilities) >= 1) { if (synaptics_send_cmd(psmouse, SYN_QUE_EXT_CAPAB, cap)) { printk(KERN_ERR "Synaptics claims to have extended capabilities," " but I'm not able to read them."); - } else - priv->ext_cap = (cap[0]<<16) | (cap[1]<<8) | cap[2]; + } else { + priv->ext_cap = (cap[0] << 16) | (cap[1] << 8) | cap[2]; + + /* + * if nExtBtn is greater than 8 it should be considered + * invalid and treated as 0 + */ + if (SYN_CAP_MULTI_BUTTON_NO(priv->ext_cap) > 8) + priv->ext_cap &= 0xff0fff; + } } return 0; } @@ -167,11 +159,12 @@ static void print_ident(struct synaptics if (SYN_CAP_EXTENDED(priv->capabilities)) { printk(KERN_INFO " Touchpad has extended capability bits\n"); - if (SYN_CAP_MULTI_BUTTON_NO(priv->ext_cap) && - SYN_CAP_MULTI_BUTTON_NO(priv->ext_cap) <= 8) + if (SYN_CAP_MULTI_BUTTON_NO(priv->ext_cap)) printk(KERN_INFO " -> %d multi-buttons, i.e. besides standard buttons\n", (int)(SYN_CAP_MULTI_BUTTON_NO(priv->ext_cap))); - else if (SYN_CAP_FOUR_BUTTON(priv->capabilities)) + if (SYN_CAP_MIDDLE_BUTTON(priv->capabilities)) + printk(KERN_INFO " -> middle button\n"); + if (SYN_CAP_FOUR_BUTTON(priv->capabilities)) printk(KERN_INFO " -> four buttons\n"); if (SYN_CAP_MULTIFINGER(priv->capabilities)) printk(KERN_INFO " -> multifinger detection\n"); @@ -233,7 +226,7 @@ static int synaptics_pt_write(struct ser struct psmouse *parent = port->driver; char rate_param = SYN_PS_CLIENT_CMD; /* indicates that we want pass-through port */ - if (synaptics_special_cmd(parent, c)) + if (psmouse_sliced_command(parent, c)) return -1; if (psmouse_command(parent, &rate_param, PSMOUSE_CMD_SETRATE)) return -1; @@ -297,158 +290,6 @@ static void synaptics_pt_create(struct p } /***************************************************************************** - * Driver initialization/cleanup functions - ****************************************************************************/ - -static inline void set_abs_params(struct input_dev *dev, int axis, int min, int max, int fuzz, int flat) -{ - dev->absmin[axis] = min; - dev->absmax[axis] = max; - dev->absfuzz[axis] = fuzz; - dev->absflat[axis] = flat; - - set_bit(axis, dev->absbit); -} - -static void set_input_params(struct input_dev *dev, struct synaptics_data *priv) -{ - set_bit(EV_ABS, dev->evbit); - set_abs_params(dev, ABS_X, XMIN_NOMINAL, XMAX_NOMINAL, 0, 0); - set_abs_params(dev, ABS_Y, YMIN_NOMINAL, YMAX_NOMINAL, 0, 0); - set_abs_params(dev, ABS_PRESSURE, 0, 255, 0, 0); - set_bit(ABS_TOOL_WIDTH, dev->absbit); - - set_bit(EV_KEY, dev->evbit); - set_bit(BTN_TOUCH, dev->keybit); - set_bit(BTN_TOOL_FINGER, dev->keybit); - set_bit(BTN_TOOL_DOUBLETAP, dev->keybit); - set_bit(BTN_TOOL_TRIPLETAP, dev->keybit); - - set_bit(BTN_LEFT, dev->keybit); - set_bit(BTN_RIGHT, dev->keybit); - set_bit(BTN_FORWARD, dev->keybit); - set_bit(BTN_BACK, dev->keybit); - if (SYN_CAP_MULTI_BUTTON_NO(priv->ext_cap)) { - switch (SYN_CAP_MULTI_BUTTON_NO(priv->ext_cap) & ~0x01) { - default: - /* - * if nExtBtn is greater than 8 it should be considered - * invalid and treated as 0 - */ - break; - case 8: - set_bit(BTN_7, dev->keybit); - set_bit(BTN_6, dev->keybit); - case 6: - set_bit(BTN_5, dev->keybit); - set_bit(BTN_4, dev->keybit); - case 4: - set_bit(BTN_3, dev->keybit); - set_bit(BTN_2, dev->keybit); - case 2: - set_bit(BTN_1, dev->keybit); - set_bit(BTN_0, dev->keybit); - break; - } - } - - clear_bit(EV_REL, dev->evbit); - clear_bit(REL_X, dev->relbit); - clear_bit(REL_Y, dev->relbit); -} - -void synaptics_reset(struct psmouse *psmouse) -{ - /* reset touchpad back to relative mode, gestures enabled */ - synaptics_mode_cmd(psmouse, 0); -} - -static void synaptics_disconnect(struct psmouse *psmouse) -{ - synaptics_reset(psmouse); - kfree(psmouse->private); -} - -static int synaptics_reconnect(struct psmouse *psmouse) -{ - struct synaptics_data *priv = psmouse->private; - struct synaptics_data old_priv = *priv; - - if (!synaptics_detect(psmouse)) - return -1; - - if (synaptics_query_hardware(psmouse)) { - printk(KERN_ERR "Unable to query Synaptics hardware.\n"); - return -1; - } - - if (old_priv.identity != priv->identity || - old_priv.model_id != priv->model_id || - old_priv.capabilities != priv->capabilities || - old_priv.ext_cap != priv->ext_cap) - return -1; - - if (synaptics_set_mode(psmouse, 0)) { - printk(KERN_ERR "Unable to initialize Synaptics hardware.\n"); - return -1; - } - - return 0; -} - -int synaptics_detect(struct psmouse *psmouse) -{ - unsigned char param[4]; - - param[0] = 0; - - psmouse_command(psmouse, param, PSMOUSE_CMD_SETRES); - psmouse_command(psmouse, param, PSMOUSE_CMD_SETRES); - psmouse_command(psmouse, param, PSMOUSE_CMD_SETRES); - psmouse_command(psmouse, param, PSMOUSE_CMD_SETRES); - psmouse_command(psmouse, param, PSMOUSE_CMD_GETINFO); - - return param[1] == 0x47; -} - -int synaptics_init(struct psmouse *psmouse) -{ - struct synaptics_data *priv; - - psmouse->private = priv = kmalloc(sizeof(struct synaptics_data), GFP_KERNEL); - if (!priv) - return -1; - memset(priv, 0, sizeof(struct synaptics_data)); - - if (synaptics_query_hardware(psmouse)) { - printk(KERN_ERR "Unable to query Synaptics hardware.\n"); - goto init_fail; - } - - if (synaptics_set_mode(psmouse, 0)) { - printk(KERN_ERR "Unable to initialize Synaptics hardware.\n"); - goto init_fail; - } - - priv->pkt_type = SYN_MODEL_NEWABS(priv->model_id) ? SYN_NEWABS : SYN_OLDABS; - - if (SYN_CAP_EXTENDED(priv->capabilities) && SYN_CAP_PASS_THROUGH(priv->capabilities)) - synaptics_pt_create(psmouse); - - print_ident(priv); - set_input_params(&psmouse->dev, priv); - - psmouse->disconnect = synaptics_disconnect; - psmouse->reconnect = synaptics_reconnect; - - return 0; - - init_fail: - kfree(priv); - return -1; -} - -/***************************************************************************** * Functions to interpret the absolute mode packets ****************************************************************************/ @@ -471,17 +312,17 @@ static void synaptics_parse_hw_state(uns hw->left = (buf[0] & 0x01) ? 1 : 0; hw->right = (buf[0] & 0x02) ? 1 : 0; - if (SYN_CAP_EXTENDED(priv->capabilities) && - (SYN_CAP_FOUR_BUTTON(priv->capabilities))) { - hw->up = ((buf[3] & 0x01)) ? 1 : 0; - if (hw->left) - hw->up = !hw->up; - hw->down = ((buf[3] & 0x02)) ? 1 : 0; - if (hw->right) - hw->down = !hw->down; + + if (SYN_CAP_MIDDLE_BUTTON(priv->capabilities)) + hw->middle = ((buf[0] ^ buf[3]) & 0x01) ? 1 : 0; + + if (SYN_CAP_FOUR_BUTTON(priv->capabilities)) { + hw->up = ((buf[0] ^ buf[3]) & 0x01) ? 1 : 0; + hw->down = ((buf[0] ^ buf[3]) & 0x02) ? 1 : 0; } + if (SYN_CAP_MULTI_BUTTON_NO(priv->ext_cap) && - ((buf[3] & 2) ? !hw->right : hw->right)) { + ((buf[0] ^ buf[3]) & 0x02)) { switch (SYN_CAP_MULTI_BUTTON_NO(priv->ext_cap) & ~0x01) { default: /* @@ -490,17 +331,17 @@ static void synaptics_parse_hw_state(uns */ break; case 8: - hw->b7 = ((buf[5] & 0x08)) ? 1 : 0; - hw->b6 = ((buf[4] & 0x08)) ? 1 : 0; + hw->ext_buttons |= ((buf[5] & 0x08)) ? 0x80 : 0; + hw->ext_buttons |= ((buf[4] & 0x08)) ? 0x40 : 0; case 6: - hw->b5 = ((buf[5] & 0x04)) ? 1 : 0; - hw->b4 = ((buf[4] & 0x04)) ? 1 : 0; + hw->ext_buttons |= ((buf[5] & 0x04)) ? 0x20 : 0; + hw->ext_buttons |= ((buf[4] & 0x04)) ? 0x10 : 0; case 4: - hw->b3 = ((buf[5] & 0x02)) ? 1 : 0; - hw->b2 = ((buf[4] & 0x02)) ? 1 : 0; + hw->ext_buttons |= ((buf[5] & 0x02)) ? 0x08 : 0; + hw->ext_buttons |= ((buf[4] & 0x02)) ? 0x04 : 0; case 2: - hw->b1 = ((buf[5] & 0x01)) ? 1 : 0; - hw->b0 = ((buf[4] & 0x01)) ? 1 : 0; + hw->ext_buttons |= ((buf[5] & 0x01)) ? 0x02 : 0; + hw->ext_buttons |= ((buf[4] & 0x01)) ? 0x01 : 0; } } } else { @@ -525,6 +366,7 @@ static void synaptics_process_packet(str struct synaptics_hw_state hw; int num_fingers; int finger_width; + int i; synaptics_parse_hw_state(psmouse->packet, priv, &hw); @@ -570,32 +412,20 @@ static void synaptics_process_packet(str input_report_key(dev, BTN_TOOL_DOUBLETAP, num_fingers == 2); input_report_key(dev, BTN_TOOL_TRIPLETAP, num_fingers == 3); - input_report_key(dev, BTN_LEFT, hw.left); - input_report_key(dev, BTN_RIGHT, hw.right); - input_report_key(dev, BTN_FORWARD, hw.up); - input_report_key(dev, BTN_BACK, hw.down); - if (SYN_CAP_MULTI_BUTTON_NO(priv->ext_cap)) - switch(SYN_CAP_MULTI_BUTTON_NO(priv->ext_cap) & ~0x01) { - default: - /* - * if nExtBtn is greater than 8 it should be considered - * invalid and treated as 0 - */ - break; - case 8: - input_report_key(dev, BTN_7, hw.b7); - input_report_key(dev, BTN_6, hw.b6); - case 6: - input_report_key(dev, BTN_5, hw.b5); - input_report_key(dev, BTN_4, hw.b4); - case 4: - input_report_key(dev, BTN_3, hw.b3); - input_report_key(dev, BTN_2, hw.b2); - case 2: - input_report_key(dev, BTN_1, hw.b1); - input_report_key(dev, BTN_0, hw.b0); - break; - } + input_report_key(dev, BTN_LEFT, hw.left); + input_report_key(dev, BTN_RIGHT, hw.right); + + if (SYN_CAP_MIDDLE_BUTTON(priv->capabilities)) + input_report_key(dev, BTN_MIDDLE, hw.middle); + + if (SYN_CAP_FOUR_BUTTON(priv->capabilities)) { + input_report_key(dev, BTN_FORWARD, hw.up); + input_report_key(dev, BTN_BACK, hw.down); + } + + for (i = 0; i < SYN_CAP_MULTI_BUTTON_NO(priv->ext_cap); i++) + input_report_key(dev, BTN_0 + i, hw.ext_buttons & (1 << i)); + input_sync(dev); } @@ -607,6 +437,9 @@ static int synaptics_validate_byte(unsig static unsigned char oldabs_mask[] = { 0xC0, 0x60, 0x00, 0xC0, 0x60 }; static unsigned char oldabs_rslt[] = { 0xC0, 0x00, 0x00, 0x80, 0x00 }; + if (idx < 0 || idx > 4) + return 0; + switch (pkt_type) { case SYN_NEWABS: case SYN_NEWABS_RELAXED: @@ -637,7 +470,7 @@ static unsigned char synaptics_detect_pk return SYN_NEWABS_STRICT; } -void synaptics_process_byte(struct psmouse *psmouse, struct pt_regs *regs) +static psmouse_ret_t synaptics_process_byte(struct psmouse *psmouse, struct pt_regs *regs) { struct input_dev *dev = &psmouse->dev; struct synaptics_data *priv = psmouse->private; @@ -645,11 +478,6 @@ void synaptics_process_byte(struct psmou input_regs(dev, regs); if (psmouse->pktcnt >= 6) { /* Full packet received */ - if (priv->out_of_sync) { - priv->out_of_sync = 0; - printk(KERN_NOTICE "Synaptics driver resynced.\n"); - } - if (unlikely(priv->pkt_type == SYN_NEWABS)) priv->pkt_type = synaptics_detect_pkt_type(psmouse); @@ -657,16 +485,153 @@ void synaptics_process_byte(struct psmou synaptics_pass_pt_packet(&psmouse->ptport->serio, psmouse->packet); else synaptics_process_packet(psmouse); - psmouse->pktcnt = 0; - } else if (psmouse->pktcnt && - !synaptics_validate_byte(psmouse->packet, psmouse->pktcnt - 1, priv->pkt_type)) { - printk(KERN_WARNING "Synaptics driver lost sync at byte %d\n", psmouse->pktcnt); - psmouse->pktcnt = 0; - if (++priv->out_of_sync == psmouse_resetafter) { - psmouse->state = PSMOUSE_IGNORE; - printk(KERN_NOTICE "synaptics: issuing reconnect request\n"); - serio_reconnect(psmouse->serio); - } + return PSMOUSE_FULL_PACKET; + } + + return synaptics_validate_byte(psmouse->packet, psmouse->pktcnt - 1, priv->pkt_type) ? + PSMOUSE_GOOD_DATA : PSMOUSE_BAD_DATA; +} + +/***************************************************************************** + * Driver initialization/cleanup functions + ****************************************************************************/ + +static inline void set_abs_params(struct input_dev *dev, int axis, int min, int max, int fuzz, int flat) +{ + dev->absmin[axis] = min; + dev->absmax[axis] = max; + dev->absfuzz[axis] = fuzz; + dev->absflat[axis] = flat; + + set_bit(axis, dev->absbit); +} + +static void set_input_params(struct input_dev *dev, struct synaptics_data *priv) +{ + int i; + + set_bit(EV_ABS, dev->evbit); + set_abs_params(dev, ABS_X, XMIN_NOMINAL, XMAX_NOMINAL, 0, 0); + set_abs_params(dev, ABS_Y, YMIN_NOMINAL, YMAX_NOMINAL, 0, 0); + set_abs_params(dev, ABS_PRESSURE, 0, 255, 0, 0); + set_bit(ABS_TOOL_WIDTH, dev->absbit); + + set_bit(EV_KEY, dev->evbit); + set_bit(BTN_TOUCH, dev->keybit); + set_bit(BTN_TOOL_FINGER, dev->keybit); + set_bit(BTN_TOOL_DOUBLETAP, dev->keybit); + set_bit(BTN_TOOL_TRIPLETAP, dev->keybit); + + set_bit(BTN_LEFT, dev->keybit); + set_bit(BTN_RIGHT, dev->keybit); + + if (SYN_CAP_MIDDLE_BUTTON(priv->capabilities)) + set_bit(BTN_MIDDLE, dev->keybit); + + if (SYN_CAP_FOUR_BUTTON(priv->capabilities)) { + set_bit(BTN_FORWARD, dev->keybit); + set_bit(BTN_BACK, dev->keybit); + } + + for (i = 0; i < SYN_CAP_MULTI_BUTTON_NO(priv->ext_cap); i++) + set_bit(BTN_0 + i, dev->keybit); + + clear_bit(EV_REL, dev->evbit); + clear_bit(REL_X, dev->relbit); + clear_bit(REL_Y, dev->relbit); +} + +void synaptics_reset(struct psmouse *psmouse) +{ + /* reset touchpad back to relative mode, gestures enabled */ + synaptics_mode_cmd(psmouse, 0); +} + +static void synaptics_disconnect(struct psmouse *psmouse) +{ + synaptics_reset(psmouse); + kfree(psmouse->private); +} + +static int synaptics_reconnect(struct psmouse *psmouse) +{ + struct synaptics_data *priv = psmouse->private; + struct synaptics_data old_priv = *priv; + + if (!synaptics_detect(psmouse)) + return -1; + + if (synaptics_query_hardware(psmouse)) { + printk(KERN_ERR "Unable to query Synaptics hardware.\n"); + return -1; + } + + if (old_priv.identity != priv->identity || + old_priv.model_id != priv->model_id || + old_priv.capabilities != priv->capabilities || + old_priv.ext_cap != priv->ext_cap) + return -1; + + if (synaptics_set_mode(psmouse, 0)) { + printk(KERN_ERR "Unable to initialize Synaptics hardware.\n"); + return -1; + } + + return 0; +} + +int synaptics_detect(struct psmouse *psmouse) +{ + unsigned char param[4]; + + param[0] = 0; + + psmouse_command(psmouse, param, PSMOUSE_CMD_SETRES); + psmouse_command(psmouse, param, PSMOUSE_CMD_SETRES); + psmouse_command(psmouse, param, PSMOUSE_CMD_SETRES); + psmouse_command(psmouse, param, PSMOUSE_CMD_SETRES); + psmouse_command(psmouse, param, PSMOUSE_CMD_GETINFO); + + return param[1] == 0x47; +} + +int synaptics_init(struct psmouse *psmouse) +{ + struct synaptics_data *priv; + + psmouse->private = priv = kmalloc(sizeof(struct synaptics_data), GFP_KERNEL); + if (!priv) + return -1; + memset(priv, 0, sizeof(struct synaptics_data)); + + if (synaptics_query_hardware(psmouse)) { + printk(KERN_ERR "Unable to query Synaptics hardware.\n"); + goto init_fail; + } + + if (synaptics_set_mode(psmouse, 0)) { + printk(KERN_ERR "Unable to initialize Synaptics hardware.\n"); + goto init_fail; } + + priv->pkt_type = SYN_MODEL_NEWABS(priv->model_id) ? SYN_NEWABS : SYN_OLDABS; + + if (SYN_CAP_PASS_THROUGH(priv->capabilities)) + synaptics_pt_create(psmouse); + + print_ident(priv); + set_input_params(&psmouse->dev, priv); + + psmouse->protocol_handler = synaptics_process_byte; + psmouse->disconnect = synaptics_disconnect; + psmouse->reconnect = synaptics_reconnect; + + return 0; + + init_fail: + kfree(priv); + return -1; } + + --- diff/drivers/input/mouse/synaptics.h 2004-04-05 12:57:08.000000000 +0100 +++ source/drivers/input/mouse/synaptics.h 2004-04-21 10:46:51.512750984 +0100 @@ -9,7 +9,6 @@ #ifndef _SYNAPTICS_H #define _SYNAPTICS_H -extern void synaptics_process_byte(struct psmouse *psmouse, struct pt_regs *regs); extern int synaptics_detect(struct psmouse *psmouse); extern int synaptics_init(struct psmouse *psmouse); extern void synaptics_reset(struct psmouse *psmouse); @@ -44,13 +43,14 @@ extern void synaptics_reset(struct psmou /* synaptics capability bits */ #define SYN_CAP_EXTENDED(c) ((c) & (1 << 23)) +#define SYN_CAP_MIDDLE_BUTTON(c) ((c) & (1 << 18)) #define SYN_CAP_PASS_THROUGH(c) ((c) & (1 << 7)) #define SYN_CAP_SLEEP(c) ((c) & (1 << 4)) #define SYN_CAP_FOUR_BUTTON(c) ((c) & (1 << 3)) #define SYN_CAP_MULTIFINGER(c) ((c) & (1 << 1)) #define SYN_CAP_PALMDETECT(c) ((c) & (1 << 0)) #define SYN_CAP_VALID(c) ((((c) & 0x00ff00) >> 8) == 0x47) -#define SYN_EXT_CAP_REQUESTS(c) ((((c) & 0x700000) >> 20) == 1) +#define SYN_EXT_CAP_REQUESTS(c) (((c) & 0x700000) >> 20) #define SYN_CAP_MULTI_BUTTON_NO(ec) (((ec) & 0x00f000) >> 12) /* synaptics modes query bits */ @@ -86,18 +86,12 @@ struct synaptics_hw_state { int y; int z; int w; - int left; - int right; - int up; - int down; - int b0; - int b1; - int b2; - int b3; - int b4; - int b5; - int b6; - int b7; + unsigned int left:1; + unsigned int right:1; + unsigned int middle:1; + unsigned int up:1; + unsigned int down:1; + unsigned char ext_buttons; }; struct synaptics_data { @@ -108,7 +102,6 @@ struct synaptics_data { unsigned long int identity; /* Identification */ /* Data for normal processing */ - unsigned int out_of_sync; /* # of packets out of sync */ int old_w; /* Previous w value */ unsigned char pkt_type; /* packet type - old, new, etc */ }; --- diff/drivers/input/serio/i8042.c 2004-04-05 12:57:08.000000000 +0100 +++ source/drivers/input/serio/i8042.c 2004-04-21 10:46:51.512750984 +0100 @@ -532,8 +532,8 @@ static int __init i8042_check_mux(struct return -1; /* Workaround for broken chips which seem to support MUX, but in reality don't. */ - /* They all report version 12.10 */ - if (mux_version == 0xCA) + /* They all report version 10.12 */ + if (mux_version == 0xAC) return -1; printk(KERN_INFO "i8042.c: Detected active multiplexing controller, rev %d.%d.\n", --- diff/drivers/input/serio/serio.c 2004-04-21 10:45:34.289490704 +0100 +++ source/drivers/input/serio/serio.c 2004-04-21 10:46:51.513750832 +0100 @@ -195,9 +195,6 @@ irqreturn_t serio_interrupt(struct serio ret = serio->dev->interrupt(serio, data, flags, regs); } else { if (!flags) { - if ((serio->type == SERIO_8042 || - serio->type == SERIO_8042_XL) && (data != 0xaa)) - return ret; serio_rescan(serio); ret = IRQ_HANDLED; } --- diff/drivers/input/tsdev.c 2004-02-09 10:36:10.000000000 +0000 +++ source/drivers/input/tsdev.c 2004-04-21 10:46:51.514750680 +0100 @@ -3,9 +3,19 @@ * * Copyright (c) 2001 "Crazy" james Simmons * - * Input driver to Touchscreen device driver module. + * Compaq touchscreen protocol driver. The protocol emulated by this driver + * is obsolete; for new programs use the tslib library which can read directly + * from evdev and perform dejittering, variance filtering and calibration - + * all in user space, not at kernel level. The meaning of this driver is + * to allow usage of newer input drivers with old applications that use the + * old /dev/h3600_ts and /dev/h3600_tsraw devices. * * Sponsored by Transvirtual Technology + * + * 09-Apr-2004: Andrew Zabolotny + * Fixed to actually work, not just output random numbers. + * Added support for both h3600_ts and h3600_tsraw protocol + * emulation. */ /* @@ -29,6 +39,8 @@ #define TSDEV_MINOR_BASE 128 #define TSDEV_MINORS 32 +/* First 16 devices are h3600_ts compatible; second 16 are h3600_tsraw */ +#define TSDEV_MINOR_MASK 15 #define TSDEV_BUFFER_SIZE 64 #include @@ -52,48 +64,84 @@ #define CONFIG_INPUT_TSDEV_SCREEN_Y 320 #endif +/* This driver emulates both protocols of the old h3600_ts and h3600_tsraw + * devices. The first one must output X/Y data in 'cooked' format, e.g. + * filtered, dejittered and calibrated. Second device just outputs raw + * data received from the hardware. + * + * This driver doesn't support filtering and dejittering; it supports only + * calibration. Filtering and dejittering must be done in the low-level + * driver, if needed, because it may gain additional benefits from knowing + * the low-level details, the nature of noise and so on. + * + * The driver precomputes a calibration matrix given the initial xres and + * yres values (quite innacurate for most touchscreens) that will result + * in a more or less expected range of output values. The driver supports + * the TS_SET_CAL ioctl, which will replace the calibration matrix with a + * new one, supposedly generated from the values taken from the raw device. + */ + MODULE_AUTHOR("James Simmons "); MODULE_DESCRIPTION("Input driver to touchscreen converter"); MODULE_LICENSE("GPL"); static int xres = CONFIG_INPUT_TSDEV_SCREEN_X; module_param(xres, uint, 0); -MODULE_PARM_DESC(xres, "Horizontal screen resolution"); +MODULE_PARM_DESC(xres, "Horizontal screen resolution (can be negative for X-mirror)"); static int yres = CONFIG_INPUT_TSDEV_SCREEN_Y; module_param(yres, uint, 0); -MODULE_PARM_DESC(yres, "Vertical screen resolution"); +MODULE_PARM_DESC(yres, "Vertical screen resolution (can be negative for Y-mirror)"); + +/* From Compaq's Touch Screen Specification version 0.2 (draft) */ +struct ts_event { + short pressure; + short x; + short y; + short millisecs; +}; + +struct ts_calibration { + int xscale; + int xtrans; + int yscale; + int ytrans; + int xyswap; +}; struct tsdev { int exist; int open; int minor; - char name[16]; + char name[8]; wait_queue_head_t wait; struct list_head list; struct input_handle handle; + int x, y, pressure; + struct ts_calibration cal; }; -/* From Compaq's Touch Screen Specification version 0.2 (draft) */ -typedef struct { - short pressure; - short x; - short y; - short millisecs; -} TS_EVENT; - struct tsdev_list { struct fasync_struct *fasync; struct list_head node; struct tsdev *tsdev; int head, tail; - int oldx, oldy, pendown; - TS_EVENT event[TSDEV_BUFFER_SIZE]; + struct ts_event event[TSDEV_BUFFER_SIZE]; + int raw; }; +/* The following ioctl codes are defined ONLY for backward compatibility. + * Don't use tsdev for new developement; use the tslib library instead. + * Touchscreen calibration is a fully userspace task. + */ +/* Use 'f' as magic number */ +#define IOC_H3600_TS_MAGIC 'f' +#define TS_GET_CAL _IOR(IOC_H3600_TS_MAGIC, 10, struct ts_calibration) +#define TS_SET_CAL _IOW(IOC_H3600_TS_MAGIC, 11, struct ts_calibration) + static struct input_handler tsdev_handler; -static struct tsdev *tsdev_table[TSDEV_MINORS]; +static struct tsdev *tsdev_table[TSDEV_MINORS/2]; static int tsdev_fasync(int fd, struct file *file, int on) { @@ -109,13 +157,16 @@ static int tsdev_open(struct inode *inod int i = iminor(inode) - TSDEV_MINOR_BASE; struct tsdev_list *list; - if (i >= TSDEV_MINORS || !tsdev_table[i]) + if (i >= TSDEV_MINORS || !tsdev_table[i & TSDEV_MINOR_MASK]) return -ENODEV; if (!(list = kmalloc(sizeof(struct tsdev_list), GFP_KERNEL))) return -ENOMEM; memset(list, 0, sizeof(struct tsdev_list)); + list->raw = (i >= TSDEV_MINORS/2) ? 1 : 0; + + i &= TSDEV_MINOR_MASK; list->tsdev = tsdev_table[i]; list_add_tail(&list->node, &tsdev_table[i]->list); file->private_data = list; @@ -169,11 +220,13 @@ static ssize_t tsdev_read(struct file *f if (!list->tsdev->exist) return -ENODEV; - while (list->head != list->tail && retval + sizeof(TS_EVENT) <= count) { - if (copy_to_user (buffer + retval, list->event + list->tail, sizeof(TS_EVENT))) + while (list->head != list->tail && + retval + sizeof (struct ts_event) <= count) { + if (copy_to_user (buffer + retval, list->event + list->tail, + sizeof (struct ts_event))) return -EFAULT; list->tail = (list->tail + 1) & (TSDEV_BUFFER_SIZE - 1); - retval += sizeof(TS_EVENT); + retval += sizeof (struct ts_event); } return retval; @@ -193,22 +246,27 @@ static unsigned int tsdev_poll(struct fi static int tsdev_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) { -/* struct tsdev_list *list = file->private_data; - struct tsdev *evdev = list->tsdev; - struct input_dev *dev = tsdev->handle.dev; - int retval; - + struct tsdev *tsdev = list->tsdev; + int retval = 0; + switch (cmd) { - case HHEHE: - return 0; - case hjff: - return 0; - default: - return 0; + case TS_GET_CAL: + if (copy_to_user ((void *)arg, &tsdev->cal, + sizeof (struct ts_calibration))) + retval = -EFAULT; + break; + case TS_SET_CAL: + if (copy_from_user (&tsdev->cal, (void *)arg, + sizeof (struct ts_calibration))) + retval = -EFAULT; + break; + default: + retval = -EINVAL; + break; } -*/ - return -EINVAL; + + return retval; } struct file_operations tsdev_fops = { @@ -227,82 +285,85 @@ static void tsdev_event(struct input_han struct tsdev *tsdev = handle->private; struct tsdev_list *list; struct timeval time; - int size; - list_for_each_entry(list, &tsdev->list, node) { - switch (type) { - case EV_ABS: - switch (code) { - case ABS_X: - if (!list->pendown) - return; - size = handle->dev->absmax[ABS_X] - handle->dev->absmin[ABS_X]; - if (size > 0) - list->oldx = ((value - handle->dev->absmin[ABS_X]) * xres / size); - else - list->oldx = ((value - handle->dev->absmin[ABS_X])); - break; - case ABS_Y: - if (!list->pendown) - return; - size = handle->dev->absmax[ABS_Y] - handle->dev->absmin[ABS_Y]; - if (size > 0) - list->oldy = ((value - handle->dev->absmin[ABS_Y]) * yres / size); - else - list->oldy = ((value - handle->dev->absmin[ABS_Y])); - break; - case ABS_PRESSURE: - list->pendown = ((value > handle->dev-> absmin[ABS_PRESSURE])) ? - value - handle->dev->absmin[ABS_PRESSURE] : 0; - break; - } + switch (type) { + case EV_ABS: + switch (code) { + case ABS_X: + tsdev->x = value; break; + case ABS_Y: + tsdev->y = value; + break; + case ABS_PRESSURE: + if (value > handle->dev->absmax[ABS_PRESSURE]) + value = handle->dev->absmax[ABS_PRESSURE]; + value -= handle->dev->absmin[ABS_PRESSURE]; + if (value < 0) + value = 0; + tsdev->pressure = value; + break; + } + break; - case EV_REL: - switch (code) { - case REL_X: - if (!list->pendown) - return; - list->oldx += value; - if (list->oldx < 0) - list->oldx = 0; - else if (list->oldx > xres) - list->oldx = xres; + case EV_REL: + switch (code) { + case REL_X: + tsdev->x += value; + if (tsdev->x < 0) + tsdev->x = 0; + else if (tsdev->x > xres) + tsdev->x = xres; + break; + case REL_Y: + tsdev->y += value; + if (tsdev->y < 0) + tsdev->y = 0; + else if (tsdev->y > yres) + tsdev->y = yres; + break; + } + break; + + case EV_KEY: + if (code == BTN_TOUCH || code == BTN_MOUSE) { + switch (value) { + case 0: + tsdev->pressure = 0; break; - case REL_Y: - if (!list->pendown) - return; - list->oldy += value; - if (list->oldy < 0) - list->oldy = 0; - else if (list->oldy > xres) - list->oldy = xres; + case 1: + if (!tsdev->pressure) + tsdev->pressure = 1; break; } - break; - - case EV_KEY: - if (code == BTN_TOUCH || code == BTN_MOUSE) { - switch (value) { - case 0: - list->pendown = 0; - break; - case 1: - if (!list->pendown) - list->pendown = 1; - break; - case 2: - return; - } - } else - return; - break; } + break; + } + + if (type != EV_SYN || code != SYN_REPORT) + return; + + list_for_each_entry(list, &tsdev->list, node) { + int x, y, tmp; + do_gettimeofday(&time); list->event[list->head].millisecs = time.tv_usec / 100; - list->event[list->head].pressure = list->pendown; - list->event[list->head].x = list->oldx; - list->event[list->head].y = list->oldy; + list->event[list->head].pressure = tsdev->pressure; + + x = tsdev->x; + y = tsdev->y; + + /* Calibration */ + if (!list->raw) { + x = ((x * tsdev->cal.xscale) >> 8) + tsdev->cal.xtrans; + y = ((y * tsdev->cal.yscale) >> 8) + tsdev->cal.ytrans; + if (tsdev->cal.xyswap) { + tmp = x; x = y; y = tmp; + } + } + + list->event[list->head].x = x; + list->event[list->head].y = y; list->head = (list->head + 1) & (TSDEV_BUFFER_SIZE - 1); kill_fasync(&list->fasync, SIGIO, POLL_IN); } @@ -314,11 +375,11 @@ static struct input_handle *tsdev_connec struct input_device_id *id) { struct tsdev *tsdev; - int minor; + int minor, delta; - for (minor = 0; minor < TSDEV_MINORS && tsdev_table[minor]; + for (minor = 0; minor < TSDEV_MINORS/2 && tsdev_table[minor]; minor++); - if (minor == TSDEV_MINORS) { + if (minor >= TSDEV_MINORS/2) { printk(KERN_ERR "tsdev: You have way too many touchscreens\n"); return NULL; @@ -340,10 +401,25 @@ static struct input_handle *tsdev_connec tsdev->handle.handler = handler; tsdev->handle.private = tsdev; + /* Precompute the rough calibration matrix */ + delta = dev->absmax [ABS_X] - dev->absmin [ABS_X] + 1; + if (delta == 0) + delta = 1; + tsdev->cal.xscale = (xres << 8) / delta; + tsdev->cal.xtrans = - ((dev->absmin [ABS_X] * tsdev->cal.xscale) >> 8); + + delta = dev->absmax [ABS_Y] - dev->absmin [ABS_Y] + 1; + if (delta == 0) + delta = 1; + tsdev->cal.yscale = (yres << 8) / delta; + tsdev->cal.ytrans = - ((dev->absmin [ABS_Y] * tsdev->cal.yscale) >> 8); + tsdev_table[minor] = tsdev; - + devfs_mk_cdev(MKDEV(INPUT_MAJOR, TSDEV_MINOR_BASE + minor), S_IFCHR|S_IRUGO|S_IWUSR, "input/ts%d", minor); + devfs_mk_cdev(MKDEV(INPUT_MAJOR, TSDEV_MINOR_BASE + minor + TSDEV_MINORS/2), + S_IFCHR|S_IRUGO|S_IWUSR, "input/tsraw%d", minor); class_simple_device_add(input_class, MKDEV(INPUT_MAJOR, TSDEV_MINOR_BASE + minor), dev->dev, "ts%d", minor); @@ -362,6 +438,7 @@ static void tsdev_disconnect(struct inpu wake_up_interruptible(&tsdev->wait); } else tsdev_free(tsdev); + devfs_remove("input/tsraw%d", tsdev->minor); } static struct input_device_id tsdev_ids[] = { @@ -379,6 +456,12 @@ static struct input_device_id tsdev_ids[ .absbit = { BIT(ABS_X) | BIT(ABS_Y) }, },/* A tablet like device, at least touch detection, two absolute axes */ + { + .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_ABSBIT, + .evbit = { BIT(EV_ABS) }, + .absbit = { BIT(ABS_X) | BIT(ABS_Y) | BIT(ABS_PRESSURE) }, + },/* A tablet like device with several gradations of pressure */ + {},/* Terminating entry */ }; --- diff/drivers/md/dm.c 2004-04-21 10:45:34.302488728 +0100 +++ source/drivers/md/dm.c 2004-04-21 10:46:51.515750528 +0100 @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -46,6 +47,7 @@ struct target_io { */ #define DMF_BLOCK_IO 0 #define DMF_SUSPENDED 1 +#define DMF_FS_LOCKED 2 struct mapped_device { struct rw_semaphore lock; @@ -80,6 +82,11 @@ struct mapped_device { */ uint32_t event_nr; wait_queue_head_t eventq; + + /* + * freeze/thaw support require holding onto a super block + */ + struct super_block *frozen_sb; }; #define MIN_IOS 256 @@ -885,6 +892,52 @@ int dm_swap_table(struct mapped_device * } /* + * Functions to lock and unlock any filesystem running on the + * device. + */ +static int __lock_fs(struct mapped_device *md) +{ + struct block_device *bdev; + + if (test_and_set_bit(DMF_FS_LOCKED, &md->flags)) + return 0; + + bdev = bdget_disk(md->disk, 0); + if (!bdev) { + DMWARN("bdget failed in __lock_fs"); + return -ENOMEM; + } + + WARN_ON(md->frozen_sb); + md->frozen_sb = freeze_bdev(bdev); + /* don't bdput right now, we don't want the bdev + * to go away while it is locked. We'll bdput + * in __unlock_fs + */ + return 0; +} + +static int __unlock_fs(struct mapped_device *md) +{ + struct block_device *bdev; + + if (!test_and_clear_bit(DMF_FS_LOCKED, &md->flags)) + return 0; + + bdev = bdget_disk(md->disk, 0); + if (!bdev) { + DMWARN("bdget failed in __unlock_fs"); + return -ENOMEM; + } + + thaw_bdev(bdev, md->frozen_sb); + md->frozen_sb = NULL; + bdput(bdev); + bdput(bdev); + return 0; +} + +/* * We need to be able to change a mapping table under a mounted * filesystem. For example we might want to move some data in * the background. Before the table can be swapped with @@ -896,13 +949,27 @@ int dm_suspend(struct mapped_device *md) struct dm_table *map; DECLARE_WAITQUEUE(wait, current); - down_write(&md->lock); + /* Flush I/O to the device. */ + down_read(&md->lock); + if (test_bit(DMF_BLOCK_IO, &md->flags)) { + up_read(&md->lock); + return -EINVAL; + } + + __lock_fs(md); + up_read(&md->lock); /* * First we set the BLOCK_IO flag so no more ios will be * mapped. */ + down_write(&md->lock); if (test_bit(DMF_BLOCK_IO, &md->flags)) { + /* + * If we get here we know another thread is + * trying to suspend as well, so we leave the fs + * locked for this thread. + */ up_write(&md->lock); return -EINVAL; } @@ -937,6 +1004,7 @@ int dm_suspend(struct mapped_device *md) /* were we interrupted ? */ if (atomic_read(&md->pending)) { + __unlock_fs(md); clear_bit(DMF_BLOCK_IO, &md->flags); up_write(&md->lock); return -EINTR; @@ -974,6 +1042,7 @@ int dm_resume(struct mapped_device *md) def = bio_list_get(&md->deferred); __flush_deferred_io(md, def); up_write(&md->lock); + __unlock_fs(md); dm_table_unplug_all(map); dm_table_put(map); --- diff/drivers/net/8139too.c 2004-04-21 10:45:34.383476416 +0100 +++ source/drivers/net/8139too.c 2004-04-21 10:46:51.516750376 +0100 @@ -1677,11 +1677,17 @@ static void rtl8139_tx_timeout (struct n u8 tmp8; unsigned long flags; - DPRINTK ("%s: Transmit timeout, status %2.2x %4.4x " - "media %2.2x.\n", dev->name, - RTL_R8 (ChipCmd), - RTL_R16 (IntrStatus), - RTL_R8 (MediaStatus)); + printk (KERN_DEBUG "%s: Transmit timeout, status %2.2x %4.4x %4.4x " + "media %2.2x.\n", dev->name, RTL_R8 (ChipCmd), + RTL_R16(IntrStatus), RTL_R16(IntrMask), RTL_R8(MediaStatus)); + /* Emit info to figure out what went wrong. */ + printk (KERN_DEBUG "%s: Tx queue start entry %ld dirty entry %ld.\n", + dev->name, tp->cur_tx, tp->dirty_tx); + for (i = 0; i < NUM_TX_DESC; i++) + printk (KERN_DEBUG "%s: Tx descriptor %d is %8.8lx.%s\n", + dev->name, i, RTL_R32 (TxStatus0 + (i * 4)), + i == tp->dirty_tx % NUM_TX_DESC ? + " (queue head)" : ""); tp->xstats.tx_timeouts++; @@ -1694,15 +1700,6 @@ static void rtl8139_tx_timeout (struct n /* Disable interrupts by clearing the interrupt mask. */ RTL_W16 (IntrMask, 0x0000); - /* Emit info to figure out what went wrong. */ - printk (KERN_DEBUG "%s: Tx queue start entry %ld dirty entry %ld.\n", - dev->name, tp->cur_tx, tp->dirty_tx); - for (i = 0; i < NUM_TX_DESC; i++) - printk (KERN_DEBUG "%s: Tx descriptor %d is %8.8lx.%s\n", - dev->name, i, RTL_R32 (TxStatus0 + (i * 4)), - i == tp->dirty_tx % NUM_TX_DESC ? - " (queue head)" : ""); - /* Stop a shared interrupt from scavenging while we are. */ spin_lock_irqsave (&tp->lock, flags); rtl8139_tx_clear (tp); @@ -1714,7 +1711,6 @@ static void rtl8139_tx_timeout (struct n netif_wake_queue (dev); } spin_unlock(&tp->rx_lock); - } --- diff/drivers/net/82596.c 2004-04-05 12:57:08.000000000 +0100 +++ source/drivers/net/82596.c 2004-04-21 10:46:51.517750224 +0100 @@ -58,7 +58,6 @@ #include #include #include -#include static char version[] __initdata = "82596.c $Revision: 1.5 $\n"; --- diff/drivers/net/Makefile 2004-04-21 10:45:34.386475960 +0100 +++ source/drivers/net/Makefile 2004-04-21 10:46:51.523749312 +0100 @@ -189,4 +189,6 @@ obj-$(CONFIG_NET_TULIP) += tulip/ obj-$(CONFIG_HAMRADIO) += hamradio/ obj-$(CONFIG_IRDA) += irda/ +# Must come after all NICs that might use them obj-$(CONFIG_NETCONSOLE) += netconsole.o +obj-$(CONFIG_KGDB) += kgdb_eth.o --- diff/drivers/net/acenic.c 2004-03-11 10:20:25.000000000 +0000 +++ source/drivers/net/acenic.c 2004-04-21 10:46:51.519749920 +0100 @@ -131,7 +131,6 @@ #define PCI_DEVICE_ID_SGI_ACENIC 0x0009 #endif -#if LINUX_VERSION_CODE >= 0x20400 static struct pci_device_id acenic_pci_tbl[] = { { PCI_VENDOR_ID_ALTEON, PCI_DEVICE_ID_ALTEON_ACENIC_FIBRE, PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_NETWORK_ETHERNET << 8, 0xffff00, }, @@ -156,37 +155,6 @@ static struct pci_device_id acenic_pci_t { } }; MODULE_DEVICE_TABLE(pci, acenic_pci_tbl); -#endif - - -#ifndef MODULE_LICENSE -#define MODULE_LICENSE(a) -#endif - -#ifndef wmb -#define wmb() mb() -#endif - -#ifndef __exit -#define __exit -#endif - -#ifndef __devinit -#define __devinit __init -#endif - -#ifndef SMP_CACHE_BYTES -#define SMP_CACHE_BYTES L1_CACHE_BYTES -#endif - -#ifndef SET_MODULE_OWNER -#define SET_MODULE_OWNER(dev) do{} while(0) -#define ACE_MOD_INC_USE_COUNT MOD_INC_USE_COUNT -#define ACE_MOD_DEC_USE_COUNT MOD_DEC_USE_COUNT -#else -#define ACE_MOD_INC_USE_COUNT do{} while(0) -#define ACE_MOD_DEC_USE_COUNT do{} while(0) -#endif #ifndef SET_NETDEV_DEV #define SET_NETDEV_DEV(net, pdev) do{} while(0) @@ -198,151 +166,8 @@ MODULE_DEVICE_TABLE(pci, acenic_pci_tbl) #define ace_sync_irq(irq) synchronize_irq() #endif -#if LINUX_VERSION_CODE < 0x2051e -#define local_irq_save(flags) do{__save_flags(flags) ; \ - __cli();} while(0) -#define local_irq_restore(flags) __restore_flags(flags) -#endif - -#if (LINUX_VERSION_CODE < 0x02030d) -#define pci_resource_start(dev, bar) dev->base_address[bar] -#elif (LINUX_VERSION_CODE < 0x02032c) -#define pci_resource_start(dev, bar) dev->resource[bar].start -#endif - -#if (LINUX_VERSION_CODE < 0x02030e) -#define net_device device -#endif - - -#if (LINUX_VERSION_CODE < 0x02032a) -typedef u32 dma_addr_t; - -static inline void *pci_alloc_consistent(struct pci_dev *hwdev, size_t size, - dma_addr_t *dma_handle) -{ - void *virt_ptr; - - virt_ptr = kmalloc(size, GFP_KERNEL); - if (!virt_ptr) - return NULL; - *dma_handle = virt_to_bus(virt_ptr); - return virt_ptr; -} - -#define pci_free_consistent(cookie, size, ptr, dma_ptr) kfree(ptr) -#define pci_map_page(cookie, page, off, size, dir) \ - virt_to_bus(page_address(page)+(off)) -#define pci_unmap_page(cookie, address, size, dir) -#define pci_set_dma_mask(dev, mask) \ - (((u64)(mask) & 0xffffffff00000000) == 0 ? 0 : -EIO) -#define pci_dma_supported(dev, mask) \ - (((u64)(mask) & 0xffffffff00000000) == 0 ? 1 : 0) - -#elif (LINUX_VERSION_CODE < 0x02040d) - -/* - * 2.4.13 introduced pci_map_page()/pci_unmap_page() - for 2.4.12 and prior, - * fall back on pci_map_single()/pci_unnmap_single(). - * - * We are guaranteed that the page is mapped at this point since - * pci_map_page() is only used upon valid struct skb's. - */ -static inline dma_addr_t -pci_map_page(struct pci_dev *cookie, struct page *page, unsigned long off, - size_t size, int dir) -{ - void *page_virt; - - page_virt = page_address(page); - if (!page_virt) - BUG(); - return pci_map_single(cookie, (page_virt + off), size, dir); -} -#define pci_unmap_page(cookie, dma_addr, size, dir) \ - pci_unmap_single(cookie, dma_addr, size, dir) -#endif - -#if (LINUX_VERSION_CODE < 0x020412) -#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) -#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) -#define pci_unmap_addr(PTR, ADDR_NAME) 0 -#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) do{} while(0) -#define pci_unmap_len(PTR, LEN_NAME) 0 -#define pci_unmap_len_set(PTR, LEN_NAME, VAL) do{} while(0) -#endif - - -#if (LINUX_VERSION_CODE < 0x02032b) -/* - * SoftNet - * - * For pre-softnet kernels we need to tell the upper layer not to - * re-enter start_xmit() while we are in there. However softnet - * guarantees not to enter while we are in there so there is no need - * to do the netif_stop_queue() dance unless the transmit queue really - * gets stuck. This should also improve performance according to tests - * done by Aman Singla. - */ -#define dev_kfree_skb_irq(a) dev_kfree_skb(a) -#define netif_wake_queue(dev) clear_bit(0, &dev->tbusy) -#define netif_stop_queue(dev) set_bit(0, &dev->tbusy) -#define late_stop_netif_stop_queue(dev) do{} while(0) -#define early_stop_netif_stop_queue(dev) test_and_set_bit(0,&dev->tbusy) -#define early_stop_netif_wake_queue(dev) netif_wake_queue(dev) - -static inline void netif_start_queue(struct net_device *dev) -{ - dev->tbusy = 0; - dev->interrupt = 0; - dev->start = 1; -} - -#define ace_mark_net_bh() mark_bh(NET_BH) -#define netif_queue_stopped(dev) dev->tbusy -#define netif_running(dev) dev->start -#define ace_if_down(dev) do{dev->start = 0;} while(0) - -#define tasklet_struct tq_struct -static inline void tasklet_schedule(struct tasklet_struct *tasklet) -{ - queue_task(tasklet, &tq_immediate); - mark_bh(IMMEDIATE_BH); -} - -static inline void tasklet_init(struct tasklet_struct *tasklet, - void (*func)(unsigned long), - unsigned long data) -{ - tasklet->next = NULL; - tasklet->sync = 0; - tasklet->routine = (void (*)(void *))func; - tasklet->data = (void *)data; -} -#define tasklet_kill(tasklet) do{} while(0) -#else -#define late_stop_netif_stop_queue(dev) netif_stop_queue(dev) -#define early_stop_netif_stop_queue(dev) 0 -#define early_stop_netif_wake_queue(dev) do{} while(0) -#define ace_mark_net_bh() do{} while(0) -#define ace_if_down(dev) do{} while(0) -#endif - -#if (LINUX_VERSION_CODE >= 0x02031b) -#define NEW_NETINIT -#define ACE_PROBE_ARG void -#else -#define ACE_PROBE_ARG struct net_device *dev -#endif - -#ifndef min_t -#define min_t(type,a,b) (((a)<(b))?(a):(b)) -#endif - -#ifndef ARCH_HAS_PREFETCHW -#ifndef prefetchw -#define prefetchw(x) do{} while(0) -#endif +#ifndef offset_in_page +#define offset_in_page(ptr) ((unsigned long)(ptr) & ~PAGE_MASK) #endif #define ACE_MAX_MOD_PARMS 8 @@ -595,407 +420,323 @@ static int max_rx_desc[ACE_MAX_MOD_PARMS static int tx_ratio[ACE_MAX_MOD_PARMS]; static int dis_pci_mem_inval[ACE_MAX_MOD_PARMS] = {1, 1, 1, 1, 1, 1, 1, 1}; +MODULE_AUTHOR("Jes Sorensen "); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("AceNIC/3C985/GA620 Gigabit Ethernet driver"); +MODULE_PARM(link, "1-" __MODULE_STRING(8) "i"); +MODULE_PARM(trace, "1-" __MODULE_STRING(8) "i"); +MODULE_PARM(tx_coal_tick, "1-" __MODULE_STRING(8) "i"); +MODULE_PARM(max_tx_desc, "1-" __MODULE_STRING(8) "i"); +MODULE_PARM(rx_coal_tick, "1-" __MODULE_STRING(8) "i"); +MODULE_PARM(max_rx_desc, "1-" __MODULE_STRING(8) "i"); +MODULE_PARM(tx_ratio, "1-" __MODULE_STRING(8) "i"); +MODULE_PARM_DESC(link, "AceNIC/3C985/NetGear link state"); +MODULE_PARM_DESC(trace, "AceNIC/3C985/NetGear firmware trace level"); +MODULE_PARM_DESC(tx_coal_tick, "AceNIC/3C985/GA620 max clock ticks to wait from first tx descriptor arrives"); +MODULE_PARM_DESC(max_tx_desc, "AceNIC/3C985/GA620 max number of transmit descriptors to wait"); +MODULE_PARM_DESC(rx_coal_tick, "AceNIC/3C985/GA620 max clock ticks to wait from first rx descriptor arrives"); +MODULE_PARM_DESC(max_rx_desc, "AceNIC/3C985/GA620 max number of receive descriptors to wait"); +MODULE_PARM_DESC(tx_ratio, "AceNIC/3C985/GA620 ratio of NIC memory used for TX/RX descriptors (range 0-63)"); + + static char version[] __initdata = "acenic.c: v0.92 08/05/2002 Jes Sorensen, linux-acenic@SunSITE.dk\n" " http://home.cern.ch/~jes/gige/acenic.html\n"; -static struct net_device *root_dev; - -static int probed __initdata = 0; - - -int __devinit acenic_probe (ACE_PROBE_ARG) +static int __devinit acenic_probe_one(struct pci_dev *pdev, + const struct pci_device_id *id) { -#ifdef NEW_NETINIT struct net_device *dev; -#endif struct ace_private *ap; - struct pci_dev *pdev = NULL; - int boards_found = 0; - int version_disp; - - if (probed) - return -ENODEV; - probed++; - - version_disp = 0; - - while ((pdev = pci_find_class(PCI_CLASS_NETWORK_ETHERNET<<8, pdev))) { - - if (!((pdev->vendor == PCI_VENDOR_ID_ALTEON) && - ((pdev->device == PCI_DEVICE_ID_ALTEON_ACENIC_FIBRE) || - (pdev->device == PCI_DEVICE_ID_ALTEON_ACENIC_COPPER)))&& - !((pdev->vendor == PCI_VENDOR_ID_3COM) && - (pdev->device == PCI_DEVICE_ID_3COM_3C985)) && - !((pdev->vendor == PCI_VENDOR_ID_NETGEAR) && - ((pdev->device == PCI_DEVICE_ID_NETGEAR_GA620) || - (pdev->device == PCI_DEVICE_ID_NETGEAR_GA620T))) && - /* - * Farallon used the DEC vendor ID on their cards by - * mistake for a while - */ - !((pdev->vendor == PCI_VENDOR_ID_DEC) && - (pdev->device == PCI_DEVICE_ID_FARALLON_PN9000SX)) && - !((pdev->vendor == PCI_VENDOR_ID_ALTEON) && - (pdev->device == PCI_DEVICE_ID_FARALLON_PN9100T)) && - !((pdev->vendor == PCI_VENDOR_ID_SGI) && - (pdev->device == PCI_DEVICE_ID_SGI_ACENIC))) - continue; - - dev = alloc_etherdev(sizeof(struct ace_private)); - if (dev == NULL) { - printk(KERN_ERR "acenic: Unable to allocate " - "net_device structure!\n"); - break; - } + static int boards_found; - SET_MODULE_OWNER(dev); - SET_NETDEV_DEV(dev, &pdev->dev); + dev = alloc_etherdev(sizeof(struct ace_private)); + if (dev == NULL) { + printk(KERN_ERR "acenic: Unable to allocate " + "net_device structure!\n"); + return -ENOMEM; + } - ap = dev->priv; - ap->pdev = pdev; + SET_MODULE_OWNER(dev); + SET_NETDEV_DEV(dev, &pdev->dev); - dev->open = &ace_open; - dev->hard_start_xmit = &ace_start_xmit; - dev->features |= NETIF_F_SG | NETIF_F_IP_CSUM; + ap = dev->priv; + ap->pdev = pdev; + + dev->features |= NETIF_F_SG | NETIF_F_IP_CSUM; #if ACENIC_DO_VLAN - dev->features |= NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX; - dev->vlan_rx_register = ace_vlan_rx_register; - dev->vlan_rx_kill_vid = ace_vlan_rx_kill_vid; -#endif - if (1) { - static void ace_watchdog(struct net_device *dev); - dev->tx_timeout = &ace_watchdog; - dev->watchdog_timeo = 5*HZ; - } - dev->stop = &ace_close; - dev->get_stats = &ace_get_stats; - dev->set_multicast_list = &ace_set_multicast_list; - dev->do_ioctl = &ace_ioctl; - dev->set_mac_address = &ace_set_mac_addr; - dev->change_mtu = &ace_change_mtu; + dev->features |= NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX; + dev->vlan_rx_register = ace_vlan_rx_register; + dev->vlan_rx_kill_vid = ace_vlan_rx_kill_vid; +#endif + if (1) { + static void ace_watchdog(struct net_device *dev); + dev->tx_timeout = &ace_watchdog; + dev->watchdog_timeo = 5*HZ; + } - /* display version info if adapter is found */ - if (!version_disp) - { - /* set display flag to TRUE so that */ - /* we only display this string ONCE */ - version_disp = 1; - printk(version); - } + dev->open = &ace_open; + dev->stop = &ace_close; + dev->hard_start_xmit = &ace_start_xmit; + dev->get_stats = &ace_get_stats; + dev->set_multicast_list = &ace_set_multicast_list; + dev->do_ioctl = &ace_ioctl; + dev->set_mac_address = &ace_set_mac_addr; + dev->change_mtu = &ace_change_mtu; - if (pci_enable_device(pdev)) { - free_netdev(dev); - continue; - } + /* we only display this string ONCE */ + if (!boards_found) + printk(version); - /* - * Enable master mode before we start playing with the - * pci_command word since pci_set_master() will modify - * it. - */ - pci_set_master(pdev); + if (pci_enable_device(pdev)) + goto fail_free_netdev; - pci_read_config_word(pdev, PCI_COMMAND, &ap->pci_command); + /* + * Enable master mode before we start playing with the + * pci_command word since pci_set_master() will modify + * it. + */ + pci_set_master(pdev); - /* OpenFirmware on Mac's does not set this - DOH.. */ - if (!(ap->pci_command & PCI_COMMAND_MEMORY)) { - printk(KERN_INFO "%s: Enabling PCI Memory Mapped " - "access - was not enabled by BIOS/Firmware\n", - dev->name); - ap->pci_command = ap->pci_command | PCI_COMMAND_MEMORY; - pci_write_config_word(ap->pdev, PCI_COMMAND, - ap->pci_command); - wmb(); - } + pci_read_config_word(pdev, PCI_COMMAND, &ap->pci_command); - pci_read_config_byte(pdev, PCI_LATENCY_TIMER, - &ap->pci_latency); - if (ap->pci_latency <= 0x40) { - ap->pci_latency = 0x40; - pci_write_config_byte(pdev, PCI_LATENCY_TIMER, - ap->pci_latency); - } + /* OpenFirmware on Mac's does not set this - DOH.. */ + if (!(ap->pci_command & PCI_COMMAND_MEMORY)) { + printk(KERN_INFO "%s: Enabling PCI Memory Mapped " + "access - was not enabled by BIOS/Firmware\n", + dev->name); + ap->pci_command = ap->pci_command | PCI_COMMAND_MEMORY; + pci_write_config_word(ap->pdev, PCI_COMMAND, + ap->pci_command); + wmb(); + } - /* - * Remap the regs into kernel space - this is abuse of - * dev->base_addr since it was means for I/O port - * addresses but who gives a damn. - */ - dev->base_addr = pci_resource_start(pdev, 0); - ap->regs = (struct ace_regs *)ioremap(dev->base_addr, 0x4000); - if (!ap->regs) { - printk(KERN_ERR "%s: Unable to map I/O register, " - "AceNIC %i will be disabled.\n", - dev->name, boards_found); - break; - } + pci_read_config_byte(pdev, PCI_LATENCY_TIMER, &ap->pci_latency); + if (ap->pci_latency <= 0x40) { + ap->pci_latency = 0x40; + pci_write_config_byte(pdev, PCI_LATENCY_TIMER, ap->pci_latency); + } - switch(pdev->vendor) { - case PCI_VENDOR_ID_ALTEON: - if (pdev->device == PCI_DEVICE_ID_FARALLON_PN9100T) { - strncpy(ap->name, "Farallon PN9100-T " - "Gigabit Ethernet", sizeof (ap->name)); - printk(KERN_INFO "%s: Farallon PN9100-T ", - dev->name); - } else { - strncpy(ap->name, "AceNIC Gigabit Ethernet", - sizeof (ap->name)); - printk(KERN_INFO "%s: Alteon AceNIC ", - dev->name); - } - break; - case PCI_VENDOR_ID_3COM: - strncpy(ap->name, "3Com 3C985 Gigabit Ethernet", - sizeof (ap->name)); - printk(KERN_INFO "%s: 3Com 3C985 ", dev->name); - break; - case PCI_VENDOR_ID_NETGEAR: - strncpy(ap->name, "NetGear GA620 Gigabit Ethernet", - sizeof (ap->name)); - printk(KERN_INFO "%s: NetGear GA620 ", dev->name); - break; - case PCI_VENDOR_ID_DEC: - if (pdev->device == PCI_DEVICE_ID_FARALLON_PN9000SX) { - strncpy(ap->name, "Farallon PN9000-SX " - "Gigabit Ethernet", sizeof (ap->name)); - printk(KERN_INFO "%s: Farallon PN9000-SX ", - dev->name); - break; - } - case PCI_VENDOR_ID_SGI: - strncpy(ap->name, "SGI AceNIC Gigabit Ethernet", + /* + * Remap the regs into kernel space - this is abuse of + * dev->base_addr since it was means for I/O port + * addresses but who gives a damn. + */ + dev->base_addr = pci_resource_start(pdev, 0); + ap->regs = (struct ace_regs *)ioremap(dev->base_addr, 0x4000); + if (!ap->regs) { + printk(KERN_ERR "%s: Unable to map I/O register, " + "AceNIC %i will be disabled.\n", + dev->name, boards_found); + goto fail_free_netdev; + } + + switch(pdev->vendor) { + case PCI_VENDOR_ID_ALTEON: + if (pdev->device == PCI_DEVICE_ID_FARALLON_PN9100T) { + strncpy(ap->name, "Farallon PN9100-T " + "Gigabit Ethernet", sizeof (ap->name)); + printk(KERN_INFO "%s: Farallon PN9100-T ", + dev->name); + } else { + strncpy(ap->name, "AceNIC Gigabit Ethernet", sizeof (ap->name)); - printk(KERN_INFO "%s: SGI AceNIC ", dev->name); - break; - default: - strncpy(ap->name, "Unknown AceNIC based Gigabit " - "Ethernet", sizeof (ap->name)); - printk(KERN_INFO "%s: Unknown AceNIC ", dev->name); + printk(KERN_INFO "%s: Alteon AceNIC ", + dev->name); + } + break; + case PCI_VENDOR_ID_3COM: + strncpy(ap->name, "3Com 3C985 Gigabit Ethernet", + sizeof (ap->name)); + printk(KERN_INFO "%s: 3Com 3C985 ", dev->name); + break; + case PCI_VENDOR_ID_NETGEAR: + strncpy(ap->name, "NetGear GA620 Gigabit Ethernet", + sizeof (ap->name)); + printk(KERN_INFO "%s: NetGear GA620 ", dev->name); + break; + case PCI_VENDOR_ID_DEC: + if (pdev->device == PCI_DEVICE_ID_FARALLON_PN9000SX) { + strncpy(ap->name, "Farallon PN9000-SX " + "Gigabit Ethernet", sizeof (ap->name)); + printk(KERN_INFO "%s: Farallon PN9000-SX ", + dev->name); break; } - ap->name [sizeof (ap->name) - 1] = '\0'; - printk("Gigabit Ethernet at 0x%08lx, ", dev->base_addr); + case PCI_VENDOR_ID_SGI: + strncpy(ap->name, "SGI AceNIC Gigabit Ethernet", + sizeof (ap->name)); + printk(KERN_INFO "%s: SGI AceNIC ", dev->name); + break; + default: + strncpy(ap->name, "Unknown AceNIC based Gigabit " + "Ethernet", sizeof (ap->name)); + printk(KERN_INFO "%s: Unknown AceNIC ", dev->name); + break; + } + + ap->name [sizeof (ap->name) - 1] = '\0'; + printk("Gigabit Ethernet at 0x%08lx, ", dev->base_addr); #ifdef __sparc__ - printk("irq %s\n", __irq_itoa(pdev->irq)); + printk("irq %s\n", __irq_itoa(pdev->irq)); #else - printk("irq %i\n", pdev->irq); + printk("irq %i\n", pdev->irq); #endif #ifdef CONFIG_ACENIC_OMIT_TIGON_I - if ((readl(&ap->regs->HostCtrl) >> 28) == 4) { - printk(KERN_ERR "%s: Driver compiled without Tigon I" - " support - NIC disabled\n", dev->name); - ace_init_cleanup(dev); - free_netdev(dev); - continue; - } + if ((readl(&ap->regs->HostCtrl) >> 28) == 4) { + printk(KERN_ERR "%s: Driver compiled without Tigon I" + " support - NIC disabled\n", dev->name); + goto fail_uninit; + } #endif - if (ace_allocate_descriptors(dev)) { - /* - * ace_allocate_descriptors() calls - * ace_init_cleanup() on error. - */ - free_netdev(dev); - continue; - } + if (ace_allocate_descriptors(dev)) + goto fail_free_netdev; #ifdef MODULE - if (boards_found >= ACE_MAX_MOD_PARMS) - ap->board_idx = BOARD_IDX_OVERFLOW; - else - ap->board_idx = boards_found; + if (boards_found >= ACE_MAX_MOD_PARMS) + ap->board_idx = BOARD_IDX_OVERFLOW; + else + ap->board_idx = boards_found; #else - ap->board_idx = BOARD_IDX_STATIC; + ap->board_idx = BOARD_IDX_STATIC; #endif - if (ace_init(dev)) { - /* - * ace_init() calls ace_init_cleanup() on error. - */ - free_netdev(dev); - continue; - } + if (ace_init(dev)) + goto fail_free_netdev; - if (register_netdev(dev)) { - printk(KERN_ERR "acenic: device registration failed\n"); - ace_init_cleanup(dev); - free_netdev(dev); - continue; - } - - if (ap->pci_using_dac) - dev->features |= NETIF_F_HIGHDMA; - - boards_found++; + if (register_netdev(dev)) { + printk(KERN_ERR "acenic: device registration failed\n"); + goto fail_uninit; } - /* - * If we're at this point we're going through ace_probe() for - * the first time. Return success (0) if we've initialized 1 - * or more boards. Otherwise, return failure (-ENODEV). - */ - - if (boards_found > 0) - return 0; - else - return -ENODEV; -} + if (ap->pci_using_dac) + dev->features |= NETIF_F_HIGHDMA; + pci_set_drvdata(pdev, dev); -#ifdef MODULE -MODULE_AUTHOR("Jes Sorensen "); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("AceNIC/3C985/GA620 Gigabit Ethernet driver"); -MODULE_PARM(link, "1-" __MODULE_STRING(8) "i"); -MODULE_PARM(trace, "1-" __MODULE_STRING(8) "i"); -MODULE_PARM(tx_coal_tick, "1-" __MODULE_STRING(8) "i"); -MODULE_PARM(max_tx_desc, "1-" __MODULE_STRING(8) "i"); -MODULE_PARM(rx_coal_tick, "1-" __MODULE_STRING(8) "i"); -MODULE_PARM(max_rx_desc, "1-" __MODULE_STRING(8) "i"); -MODULE_PARM(tx_ratio, "1-" __MODULE_STRING(8) "i"); -MODULE_PARM_DESC(link, "AceNIC/3C985/NetGear link state"); -MODULE_PARM_DESC(trace, "AceNIC/3C985/NetGear firmware trace level"); -MODULE_PARM_DESC(tx_coal_tick, "AceNIC/3C985/GA620 max clock ticks to wait from first tx descriptor arrives"); -MODULE_PARM_DESC(max_tx_desc, "AceNIC/3C985/GA620 max number of transmit descriptors to wait"); -MODULE_PARM_DESC(rx_coal_tick, "AceNIC/3C985/GA620 max clock ticks to wait from first rx descriptor arrives"); -MODULE_PARM_DESC(max_rx_desc, "AceNIC/3C985/GA620 max number of receive descriptors to wait"); -MODULE_PARM_DESC(tx_ratio, "AceNIC/3C985/GA620 ratio of NIC memory used for TX/RX descriptors (range 0-63)"); -#endif + boards_found++; + return 0; + fail_uninit: + ace_init_cleanup(dev); + fail_free_netdev: + free_netdev(dev); + return -ENODEV; +} -static void __exit ace_module_cleanup(void) +static void __devexit acenic_remove_one(struct pci_dev *pdev) { - struct ace_private *ap; - struct ace_regs *regs; - struct net_device *next; + struct net_device *dev = pci_get_drvdata(pdev); + struct ace_private *ap = dev->priv; + struct ace_regs *regs = ap->regs; short i; - while (root_dev) { - ap = root_dev->priv; - next = ap->next; - unregister_netdev(root_dev); + unregister_netdev(dev); - regs = ap->regs; - - writel(readl(®s->CpuCtrl) | CPU_HALT, ®s->CpuCtrl); - if (ap->version >= 2) - writel(readl(®s->CpuBCtrl) | CPU_HALT, - ®s->CpuBCtrl); - /* - * This clears any pending interrupts - */ - writel(1, ®s->Mb0Lo); - readl(®s->CpuCtrl); /* flush */ + writel(readl(®s->CpuCtrl) | CPU_HALT, ®s->CpuCtrl); + if (ap->version >= 2) + writel(readl(®s->CpuBCtrl) | CPU_HALT, ®s->CpuBCtrl); + + /* + * This clears any pending interrupts + */ + writel(1, ®s->Mb0Lo); + readl(®s->CpuCtrl); /* flush */ - /* - * Make sure no other CPUs are processing interrupts - * on the card before the buffers are being released. - * Otherwise one might experience some `interesting' - * effects. - * - * Then release the RX buffers - jumbo buffers were - * already released in ace_close(). - */ - ace_sync_irq(root_dev->irq); + /* + * Make sure no other CPUs are processing interrupts + * on the card before the buffers are being released. + * Otherwise one might experience some `interesting' + * effects. + * + * Then release the RX buffers - jumbo buffers were + * already released in ace_close(). + */ + ace_sync_irq(dev->irq); - for (i = 0; i < RX_STD_RING_ENTRIES; i++) { - struct sk_buff *skb = ap->skb->rx_std_skbuff[i].skb; + for (i = 0; i < RX_STD_RING_ENTRIES; i++) { + struct sk_buff *skb = ap->skb->rx_std_skbuff[i].skb; - if (skb) { - struct ring_info *ringp; - dma_addr_t mapping; + if (skb) { + struct ring_info *ringp; + dma_addr_t mapping; - ringp = &ap->skb->rx_std_skbuff[i]; - mapping = pci_unmap_addr(ringp, mapping); - pci_unmap_page(ap->pdev, mapping, - ACE_STD_BUFSIZE - (2 + 16), - PCI_DMA_FROMDEVICE); + ringp = &ap->skb->rx_std_skbuff[i]; + mapping = pci_unmap_addr(ringp, mapping); + pci_unmap_page(ap->pdev, mapping, + ACE_STD_BUFSIZE - (2 + 16), + PCI_DMA_FROMDEVICE); - ap->rx_std_ring[i].size = 0; - ap->skb->rx_std_skbuff[i].skb = NULL; - dev_kfree_skb(skb); - } - } - if (ap->version >= 2) { - for (i = 0; i < RX_MINI_RING_ENTRIES; i++) { - struct sk_buff *skb = ap->skb->rx_mini_skbuff[i].skb; - - if (skb) { - struct ring_info *ringp; - dma_addr_t mapping; - - ringp = &ap->skb->rx_mini_skbuff[i]; - mapping = pci_unmap_addr(ringp,mapping); - pci_unmap_page(ap->pdev, mapping, - ACE_MINI_BUFSIZE - (2 + 16), - PCI_DMA_FROMDEVICE); - - ap->rx_mini_ring[i].size = 0; - ap->skb->rx_mini_skbuff[i].skb = NULL; - dev_kfree_skb(skb); - } - } + ap->rx_std_ring[i].size = 0; + ap->skb->rx_std_skbuff[i].skb = NULL; + dev_kfree_skb(skb); } - for (i = 0; i < RX_JUMBO_RING_ENTRIES; i++) { - struct sk_buff *skb = ap->skb->rx_jumbo_skbuff[i].skb; + } + + if (ap->version >= 2) { + for (i = 0; i < RX_MINI_RING_ENTRIES; i++) { + struct sk_buff *skb = ap->skb->rx_mini_skbuff[i].skb; + if (skb) { struct ring_info *ringp; dma_addr_t mapping; - ringp = &ap->skb->rx_jumbo_skbuff[i]; - mapping = pci_unmap_addr(ringp, mapping); + ringp = &ap->skb->rx_mini_skbuff[i]; + mapping = pci_unmap_addr(ringp,mapping); pci_unmap_page(ap->pdev, mapping, - ACE_JUMBO_BUFSIZE - (2 + 16), + ACE_MINI_BUFSIZE - (2 + 16), PCI_DMA_FROMDEVICE); - ap->rx_jumbo_ring[i].size = 0; - ap->skb->rx_jumbo_skbuff[i].skb = NULL; + ap->rx_mini_ring[i].size = 0; + ap->skb->rx_mini_skbuff[i].skb = NULL; dev_kfree_skb(skb); } } - - ace_init_cleanup(root_dev); - free_netdev(root_dev); - root_dev = next; } -} + for (i = 0; i < RX_JUMBO_RING_ENTRIES; i++) { + struct sk_buff *skb = ap->skb->rx_jumbo_skbuff[i].skb; + if (skb) { + struct ring_info *ringp; + dma_addr_t mapping; -int __init ace_module_init(void) -{ - int status; + ringp = &ap->skb->rx_jumbo_skbuff[i]; + mapping = pci_unmap_addr(ringp, mapping); + pci_unmap_page(ap->pdev, mapping, + ACE_JUMBO_BUFSIZE - (2 + 16), + PCI_DMA_FROMDEVICE); - root_dev = NULL; + ap->rx_jumbo_ring[i].size = 0; + ap->skb->rx_jumbo_skbuff[i].skb = NULL; + dev_kfree_skb(skb); + } + } -#ifdef NEW_NETINIT - status = acenic_probe(); -#else - status = acenic_probe(NULL); -#endif - return status; + ace_init_cleanup(dev); + free_netdev(dev); } +static struct pci_driver acenic_pci_driver = { + .name = "acenic", + .id_table = acenic_pci_tbl, + .probe = acenic_probe_one, + .remove = __devexit_p(acenic_remove_one), +}; -#if (LINUX_VERSION_CODE < 0x02032a) -#ifdef MODULE -int init_module(void) +static int __init acenic_init(void) { - return ace_module_init(); + return pci_module_init(&acenic_pci_driver); } - -void cleanup_module(void) +static void __exit acenic_exit(void) { - ace_module_cleanup(); + pci_unregister_driver(&acenic_pci_driver); } -#endif -#else -module_init(ace_module_init); -module_exit(ace_module_cleanup); -#endif +module_init(acenic_init); +module_exit(acenic_exit); static void ace_free_descriptors(struct net_device *dev) { @@ -1462,13 +1203,6 @@ static int __init ace_init(struct net_de } else dev->irq = pdev->irq; - /* - * Register the device here to be able to catch allocated - * interrupt handlers in case the firmware doesn't come up. - */ - ap->next = root_dev; - root_dev = dev; - #ifdef INDEX_DEBUG spin_lock_init(&ap->debug_lock); ap->last_tx = ACE_TX_RING_ENTRIES(ap) - 1; @@ -2642,8 +2376,6 @@ static int ace_open(struct net_device *d netif_start_queue(dev); - ACE_MOD_INC_USE_COUNT; - /* * Setup the bottom half rx ring refill handler */ @@ -2660,8 +2392,6 @@ static int ace_close(struct net_device * unsigned long flags; short i; - ace_if_down(dev); - /* * Without (or before) releasing irq and stopping hardware, this * is an absolute non-sense, by the way. It will be reset instantly @@ -2733,7 +2463,6 @@ static int ace_close(struct net_device * ace_unmask_irq(dev); local_irq_restore(flags); - ACE_MOD_DEC_USE_COUNT; return 0; } @@ -2790,12 +2519,6 @@ static int ace_start_xmit(struct sk_buff struct tx_desc *desc; u32 idx, flagsize; - /* - * This only happens with pre-softnet, ie. 2.2.x kernels. - */ - if (early_stop_netif_stop_queue(dev)) - return 1; - restart: idx = ap->tx_prd; --- diff/drivers/net/epic100.c 2004-04-05 12:57:08.000000000 +0100 +++ source/drivers/net/epic100.c 2004-04-21 10:46:51.521749616 +0100 @@ -96,9 +96,9 @@ static int rx_copybreak; Making the Tx ring too large decreases the effectiveness of channel bonding and packet priority. There are no ill effects from too-large receive rings. */ -#define TX_RING_SIZE 16 -#define TX_QUEUE_LEN 10 /* Limit ring entries actually used. */ -#define RX_RING_SIZE 32 +#define TX_RING_SIZE 256 +#define TX_QUEUE_LEN 240 /* Limit ring entries actually used. */ +#define RX_RING_SIZE 256 #define TX_TOTAL_SIZE TX_RING_SIZE*sizeof(struct epic_tx_desc) #define RX_TOTAL_SIZE RX_RING_SIZE*sizeof(struct epic_rx_desc) @@ -292,6 +292,12 @@ enum CommandBits { StopTxDMA=0x20, StopRxDMA=0x40, RestartTx=0x80, }; +#define EpicRemoved 0xffffffff /* Chip failed or removed (CardBus) */ + +#define EpicNapiEvent (TxEmpty | TxDone | \ + RxDone | RxStarted | RxEarlyWarn | RxOverflow | RxFull) +#define EpicNormalEvent (0x0000ffff & ~EpicNapiEvent) + static u16 media2miictl[16] = { 0, 0x0C00, 0x0C00, 0x2000, 0x0100, 0x2100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; @@ -330,9 +336,12 @@ struct epic_private { /* Ring pointers. */ spinlock_t lock; /* Group with Tx control cache line. */ + spinlock_t napi_lock; + unsigned int reschedule_in_poll; unsigned int cur_tx, dirty_tx; unsigned int cur_rx, dirty_rx; + u32 irq_mask; unsigned int rx_buf_sz; /* Based on MTU+slack. */ struct pci_dev *pci_dev; /* PCI bus location. */ @@ -359,7 +368,8 @@ static void epic_timer(unsigned long dat static void epic_tx_timeout(struct net_device *dev); static void epic_init_ring(struct net_device *dev); static int epic_start_xmit(struct sk_buff *skb, struct net_device *dev); -static int epic_rx(struct net_device *dev); +static int epic_rx(struct net_device *dev, int budget); +static int epic_poll(struct net_device *dev, int *budget); static irqreturn_t epic_interrupt(int irq, void *dev_instance, struct pt_regs *regs); static int netdev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); static struct ethtool_ops netdev_ethtool_ops; @@ -378,7 +388,7 @@ static int __devinit epic_init_one (stru int irq; struct net_device *dev; struct epic_private *ep; - int i, option = 0, duplex = 0; + int i, ret, option = 0, duplex = 0; void *ring_space; dma_addr_t ring_dma; @@ -392,29 +402,33 @@ static int __devinit epic_init_one (stru card_idx++; - i = pci_enable_device(pdev); - if (i) - return i; + ret = pci_enable_device(pdev); + if (ret) + goto out; irq = pdev->irq; if (pci_resource_len(pdev, 0) < pci_id_tbl[chip_idx].io_size) { printk (KERN_ERR "card %d: no PCI region space\n", card_idx); - return -ENODEV; + ret = -ENODEV; + goto err_out_disable; } pci_set_master(pdev); + ret = pci_request_regions(pdev, DRV_NAME); + if (ret < 0) + goto err_out_disable; + + ret = -ENOMEM; + dev = alloc_etherdev(sizeof (*ep)); if (!dev) { printk (KERN_ERR "card %d: no memory for eth device\n", card_idx); - return -ENOMEM; + goto err_out_free_res; } SET_MODULE_OWNER(dev); SET_NETDEV_DEV(dev, &pdev->dev); - if (pci_request_regions(pdev, DRV_NAME)) - goto err_out_free_netdev; - #ifdef USE_IO_OPS ioaddr = pci_resource_start (pdev, 0); #else @@ -422,7 +436,7 @@ static int __devinit epic_init_one (stru ioaddr = (long) ioremap (ioaddr, pci_resource_len (pdev, 1)); if (!ioaddr) { printk (KERN_ERR DRV_NAME " %d: ioremap failed\n", card_idx); - goto err_out_free_res; + goto err_out_free_netdev; } #endif @@ -459,7 +473,9 @@ static int __devinit epic_init_one (stru dev->base_addr = ioaddr; dev->irq = irq; - spin_lock_init (&ep->lock); + spin_lock_init(&ep->lock); + spin_lock_init(&ep->napi_lock); + ep->reschedule_in_poll = 0; /* Bring the chip out of low-power mode. */ outl(0x4200, ioaddr + GENCTL); @@ -489,6 +505,9 @@ static int __devinit epic_init_one (stru ep->pci_dev = pdev; ep->chip_id = chip_idx; ep->chip_flags = pci_id_tbl[chip_idx].drv_flags; + ep->irq_mask = + (ep->chip_flags & TYPE2_INTR ? PCIBusErr175 : PCIBusErr170) + | CntFull | TxUnderrun | EpicNapiEvent; /* Find the connected MII xcvrs. Doing this in open() would allow detecting external xcvrs later, but @@ -543,10 +562,12 @@ static int __devinit epic_init_one (stru dev->ethtool_ops = &netdev_ethtool_ops; dev->watchdog_timeo = TX_TIMEOUT; dev->tx_timeout = &epic_tx_timeout; + dev->poll = epic_poll; + dev->weight = 64; - i = register_netdev(dev); - if (i) - goto err_out_unmap_tx; + ret = register_netdev(dev); + if (ret < 0) + goto err_out_unmap_rx; printk(KERN_INFO "%s: %s at %#lx, IRQ %d, ", dev->name, pci_id_tbl[chip_idx].name, ioaddr, dev->irq); @@ -554,19 +575,24 @@ static int __devinit epic_init_one (stru printk("%2.2x:", dev->dev_addr[i]); printk("%2.2x.\n", dev->dev_addr[i]); - return 0; +out: + return ret; +err_out_unmap_rx: + pci_free_consistent(pdev, RX_TOTAL_SIZE, ep->rx_ring, ep->rx_ring_dma); err_out_unmap_tx: pci_free_consistent(pdev, TX_TOTAL_SIZE, ep->tx_ring, ep->tx_ring_dma); err_out_iounmap: #ifndef USE_IO_OPS iounmap(ioaddr); -err_out_free_res: -#endif - pci_release_regions(pdev); err_out_free_netdev: +#endif free_netdev(dev); - return -ENODEV; +err_out_free_res: + pci_release_regions(pdev); +err_out_disable: + pci_disable_device(pdev); + goto out; } /* Serial EEPROM section. */ @@ -592,6 +618,36 @@ err_out_free_netdev: #define EE_READ256_CMD (6 << 8) #define EE_ERASE_CMD (7 << 6) +static void epic_disable_int(struct net_device *dev, struct epic_private *ep) +{ + long ioaddr = dev->base_addr; + + outl(0x00000000, ioaddr + INTMASK); +} + +static inline void __epic_pci_commit(long ioaddr) +{ +#ifndef USE_IO_OPS + inl(ioaddr + INTMASK); +#endif +} + +static void epic_napi_irq_off(struct net_device *dev, struct epic_private *ep) +{ + long ioaddr = dev->base_addr; + + outl(ep->irq_mask & ~EpicNapiEvent, ioaddr + INTMASK); + __epic_pci_commit(ioaddr); +} + +static void epic_napi_irq_on(struct net_device *dev, struct epic_private *ep) +{ + long ioaddr = dev->base_addr; + + /* No need to commit possible posted write */ + outl(ep->irq_mask | EpicNapiEvent, ioaddr + INTMASK); +} + static int __devinit read_eeprom(long ioaddr, int location) { int i; @@ -752,9 +808,8 @@ static int epic_open(struct net_device * /* Enable interrupts by setting the interrupt mask. */ outl((ep->chip_flags & TYPE2_INTR ? PCIBusErr175 : PCIBusErr170) - | CntFull | TxUnderrun | TxDone | TxEmpty - | RxError | RxOverflow | RxFull | RxHeader | RxDone, - ioaddr + INTMASK); + | CntFull | TxUnderrun + | RxError | RxHeader | EpicNapiEvent, ioaddr + INTMASK); if (debug > 1) printk(KERN_DEBUG "%s: epic_open() ioaddr %lx IRQ %d status %4.4x " @@ -795,7 +850,7 @@ static void epic_pause(struct net_device } /* Remove the packets on the Rx queue. */ - epic_rx(dev); + epic_rx(dev, RX_RING_SIZE); } static void epic_restart(struct net_device *dev) @@ -841,9 +896,9 @@ static void epic_restart(struct net_devi /* Enable interrupts by setting the interrupt mask. */ outl((ep->chip_flags & TYPE2_INTR ? PCIBusErr175 : PCIBusErr170) - | CntFull | TxUnderrun | TxDone | TxEmpty - | RxError | RxOverflow | RxFull | RxHeader | RxDone, - ioaddr + INTMASK); + | CntFull | TxUnderrun + | RxError | RxHeader | EpicNapiEvent, ioaddr + INTMASK); + printk(KERN_DEBUG "%s: epic_restart() done, cmd status %4.4x, ctl %4.4x" " interrupt %4.4x.\n", dev->name, (int)inl(ioaddr + COMMAND), (int)inl(ioaddr + GENCTL), @@ -929,7 +984,6 @@ static void epic_init_ring(struct net_de int i; ep->tx_full = 0; - ep->lock = (spinlock_t) SPIN_LOCK_UNLOCKED; ep->dirty_tx = ep->cur_tx = 0; ep->cur_rx = ep->dirty_rx = 0; ep->rx_buf_sz = (dev->mtu <= 1500 ? PKT_BUF_SZ : dev->mtu + 32); @@ -1029,6 +1083,76 @@ static int epic_start_xmit(struct sk_buf return 0; } +static void epic_tx_error(struct net_device *dev, struct epic_private *ep, + int status) +{ + struct net_device_stats *stats = &ep->stats; + +#ifndef final_version + /* There was an major error, log it. */ + if (debug > 1) + printk(KERN_DEBUG "%s: Transmit error, Tx status %8.8x.\n", + dev->name, status); +#endif + stats->tx_errors++; + if (status & 0x1050) + stats->tx_aborted_errors++; + if (status & 0x0008) + stats->tx_carrier_errors++; + if (status & 0x0040) + stats->tx_window_errors++; + if (status & 0x0010) + stats->tx_fifo_errors++; +} + +static void epic_tx(struct net_device *dev, struct epic_private *ep) +{ + unsigned int dirty_tx, cur_tx; + + /* + * Note: if this lock becomes a problem we can narrow the locked + * region at the cost of occasionally grabbing the lock more times. + */ + cur_tx = ep->cur_tx; + for (dirty_tx = ep->dirty_tx; cur_tx - dirty_tx > 0; dirty_tx++) { + struct sk_buff *skb; + int entry = dirty_tx % TX_RING_SIZE; + int txstatus = le32_to_cpu(ep->tx_ring[entry].txstatus); + + if (txstatus & DescOwn) + break; /* It still hasn't been Txed */ + + if (likely(txstatus & 0x0001)) { + ep->stats.collisions += (txstatus >> 8) & 15; + ep->stats.tx_packets++; + ep->stats.tx_bytes += ep->tx_skbuff[entry]->len; + } else + epic_tx_error(dev, ep, txstatus); + + /* Free the original skb. */ + skb = ep->tx_skbuff[entry]; + pci_unmap_single(ep->pci_dev, ep->tx_ring[entry].bufaddr, + skb->len, PCI_DMA_TODEVICE); + dev_kfree_skb_irq(skb); + ep->tx_skbuff[entry] = 0; + } + +#ifndef final_version + if (cur_tx - dirty_tx > TX_RING_SIZE) { + printk(KERN_WARNING + "%s: Out-of-sync dirty pointer, %d vs. %d, full=%d.\n", + dev->name, dirty_tx, cur_tx, ep->tx_full); + dirty_tx += TX_RING_SIZE; + } +#endif + ep->dirty_tx = dirty_tx; + if (ep->tx_full && cur_tx - dirty_tx < TX_QUEUE_LEN - 4) { + /* The ring is no longer full, allow new TX entries. */ + ep->tx_full = 0; + netif_wake_queue(dev); + } +} + /* The interrupt handler does all of the Rx thread work and cleans up after the Tx thread. */ static irqreturn_t epic_interrupt(int irq, void *dev_instance, struct pt_regs *regs) @@ -1042,7 +1166,7 @@ static irqreturn_t epic_interrupt(int ir do { status = inl(ioaddr + INTSTAT); /* Acknowledge all of the current interrupt sources ASAP. */ - outl(status & 0x00007fff, ioaddr + INTSTAT); + outl(status & EpicNormalEvent, ioaddr + INTSTAT); if (debug > 4) printk(KERN_DEBUG "%s: Interrupt, status=%#8.8x new " @@ -1053,74 +1177,21 @@ static irqreturn_t epic_interrupt(int ir break; handled = 1; - if (status & (RxDone | RxStarted | RxEarlyWarn | RxOverflow)) - epic_rx(dev); - - if (status & (TxEmpty | TxDone)) { - unsigned int dirty_tx, cur_tx; - - /* Note: if this lock becomes a problem we can narrow the locked - region at the cost of occasionally grabbing the lock more - times. */ - spin_lock(&ep->lock); - cur_tx = ep->cur_tx; - dirty_tx = ep->dirty_tx; - for (; cur_tx - dirty_tx > 0; dirty_tx++) { - struct sk_buff *skb; - int entry = dirty_tx % TX_RING_SIZE; - int txstatus = le32_to_cpu(ep->tx_ring[entry].txstatus); - - if (txstatus & DescOwn) - break; /* It still hasn't been Txed */ - - if ( ! (txstatus & 0x0001)) { - /* There was an major error, log it. */ -#ifndef final_version - if (debug > 1) - printk(KERN_DEBUG "%s: Transmit error, Tx status %8.8x.\n", - dev->name, txstatus); -#endif - ep->stats.tx_errors++; - if (txstatus & 0x1050) ep->stats.tx_aborted_errors++; - if (txstatus & 0x0008) ep->stats.tx_carrier_errors++; - if (txstatus & 0x0040) ep->stats.tx_window_errors++; - if (txstatus & 0x0010) ep->stats.tx_fifo_errors++; - } else { - ep->stats.collisions += (txstatus >> 8) & 15; - ep->stats.tx_packets++; - ep->stats.tx_bytes += ep->tx_skbuff[entry]->len; - } - - /* Free the original skb. */ - skb = ep->tx_skbuff[entry]; - pci_unmap_single(ep->pci_dev, ep->tx_ring[entry].bufaddr, - skb->len, PCI_DMA_TODEVICE); - dev_kfree_skb_irq(skb); - ep->tx_skbuff[entry] = 0; - } - -#ifndef final_version - if (cur_tx - dirty_tx > TX_RING_SIZE) { - printk(KERN_WARNING "%s: Out-of-sync dirty pointer, %d vs. %d, full=%d.\n", - dev->name, dirty_tx, cur_tx, ep->tx_full); - dirty_tx += TX_RING_SIZE; - } -#endif - ep->dirty_tx = dirty_tx; - if (ep->tx_full - && cur_tx - dirty_tx < TX_QUEUE_LEN - 4) { - /* The ring is no longer full, allow new TX entries. */ - ep->tx_full = 0; - spin_unlock(&ep->lock); - netif_wake_queue(dev); + if ((status & EpicNapiEvent) && !ep->reschedule_in_poll) { + spin_lock(&ep->napi_lock); + if (netif_rx_schedule_prep(dev)) { + epic_napi_irq_off(dev, ep); + __netif_rx_schedule(dev); } else - spin_unlock(&ep->lock); + ep->reschedule_in_poll++; + spin_unlock(&ep->napi_lock); } + status &= ~EpicNapiEvent; /* Check uncommon events all at once. */ - if (status & (CntFull | TxUnderrun | RxOverflow | RxFull | - PCIBusErr170 | PCIBusErr175)) { - if (status == 0xffffffff) /* Chip failed or removed (CardBus). */ + if (status & + (CntFull | TxUnderrun | PCIBusErr170 | PCIBusErr175)) { + if (status == EpicRemoved) break; /* Always update the error counts to avoid overhead later. */ ep->stats.rx_missed_errors += inb(ioaddr + MPCNT); @@ -1133,11 +1204,6 @@ static irqreturn_t epic_interrupt(int ir /* Restart the transmit process. */ outl(RestartTx, ioaddr + COMMAND); } - if (status & RxOverflow) { /* Missed a Rx frame. */ - ep->stats.rx_errors++; - } - if (status & (RxOverflow | RxFull)) - outw(RxQueued, ioaddr + COMMAND); if (status & PCIBusErr170) { printk(KERN_ERR "%s: PCI Bus Error! EPIC status %4.4x.\n", dev->name, status); @@ -1147,6 +1213,8 @@ static irqreturn_t epic_interrupt(int ir /* Clear all error sources. */ outl(status & 0x7f18, ioaddr + INTSTAT); } + if (!(status & EpicNormalEvent)) + break; if (--boguscnt < 0) { printk(KERN_ERR "%s: Too much work at interrupt, " "IntrStatus=0x%8.8x.\n", @@ -1164,7 +1232,7 @@ static irqreturn_t epic_interrupt(int ir return IRQ_RETVAL(handled); } -static int epic_rx(struct net_device *dev) +static int epic_rx(struct net_device *dev, int budget) { struct epic_private *ep = dev->priv; int entry = ep->cur_rx % RX_RING_SIZE; @@ -1174,6 +1242,10 @@ static int epic_rx(struct net_device *de if (debug > 4) printk(KERN_DEBUG " In epic_rx(), entry %d %8.8x.\n", entry, ep->rx_ring[entry].rxstatus); + + if (rx_work_limit > budget) + rx_work_limit = budget; + /* If we own the next entry, it's a new packet. Send it up. */ while ((ep->rx_ring[entry].rxstatus & cpu_to_le32(DescOwn)) == 0) { int status = le32_to_cpu(ep->rx_ring[entry].rxstatus); @@ -1234,7 +1306,7 @@ static int epic_rx(struct net_device *de ep->rx_skbuff[entry] = NULL; } skb->protocol = eth_type_trans(skb, dev); - netif_rx(skb); + netif_receive_skb(skb); dev->last_rx = jiffies; ep->stats.rx_packets++; ep->stats.rx_bytes += pkt_len; @@ -1262,6 +1334,61 @@ static int epic_rx(struct net_device *de return work_done; } +static void epic_rx_err(struct net_device *dev, struct epic_private *ep) +{ + long ioaddr = dev->base_addr; + int status; + + status = inl(ioaddr + INTSTAT); + + if (status == EpicRemoved) + return; + if (status & RxOverflow) /* Missed a Rx frame. */ + ep->stats.rx_errors++; + if (status & (RxOverflow | RxFull)) + outw(RxQueued, ioaddr + COMMAND); +} + +static int epic_poll(struct net_device *dev, int *budget) +{ + struct epic_private *ep = dev->priv; + int work_done, orig_budget; + long ioaddr = dev->base_addr; + + orig_budget = (*budget > dev->quota) ? dev->quota : *budget; + +rx_action: + + epic_tx(dev, ep); + + work_done = epic_rx(dev, *budget); + + epic_rx_err(dev, ep); + + *budget -= work_done; + dev->quota -= work_done; + + if (netif_running(dev) && (work_done < orig_budget)) { + unsigned long flags; + + spin_lock_irqsave(&ep->napi_lock, flags); + + if (ep->reschedule_in_poll) { + ep->reschedule_in_poll--; + spin_unlock_irqrestore(&ep->napi_lock, flags); + goto rx_action; + } + + outl(EpicNapiEvent, ioaddr + INTSTAT); + epic_napi_irq_on(dev, ep); + __netif_rx_complete(dev); + + spin_unlock_irqrestore(&ep->napi_lock, flags); + } + + return (work_done >= orig_budget); +} + static int epic_close(struct net_device *dev) { long ioaddr = dev->base_addr; @@ -1276,9 +1403,13 @@ static int epic_close(struct net_device dev->name, (int)inl(ioaddr + INTSTAT)); del_timer_sync(&ep->timer); - epic_pause(dev); + + epic_disable_int(dev, ep); + free_irq(dev->irq, dev); + epic_pause(dev); + /* Free all the skbuffs in the Rx queue. */ for (i = 0; i < RX_RING_SIZE; i++) { skb = ep->rx_skbuff[i]; @@ -1476,6 +1607,7 @@ static void __devexit epic_remove_one (s #endif pci_release_regions(pdev); free_netdev(dev); + pci_disable_device(pdev); pci_set_drvdata(pdev, NULL); /* pci_power_off(pdev, -1); */ } --- diff/drivers/net/lasi_82596.c 2004-04-05 12:57:08.000000000 +0100 +++ source/drivers/net/lasi_82596.c 2004-04-21 10:46:51.522749464 +0100 @@ -87,7 +87,6 @@ #include #include #include -#include #include #include #include --- diff/drivers/net/macsonic.c 2004-02-18 08:54:10.000000000 +0000 +++ source/drivers/net/macsonic.c 2004-04-21 10:46:51.523749312 +0100 @@ -53,7 +53,6 @@ #include #include #include -#include #define SREGS_PAD(n) u16 n; --- diff/drivers/net/pcmcia/3c574_cs.c 2004-04-05 12:57:08.000000000 +0100 +++ source/drivers/net/pcmcia/3c574_cs.c 2004-04-21 10:46:51.524749160 +0100 @@ -384,6 +384,8 @@ static void tc574_detach(dev_link_t *lin #define CS_CHECK(fn, ret) \ do { last_fn = (fn); if ((last_ret = (ret)) != 0) goto cs_failed; } while (0) +static char *ram_split[] = {"5:3", "3:1", "1:1", "3:5"}; + static void tc574_config(dev_link_t *link) { client_handle_t handle = link->handle; @@ -396,6 +398,7 @@ static void tc574_config(dev_link_t *lin ioaddr_t ioaddr; u16 *phys_addr; char *cardname; + union wn3_config config; phys_addr = (u16 *)dev->dev_addr; @@ -431,15 +434,7 @@ static void tc574_config(dev_link_t *lin dev->irq = link->irq.AssignedIRQ; dev->base_addr = link->io.BasePort1; - if (register_netdev(dev) != 0) { - printk(KERN_NOTICE "3c574_cs: register_netdev() failed\n"); - goto failed; - } - ioaddr = dev->base_addr; - strcpy(lp->node.dev_name, dev->name); - link->dev = &lp->node; - link->state &= ~DEV_CONFIG_PENDING; /* The 3c574 normally uses an EEPROM for configuration info, including the hardware address. The future products may include a modem chip @@ -467,24 +462,14 @@ static void tc574_config(dev_link_t *lin } else cardname = "3Com 3c574"; - printk(KERN_INFO "%s: %s at io %#3lx, irq %d, hw_addr ", - dev->name, cardname, dev->base_addr, dev->irq); - - for (i = 0; i < 6; i++) - printk("%02X%s", dev->dev_addr[i], ((i<5) ? ":" : ".\n")); - { - u_char mcr, *ram_split[] = {"5:3", "3:1", "1:1", "3:5"}; - union wn3_config config; + u_char mcr; outw(2<<11, ioaddr + RunnerRdCtrl); mcr = inb(ioaddr + 2); outw(0<<11, ioaddr + RunnerRdCtrl); printk(KERN_INFO " ASIC rev %d,", mcr>>3); EL3WINDOW(3); config.i = inl(ioaddr + Wn3_Config); - printk(" %dK FIFO split %s Rx:Tx, %sMII interface.\n", - 8 << config.u.ram_size, ram_split[config.u.ram_split], - config.u.autoselect ? "autoselect " : ""); lp->default_media = config.u.xcvr; lp->autoselect = config.u.autoselect; } @@ -531,6 +516,25 @@ static void tc574_config(dev_link_t *lin } } + link->state &= ~DEV_CONFIG_PENDING; + link->dev = &lp->node; + + if (register_netdev(dev) != 0) { + printk(KERN_NOTICE "3c574_cs: register_netdev() failed\n"); + link->dev = NULL; + goto failed; + } + + strcpy(lp->node.dev_name, dev->name); + + printk(KERN_INFO "%s: %s at io %#3lx, irq %d, hw_addr ", + dev->name, cardname, dev->base_addr, dev->irq); + for (i = 0; i < 6; i++) + printk("%02X%s", dev->dev_addr[i], ((i<5) ? ":" : ".\n")); + printk(" %dK FIFO split %s Rx:Tx, %sMII interface.\n", + 8 << config.u.ram_size, ram_split[config.u.ram_split], + config.u.autoselect ? "autoselect " : ""); + return; cs_failed: --- diff/drivers/net/pcmcia/3c589_cs.c 2004-04-21 10:45:34.432468968 +0100 +++ source/drivers/net/pcmcia/3c589_cs.c 2004-04-21 10:46:51.525749008 +0100 @@ -308,7 +308,7 @@ static void tc589_config(dev_link_t *lin tuple_t tuple; cisparse_t parse; u16 buf[32], *phys_addr; - int last_fn, last_ret, i, j, multi = 0; + int last_fn, last_ret, i, j, multi = 0, fifo; ioaddr_t ioaddr; char *ram_split[] = {"5:3", "3:1", "1:1", "3:5"}; @@ -357,11 +357,6 @@ static void tc589_config(dev_link_t *lin dev->irq = link->irq.AssignedIRQ; dev->base_addr = link->io.BasePort1; - if (register_netdev(dev) != 0) { - printk(KERN_ERR "3c589_cs: register_netdev() failed\n"); - goto failed; - } - ioaddr = dev->base_addr; EL3WINDOW(0); @@ -382,13 +377,10 @@ static void tc589_config(dev_link_t *lin } } - strcpy(lp->node.dev_name, dev->name); - link->dev = &lp->node; - link->state &= ~DEV_CONFIG_PENDING; - /* The address and resource configuration register aren't loaded from the EEPROM and *must* be set to 0 and IRQ3 for the PCMCIA version. */ outw(0x3f00, ioaddr + 8); + fifo = inl(ioaddr); /* The if_port symbol can be set when the module is loaded */ if ((if_port >= 0) && (if_port <= 3)) @@ -396,14 +388,24 @@ static void tc589_config(dev_link_t *lin else printk(KERN_ERR "3c589_cs: invalid if_port requested\n"); + link->dev = &lp->node; + link->state &= ~DEV_CONFIG_PENDING; + + if (register_netdev(dev) != 0) { + printk(KERN_ERR "3c589_cs: register_netdev() failed\n"); + link->dev = NULL; + goto failed; + } + + strcpy(lp->node.dev_name, dev->name); + printk(KERN_INFO "%s: 3Com 3c%s, io %#3lx, irq %d, hw_addr ", dev->name, (multi ? "562" : "589"), dev->base_addr, dev->irq); for (i = 0; i < 6; i++) printk("%02X%s", dev->dev_addr[i], ((i<5) ? ":" : "\n")); - i = inl(ioaddr); printk(KERN_INFO " %dK FIFO split %s Rx:Tx, %s xcvr\n", - (i & 7) ? 32 : 8, ram_split[(i >> 16) & 3], + (fifo & 7) ? 32 : 8, ram_split[(fifo >> 16) & 3], if_names[dev->if_port]); return; --- diff/drivers/net/pcmcia/axnet_cs.c 2004-02-18 08:54:10.000000000 +0000 +++ source/drivers/net/pcmcia/axnet_cs.c 2004-04-21 10:46:51.526748856 +0100 @@ -430,19 +430,11 @@ static void axnet_config(dev_link_t *lin ei_status.block_input = &block_input; ei_status.block_output = &block_output; - strcpy(info->node.dev_name, dev->name); - if (inb(dev->base_addr + AXNET_TEST) != 0) info->flags |= IS_AX88790; else info->flags |= IS_AX88190; - printk(KERN_INFO "%s: Asix AX88%d90: io %#3lx, irq %d, hw_addr ", - dev->name, ((info->flags & IS_AX88790) ? 7 : 1), - dev->base_addr, dev->irq); - for (i = 0; i < 6; i++) - printk("%02X%s", dev->dev_addr[i], ((i<5) ? ":" : "\n")); - if (info->flags & IS_AX88790) outb(0x10, dev->base_addr + AXNET_GPIO); /* select Internal PHY */ @@ -463,19 +455,27 @@ static void axnet_config(dev_link_t *lin } info->phy_id = (i < 32) ? i : -1; - if (i < 32) { - DEBUG(0, " MII transceiver at index %d, status %x.\n", i, j); - } else { - printk(KERN_NOTICE " No MII transceivers found!\n"); - } + link->dev = &info->node; + link->state &= ~DEV_CONFIG_PENDING; if (register_netdev(dev) != 0) { printk(KERN_NOTICE "axnet_cs: register_netdev() failed\n"); + link->dev = NULL; goto failed; } - link->dev = &info->node; - link->state &= ~DEV_CONFIG_PENDING; + strcpy(info->node.dev_name, dev->name); + + printk(KERN_INFO "%s: Asix AX88%d90: io %#3lx, irq %d, hw_addr ", + dev->name, ((info->flags & IS_AX88790) ? 7 : 1), + dev->base_addr, dev->irq); + for (i = 0; i < 6; i++) + printk("%02X%s", dev->dev_addr[i], ((i<5) ? ":" : "\n")); + if (info->phy_id != -1) { + DEBUG(0, " MII transceiver at index %d, status %x.\n", info->phy_id, j); + } else { + printk(KERN_NOTICE " No MII transceivers found!\n"); + } return; cs_failed: --- diff/drivers/net/pcmcia/com20020_cs.c 2004-04-05 12:57:08.000000000 +0100 +++ source/drivers/net/pcmcia/com20020_cs.c 2004-04-21 10:46:51.526748856 +0100 @@ -141,7 +141,6 @@ static dev_link_t *dev_list; typedef struct com20020_dev_t { struct net_device *dev; - int dev_configured; dev_node_t node; } com20020_dev_t; @@ -277,13 +276,10 @@ static void com20020_detach(dev_link_t * dev = info->dev; if (dev) { - if (info->dev_configured) + if (link->dev) { DEBUG(1,"unregister...\n"); - if (netif_running(dev)) - dev->stop(dev); - unregister_netdev(dev); /* @@ -398,17 +394,18 @@ static void com20020_config(dev_link_t * lp->card_name = "PCMCIA COM20020"; lp->card_flags = ARC_CAN_10MBIT; /* pretend all of them can 10Mbit */ + link->dev = &info->node; + link->state &= ~DEV_CONFIG_PENDING; + i = com20020_found(dev, 0); /* calls register_netdev */ if (i != 0) { DEBUG(1,KERN_NOTICE "com20020_cs: com20020_found() failed\n"); + link->dev = NULL; goto failed; } - info->dev_configured = 1; strcpy(info->node.dev_name, dev->name); - link->dev = &info->node; - link->state &= ~DEV_CONFIG_PENDING; DEBUG(1,KERN_INFO "%s: port %#3lx, irq %d\n", dev->name, dev->base_addr, dev->irq); --- diff/drivers/net/pcmcia/fmvj18x_cs.c 2004-04-05 12:57:08.000000000 +0100 +++ source/drivers/net/pcmcia/fmvj18x_cs.c 2004-04-21 10:46:51.527748704 +0100 @@ -510,10 +510,6 @@ static void fmvj18x_config(dev_link_t *l CS_CHECK(RequestConfiguration, pcmcia_request_configuration(link->handle, &link->conf)); dev->irq = link->irq.AssignedIRQ; dev->base_addr = link->io.BasePort1; - if (register_netdev(dev) != 0) { - printk(KERN_NOTICE "fmvj18x_cs: register_netdev() failed\n"); - goto failed; - } if (link->io.BasePort2 != 0) fmvj18x_setup_mfc(link); @@ -575,7 +571,6 @@ static void fmvj18x_config(dev_link_t *l /* Read MACID from Buggy CIS */ if (fmvj18x_get_hwinfo(link, tuple.TupleData) == -1) { printk(KERN_NOTICE "fmvj18x_cs: unable to read hardware net address.\n"); - unregister_netdev(dev); goto failed; } for (i = 0 ; i < 6; i++) { @@ -592,10 +587,18 @@ static void fmvj18x_config(dev_link_t *l break; } - strcpy(lp->node.dev_name, dev->name); + lp->cardtype = cardtype; link->dev = &lp->node; + link->state &= ~DEV_CONFIG_PENDING; + + if (register_netdev(dev) != 0) { + printk(KERN_NOTICE "fmvj18x_cs: register_netdev() failed\n"); + link->dev = NULL; + goto failed; + } + + strcpy(lp->node.dev_name, dev->name); - lp->cardtype = cardtype; /* print current configuration */ printk(KERN_INFO "%s: %s, sram %s, port %#3lx, irq %d, hw_addr ", dev->name, card_name, sram_config == 0 ? "4K TX*2" : "8K TX*2", @@ -603,7 +606,6 @@ static void fmvj18x_config(dev_link_t *l for (i = 0; i < 6; i++) printk("%02X%s", dev->dev_addr[i], ((i<5) ? ":" : "\n")); - link->state &= ~DEV_CONFIG_PENDING; return; cs_failed: --- diff/drivers/net/pcmcia/ibmtr_cs.c 2004-04-21 10:45:34.432468968 +0100 +++ source/drivers/net/pcmcia/ibmtr_cs.c 2004-04-21 10:46:51.527748704 +0100 @@ -317,13 +317,10 @@ static void ibmtr_config(dev_link_t *lin /* Try PRIMARY card at 0xA20-0xA23 */ link->io.BasePort1 = 0xA20; i = pcmcia_request_io(link->handle, &link->io); - if (i == CS_SUCCESS) { - memcpy(info->node.dev_name, "tr0\0", 4); - } else { + if (i != CS_SUCCESS) { /* Couldn't get 0xA20-0xA23. Try ALTERNATE at 0xA24-0xA27. */ link->io.BasePort1 = 0xA24; CS_CHECK(RequestIO, pcmcia_request_io(link->handle, &link->io)); - memcpy(info->node.dev_name, "tr1\0", 4); } dev->base_addr = link->io.BasePort1; @@ -367,15 +364,17 @@ static void ibmtr_config(dev_link_t *lin Adapters Technical Reference" SC30-3585 for this info. */ ibmtr_hw_setup(dev, mmiobase); + link->dev = &info->node; + link->state &= ~DEV_CONFIG_PENDING; + i = ibmtr_probe_card(dev); - if (i != 0) { printk(KERN_NOTICE "ibmtr_cs: register_netdev() failed\n"); + link->dev = NULL; goto failed; } - link->dev = &info->node; - link->state &= ~DEV_CONFIG_PENDING; + strcpy(info->node.dev_name, dev->name); printk(KERN_INFO "%s: port %#3lx, irq %d,", dev->name, dev->base_addr, dev->irq); --- diff/drivers/net/pcmcia/nmclan_cs.c 2004-04-21 10:45:34.434468664 +0100 +++ source/drivers/net/pcmcia/nmclan_cs.c 2004-04-21 10:46:51.528748552 +0100 @@ -734,11 +734,6 @@ static void nmclan_config(dev_link_t *li CS_CHECK(RequestConfiguration, pcmcia_request_configuration(handle, &link->conf)); dev->irq = link->irq.AssignedIRQ; dev->base_addr = link->io.BasePort1; - i = register_netdev(dev); - if (i != 0) { - printk(KERN_NOTICE "nmclan_cs: register_netdev() failed\n"); - goto failed; - } ioaddr = dev->base_addr; @@ -777,10 +772,18 @@ static void nmclan_config(dev_link_t *li else printk(KERN_NOTICE "nmclan_cs: invalid if_port requested\n"); - strcpy(lp->node.dev_name, dev->name); link->dev = &lp->node; link->state &= ~DEV_CONFIG_PENDING; + i = register_netdev(dev); + if (i != 0) { + printk(KERN_NOTICE "nmclan_cs: register_netdev() failed\n"); + link->dev = NULL; + goto failed; + } + + strcpy(lp->node.dev_name, dev->name); + printk(KERN_INFO "%s: nmclan: port %#3lx, irq %d, %s port, hw_addr ", dev->name, dev->base_addr, dev->irq, if_names[dev->if_port]); for (i = 0; i < 6; i++) --- diff/drivers/net/pcmcia/smc91c92_cs.c 2004-04-05 12:57:08.000000000 +0100 +++ source/drivers/net/pcmcia/smc91c92_cs.c 2004-04-21 10:46:51.530748248 +0100 @@ -901,6 +901,7 @@ static void smc91c92_config(dev_link_t * char *name; int i, j, rev; ioaddr_t ioaddr; + u_long mir; DEBUG(0, "smc91c92_config(0x%p)\n", link); @@ -952,11 +953,6 @@ static void smc91c92_config(dev_link_t * else printk(KERN_NOTICE "smc91c92_cs: invalid if_port requested\n"); - if (register_netdev(dev) != 0) { - printk(KERN_ERR "smc91c92_cs: register_netdev() failed\n"); - goto config_undo; - } - switch (smc->manfid) { case MANFID_OSITECH: case MANFID_PSION: @@ -977,8 +973,6 @@ static void smc91c92_config(dev_link_t * goto config_undo; } - strcpy(smc->node.dev_name, dev->name); - link->dev = &smc->node; smc->duplex = 0; smc->rx_ovrn = 0; @@ -993,25 +987,16 @@ static void smc91c92_config(dev_link_t * case 8: name = "100-FD"; break; case 9: name = "110"; break; } - printk(KERN_INFO "%s: smc91c%s rev %d: io %#3lx, irq %d, " - "hw_addr ", dev->name, name, (rev & 0x0f), dev->base_addr, - dev->irq); - for (i = 0; i < 6; i++) - printk("%02X%s", dev->dev_addr[i], ((i<5) ? ":" : "\n")); ioaddr = dev->base_addr; if (rev > 0) { - u_long mir, mcr; + u_long mcr; SMC_SELECT_BANK(0); mir = inw(ioaddr + MEMINFO) & 0xff; if (mir == 0xff) mir++; /* Get scale factor for memory size */ mcr = ((rev >> 4) > 3) ? inw(ioaddr + MEMCFG) : 0x0200; mir *= 128 * (1<<((mcr >> 9) & 7)); - if (mir & 0x3ff) - printk(KERN_INFO " %lu byte", mir); - else - printk(KERN_INFO " %lu kb", mir>>10); SMC_SELECT_BANK(1); smc->cfg = inw(ioaddr + CONFIG) & ~CFG_AUI_SELECT; smc->cfg |= CFG_NO_WAIT | CFG_16BIT | CFG_STATIC; @@ -1019,9 +1004,8 @@ static void smc91c92_config(dev_link_t * smc->cfg |= CFG_IRQ_SEL_1 | CFG_IRQ_SEL_0; if ((rev >> 4) >= 7) smc->cfg |= CFG_MII_SELECT; - printk(" buffer, %s xcvr\n", (smc->cfg & CFG_MII_SELECT) ? - "MII" : if_names[dev->if_port]); - } + } else + mir = 0; if (smc->cfg & CFG_MII_SELECT) { SMC_SELECT_BANK(3); @@ -1031,16 +1015,45 @@ static void smc91c92_config(dev_link_t * if ((j != 0) && (j != 0xffff)) break; } smc->mii_if.phy_id = (i < 32) ? i : -1; - if (i < 32) { - DEBUG(0, " MII transceiver at index %d, status %x.\n", i, j); - } else { - printk(KERN_NOTICE " No MII transceivers found!\n"); - } SMC_SELECT_BANK(0); } + link->dev = &smc->node; link->state &= ~DEV_CONFIG_PENDING; + + if (register_netdev(dev) != 0) { + printk(KERN_ERR "smc91c92_cs: register_netdev() failed\n"); + link->dev = NULL; + goto config_undo; + } + + strcpy(smc->node.dev_name, dev->name); + + printk(KERN_INFO "%s: smc91c%s rev %d: io %#3lx, irq %d, " + "hw_addr ", dev->name, name, (rev & 0x0f), dev->base_addr, + dev->irq); + for (i = 0; i < 6; i++) + printk("%02X%s", dev->dev_addr[i], ((i<5) ? ":" : "\n")); + + if (rev > 0) { + if (mir & 0x3ff) + printk(KERN_INFO " %lu byte", mir); + else + printk(KERN_INFO " %lu kb", mir>>10); + printk(" buffer, %s xcvr\n", (smc->cfg & CFG_MII_SELECT) ? + "MII" : if_names[dev->if_port]); + } + + if (smc->cfg & CFG_MII_SELECT) { + if (smc->mii_if.phy_id != -1) { + DEBUG(0, " MII transceiver at index %d, status %x.\n", + smc->mii_if.phy_id, j); + } else { + printk(KERN_NOTICE " No MII transceivers found!\n"); + } + } + return; config_undo: --- diff/drivers/net/pcmcia/xirc2ps_cs.c 2004-04-21 10:45:34.435468512 +0100 +++ source/drivers/net/pcmcia/xirc2ps_cs.c 2004-04-21 10:46:51.531748096 +0100 @@ -1114,17 +1114,20 @@ xirc2ps_config(dev_link_t * link) /* we can now register the device with the net subsystem */ dev->irq = link->irq.AssignedIRQ; dev->base_addr = link->io.BasePort1; + + if (local->dingo) + do_reset(dev, 1); /* a kludge to make the cem56 work */ + + link->dev = &local->node; + link->state &= ~DEV_CONFIG_PENDING; + if ((err=register_netdev(dev))) { printk(KNOT_XIRC "register_netdev() failed\n"); + link->dev = NULL; goto config_error; } strcpy(local->node.dev_name, dev->name); - link->dev = &local->node; - link->state &= ~DEV_CONFIG_PENDING; - - if (local->dingo) - do_reset(dev, 1); /* a kludge to make the cem56 work */ /* give some infos about the hardware */ printk(KERN_INFO "%s: %s: port %#3lx, irq %d, hwaddr", --- diff/drivers/net/r8169.c 2004-04-21 10:45:34.441467600 +0100 +++ source/drivers/net/r8169.c 2004-04-21 10:46:51.532747944 +0100 @@ -47,6 +47,9 @@ VERSION 1.2 <2002/11/30> #include +#define DMA_64BIT_MASK 0xffffffffffffffffULL +#define DMA_32BIT_MASK 0xffffffffULL + #define RTL8169_VERSION "1.2" #define MODULENAME "r8169" #define RTL8169_DRIVER_NAME MODULENAME " Gigabit Ethernet driver " RTL8169_VERSION --- diff/drivers/net/sk98lin/skvpd.c 2004-03-11 10:20:26.000000000 +0000 +++ source/drivers/net/sk98lin/skvpd.c 2004-04-21 10:46:51.532747944 +0100 @@ -468,6 +468,17 @@ SK_IOC IoC) /* IO Context */ pAC->vpd.vpd_size = vpd_size; + /* Asus K8V Se Deluxe bugfix. Correct VPD content */ + /* MBo April 2004 */ + if (((unsigned char)pAC->vpd.vpd_buf[0x3f] == 0x38) && + ((unsigned char)pAC->vpd.vpd_buf[0x40] == 0x3c) && + ((unsigned char)pAC->vpd.vpd_buf[0x41] == 0x45)) { + printk("sk98lin: Asus mainboard with buggy VPD?" + "Correcting data.\n"); + pAC->vpd.vpd_buf[0x40] = 0x38; + } + + /* find the end tag of the RO area */ if (!(r = vpd_find_para(pAC, VPD_RV, &rp))) { SK_DBG_MSG(pAC, SK_DBGMOD_VPD, SK_DBGCAT_ERR | SK_DBGCAT_FATAL, --- diff/drivers/net/sun3lance.c 2004-04-05 12:57:08.000000000 +0100 +++ source/drivers/net/sun3lance.c 2004-04-21 10:46:51.533747792 +0100 @@ -42,7 +42,6 @@ static char *version = "sun3lance.c: v1. #include #include #include -#include #include #include #include --- diff/drivers/net/tulip/tulip_core.c 2004-04-21 10:45:34.470463192 +0100 +++ source/drivers/net/tulip/tulip_core.c 2004-04-21 10:46:51.534747640 +0100 @@ -1513,7 +1513,7 @@ static int __devinit tulip_init_one (str } } /* Lite-On boards have the address byte-swapped. */ - if ((dev->dev_addr[0] == 0xA0 || dev->dev_addr[0] == 0xC0) + if ((dev->dev_addr[0] == 0xA0 || dev->dev_addr[0] == 0xC0 || dev->dev_addr[0] == 0x02) && dev->dev_addr[1] == 0x00) for (i = 0; i < 6; i+=2) { char tmp = dev->dev_addr[i]; --- diff/drivers/parisc/ccio-dma.c 2004-04-05 12:57:08.000000000 +0100 +++ source/drivers/parisc/ccio-dma.c 2004-04-21 10:46:51.535747488 +0100 @@ -44,7 +44,6 @@ #include #include /* for L1_CACHE_BYTES */ #include -#include #include #include #include --- diff/drivers/parisc/ccio-rm-dma.c 2004-04-05 12:57:08.000000000 +0100 +++ source/drivers/parisc/ccio-rm-dma.c 2004-04-21 10:46:51.535747488 +0100 @@ -40,7 +40,6 @@ #include #include -#include #include #include --- diff/drivers/pci/pci.c 2004-04-21 10:45:34.498458936 +0100 +++ source/drivers/pci/pci.c 2004-04-21 10:46:51.536747336 +0100 @@ -247,6 +247,8 @@ pci_set_power_state(struct pci_dev *dev, int pm; u16 pmcsr; + might_sleep(); + /* bound the state we're entering */ if (state > 3) state = 3; --- diff/drivers/sbus/char/openprom.c 2003-05-21 11:50:15.000000000 +0100 +++ source/drivers/sbus/char/openprom.c 2004-04-21 10:46:51.536747336 +0100 @@ -149,7 +149,6 @@ static int openprom_sunos_ioctl(struct i char buffer[OPROMMAXPARAM+1], *buf; struct openpromio *opp; int bufsize, len, error = 0; - extern char saved_command_line[]; static int cnt; if (cmd == OPROMSETOPT) --- diff/drivers/scsi/3w-xxxx.c 2003-09-30 15:46:16.000000000 +0100 +++ source/drivers/scsi/3w-xxxx.c 2004-04-21 10:46:51.548745512 +0100 @@ -6,7 +6,7 @@ Arnaldo Carvalho de Melo Brad Strand - Copyright (C) 1999-2003 3ware Inc. + Copyright (C) 1999-2004 3ware Inc. Kernel compatiblity By: Andre Hedrick Non-Copyright (C) 2000 Andre Hedrick @@ -179,6 +179,8 @@ 1.02.00.036 - Increase character ioctl timeout to 60 seconds. 1.02.00.037 - Fix tw_ioctl() to handle all non-data ATA passthru cmds for 'smartmontools' support. + 1.26.00.038 - Roll driver minor version to 26 to denote kernel 2.6. + Add support for cmds_per_lun module parameter. */ #include @@ -205,6 +207,7 @@ MODULE_LICENSE("GPL"); #include #include #include +#include #include #include @@ -242,10 +245,15 @@ static struct file_operations tw_fops = }; /* Globals */ -char *tw_driver_version="1.02.00.037"; +char *tw_driver_version="1.26.00.038"; TW_Device_Extension *tw_device_extension_list[TW_MAX_SLOT]; int tw_device_extension_count = 0; static int twe_major = -1; +static int cmds_per_lun; + +/* Module parameters */ +module_param(cmds_per_lun, int, 0); +MODULE_PARM_DESC(cmds_per_lun, "Maximum commands per LUN"); /* Functions */ @@ -1141,14 +1149,6 @@ int tw_findcards(Scsi_Host_Template *tw_ /* Set card status as online */ tw_dev->online = 1; -#ifdef CONFIG_3W_XXXX_CMD_PER_LUN - tw_host->cmd_per_lun = CONFIG_3W_XXXX_CMD_PER_LUN; - if (tw_host->cmd_per_lun > TW_MAX_CMDS_PER_LUN) - tw_host->cmd_per_lun = TW_MAX_CMDS_PER_LUN; -#else - /* Use SHT cmd_per_lun here */ - tw_host->cmd_per_lun = TW_MAX_CMDS_PER_LUN; -#endif tw_dev->free_head = TW_Q_START; tw_dev->free_tail = TW_Q_START; tw_dev->free_wrap = TW_Q_LENGTH - 1; @@ -3386,13 +3386,13 @@ int tw_slave_configure(Scsi_Device *SDpt dprintk(KERN_WARNING "3w-xxxx: tw_slave_configure()\n"); -#ifdef CONFIG_3W_XXXX_CMD_PER_LUN - max_cmds = CONFIG_3W_XXXX_CMD_PER_LUN; - if (max_cmds > TW_MAX_CMDS_PER_LUN) + if (cmds_per_lun) { + max_cmds = cmds_per_lun; + if (max_cmds > TW_MAX_CMDS_PER_LUN) + max_cmds = TW_MAX_CMDS_PER_LUN; + } else { max_cmds = TW_MAX_CMDS_PER_LUN; -#else - max_cmds = TW_MAX_CMDS_PER_LUN; -#endif + } scsi_adjust_queue_depth(SDptr, MSG_ORDERED_TAG, max_cmds); return 0; @@ -3488,6 +3488,7 @@ static Scsi_Host_Template driver_templat .eh_abort_handler = tw_scsi_eh_abort, .eh_host_reset_handler = tw_scsi_eh_reset, .bios_param = tw_scsi_biosparam, + .slave_configure = tw_slave_configure, .can_queue = TW_Q_LENGTH-2, .this_id = -1, .sg_tablesize = TW_MAX_SGL_LENGTH, --- diff/drivers/scsi/3w-xxxx.h 2003-09-30 15:46:16.000000000 +0100 +++ source/drivers/scsi/3w-xxxx.h 2004-04-21 10:46:51.549745360 +0100 @@ -6,7 +6,7 @@ Arnaldo Carvalho de Melo Brad Strand - Copyright (C) 1999-2003 3ware Inc. + Copyright (C) 1999-2004 3ware Inc. Kernel compatiblity By: Andre Hedrick Non-Copyright (C) 2000 Andre Hedrick --- diff/drivers/scsi/advansys.c 2004-02-18 08:54:11.000000000 +0000 +++ source/drivers/scsi/advansys.c 2004-04-21 10:46:51.573741712 +0100 @@ -4619,7 +4619,7 @@ advansys_detect(Scsi_Host_Template *tpnt ASC_DBG1(1, "advansys_detect: probing I/O port 0x%x...\n", iop); - if (check_region(iop, ASC_IOADR_GAP) != 0) { + if (!request_region(iop, ASC_IOADR_GAP,"advansys")) { printk( "AdvanSys SCSI: specified I/O Port 0x%X is busy\n", iop); /* Don't try this I/O port twice. */ @@ -4630,6 +4630,7 @@ advansys_detect(Scsi_Host_Template *tpnt "AdvanSys SCSI: specified I/O Port 0x%X has no adapter\n", iop); /* Don't try this I/O port twice. */ asc_ioport[ioport] = 0; + release_region(iop, ASC_IOADR_GAP); goto ioport_try_again; } else { /* @@ -4647,6 +4648,7 @@ advansys_detect(Scsi_Host_Template *tpnt * 'ioport' past this board. */ ioport++; + release_region(iop,ASC_IOADR_GAP); goto ioport_try_again; } } @@ -10003,9 +10005,9 @@ AscSearchIOPortAddr11( } for (; i < ASC_IOADR_TABLE_MAX_IX; i++) { iop_base = _asc_def_iop_base[i]; - if (check_region(iop_base, ASC_IOADR_GAP) != 0) { + if (!request_region(iop_base, ASC_IOADR_GAP, "advansys")) { ASC_DBG1(1, - "AscSearchIOPortAddr11: check_region() failed I/O port 0x%x\n", + "AscSearchIOPortAddr11: request_region() failed I/O port 0x%x\n", iop_base); continue; } --- diff/drivers/scsi/aic7xxx/aic7770_osm.c 2004-01-19 10:22:58.000000000 +0000 +++ source/drivers/scsi/aic7xxx/aic7770_osm.c 2004-04-21 10:46:51.576741256 +0100 @@ -73,7 +73,7 @@ typedef void *aic7770_dev_t; static int aic7770_linux_config(struct aic7770_identity *entry, aic7770_dev_t dev, u_int eisaBase); -void +int ahc_linux_eisa_init(void) { #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0) @@ -82,7 +82,7 @@ ahc_linux_eisa_init(void) int i; if (aic7xxx_probe_eisa_vl == 0) - return; + return -ENODEV; /* * Linux requires the EISA IDs to be specified in @@ -93,7 +93,7 @@ ahc_linux_eisa_init(void) (ahc_num_aic7770_devs + 1), M_DEVBUF, M_NOWAIT); if (aic7770_driver.id_table == NULL) - return; + return -ENOMEM; for (eid = (struct eisa_device_id *)aic7770_driver.id_table, id = aic7770_ident_table, i = 0; @@ -109,15 +109,16 @@ ahc_linux_eisa_init(void) } eid->sig[0] = 0; - eisa_driver_register(&aic7770_driver); + return eisa_driver_register(&aic7770_driver); #else /* LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) */ struct aic7770_identity *entry; u_int slot; u_int eisaBase; u_int i; + int ret = -ENODEV; if (aic7xxx_probe_eisa_vl == 0) - return; + return ret; eisaBase = 0x1000 + AHC_EISA_SLOT_OFFSET; for (slot = 1; slot < NUMSLOTS; eisaBase+=0x1000, slot++) { @@ -146,9 +147,12 @@ ahc_linux_eisa_init(void) continue; /* no EISA card in slot */ entry = aic7770_find_device(eisa_id); - if (entry != NULL) + if (entry != NULL) { aic7770_linux_config(entry, NULL, eisaBase); + ret = 0; + } } + return ret; #endif } @@ -156,13 +160,8 @@ void ahc_linux_eisa_exit(void) { #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0) - if (aic7xxx_probe_eisa_vl == 0) - return; - - if (aic7770_driver.id_table != NULL) { - eisa_driver_unregister(&aic7770_driver); - free(aic7770_driver.id_table, M_DEVBUF); - } + eisa_driver_unregister(&aic7770_driver); + free(aic7770_driver.id_table, M_DEVBUF); #endif } --- diff/drivers/scsi/aic7xxx/aic79xx_osm.c 2004-03-11 10:20:27.000000000 +0000 +++ source/drivers/scsi/aic7xxx/aic79xx_osm.c 2004-04-21 10:46:51.592738824 +0100 @@ -2591,6 +2591,7 @@ ahd_linux_dv_thread(void *data) sprintf(current->comm, "ahd_dv_%d", ahd->unit); #else daemonize("ahd_dv_%d", ahd->unit); + current->flags |= PF_FREEZE; #endif unlock_kernel(); --- diff/drivers/scsi/aic7xxx/aic7xxx_osm.c 2004-03-11 10:20:27.000000000 +0000 +++ source/drivers/scsi/aic7xxx/aic7xxx_osm.c 2004-04-21 10:46:51.603737152 +0100 @@ -892,18 +892,25 @@ ahc_linux_detect(Scsi_Host_Template *tem ahc_list_lockinit(); #ifdef CONFIG_PCI - ahc_linux_pci_init(); + found = ahc_linux_pci_init(); + if (found) + goto out; #endif #ifdef CONFIG_EISA - ahc_linux_eisa_init(); + found = ahc_linux_eisa_init(); + if (found) { +#ifdef CONFIG_PCI + ahc_linux_pci_exit(); +#endif + goto out; + } #endif /* * Register with the SCSI layer all * controllers we've found. */ - found = 0; TAILQ_FOREACH(ahc, &ahc_tailq, links) { if (ahc_linux_register_host(ahc, template) == 0) @@ -913,6 +920,8 @@ ahc_linux_detect(Scsi_Host_Template *tem spin_lock_irq(&io_request_lock); #endif aic7xxx_detect_complete++; + +out: return (found); } @@ -2296,6 +2305,7 @@ ahc_linux_dv_thread(void *data) sprintf(current->comm, "ahc_dv_%d", ahc->unit); #else daemonize("ahc_dv_%d", ahc->unit); + current->flags |= PF_FREEZE; #endif unlock_kernel(); @@ -3969,11 +3979,10 @@ ahc_linux_alloc_device(struct ahc_softc } static void -ahc_linux_free_device(struct ahc_softc *ahc, struct ahc_linux_device *dev) +__ahc_linux_free_device(struct ahc_softc *ahc, struct ahc_linux_device *dev) { struct ahc_linux_target *targ; - del_timer_sync(&dev->timer); targ = dev->target; targ->devices[dev->lun] = NULL; free(dev, M_DEVBUF); @@ -3983,6 +3992,13 @@ ahc_linux_free_device(struct ahc_softc * ahc_linux_free_target(ahc, targ); } +static void +ahc_linux_free_device(struct ahc_softc *ahc, struct ahc_linux_device *dev) +{ + del_timer_sync(&dev->timer); + __ahc_linux_free_device(ahc, dev); +} + void ahc_send_async(struct ahc_softc *ahc, char channel, u_int target, u_int lun, ac_code code, void *arg) @@ -4693,7 +4709,7 @@ ahc_linux_dev_timed_unfreeze(u_long arg) ahc_linux_run_device_queue(ahc, dev); if (TAILQ_EMPTY(&dev->busyq) && dev->active == 0) - ahc_linux_free_device(ahc, dev); + __ahc_linux_free_device(ahc, dev); ahc_unlock(ahc, &s); } @@ -5067,11 +5083,17 @@ ahc_platform_dump_card_state(struct ahc_ } } +static void __exit ahc_linux_exit(void); + static int __init ahc_linux_init(void) { #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0) - return (ahc_linux_detect(&aic7xxx_driver_template) ? 0 : -ENODEV); + int rc = ahc_linux_detect(&aic7xxx_driver_template); + if (rc) + return rc; + ahc_linux_exit(); + return -ENODEV; #else scsi_register_module(MODULE_SCSI_HA, &aic7xxx_driver_template); if (aic7xxx_driver_template.present == 0) { --- diff/drivers/scsi/aic7xxx/aic7xxx_osm.h 2004-01-19 10:22:58.000000000 +0000 +++ source/drivers/scsi/aic7xxx/aic7xxx_osm.h 2004-04-21 10:46:51.607736544 +0100 @@ -840,7 +840,7 @@ typedef enum #ifdef CONFIG_EISA extern uint32_t aic7xxx_probe_eisa_vl; -void ahc_linux_eisa_init(void); +int ahc_linux_eisa_init(void); void ahc_linux_eisa_exit(void); int aic7770_map_registers(struct ahc_softc *ahc, u_int port); --- diff/drivers/scsi/libata-scsi.c 2004-04-21 10:45:34.743421696 +0100 +++ source/drivers/scsi/libata-scsi.c 2004-04-21 10:46:51.608736392 +0100 @@ -169,6 +169,23 @@ int ata_scsi_slave_config(struct scsi_de sdev->use_10_for_ms = 1; blk_queue_max_phys_segments(sdev->request_queue, LIBATA_MAX_PRD); + if (sdev->id < ATA_MAX_DEVICES) { + struct ata_port *ap; + struct ata_device *dev; + + ap = (struct ata_port *) &sdev->host->hostdata[0]; + dev = &ap->device[sdev->id]; + + if (dev->flags & ATA_DFLAG_LBA48) { + sdev->host->max_sectors = 65534; + blk_queue_max_sectors(sdev->request_queue, 65534); + printk(KERN_INFO "ata%u: dev %u max request 32MB (lba48)\n", + ap->id, sdev->id); + } else + printk(KERN_INFO "ata%u: dev %u max request 128K\n", + ap->id, sdev->id); + } + return 0; /* scsi layer doesn't check return value, sigh */ } --- diff/drivers/scsi/sata_promise.c 2004-04-21 10:45:34.910396312 +0100 +++ source/drivers/scsi/sata_promise.c 2004-04-21 10:46:51.608736392 +0100 @@ -1289,7 +1289,7 @@ static void pdc20621_put_to_dimm(struct readl(mmio + PDC_DIMM_WINDOW_CTLR); offset -= (idx * window_size); idx++; - dist = ((long) (window_size - (offset + size))) >= 0 ? size : + dist = ((s32)(window_size - (offset + size))) >= 0 ? size : (long) (window_size - offset); memcpy_toio((char *) (dimm_mmio + offset / 4), (char *) psource, dist); writel(0x01, mmio + PDC_GENERAL_CTLR); --- diff/drivers/serial/8250.c 2004-04-21 10:45:34.976386280 +0100 +++ source/drivers/serial/8250.c 2004-04-21 10:46:51.610736088 +0100 @@ -832,7 +832,7 @@ receive_chars(struct uart_8250_port *up, if (unlikely(tty->flip.count >= TTY_FLIPBUF_SIZE)) { tty->flip.work.func((void *)tty); if (tty->flip.count >= TTY_FLIPBUF_SIZE) - return; // if TTY_DONT_FLIP is set + return; /* if TTY_DONT_FLIP is set */ } ch = serial_inp(up, UART_RX); *tty->flip.char_buf_ptr = ch; @@ -1193,12 +1193,21 @@ static void serial8250_break_ctl(struct spin_unlock_irqrestore(&up->port.lock, flags); } +#ifdef CONFIG_KGDB +static int kgdb_irq = -1; +#endif + static int serial8250_startup(struct uart_port *port) { struct uart_8250_port *up = (struct uart_8250_port *)port; unsigned long flags; int retval; +#ifdef CONFIG_KGDB + if (up->port.irq == kgdb_irq) + return -EBUSY; +#endif + up->capabilities = uart_config[up->port.type].flags; up->mcr = 0; @@ -1888,6 +1897,10 @@ static void __init serial8250_register_p for (i = 0; i < UART_NR; i++) { struct uart_8250_port *up = &serial8250_ports[i]; +#ifdef CONFIG_KGDB + if (up->port.irq == kgdb_irq) + up->port.kgdb = 1; +#endif up->port.line = i; up->port.ops = &serial8250_pops; init_timer(&up->timer); @@ -2171,6 +2184,31 @@ void serial8250_resume_port(int line) uart_resume_port(&serial8250_reg, &serial8250_ports[line].port); } +#ifdef CONFIG_KGDB +/* + * Find all the ports using the given irq and shut them down. + * Result should be that the irq will be released. + */ +void shutdown_for_kgdb(struct async_struct * info) +{ + int irq = info->state->irq; + struct uart_8250_port *up; + int ttyS; + + kgdb_irq = irq; /* save for later init */ + for (ttyS = 0; ttyS < UART_NR; ttyS++){ + up = &serial8250_ports[ttyS]; + if (up->port.irq == irq && (irq_lists + irq)->head) { +#ifdef CONFIG_DEBUG_SPINLOCK /* ugly business... */ + if(up->port.lock.magic != SPINLOCK_MAGIC) + spin_lock_init(&up->port.lock); +#endif + serial8250_shutdown(&up->port); + } + } +} +#endif /* CONFIG_KGDB */ + static int __init serial8250_init(void) { int ret, i; --- diff/drivers/serial/serial_core.c 2004-04-21 10:45:35.016380200 +0100 +++ source/drivers/serial/serial_core.c 2004-04-21 10:46:51.611735936 +0100 @@ -1985,6 +1985,11 @@ uart_configure_port(struct uart_driver * { unsigned int flags; +#ifdef CONFIG_KGDB + if (port->kgdb) + return; +#endif + /* * If there isn't a port here, don't do anything further. */ --- diff/drivers/usb/gadget/rndis.c 2004-04-21 10:45:35.052374728 +0100 +++ source/drivers/usb/gadget/rndis.c 2004-04-21 10:46:51.612735784 +0100 @@ -746,19 +746,23 @@ static int gen_ndis_set_resp (u8 configN rndis_set_cmplt_type *resp; int i, retval = -ENOTSUPP; struct rndis_config_parameter *param; - - if (!r) return -ENOMEM; + u8 *cp; + + if (!r) + return -ENOMEM; resp = (rndis_set_cmplt_type *) r->buf; - - if (!resp) return -ENOMEM; - + if (!resp) + return -ENOMEM; + + cp = (u8 *)resp; + switch (OID) { case OID_GEN_CURRENT_PACKET_FILTER: DEBUG("%s: OID_GEN_CURRENT_PACKET_FILTER\n", __FUNCTION__); - currentFilter2devFlags ((u32) ((u8 *) resp + 28), + currentFilter2devFlags(cp[28], rndis_per_dev_params [configNr].dev); retval = 0; - if ((u32) ((u8 *) resp + 28)) + if (cp[28]) rndis_per_dev_params [configNr].state = RNDIS_INITIALIZED; else rndis_per_dev_params [configNr].state = RNDIS_UNINITIALIZED; --- diff/drivers/usb/host/ehci-hcd.c 2004-04-21 10:45:35.057373968 +0100 +++ source/drivers/usb/host/ehci-hcd.c 2004-04-21 10:46:51.612735784 +0100 @@ -577,7 +577,8 @@ static void ehci_stop (struct usb_hcd *h /* root hub is shut down separately (first, when possible) */ spin_lock_irq (&ehci->lock); - ehci_work (ehci, NULL); + if (ehci->async) + ehci_work (ehci, NULL); spin_unlock_irq (&ehci->lock); ehci_mem_cleanup (ehci); --- diff/drivers/video/Kconfig 2004-04-21 10:45:35.091368800 +0100 +++ source/drivers/video/Kconfig 2004-04-21 10:46:51.648730312 +0100 @@ -308,6 +308,14 @@ config FB_HGA As this card technology is 15 years old, most people will answer N here. +config FB_HGA_ACCEL + bool "Hercules mono Acceleration functions (EXPERIMENTAL)" + depends on FB_HGA + ---help--- + This will compile the Hercules mono graphics with + acceleration functions. + + config VIDEO_SELECT bool depends on FB && X86 @@ -772,6 +780,14 @@ config FB_TRIDENT To compile this driver as a module, choose M here: the module will be called tridentfb. +config FB_TRIDENT_ACCEL + bool "Trident Acceleration functions (EXPERIMENTAL)" + depends on FB_TRIDENT + ---help--- + This will compile the Trident frame buffer device with + acceleration functions. + + config FB_PM3 tristate "Permedia3 support" depends on FB && PCI && BROKEN --- diff/drivers/video/aty/radeon_accel.c 2004-02-18 08:54:12.000000000 +0000 +++ source/drivers/video/aty/radeon_accel.c 2004-04-21 10:46:51.618734872 +0100 @@ -53,6 +53,18 @@ void radeonfb_fillrect(struct fb_info *i static void radeonfb_prim_copyarea(struct radeonfb_info *rinfo, const struct fb_copyarea *area) { + int xdir, ydir; + u32 sx, sy, dx, dy, w, h; + + w = area->width; h = area->height; + dx = area->dx; dy = area->dy; + sx = area->sx; sy = area->sy; + xdir = sx - dx; + ydir = sy - dy; + + if ( xdir < 0 ) { sx += w-1; dx += w-1; } + if ( ydir < 0 ) { sy += h-1; dy += h-1; } + radeon_fifo_wait(3); OUTREG(DP_GUI_MASTER_CNTL, rinfo->dp_gui_master_cntl /* i.e. GMC_DST_32BPP */ @@ -60,12 +72,13 @@ static void radeonfb_prim_copyarea(struc | ROP3_S | DP_SRC_RECT ); OUTREG(DP_WRITE_MSK, 0xffffffff); - OUTREG(DP_CNTL, (DST_X_LEFT_TO_RIGHT | DST_Y_TOP_TO_BOTTOM)); + OUTREG(DP_CNTL, (xdir>=0 ? DST_X_LEFT_TO_RIGHT : 0) + | (ydir>=0 ? DST_Y_TOP_TO_BOTTOM : 0)); radeon_fifo_wait(3); - OUTREG(SRC_Y_X, (area->sy << 16) | area->sx); - OUTREG(DST_Y_X, (area->dy << 16) | area->dx); - OUTREG(DST_HEIGHT_WIDTH, (area->height << 16) | area->width); + OUTREG(SRC_Y_X, (sy << 16) | sx); + OUTREG(DST_Y_X, (dy << 16) | dx); + OUTREG(DST_HEIGHT_WIDTH, (h << 16) | w); } --- diff/drivers/video/console/fbcon.c 2004-04-05 12:57:08.000000000 +0100 +++ source/drivers/video/console/fbcon.c 2004-04-21 10:46:51.630733048 +0100 @@ -749,14 +749,6 @@ static void fbcon_set_display(struct vc_ vc->vc_cols = nr_cols; vc->vc_rows = nr_rows; } - p->vrows = info->var.yres_virtual / vc->vc_font.height; - if(info->var.yres > (vc->vc_font.height * (vc->vc_rows + 1))) { - p->vrows -= (info->var.yres - (vc->vc_font.height * vc->vc_rows)) / vc->vc_font.height; - } - if ((info->var.yres % vc->vc_font.height) && - (info->var.yres_virtual % vc->vc_font.height < - info->var.yres % vc->vc_font.height)) - p->vrows--; vc->vc_can_do_color = info->var.bits_per_pixel != 1; vc->vc_complement_mask = vc->vc_can_do_color ? 0x7700 : 0x0800; if (charcnt == 256) { @@ -767,7 +759,7 @@ static void fbcon_set_display(struct vc_ vc->vc_complement_mask <<= 1; } - if (!init) { + if (logo) { if (vc->vc_cols != nr_cols || vc->vc_rows != nr_rows) vc_resize(vc->vc_num, nr_cols, nr_rows); else if (CON_IS_VISIBLE(vc) && @@ -784,9 +776,6 @@ static void fbcon_set_display(struct vc_ vc->vc_pos += logo_lines * vc->vc_size_row; kfree(save); } - } - - if (logo) { if (logo_lines > vc->vc_bottom) { logo_shown = -1; printk(KERN_INFO @@ -1570,6 +1559,8 @@ static int fbcon_resize(struct vc_data * p->vrows = var.yres_virtual/fh; if (var.yres > (fh * (height + 1))) p->vrows -= (var.yres - (fh * height)) / fh; + if ((var.yres % fh) && (var.yres_virtual % fh < var.yres % fh)) + p->vrows--; return 0; } @@ -1839,15 +1830,6 @@ static int fbcon_do_set_font(struct vc_d if (resize) { /* reset wrap/pan */ info->var.xoffset = info->var.yoffset = p->yscroll = 0; - p->vrows = info->var.yres_virtual / h; - -#if 0 /* INCOMPLETE - let the console gurus handle this */ - if(info->var.yres > (h * (vc->vc_rows + 1)) - p->vrows -= (info->var.yres - (h * vc->vc_rows)) / h; -#endif - if ((info->var.yres % h) - && (info->var.yres_virtual % h < info->var.yres % h)) - p->vrows--; updatescrollmode(p, vc); vc_resize(vc->vc_num, info->var.xres / w, info->var.yres / h); if (CON_IS_VISIBLE(vc) && softback_buf) { --- diff/drivers/video/console/sticore.c 2004-02-18 08:54:12.000000000 +0000 +++ source/drivers/video/console/sticore.c 2004-04-21 10:46:51.631732896 +0100 @@ -22,7 +22,6 @@ #include #include -#include #include #include #include --- diff/drivers/video/fbmem.c 2004-04-21 10:45:35.140361352 +0100 +++ source/drivers/video/fbmem.c 2004-04-21 10:46:51.632732744 +0100 @@ -1013,7 +1013,7 @@ fb_ioctl(struct inode *inode, struct fil struct fb_con2fbmap con2fb; #endif struct fb_cmap cmap; - int i, rc; + int i; if (!fb) return -ENODEV; @@ -1037,7 +1037,7 @@ fb_ioctl(struct inode *inode, struct fil case FBIOPUTCMAP: if (copy_from_user(&cmap, (void *) arg, sizeof(cmap))) return -EFAULT; - return (fb_set_cmap(&cmap, 0, info)); + return (fb_set_cmap(&cmap, 1, info)); case FBIOGETCMAP: if (copy_from_user(&cmap, (void *) arg, sizeof(cmap))) return -EFAULT; @@ -1055,9 +1055,9 @@ fb_ioctl(struct inode *inode, struct fil return 0; case FBIO_CURSOR: acquire_console_sem(); - rc = fb_cursor(info, (struct fb_cursor *) arg); + i = fb_cursor(info, (struct fb_cursor *) arg); release_console_sem(); - return rc; + return i; #ifdef CONFIG_FRAMEBUFFER_CONSOLE case FBIOGET_CON2FBMAP: if (copy_from_user(&con2fb, (void *)arg, sizeof(con2fb))) @@ -1286,6 +1286,7 @@ register_framebuffer(struct fb_info *fb_ fb_info->pixmap.size = FBPIXMAPSIZE; fb_info->pixmap.buf_align = 1; fb_info->pixmap.scan_align = 1; + fb_info->pixmap.access_align = 4; fb_info->pixmap.flags = FB_PIXMAP_DEFAULT; } } @@ -1301,6 +1302,7 @@ register_framebuffer(struct fb_info *fb_ fb_info->sprite.size = FBPIXMAPSIZE; fb_info->sprite.buf_align = 1; fb_info->sprite.scan_align = 1; + fb_info->sprite.access_align = 4; fb_info->sprite.flags = FB_PIXMAP_DEFAULT; } } --- diff/drivers/video/hgafb.c 2004-01-19 10:22:59.000000000 +0000 +++ source/drivers/video/hgafb.c 2004-04-21 10:46:51.641731376 +0100 @@ -448,6 +448,10 @@ static int hgafb_blank(int blank_mode, s return 0; } +/* + * Accel functions + */ +#ifdef CONFIG_FB_HGA_ACCEL static void hgafb_fillrect(struct fb_info *info, const struct fb_fillrect *rect) { u_int rows, y; @@ -510,6 +514,11 @@ static void hgafb_imageblit(struct fb_in *dest = d; } } +#else /* !CONFIG_FB_HGA_ACCEL */ +#define hgafb_fillrect cfb_fillrect +#define hgafb_copyarea cfb_copyarea +#define hgafb_imageblit cfb_imageblit +#endif /* CONFIG_FB_HGA_ACCEL */ static struct fb_ops hgafb_ops = { @@ -519,9 +528,9 @@ static struct fb_ops hgafb_ops = { .fb_setcolreg = hgafb_setcolreg, .fb_pan_display = hgafb_pan_display, .fb_blank = hgafb_blank, - .fb_fillrect = cfb_fillrect, //hgafb_fillrect, - .fb_copyarea = cfb_copyarea, //hgafb_copyarea, - .fb_imageblit = cfb_imageblit,//hgafb_imageblit, + .fb_fillrect = hgafb_fillrect, + .fb_copyarea = hgafb_copyarea, + .fb_imageblit = hgafb_imageblit, }; /* ------------------------------------------------------------------------- * --- diff/drivers/video/imsttfb.c 2003-10-27 09:20:38.000000000 +0000 +++ source/drivers/video/imsttfb.c 2004-04-21 10:46:51.648730312 +0100 @@ -1084,6 +1084,7 @@ imsttfb_copyarea(struct fb_info *info, c while(read_reg_le32(par->dc_regs, SSTATUS) & 0x40); } +#if 0 static int imsttfb_load_cursor_image(struct imstt_par *par, int width, int height, __u8 fgc) { @@ -1191,7 +1192,6 @@ imstt_set_cursor(struct imstt_par *par, } } -#if 0 static int imsttfb_cursor(struct fb_info *info, struct fb_cursor *cursor) { --- diff/drivers/video/tridentfb.c 2004-02-09 10:36:12.000000000 +0000 +++ source/drivers/video/tridentfb.c 2004-04-21 10:46:51.654729400 +0100 @@ -450,7 +450,7 @@ static struct accel_switch accel_image = /* * Accel functions called by the upper layers */ - +#ifdef CONFIG_FB_TRIDENT_ACCEL static void tridentfb_fillrect(struct fb_info * info, const struct fb_fillrect *fr) { int bpp = info->var.bits_per_pixel; @@ -474,6 +474,11 @@ static void tridentfb_copyarea(struct fb acc->copy_rect(ca->sx,ca->sy,ca->dx,ca->dy,ca->width,ca->height); acc->wait_engine(); } +#else /* !CONFIG_FB_TRIDENT_ACCEL */ +#define tridentfb_fillrect cfb_fillrect +#define tridentfb_copyarea cfb_copyarea +#endif /* CONFIG_FB_TRIDENT_ACCEL */ + /* * Hardware access functions @@ -1265,10 +1270,8 @@ static struct fb_ops tridentfb_ops = { .fb_blank = tridentfb_blank, .fb_check_var = tridentfb_check_var, .fb_set_par = tridentfb_set_par, -// .fb_fillrect = tridentfb_fillrect, -// .fb_copyarea= tridentfb_copyarea, - .fb_fillrect = cfb_fillrect, - .fb_copyarea= cfb_copyarea, + .fb_fillrect = tridentfb_fillrect, + .fb_copyarea= tridentfb_copyarea, .fb_imageblit = cfb_imageblit, .fb_cursor = soft_cursor, }; --- diff/fs/Kconfig 2004-04-21 10:45:35.162358008 +0100 +++ source/fs/Kconfig 2004-04-21 10:46:51.704721800 +0100 @@ -468,7 +468,7 @@ config AUTOFS4_FS automounter (amd), which is a pure user space daemon. To use the automounter you need the user-space tools from - ; you also + ; you also want to answer Y to "NFS file system support", below. To compile this support as a module, choose M here: the module will be --- diff/fs/aio.c 2004-04-05 12:57:08.000000000 +0100 +++ source/fs/aio.c 2004-04-21 10:46:51.654729400 +0100 @@ -64,14 +64,9 @@ static void aio_kick_handler(void *); static int __init aio_setup(void) { kiocb_cachep = kmem_cache_create("kiocb", sizeof(struct kiocb), - 0, SLAB_HWCACHE_ALIGN, NULL, NULL); - if (!kiocb_cachep) - panic("unable to create kiocb cache\n"); - + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); kioctx_cachep = kmem_cache_create("kioctx", sizeof(struct kioctx), - 0, SLAB_HWCACHE_ALIGN, NULL, NULL); - if (!kioctx_cachep) - panic("unable to create kioctx cache"); + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); aio_wq = create_workqueue("aio"); --- diff/fs/autofs4/autofs_i.h 2003-09-30 15:46:18.000000000 +0100 +++ source/fs/autofs4/autofs_i.h 2004-04-21 10:46:51.655729248 +0100 @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include #include @@ -82,7 +84,7 @@ struct autofs_wait_queue { char *name; /* This is for status reporting upon return */ int status; - int wait_ctr; + atomic_t wait_ctr; }; #define AUTOFS_SBI_MAGIC 0x6d4a556d @@ -93,7 +95,10 @@ struct autofs_sb_info { pid_t oz_pgrp; int catatonic; int version; + int sub_version; unsigned long exp_timeout; + int reghost_enabled; + int needs_reghost; struct super_block *sb; struct autofs_wait_queue *queues; /* Wait queue pointer */ }; @@ -125,6 +130,12 @@ static inline int autofs4_ispending(stru (inf != NULL && inf->flags & AUTOFS_INF_EXPIRING); } +static inline void autofs4_copy_atime(struct file *src, struct file *dst) +{ + dst->f_dentry->d_inode->i_atime = src->f_dentry->d_inode->i_atime; + return; +} + struct inode *autofs4_get_inode(struct super_block *, struct autofs_info *); struct autofs_info *autofs4_init_inf(struct autofs_sb_info *, mode_t mode); void autofs4_free_ino(struct autofs_info *); @@ -141,6 +152,7 @@ int autofs4_expire_multi(struct super_bl extern struct inode_operations autofs4_symlink_inode_operations; extern struct inode_operations autofs4_dir_inode_operations; extern struct inode_operations autofs4_root_inode_operations; +extern struct file_operations autofs4_dir_operations; extern struct file_operations autofs4_root_operations; /* Initializing function */ @@ -157,6 +169,24 @@ enum autofs_notify NFY_EXPIRE }; -int autofs4_wait(struct autofs_sb_info *,struct qstr *, enum autofs_notify); +int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify); int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int); void autofs4_catatonic_mode(struct autofs_sb_info *); + +static inline int simple_positive(struct dentry *dentry) +{ + return dentry->d_inode && !d_unhashed(dentry); +} + +static inline int simple_empty_nolock(struct dentry *dentry) +{ + struct dentry *child; + int ret = 0; + + list_for_each_entry(child, &dentry->d_subdirs, d_child) + if (simple_positive(child)) + goto out; + ret = 1; +out: + return ret; +} --- diff/fs/autofs4/expire.c 2003-08-20 14:16:32.000000000 +0100 +++ source/fs/autofs4/expire.c 2004-04-21 10:46:51.656729096 +0100 @@ -4,6 +4,7 @@ * * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved * Copyright 1999-2000 Jeremy Fitzhardinge + * Copyright 2001-2003 Ian Kent * * This file is part of the Linux kernel and is made available under * the terms of the GNU General Public License, version 2, or at your @@ -12,139 +13,239 @@ * ------------------------------------------------------------------------- */ #include "autofs_i.h" -#include -/* - * Determine if a subtree of the namespace is busy. - * - * mnt is the mount tree under the autofs mountpoint - */ -static inline int is_vfsmnt_tree_busy(struct vfsmount *mnt) +static unsigned long now; + +/* Check if a dentry can be expired return 1 if it can else return 0 */ +static inline int autofs4_can_expire(struct dentry *dentry, + unsigned long timeout, int do_now) { - struct vfsmount *this_parent = mnt; - struct list_head *next; - int count; + struct autofs_info *ino = autofs4_dentry_ino(dentry); - count = atomic_read(&mnt->mnt_count) - 1; + /* dentry in the process of being deleted */ + if (ino == NULL) + return 0; + + /* No point expiring a pending mount */ + if (dentry->d_flags & DCACHE_AUTOFS_PENDING) + return 0; + + if (!do_now) { + /* Too young to die */ + if (time_after(ino->last_used + timeout, now)) + return 0; + + /* update last_used here :- + - obviously makes sense if it is in use now + - less obviously, prevents rapid-fire expire + attempts if expire fails the first time */ + ino->last_used = now; + } + return 1; +} + +/* Sorry I can't solve the problem without using counts either */ +static int autofs4_may_umount(struct vfsmount *mnt) +{ + struct list_head *next; + struct vfsmount *this_parent = mnt; + int actual_refs; + int minimum_refs; + + spin_lock(&vfsmount_lock); + actual_refs = atomic_read(&mnt->mnt_count); + minimum_refs = 2; repeat: next = this_parent->mnt_mounts.next; - DPRINTK(("is_vfsmnt_tree_busy: mnt=%p, this_parent=%p, next=%p\n", - mnt, this_parent, next)); resume: - for( ; next != &this_parent->mnt_mounts; next = next->next) { - struct vfsmount *p = list_entry(next, struct vfsmount, - mnt_child); - - /* -1 for struct vfs_mount's normal count, - -1 to compensate for child's reference to parent */ - count += atomic_read(&p->mnt_count) - 1 - 1; + while (next != &this_parent->mnt_mounts) { + struct vfsmount *p = list_entry(next, struct vfsmount, mnt_child); + + next = next->next; - DPRINTK(("is_vfsmnt_tree_busy: p=%p, count now %d\n", - p, count)); + actual_refs += atomic_read(&p->mnt_count); + minimum_refs += 2; - if (!list_empty(&p->mnt_mounts)) { + if ( !list_empty(&p->mnt_mounts) ) { this_parent = p; goto repeat; } - /* root is busy if any leaf is busy */ - if (atomic_read(&p->mnt_count) > 1) - return 1; } - /* All done at this level ... ascend and resume the search. */ if (this_parent != mnt) { - next = this_parent->mnt_child.next; + next = this_parent->mnt_child.next; this_parent = this_parent->mnt_parent; goto resume; } + spin_unlock(&vfsmount_lock); - DPRINTK(("is_vfsmnt_tree_busy: count=%d\n", count)); - return count != 0; /* remaining users? */ + DPRINTK(("autofs4_may_umount: done actual = %d, minimum = %d\n", + actual_refs, minimum_refs)); + + return actual_refs > minimum_refs; } -/* Traverse a dentry's list of vfsmounts and return the number of - non-busy mounts */ -static int check_vfsmnt(struct vfsmount *mnt, struct dentry *dentry) +/* Check a mount point for busyness return 1 if not busy, otherwise */ +static int autofs4_check_mount(struct vfsmount *mnt, struct dentry *dentry) { - int ret = dentry->d_mounted; - struct vfsmount *vfs = lookup_mnt(mnt, dentry); + int status = 0; - if (vfs) { - mntput(vfs); - if (is_vfsmnt_tree_busy(vfs)) - ret--; - } - DPRINTK(("check_vfsmnt: ret=%d\n", ret)); - return ret; + DPRINTK(("autofs4_check_mount: dentry %p %.*s\n", + dentry, (int)dentry->d_name.len, dentry->d_name.name)); + + mntget(mnt); + dget(dentry); + + if (!follow_down(&mnt, &dentry)) + goto done; + + while (d_mountpoint(dentry) && follow_down(&mnt, &dentry)) + ; + + /* This is an autofs submount, we can't expire it */ + if (is_autofs4_dentry(dentry)) + goto done; + + /* The big question */ + if (autofs4_may_umount(mnt) == 0) + status = 1; +done: + DPRINTK(("autofs4_check_mount: returning = %d\n", status)); + mntput(mnt); + dput(dentry); + return status; } -/* Check dentry tree for busyness. If a dentry appears to be busy - because it is a mountpoint, check to see if the mounted - filesystem is busy. */ -static int is_tree_busy(struct vfsmount *topmnt, struct dentry *top) +/* Check a directory tree of mount points for busyness + * The tree is not busy iff no mountpoints are busy + * Return 1 if the tree is busy or 0 otherwise + */ +static int autofs4_check_tree(struct vfsmount *mnt, + struct dentry *top, + unsigned long timeout, + int do_now) { - struct dentry *this_parent; + struct dentry *this_parent = top; struct list_head *next; - int count; - count = atomic_read(&top->d_count); - - DPRINTK(("is_tree_busy: top=%p initial count=%d\n", - top, count)); - this_parent = top; - - if (is_autofs4_dentry(top)) { - count--; - DPRINTK(("is_tree_busy: autofs; count=%d\n", count)); - } + DPRINTK(("autofs4_check_tree: parent %p %.*s\n", + top, (int)top->d_name.len, top->d_name.name)); - if (d_mountpoint(top)) - count -= check_vfsmnt(topmnt, top); + /* Negative dentry - give up */ + if (!simple_positive(top)) + return 0; + + /* Timeout of a tree mount is determined by its top dentry */ + if (!autofs4_can_expire(top, timeout, do_now)) + return 0; - repeat: + spin_lock(&dcache_lock); +repeat: next = this_parent->d_subdirs.next; - resume: +resume: while (next != &this_parent->d_subdirs) { - int adj = 0; - struct dentry *dentry = list_entry(next, struct dentry, - d_child); + struct dentry *dentry = list_entry(next, struct dentry, d_child); + + /* Negative dentry - give up */ + if (!simple_positive(dentry)) { + next = next->next; + continue; + } + + DPRINTK(("autofs4_check_tree: dentry %p %.*s\n", + dentry, (int)dentry->d_name.len, dentry->d_name.name)); + + if (!simple_empty_nolock(dentry)) { + this_parent = dentry; + goto repeat; + } + + dentry = dget(dentry); + spin_unlock(&dcache_lock); + + if (d_mountpoint(dentry)) { + /* First busy => tree busy */ + if (!autofs4_check_mount(mnt, dentry)) { + dput(dentry); + return 0; + } + } + + dput(dentry); + spin_lock(&dcache_lock); next = next->next; + } + + if (this_parent != top) { + next = this_parent->d_child.next; + this_parent = this_parent->d_parent; + goto resume; + } + spin_unlock(&dcache_lock); - count += atomic_read(&dentry->d_count) - 1; + return 1; +} + +struct dentry *autofs4_check_leaves(struct vfsmount *mnt, + struct dentry *parent, + unsigned long timeout, + int do_now) +{ + struct dentry *this_parent = parent; + struct list_head *next; + + DPRINTK(("autofs4_check_leaves: parent %p %.*s\n", + parent, (int)parent->d_name.len, parent->d_name.name)); - if (d_mountpoint(dentry)) - adj += check_vfsmnt(topmnt, dentry); + spin_lock(&dcache_lock); +repeat: + next = this_parent->d_subdirs.next; +resume: + while (next != &this_parent->d_subdirs) { + struct dentry *dentry = list_entry(next, struct dentry, d_child); - if (is_autofs4_dentry(dentry)) { - adj++; - DPRINTK(("is_tree_busy: autofs; adj=%d\n", - adj)); + /* Negative dentry - give up */ + if (!simple_positive(dentry)) { + next = next->next; + continue; } - count -= adj; + DPRINTK(("autofs4_check_leaves: dentry %p %.*s\n", + dentry, (int)dentry->d_name.len, dentry->d_name.name)); if (!list_empty(&dentry->d_subdirs)) { this_parent = dentry; goto repeat; } - if (atomic_read(&dentry->d_count) != adj) { - DPRINTK(("is_tree_busy: busy leaf (d_count=%d adj=%d)\n", - atomic_read(&dentry->d_count), adj)); - return 1; + dentry = dget(dentry); + spin_unlock(&dcache_lock); + + if (d_mountpoint(dentry)) { + /* Can we expire this guy */ + if (!autofs4_can_expire(dentry, timeout, do_now)) + goto cont; + + /* Can we umount this guy */ + if (autofs4_check_mount(mnt, dentry)) + return dentry; + } +cont: + dput(dentry); + spin_lock(&dcache_lock); + next = next->next; } - /* All done at this level ... ascend and resume the search. */ - if (this_parent != top) { - next = this_parent->d_child.next; + if (this_parent != parent) { + next = this_parent->d_child.next; this_parent = this_parent->d_parent; goto resume; } + spin_unlock(&dcache_lock); - DPRINTK(("is_tree_busy: count=%d\n", count)); - return count != 0; /* remaining users? */ + return NULL; } /* @@ -156,61 +257,86 @@ static int is_tree_busy(struct vfsmount static struct dentry *autofs4_expire(struct super_block *sb, struct vfsmount *mnt, struct autofs_sb_info *sbi, - int do_now) + int how) { - unsigned long now = jiffies; unsigned long timeout; struct dentry *root = sb->s_root; - struct list_head *tmp; + struct dentry *expired = NULL; + struct list_head *next; + int do_now = how & AUTOFS_EXP_IMMEDIATE; + int exp_leaves = how & AUTOFS_EXP_LEAVES; - if (!sbi->exp_timeout || !root) + if ( !sbi->exp_timeout || !root ) return NULL; + now = jiffies; timeout = sbi->exp_timeout; spin_lock(&dcache_lock); - for(tmp = root->d_subdirs.next; - tmp != &root->d_subdirs; - tmp = tmp->next) { - struct autofs_info *ino; - struct dentry *dentry = list_entry(tmp, struct dentry, d_child); + next = root->d_subdirs.next; - if (dentry->d_inode == NULL) + /* On exit from the loop expire is set to a dgot dentry + * to expire or it's NULL */ + while ( next != &root->d_subdirs ) { + struct dentry *dentry = list_entry(next, struct dentry, d_child); + + /* Negative dentry - give up */ + if ( !simple_positive(dentry) ) { + next = next->next; continue; + } - ino = autofs4_dentry_ino(dentry); + dentry = dget(dentry); + spin_unlock(&dcache_lock); - if (ino == NULL) { - /* dentry in the process of being deleted */ - continue; + /* Case 1: indirect mount or top level direct mount */ + if (d_mountpoint(dentry)) { + DPRINTK(("autofs4_expire: checking mountpoint %p %.*s\n", + dentry, (int)dentry->d_name.len, dentry->d_name.name)); + + /* Can we expire this guy */ + if (!autofs4_can_expire(dentry, timeout, do_now)) + goto next; + + /* Can we umount this guy */ + if (autofs4_check_mount(mnt, dentry)) { + expired = dentry; + break; + } + goto next; } - /* No point expiring a pending mount */ - if (dentry->d_flags & DCACHE_AUTOFS_PENDING) - continue; + if ( simple_empty(dentry) ) + goto next; - if (!do_now) { - /* Too young to die */ - if (time_after(ino->last_used + timeout, now)) - continue; - - /* update last_used here :- - - obviously makes sense if it is in use now - - less obviously, prevents rapid-fire expire - attempts if expire fails the first time */ - ino->last_used = now; + /* Case 2: tree mount, expire iff entire tree is not busy */ + if (!exp_leaves) { + if (autofs4_check_tree(mnt, dentry, timeout, do_now)) { + expired = dentry; + break; + } + /* Case 3: direct mount, expire individual leaves */ + } else { + expired = autofs4_check_leaves(mnt, dentry, timeout, do_now); + if (expired) { + dput(dentry); + break; + } } - if (!is_tree_busy(mnt, dentry)) { - DPRINTK(("autofs_expire: returning %p %.*s\n", - dentry, (int)dentry->d_name.len, dentry->d_name.name)); - /* Start from here next time */ - list_del(&root->d_subdirs); - list_add(&root->d_subdirs, &dentry->d_child); - dget(dentry); - spin_unlock(&dcache_lock); +next: + dput(dentry); + spin_lock(&dcache_lock); + next = next->next; + } - return dentry; - } + if ( expired ) { + DPRINTK(("autofs4_expire: returning %p %.*s\n", + expired, (int)expired->d_name.len, expired->d_name.name)); + spin_lock(&dcache_lock); + list_del(&expired->d_parent->d_subdirs); + list_add(&expired->d_parent->d_subdirs, &expired->d_child); + spin_unlock(&dcache_lock); + return expired; } spin_unlock(&dcache_lock); @@ -263,7 +389,7 @@ int autofs4_expire_multi(struct super_bl /* This is synchronous because it makes the daemon a little easier */ de_info->flags |= AUTOFS_INF_EXPIRING; - ret = autofs4_wait(sbi, &dentry->d_name, NFY_EXPIRE); + ret = autofs4_wait(sbi, dentry, NFY_EXPIRE); de_info->flags &= ~AUTOFS_INF_EXPIRING; dput(dentry); } --- diff/fs/autofs4/inode.c 2004-03-11 10:20:28.000000000 +0000 +++ source/fs/autofs4/inode.c 2004-04-21 10:46:51.656729096 +0100 @@ -187,6 +187,7 @@ int autofs4_fill_super(struct super_bloc struct file * pipe; int pipefd; struct autofs_sb_info *sbi; + struct autofs_info *ino; int minproto, maxproto; sbi = (struct autofs_sb_info *) kmalloc(sizeof(*sbi), GFP_KERNEL); @@ -203,6 +204,7 @@ int autofs4_fill_super(struct super_bloc sbi->oz_pgrp = process_group(current); sbi->sb = s; sbi->version = 0; + sbi->sub_version = 0; sbi->queues = NULL; s->s_blocksize = 1024; s->s_blocksize_bits = 10; @@ -212,7 +214,11 @@ int autofs4_fill_super(struct super_bloc /* * Get the root inode and dentry, but defer checking for errors. */ - root_inode = autofs4_get_inode(s, autofs4_mkroot(sbi)); + ino = autofs4_mkroot(sbi); + if (!ino) + goto fail_free; + root_inode = autofs4_get_inode(s, ino); + kfree(ino); if (!root_inode) goto fail_free; @@ -244,6 +250,7 @@ int autofs4_fill_super(struct super_bloc } sbi->version = maxproto > AUTOFS_MAX_PROTO_VERSION ? AUTOFS_MAX_PROTO_VERSION : maxproto; + sbi->sub_version = AUTOFS_PROTO_SUBVERSION; DPRINTK(("autofs: pipe fd = %d, pgrp = %u\n", pipefd, sbi->oz_pgrp)); pipe = fget(pipefd); @@ -305,7 +312,7 @@ struct inode *autofs4_get_inode(struct s if (S_ISDIR(inf->mode)) { inode->i_nlink = 2; inode->i_op = &autofs4_dir_inode_operations; - inode->i_fop = &simple_dir_operations; + inode->i_fop = &autofs4_dir_operations; } else if (S_ISLNK(inf->mode)) { inode->i_size = inf->size; inode->i_op = &autofs4_symlink_inode_operations; --- diff/fs/autofs4/root.c 2003-09-30 15:46:18.000000000 +0100 +++ source/fs/autofs4/root.c 2004-04-21 10:46:51.657728944 +0100 @@ -4,6 +4,7 @@ * * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved * Copyright 1999-2000 Jeremy Fitzhardinge + * Copyright 2001-2003 Ian Kent * * This file is part of the Linux kernel and is made available under * the terms of the GNU General Public License, version 2, or at your @@ -24,17 +25,28 @@ static int autofs4_dir_unlink(struct ino static int autofs4_dir_rmdir(struct inode *,struct dentry *); static int autofs4_dir_mkdir(struct inode *,struct dentry *,int); static int autofs4_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long); +static int autofs4_dir_open(struct inode *inode, struct file *file); +static int autofs4_dir_close(struct inode *inode, struct file *file); +static int autofs4_dir_readdir(struct file * filp, void * dirent, filldir_t filldir); +static int autofs4_root_readdir(struct file * filp, void * dirent, filldir_t filldir); static struct dentry *autofs4_root_lookup(struct inode *,struct dentry *, struct nameidata *); +static int autofs4_dcache_readdir(struct file *, void *, filldir_t); struct file_operations autofs4_root_operations = { .open = dcache_dir_open, .release = dcache_dir_close, - .llseek = dcache_dir_lseek, .read = generic_read_dir, - .readdir = dcache_readdir, + .readdir = autofs4_root_readdir, .ioctl = autofs4_root_ioctl, }; +struct file_operations autofs4_dir_operations = { + .open = autofs4_dir_open, + .release = autofs4_dir_close, + .read = generic_read_dir, + .readdir = autofs4_dir_readdir, +}; + struct inode_operations autofs4_root_inode_operations = { .lookup = autofs4_root_lookup, .unlink = autofs4_dir_unlink, @@ -51,6 +63,30 @@ struct inode_operations autofs4_dir_inod .rmdir = autofs4_dir_rmdir, }; +static int autofs4_root_readdir(struct file *file, void *dirent, + filldir_t filldir) +{ + struct autofs_sb_info *sbi = autofs4_sbi(file->f_dentry->d_sb); + int oz_mode = autofs4_oz_mode(sbi); + + DPRINTK(("autofs4_root_readdir called, filp->f_pos = %lld\n", + file->f_pos)); + + /* + * Don't set reghost flag if: + * 1) f_pos is larger than zero -- we've already been here. + * 2) we haven't even enabled reghosting in the 1st place. + * 3) this is the daemon doing a readdir + */ + if (oz_mode && file->f_pos == 0 && sbi->reghost_enabled) + sbi->needs_reghost = 1; + + DPRINTK(("autofs4_root_readdir: needs_reghost = %d\n", + sbi->needs_reghost)); + + return autofs4_dcache_readdir(file, dirent, filldir); +} + /* Update usage from here to top of tree, so that scan of top-level directories will give a useful result */ static void autofs4_update_usage(struct dentry *dentry) @@ -67,9 +103,191 @@ static void autofs4_update_usage(struct } } +/* + * From 2.4 kernel readdir.c + */ +static int autofs4_dcache_readdir(struct file * filp, void * dirent, filldir_t filldir) +{ + int i; + struct dentry *dentry = filp->f_dentry; + + i = filp->f_pos; + switch (i) { + case 0: + if (filldir(dirent, ".", 1, i, dentry->d_inode->i_ino, DT_DIR) < 0) + break; + i++; + filp->f_pos++; + /* fallthrough */ + case 1: + if (filldir(dirent, "..", 2, i, dentry->d_parent->d_inode->i_ino, DT_DIR) < 0) + break; + i++; + filp->f_pos++; + /* fallthrough */ + default: { + struct list_head *list; + int j = i-2; + + spin_lock(&dcache_lock); + list = dentry->d_subdirs.next; + + for (;;) { + if (list == &dentry->d_subdirs) { + spin_unlock(&dcache_lock); + return 0; + } + if (!j) + break; + j--; + list = list->next; + } + + while(1) { + struct dentry *de = list_entry(list, struct dentry, d_child); + + if (!d_unhashed(de) && de->d_inode) { + spin_unlock(&dcache_lock); + if (filldir(dirent, de->d_name.name, de->d_name.len, filp->f_pos, de->d_inode->i_ino, DT_UNKNOWN) < 0) + break; + spin_lock(&dcache_lock); + } + filp->f_pos++; + list = list->next; + if (list != &dentry->d_subdirs) + continue; + spin_unlock(&dcache_lock); + break; + } + } + } + return 0; +} + +static int autofs4_dir_open(struct inode *inode, struct file *file) +{ + struct dentry *dentry = file->f_dentry; + struct vfsmount *mnt = file->f_vfsmnt; + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + int status; + + DPRINTK(("autofs4_dir_open: file=%p dentry=%p %.*s\n", + file, dentry, dentry->d_name.len, dentry->d_name.name)); + + if (autofs4_oz_mode(sbi)) + goto out; + + if (autofs4_ispending(dentry)) { + DPRINTK(("autofs4_dir_open: dentry busy\n")); + return -EBUSY; + } + + if (!d_mountpoint(dentry) && dentry->d_op && dentry->d_op->d_revalidate) { + struct nameidata nd; + int empty; + + /* In case there are stale directory dentrys from a failed mount */ + spin_lock(&dcache_lock); + empty = list_empty(&dentry->d_subdirs); + spin_unlock(&dcache_lock); + + if (!empty) + d_invalidate(dentry); + + nd.flags = LOOKUP_DIRECTORY; + status = (dentry->d_op->d_revalidate)(dentry, &nd); + + if (!status) + return -ENOENT; + } + + if (d_mountpoint(dentry)) { + struct file *fp = NULL; + struct vfsmount *fp_mnt = mntget(mnt); + struct dentry *fp_dentry = dget(dentry); + + while (follow_down(&fp_mnt, &fp_dentry) && d_mountpoint(fp_dentry)); + + fp = dentry_open(fp_dentry, fp_mnt, file->f_flags); + status = PTR_ERR(fp); + if (IS_ERR(fp)) { + file->private_data = NULL; + return status; + } + file->private_data = fp; + } +out: + return 0; +} + +static int autofs4_dir_close(struct inode *inode, struct file *file) +{ + struct dentry *dentry = file->f_dentry; + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + + DPRINTK(("autofs4_dir_close: file=%p dentry=%p %.*s\n", + file, dentry, dentry->d_name.len, dentry->d_name.name)); + + if (autofs4_oz_mode(sbi)) + goto out; + + if (autofs4_ispending(dentry)) { + DPRINTK(("autofs4_dir_close: dentry busy\n")); + return -EBUSY; + } + + if (d_mountpoint(dentry)) { + struct file *fp = file->private_data; + + if (!fp) + return -ENOENT; + + filp_close(fp, current->files); + file->private_data = NULL; + } +out: + return 0; +} + +static int autofs4_dir_readdir(struct file *file, void *dirent, filldir_t filldir) +{ + struct dentry *dentry = file->f_dentry; + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + int status; + + DPRINTK(("autofs4_readdir: file=%p dentry=%p %.*s\n", + file, dentry, dentry->d_name.len, dentry->d_name.name)); + + if (autofs4_oz_mode(sbi)) + goto out; + + if (autofs4_ispending(dentry)) { + DPRINTK(("autofs4_readdir: dentry busy\n")); + return -EBUSY; + } + + if (d_mountpoint(dentry)) { + struct file *fp = file->private_data; + + if (!fp) + return -ENOENT; + + if (!fp->f_op || !fp->f_op->readdir) + goto out; + + status = vfs_readdir(fp, filldir, dirent); + file->f_pos = fp->f_pos; + if (status) + autofs4_copy_atime(file, fp); + return status; + } +out: + return autofs4_dcache_readdir(file, dirent, filldir); +} + static int try_to_fill_dentry(struct dentry *dentry, struct super_block *sb, - struct autofs_sb_info *sbi) + struct autofs_sb_info *sbi, int flags) { struct autofs_info *de_info = autofs4_dentry_ino(dentry); int status = 0; @@ -78,11 +296,10 @@ static int try_to_fill_dentry(struct den when expiration is done to trigger mount request with a new dentry */ if (de_info && (de_info->flags & AUTOFS_INF_EXPIRING)) { - DPRINTK(("try_to_fill_entry: waiting for expire %p name=%.*s, flags&PENDING=%s de_info=%p de_info->flags=%x\n", - dentry, dentry->d_name.len, dentry->d_name.name, - dentry->d_flags & DCACHE_AUTOFS_PENDING?"t":"f", - de_info, de_info?de_info->flags:0)); - status = autofs4_wait(sbi, &dentry->d_name, NFY_NONE); + DPRINTK(("try_to_fill_entry: waiting for expire %p name=%.*s\n", + dentry, dentry->d_name.len, dentry->d_name.name)); + + status = autofs4_wait(sbi, dentry, NFY_NONE); DPRINTK(("try_to_fill_entry: expire done status=%d\n", status)); @@ -93,11 +310,11 @@ static int try_to_fill_dentry(struct den dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode)); /* Wait for a pending mount, triggering one if there isn't one already */ - while(dentry->d_inode == NULL) { - DPRINTK(("try_to_fill_entry: waiting for mount name=%.*s, de_info=%p de_info->flags=%x\n", - dentry->d_name.len, dentry->d_name.name, - de_info, de_info?de_info->flags:0)); - status = autofs4_wait(sbi, &dentry->d_name, NFY_MOUNT); + if (dentry->d_inode == NULL) { + DPRINTK(("try_to_fill_entry: waiting for mount name=%.*s\n", + dentry->d_name.len, dentry->d_name.name)); + + status = autofs4_wait(sbi, dentry, NFY_MOUNT); DPRINTK(("try_to_fill_entry: mount done status=%d\n", status)); @@ -113,19 +330,22 @@ static int try_to_fill_dentry(struct den /* Return a negative dentry, but leave it "pending" */ return 1; } - } + /* Trigger mount for path component or follow link */ + } else if (flags & (LOOKUP_CONTINUE | LOOKUP_DIRECTORY) || + current->link_count) { + DPRINTK(("try_to_fill_entry: waiting for mount name=%.*s\n", + dentry->d_name.len, dentry->d_name.name)); - /* If this is an unused directory that isn't a mount point, - bitch at the daemon and fix it in user space */ - spin_lock(&dcache_lock); - if (S_ISDIR(dentry->d_inode->i_mode) && - !d_mountpoint(dentry) && - list_empty(&dentry->d_subdirs)) { - DPRINTK(("try_to_fill_entry: mounting existing dir\n")); - spin_unlock(&dcache_lock); - return autofs4_wait(sbi, &dentry->d_name, NFY_MOUNT) == 0; + dentry->d_flags |= DCACHE_AUTOFS_PENDING; + status = autofs4_wait(sbi, dentry, NFY_MOUNT); + + DPRINTK(("try_to_fill_entry: mount done status=%d\n", status)); + + if (status) { + dentry->d_flags &= ~DCACHE_AUTOFS_PENDING; + return 0; + } } - spin_unlock(&dcache_lock); /* We don't update the usages for the autofs daemon itself, this is necessary for recursive autofs mounts */ @@ -136,25 +356,25 @@ static int try_to_fill_dentry(struct den return 1; } - /* * Revalidate is called on every cache lookup. Some of those * cache lookups may actually happen while the dentry is not * yet completely filled in, and revalidate has to delay such * lookups.. */ -static int autofs4_root_revalidate(struct dentry * dentry, struct nameidata *nd) +static int autofs4_revalidate(struct dentry * dentry, struct nameidata *nd) { struct inode * dir = dentry->d_parent->d_inode; struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); int oz_mode = autofs4_oz_mode(sbi); + int flags = nd ? nd->flags : 0; + int status = 1; /* Pending dentry */ if (autofs4_ispending(dentry)) { - if (autofs4_oz_mode(sbi)) - return 1; - else - return try_to_fill_dentry(dentry, dir->i_sb, sbi); + if (!oz_mode) + status = try_to_fill_dentry(dentry, dir->i_sb, sbi, flags); + return status; } /* Negative dentry.. invalidate if "old" */ @@ -166,13 +386,12 @@ static int autofs4_root_revalidate(struc if (S_ISDIR(dentry->d_inode->i_mode) && !d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) { - DPRINTK(("autofs_root_revalidate: dentry=%p %.*s, emptydir\n", + DPRINTK(("autofs4_root_revalidate: dentry=%p %.*s, emptydir\n", dentry, dentry->d_name.len, dentry->d_name.name)); spin_unlock(&dcache_lock); - if (oz_mode) - return 1; - else - return try_to_fill_dentry(dentry, dir->i_sb, sbi); + if (!oz_mode) + status = try_to_fill_dentry(dentry, dir->i_sb, sbi, flags); + return status; } spin_unlock(&dcache_lock); @@ -183,16 +402,6 @@ static int autofs4_root_revalidate(struc return 1; } -static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd) -{ - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - - if (!autofs4_oz_mode(sbi)) - autofs4_update_usage(dentry); - - return 1; -} - static void autofs4_dentry_release(struct dentry *de) { struct autofs_info *inf; @@ -212,7 +421,7 @@ static void autofs4_dentry_release(struc /* For dentries of directories in the root dir */ static struct dentry_operations autofs4_root_dentry_operations = { - .d_revalidate = autofs4_root_revalidate, + .d_revalidate = autofs4_revalidate, .d_release = autofs4_dentry_release, }; @@ -224,11 +433,10 @@ static struct dentry_operations autofs4_ /* Lookups in non-root dirs never find anything - if it's there, it's already in the dcache */ -/* SMP-safe */ static struct dentry *autofs4_dir_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { #if 0 - DPRINTK(("autofs_dir_lookup: ignoring lookup of %.*s/%.*s\n", + DPRINTK(("autofs4_dir_lookup: iignoring lookup of %.*s/%.*s\n", dentry->d_parent->d_name.len, dentry->d_parent->d_name.name, dentry->d_name.len, dentry->d_name.name)); #endif @@ -244,7 +452,7 @@ static struct dentry *autofs4_root_looku struct autofs_sb_info *sbi; int oz_mode; - DPRINTK(("autofs_root_lookup: name = %.*s\n", + DPRINTK(("autofs4_root_lookup: name = %.*s\n", dentry->d_name.len, dentry->d_name.name)); if (dentry->d_name.len > NAME_MAX) @@ -252,9 +460,8 @@ static struct dentry *autofs4_root_looku sbi = autofs4_sbi(dir->i_sb); - lock_kernel(); oz_mode = autofs4_oz_mode(sbi); - DPRINTK(("autofs_lookup: pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d\n", + DPRINTK(("autofs4_lookup: pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d\n", current->pid, process_group(current), sbi->catatonic, oz_mode)); /* @@ -285,12 +492,16 @@ static struct dentry *autofs4_root_looku * a signal. If so we can force a restart.. */ if (dentry->d_flags & DCACHE_AUTOFS_PENDING) { + /* See if we were interrupted */ if (signal_pending(current)) { - unlock_kernel(); - return ERR_PTR(-ERESTARTNOINTR); + sigset_t *sigset = ¤t->pending.signal; + if (sigismember (sigset, SIGKILL) || + sigismember (sigset, SIGQUIT) || + sigismember (sigset, SIGINT)) { + return ERR_PTR(-ERESTARTNOINTR); + } } } - unlock_kernel(); /* * If this dentry is unhashed, then we shouldn't honour this @@ -313,27 +524,21 @@ static int autofs4_dir_symlink(struct in struct inode *inode; char *cp; - DPRINTK(("autofs_dir_symlink: %s <- %.*s\n", symname, + DPRINTK(("autofs4_dir_symlink: %s <- %.*s\n", symname, dentry->d_name.len, dentry->d_name.name)); - lock_kernel(); - if (!autofs4_oz_mode(sbi)) { - unlock_kernel(); + if (!autofs4_oz_mode(sbi)) return -EACCES; - } ino = autofs4_init_ino(ino, sbi, S_IFLNK | 0555); - if (ino == NULL) { - unlock_kernel(); + if (ino == NULL) return -ENOSPC; - } ino->size = strlen(symname); ino->u.symlink = cp = kmalloc(ino->size + 1, GFP_KERNEL); if (cp == NULL) { kfree(ino); - unlock_kernel(); return -ENOSPC; } @@ -353,7 +558,6 @@ static int autofs4_dir_symlink(struct in dir->i_mtime = CURRENT_TIME; - unlock_kernel(); return 0; } @@ -370,7 +574,7 @@ static int autofs4_dir_symlink(struct in * If a process is blocked on the dentry waiting for the expire to finish, * it will invalidate the dentry and try to mount with a new one. * - * Also see autofs_dir_rmdir().. + * Also see autofs4_dir_rmdir().. */ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) { @@ -378,11 +582,8 @@ static int autofs4_dir_unlink(struct ino struct autofs_info *ino = autofs4_dentry_ino(dentry); /* This allows root to remove symlinks */ - lock_kernel(); - if ( !autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) ) { - unlock_kernel(); + if ( !autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) ) return -EACCES; - } dput(ino->dentry); @@ -393,8 +594,6 @@ static int autofs4_dir_unlink(struct ino d_drop(dentry); - unlock_kernel(); - return 0; } @@ -403,16 +602,12 @@ static int autofs4_dir_rmdir(struct inod struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); - lock_kernel(); - if (!autofs4_oz_mode(sbi)) { - unlock_kernel(); + if (!autofs4_oz_mode(sbi)) return -EACCES; - } spin_lock(&dcache_lock); if (!list_empty(&dentry->d_subdirs)) { spin_unlock(&dcache_lock); - unlock_kernel(); return -ENOTEMPTY; } __d_drop(dentry); @@ -426,32 +621,24 @@ static int autofs4_dir_rmdir(struct inod if (dir->i_nlink) dir->i_nlink--; - unlock_kernel(); return 0; } - - static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode) { struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); struct inode *inode; - lock_kernel(); - if ( !autofs4_oz_mode(sbi) ) { - unlock_kernel(); + if ( !autofs4_oz_mode(sbi) ) return -EACCES; - } - DPRINTK(("autofs_dir_mkdir: dentry %p, creating %.*s\n", + DPRINTK(("autofs4_dir_mkdir: dentry %p, creating %.*s\n", dentry, dentry->d_name.len, dentry->d_name.name)); ino = autofs4_init_ino(ino, sbi, S_IFDIR | 0555); - if (ino == NULL) { - unlock_kernel(); + if (ino == NULL) return -ENOSPC; - } inode = autofs4_get_inode(dir->i_sb, ino); d_instantiate(dentry, inode); @@ -467,7 +654,6 @@ static int autofs4_dir_mkdir(struct inod dir->i_nlink++; dir->i_mtime = CURRENT_TIME; - unlock_kernel(); return 0; } @@ -496,7 +682,68 @@ static inline int autofs4_get_protover(s return put_user(sbi->version, p); } -/* Identify autofs_dentries - this is so we can tell if there's +/* Return protocol sub version */ +static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi, int *p) +{ + return put_user(sbi->sub_version, p); +} + +/* + * Tells the daemon whether we need to reghost or not. Also, clears + * the reghost_needed flag. + */ +static inline int autofs4_ask_reghost(struct autofs_sb_info *sbi, int *p) +{ + int status; + + DPRINTK(("autofs_ask_reghost: returning %d\n", sbi->needs_reghost)); + + status = put_user(sbi->needs_reghost, p); + if ( status ) + return status; + + sbi->needs_reghost = 0; + return 0; +} + +/* + * Enable / Disable reghosting ioctl() operation + */ +static inline int autofs4_toggle_reghost(struct autofs_sb_info *sbi, int *p) +{ + int status; + int val; + + status = get_user(val, p); + + DPRINTK(("autofs4_toggle_reghost: reghost = %d\n", val)); + + if (status) + return status; + + /* turn on/off reghosting, with the val */ + sbi->reghost_enabled = val; + return 0; +} + +/* +* Tells the daemon whether it can umount the autofs mount. +*/ +static inline int autofs4_ask_umount(struct vfsmount *mnt, int *p) +{ + int status = 0; + + if (may_umount(mnt) == 0) + status = 1; + + DPRINTK(("autofs_ask_umount: returning %d\n", status)); + + status = put_user(status, p); + + return status; +} + +/* Identify autofs4_dentries - this is so we can tell if there's an extra dentry refcount or not. We only hold a refcount on the dentry if its non-negative (ie, d_inode != NULL) */ @@ -517,7 +764,7 @@ static int autofs4_root_ioctl(struct ino { struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb); - DPRINTK(("autofs_ioctl: cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n", + DPRINTK(("autofs4_root_ioctl: cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n", cmd,arg,sbi,process_group(current))); if ( _IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) || @@ -537,9 +784,19 @@ static int autofs4_root_ioctl(struct ino return 0; case AUTOFS_IOC_PROTOVER: /* Get protocol version */ return autofs4_get_protover(sbi, (int *)arg); + case AUTOFS_IOC_PROTOSUBVER: /* Get protocol sub version */ + return autofs4_get_protosubver(sbi, (int *)arg); case AUTOFS_IOC_SETTIMEOUT: return autofs4_get_set_timeout(sbi,(unsigned long *)arg); + case AUTOFS_IOC_TOGGLEREGHOST: + return autofs4_toggle_reghost(sbi, (int *) arg); + case AUTOFS_IOC_ASKREGHOST: + return autofs4_ask_reghost(sbi, (int *) arg); + + case AUTOFS_IOC_ASKUMOUNT: + return autofs4_ask_umount(filp->f_vfsmnt, (int *) arg); + /* return a single thing to expire */ case AUTOFS_IOC_EXPIRE: return autofs4_expire_run(inode->i_sb,filp->f_vfsmnt,sbi, --- diff/fs/autofs4/waitq.c 2003-02-13 11:46:54.000000000 +0000 +++ source/fs/autofs4/waitq.c 2004-04-21 10:46:51.658728792 +0100 @@ -3,6 +3,7 @@ * linux/fs/autofs/waitq.c * * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved + * Copyright 2001-2003 Ian Kent * * This file is part of the Linux kernel and is made available under * the terms of the GNU General Public License, version 2, or at your @@ -16,6 +17,8 @@ #include #include "autofs_i.h" +static spinlock_t waitq_lock = SPIN_LOCK_UNLOCKED; + /* We make this a static variable rather than a part of the superblock; it is better if we don't reassign numbers easily even across filesystems */ static autofs_wqt_t autofs4_next_wait_queue = 1; @@ -37,7 +40,7 @@ void autofs4_catatonic_mode(struct autof wq->status = -ENOENT; /* Magic is gone - report failure */ kfree(wq->name); wq->name = NULL; - wake_up(&wq->queue); + wake_up_interruptible(&wq->queue); wq = nwq; } if (sbi->pipe) { @@ -90,7 +93,7 @@ static void autofs4_notify_daemon(struct union autofs_packet_union pkt; size_t pktsz; - DPRINTK(("autofs_notify: wait id = 0x%08lx, name = %.*s, type=%d\n", + DPRINTK(("autofs4_notify_daemon: wait id = 0x%08lx, name = %.*s, type=%d\n", wq->wait_queue_token, wq->len, wq->name, type)); memset(&pkt,0,sizeof pkt); /* For security reasons */ @@ -116,7 +119,7 @@ static void autofs4_notify_daemon(struct memcpy(ep->name, wq->name, wq->len); ep->name[wq->len] = '\0'; } else { - printk("autofs_notify_daemon: bad type %d!\n", type); + printk("autofs4_notify_daemon: bad type %d!\n", type); return; } @@ -124,62 +127,103 @@ static void autofs4_notify_daemon(struct autofs4_catatonic_mode(sbi); } -int autofs4_wait(struct autofs_sb_info *sbi, struct qstr *name, +static int autofs4_getpath(struct autofs_sb_info *sbi, + struct dentry *dentry, char **name) +{ + struct dentry *root = sbi->sb->s_root; + struct dentry *tmp; + char *buf = *name; + char *p; + int len = 0; + + spin_lock(&dcache_lock); + for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent) + len += tmp->d_name.len + 1; + + if (--len > NAME_MAX) { + spin_unlock(&dcache_lock); + return 0; + } + + *(buf + len) = '\0'; + p = buf + len - dentry->d_name.len; + strncpy(p, dentry->d_name.name, dentry->d_name.len); + + for (tmp = dentry->d_parent; tmp != root ; tmp = tmp->d_parent) { + *(--p) = '/'; + p -= tmp->d_name.len; + strncpy(p, tmp->d_name.name, tmp->d_name.len); + } + spin_unlock(&dcache_lock); + + return len; +} + +int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, enum autofs_notify notify) { struct autofs_wait_queue *wq; - int status; + char *name; + int len, status; /* In catatonic mode, we don't wait for nobody */ if ( sbi->catatonic ) return -ENOENT; - /* We shouldn't be able to get here, but just in case */ - if ( name->len > NAME_MAX ) + name = kmalloc(NAME_MAX + 1, GFP_KERNEL); + if (!name) + return -ENOMEM; + + len = autofs4_getpath(sbi, dentry, &name); + if (!len) { + kfree(name); return -ENOENT; + } - for ( wq = sbi->queues ; wq ; wq = wq->next ) { - if ( wq->hash == name->hash && - wq->len == name->len && - wq->name && !memcmp(wq->name,name->name,name->len) ) + spin_lock(&waitq_lock); + for (wq = sbi->queues ; wq ; wq = wq->next) { + if (wq->hash == dentry->d_name.hash && + wq->len == len && + wq->name && !memcmp(wq->name, name, len)) break; } + spin_unlock(&waitq_lock); if ( !wq ) { /* Create a new wait queue */ wq = kmalloc(sizeof(struct autofs_wait_queue),GFP_KERNEL); - if ( !wq ) - return -ENOMEM; - - wq->name = kmalloc(name->len,GFP_KERNEL); - if ( !wq->name ) { - kfree(wq); + if ( !wq ) { + kfree(name); return -ENOMEM; } + + spin_lock(&waitq_lock); wq->wait_queue_token = autofs4_next_wait_queue; if (++autofs4_next_wait_queue == 0) autofs4_next_wait_queue = 1; - init_waitqueue_head(&wq->queue); - wq->hash = name->hash; - wq->len = name->len; - wq->status = -EINTR; /* Status return if interrupted */ - memcpy(wq->name, name->name, name->len); wq->next = sbi->queues; sbi->queues = wq; + spin_unlock(&waitq_lock); + init_waitqueue_head(&wq->queue); + wq->hash = dentry->d_name.hash; + wq->name = name; + wq->len = len; + wq->status = -EINTR; /* Status return if interrupted */ - DPRINTK(("autofs_wait: new wait id = 0x%08lx, name = %.*s, nfy=%d\n", - wq->wait_queue_token, wq->len, wq->name, notify)); + DPRINTK(("autofs4_wait: new wait id = 0x%08lx, name = %.*s, nfy=%d\n", + (unsigned long) wq->wait_queue_token, wq->len, wq->name, notify)); /* autofs4_notify_daemon() may block */ - wq->wait_ctr = 2; + atomic_set(&wq->wait_ctr, 2); if (notify != NFY_NONE) { autofs4_notify_daemon(sbi,wq, - notify == NFY_MOUNT ? autofs_ptype_missing : - autofs_ptype_expire_multi); + notify == NFY_MOUNT ? + autofs_ptype_missing : + autofs_ptype_expire_multi); } } else { - wq->wait_ctr++; - DPRINTK(("autofs_wait: existing wait id = 0x%08lx, name = %.*s, nfy=%d\n", - wq->wait_queue_token, wq->len, wq->name, notify)); + atomic_inc(&wq->wait_ctr); + DPRINTK(("autofs4_wait: existing wait id = 0x%08lx, name = %.*s, nfy=%d\n", + (unsigned long) wq->wait_queue_token, wq->len, wq->name, notify)); } /* wq->name is NULL if and only if the lock is already released */ @@ -204,19 +248,19 @@ int autofs4_wait(struct autofs_sb_info * recalc_sigpending(); spin_unlock_irqrestore(¤t->sighand->siglock, irqflags); - interruptible_sleep_on(&wq->queue); + wait_event_interruptible(wq->queue, wq->name == NULL); spin_lock_irqsave(¤t->sighand->siglock, irqflags); current->blocked = oldset; recalc_sigpending(); spin_unlock_irqrestore(¤t->sighand->siglock, irqflags); } else { - DPRINTK(("autofs_wait: skipped sleeping\n")); + DPRINTK(("autofs4_wait: skipped sleeping\n")); } status = wq->status; - if (--wq->wait_ctr == 0) /* Are we the last process to need status? */ + if (atomic_dec_and_test(&wq->wait_ctr)) /* Are we the last process to need status? */ kfree(wq); return status; @@ -227,23 +271,28 @@ int autofs4_wait_release(struct autofs_s { struct autofs_wait_queue *wq, **wql; + spin_lock(&waitq_lock); for ( wql = &sbi->queues ; (wq = *wql) ; wql = &wq->next ) { if ( wq->wait_queue_token == wait_queue_token ) break; } - if ( !wq ) + + if ( !wq ) { + spin_unlock(&waitq_lock); return -EINVAL; + } *wql = wq->next; /* Unlink from chain */ + spin_unlock(&waitq_lock); kfree(wq->name); wq->name = NULL; /* Do not wait on this queue */ wq->status = status; - if (--wq->wait_ctr == 0) /* Is anyone still waiting for this guy? */ + if (atomic_dec_and_test(&wq->wait_ctr)) /* Is anyone still waiting for this guy? */ kfree(wq); else - wake_up(&wq->queue); + wake_up_interruptible(&wq->queue); return 0; } --- diff/fs/binfmt_aout.c 2004-04-21 10:45:35.163357856 +0100 +++ source/fs/binfmt_aout.c 2004-04-21 10:46:51.658728792 +0100 @@ -27,7 +27,6 @@ #include #include -#include #include static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs); --- diff/fs/binfmt_elf.c 2004-04-21 10:45:35.164357704 +0100 +++ source/fs/binfmt_elf.c 2004-04-21 10:46:51.659728640 +0100 @@ -40,7 +40,6 @@ #include #include -#include #include --- diff/fs/binfmt_flat.c 2004-03-11 10:20:28.000000000 +0000 +++ source/fs/binfmt_flat.c 2004-04-21 10:46:51.660728488 +0100 @@ -40,7 +40,6 @@ #include #include #include -#include #include #include --- diff/fs/binfmt_misc.c 2004-04-21 10:45:35.165357552 +0100 +++ source/fs/binfmt_misc.c 2004-04-21 10:46:51.661728336 +0100 @@ -39,6 +39,8 @@ static int enabled = 1; enum {Enabled, Magic}; #define MISC_FMT_PRESERVE_ARGV0 (1<<31) +#define MISC_FMT_OPEN_BINARY (1<<30) +#define MISC_FMT_CREDENTIALS (1<<29) typedef struct { struct list_head list; @@ -102,10 +104,15 @@ static Node *check_file(struct linux_bin static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs) { Node *fmt; - struct file * file; + struct file * interp_file = NULL; + struct file * binary_file = NULL; char iname[BINPRM_BUF_SIZE]; char *iname_addr = iname; int retval; + int fd_binary = -1; + char fd_str[32]; + char * fdsp = fd_str; + int is_open_bin; retval = -ENOEXEC; if (!enabled) @@ -120,33 +127,105 @@ static int load_misc_binary(struct linux if (!fmt) goto _ret; - allow_write_access(bprm->file); - fput(bprm->file); - bprm->file = NULL; + is_open_bin = (fmt->flags & MISC_FMT_OPEN_BINARY) ? 1 : 0; + + if (is_open_bin) { + /* if the binary should be opened on behalf of the + * interpreter than keep it open and assign descriptor + * to it */ + fd_binary = get_unused_fd (); + if (fd_binary < 0) { + retval = fd_binary; + goto _ret; + } + snprintf (fd_str, sizeof(fd_str) - 1, "%d", fd_binary); + } else { + allow_write_access (bprm->file); + fput (bprm->file); + bprm->file = NULL; + } /* Build args for interpreter */ if (!(fmt->flags & MISC_FMT_PRESERVE_ARGV0)) { remove_arg_zero(bprm); } - retval = copy_strings_kernel(1, &bprm->interp, bprm); - if (retval < 0) goto _ret; - bprm->argc++; - retval = copy_strings_kernel(1, &iname_addr, bprm); - if (retval < 0) goto _ret; - bprm->argc++; + + if (is_open_bin) { + /* make argv[1] be the file descriptor of the binary */ + retval = copy_strings_kernel (1, &fdsp, bprm); + } else { + /* make argv[1] be the path to the binary */ + retval = copy_strings_kernel (1, &bprm->interp, bprm); + } + if (retval < 0) + goto _error; + bprm->argc ++; + retval = copy_strings_kernel (1, &iname_addr, bprm); + if (retval < 0) + goto _error; + bprm->argc ++; bprm->interp = iname; /* for binfmt_script */ - file = open_exec(iname); - retval = PTR_ERR(file); - if (IS_ERR(file)) - goto _ret; - bprm->file = file; + interp_file = open_exec (iname); + retval = PTR_ERR (interp_file); + if (IS_ERR (interp_file)) + goto _error; + + + binary_file = bprm->file; + if (fmt->flags & MISC_FMT_CREDENTIALS) { + /* + * Call prepare_binprm before switching to interpreter's file + * so that all security calculation will be done according to + * binary and not interpreter + */ + retval = prepare_binprm(bprm); + if (retval < 0) + goto _error; + bprm->file = interp_file; + memset(bprm->buf, 0, BINPRM_BUF_SIZE); + retval = kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE); + } else { + bprm->file = interp_file; + retval = prepare_binprm (bprm); + } + + if (retval < 0) + goto _error; + + if (is_open_bin) { + /* if the binary is not readable than enforce mm->dumpable=0 + regardless of the interpreter's permissions */ + if (permission (binary_file->f_dentry->d_inode, MAY_READ, NULL)) { + bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; + } + /* install the binary's fd. it is done at the latest possible point + * because once it is installed it will need to be sys_close()ed + * in case of error. + */ + fd_install (fd_binary, binary_file); + } + + retval = search_binary_handler (bprm, regs); + + if (retval < 0) + goto _error_close_file; - retval = prepare_binprm(bprm); - if (retval >= 0) - retval = search_binary_handler(bprm, regs); _ret: return retval; + +_error_close_file: + if (fd_binary > 0) { + sys_close (fd_binary); + fd_binary = -1; + bprm->file = NULL; + } +_error: + if (fd_binary > 0) + put_unused_fd (fd_binary); + bprm->interp_flags = 0; + goto _ret; + } /* Command parsers */ @@ -191,6 +270,36 @@ static int unquote(char *from) return p - from; } +static inline char * check_special_flags (char * sfs, Node * e) +{ + char * p = sfs; + int cont = 1; + + /* special flags */ + while (cont) { + switch (*p) { + case 'P': + p++; + e->flags |= MISC_FMT_PRESERVE_ARGV0; + break; + case 'O': + p++; + e->flags |= MISC_FMT_OPEN_BINARY; + break; + case 'C': + p++; + /* this flags also implies the + open-binary flag */ + e->flags |= (MISC_FMT_CREDENTIALS | + MISC_FMT_OPEN_BINARY); + break; + default: + cont = 0; + } + } + + return p; +} /* * This registers a new binary format, it recognises the syntax * ':name:type:offset:magic:mask:interpreter:' @@ -293,10 +402,8 @@ static Node *create_entry(const char *bu if (!e->interpreter[0]) goto Einval; - if (*p == 'P') { - p++; - e->flags |= MISC_FMT_PRESERVE_ARGV0; - } + + p = check_special_flags (p, e); if (*p == '\n') p++; @@ -346,6 +453,7 @@ static void entry_status(Node *e, char * { char *dp; char *status = "disabled"; + const char * flags = "flags: "; if (test_bit(Enabled, &e->flags)) status = "enabled"; @@ -357,6 +465,22 @@ static void entry_status(Node *e, char * sprintf(page, "%s\ninterpreter %s\n", status, e->interpreter); dp = page + strlen(page); + + /* print the special flags */ + sprintf (dp, "%s", flags); + dp += strlen (flags); + if (e->flags & MISC_FMT_PRESERVE_ARGV0) { + *dp ++ = 'P'; + } + if (e->flags & MISC_FMT_OPEN_BINARY) { + *dp ++ = 'O'; + } + if (e->flags & MISC_FMT_CREDENTIALS) { + *dp ++ = 'C'; + } + *dp ++ = '\n'; + + if (!test_bit(Magic, &e->flags)) { sprintf(dp, "extension .%s\n", e->magic); } else { --- diff/fs/bio.c 2004-04-05 12:57:08.000000000 +0100 +++ source/fs/bio.c 2004-04-21 10:46:51.661728336 +0100 @@ -808,9 +808,7 @@ static void __init biovec_init_pools(voi size = bp->nr_vecs * sizeof(struct bio_vec); bp->slab = kmem_cache_create(bp->name, size, 0, - SLAB_HWCACHE_ALIGN, NULL, NULL); - if (!bp->slab) - panic("biovec: can't init slab cache\n"); + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); if (i >= scale) pool_entries >>= 1; @@ -825,16 +823,16 @@ static void __init biovec_init_pools(voi static int __init init_bio(void) { bio_slab = kmem_cache_create("bio", sizeof(struct bio), 0, - SLAB_HWCACHE_ALIGN, NULL, NULL); - if (!bio_slab) - panic("bio: can't create slab cache\n"); - bio_pool = mempool_create(BIO_POOL_SIZE, mempool_alloc_slab, mempool_free_slab, bio_slab); + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); + bio_pool = mempool_create(BIO_POOL_SIZE, mempool_alloc_slab, + mempool_free_slab, bio_slab); if (!bio_pool) panic("bio: can't create mempool\n"); biovec_init_pools(); - bio_split_pool = mempool_create(BIO_SPLIT_ENTRIES, bio_pair_alloc, bio_pair_free, NULL); + bio_split_pool = mempool_create(BIO_SPLIT_ENTRIES, + bio_pair_alloc, bio_pair_free, NULL); if (!bio_split_pool) panic("bio: can't create split pool\n"); --- diff/fs/block_dev.c 2004-04-21 10:45:35.166357400 +0100 +++ source/fs/block_dev.c 2004-04-21 10:46:51.662728184 +0100 @@ -251,6 +251,7 @@ static void init_once(void * foo, kmem_c { memset(bdev, 0, sizeof(*bdev)); sema_init(&bdev->bd_sem, 1); + sema_init(&bdev->bd_mount_sem, 1); INIT_LIST_HEAD(&bdev->bd_inodes); INIT_LIST_HEAD(&bdev->bd_list); inode_init_once(&ei->vfs_inode); @@ -302,14 +303,9 @@ struct super_block *blockdev_superblock; void __init bdev_cache_init(void) { int err; - bdev_cachep = kmem_cache_create("bdev_cache", - sizeof(struct bdev_inode), - 0, - SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, - init_once, - NULL); - if (!bdev_cachep) - panic("Cannot create bdev_cache SLAB cache"); + bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), + 0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_PANIC, + init_once, NULL); err = register_filesystem(&bd_type); if (err) panic("Cannot register bdev pseudo-fs"); --- diff/fs/buffer.c 2004-04-21 10:45:35.168357096 +0100 +++ source/fs/buffer.c 2004-04-21 10:46:51.663728032 +0100 @@ -227,6 +227,77 @@ int fsync_bdev(struct block_device *bdev return sync_blockdev(bdev); } +/** + * freeze_bdev -- lock a filesystem and force it into a consistent state + * @bdev: blockdevice to lock + * + * This takes the block device bd_mount_sem to make sure no new mounts + * happen on bdev until thaw_bdev() is called. + * If a superblock is found on this device, we take the s_umount semaphore + * on it to make sure nobody unmounts until the snapshot creation is done. + */ +struct super_block *freeze_bdev(struct block_device *bdev) +{ + struct super_block *sb; + + down(&bdev->bd_mount_sem); + sb = get_super(bdev); + if (sb && !(sb->s_flags & MS_RDONLY)) { + sb->s_frozen = SB_FREEZE_WRITE; + wmb(); + + sync_inodes_sb(sb, 0); + DQUOT_SYNC(sb); + + lock_super(sb); + if (sb->s_dirt && sb->s_op->write_super) + sb->s_op->write_super(sb); + unlock_super(sb); + + if (sb->s_op->sync_fs) + sb->s_op->sync_fs(sb, 1); + + sync_blockdev(sb->s_bdev); + sync_inodes_sb(sb, 1); + + sb->s_frozen = SB_FREEZE_TRANS; + wmb(); + + sync_blockdev(sb->s_bdev); + + if (sb->s_op->write_super_lockfs) + sb->s_op->write_super_lockfs(sb); + } + + sync_blockdev(bdev); + return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */ +} +EXPORT_SYMBOL(freeze_bdev); + +/** + * thaw_bdev -- unlock filesystem + * @bdev: blockdevice to unlock + * @sb: associated superblock + * + * Unlocks the filesystem and marks it writeable again after freeze_bdev(). + */ +void thaw_bdev(struct block_device *bdev, struct super_block *sb) +{ + if (sb) { + BUG_ON(sb->s_bdev != bdev); + + if (sb->s_op->unlockfs) + sb->s_op->unlockfs(sb); + sb->s_frozen = SB_UNFROZEN; + wmb(); + wake_up(&sb->s_wait_unfrozen); + drop_super(sb); + } + + up(&bdev->bd_mount_sem); +} +EXPORT_SYMBOL(thaw_bdev); + /* * sync everything. Start out by waking pdflush, because that writes back * all queues in parallel. @@ -1755,7 +1826,7 @@ static int __block_write_full_page(struc if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { lock_buffer(bh); } else if (test_set_buffer_locked(bh)) { - __set_page_dirty_nobuffers(page); + redirty_page_for_writepage(wbc, page); continue; } if (test_clear_buffer_dirty(bh)) { @@ -2964,7 +3035,7 @@ void __init buffer_init(void) bh_cachep = kmem_cache_create("buffer_head", sizeof(struct buffer_head), 0, - 0, init_buffer_head, NULL); + SLAB_PANIC, init_buffer_head, NULL); for (i = 0; i < ARRAY_SIZE(bh_wait_queue_heads); i++) init_waitqueue_head(&bh_wait_queue_heads[i].wqh); --- diff/fs/cifs/AUTHORS 2003-10-27 09:20:44.000000000 +0000 +++ source/fs/cifs/AUTHORS 2004-04-21 10:46:51.663728032 +0100 @@ -26,5 +26,6 @@ Test case and Bug Report contributors ------------------------------------- Thanks to those in the community who have submitted detailed bug reports and debug of problems they have found: Jochen Dolze, David Blaine, -Rene Scharfe, Martin Josefsson, Alexander Wild and others. +Rene Scharfe, Martin Josefsson, Alexander Wild, Anthony Liguori, +Urban Widmark, Massimiliano Ferrero, Howard Owen and others. --- diff/fs/cifs/CHANGES 2003-10-27 09:20:44.000000000 +0000 +++ source/fs/cifs/CHANGES 2004-04-21 10:46:51.664727880 +0100 @@ -1,3 +1,101 @@ +Version 1.09 +------------ +Fix /proc/fs module unload warning message (that could be logged +to the kernel log). Fix intermittent failure in connectathon +test7 (hardlink count not immediately refreshed in case in which +inode metadata can be incorrectly kept cached when time near zero) + +Version 1.08 +------------ +Allow file_mode and dir_mode (specified at mount time) to be enforced +locally (the server already enforced its own ACLs too) for servers +that do not report the correct mode (do not support the +CIFS Unix Extensions). + +Version 1.07 +------------ +Fix some small memory leaks in some unmount error paths. Fix major leak +of cache pages in readpages causing multiple read oriented stress +testcases (including fsx, and even large file copy) to fail over time. + +Version 1.06 +------------ +Send NTCreateX with ATTR_POSIX if Linux/Unix extensions negotiated with server. +This allows files that differ only in case and improves performance of file +creation and file open to such servers. Fix semaphore conflict which causes +slow delete of open file to Samba (which unfortunately can cause an oplock +break to self while vfs_unlink held i_sem) which can hang for 20 seconds. + +Version 1.05 +------------ +fixes to cifs_readpages for fsx test case + +Version 1.04 +------------ +Fix caching data integrity bug when extending file size especially when no +oplock on file. Fix spurious logging of valid already parsed mount options +that are parsed outside of the cifs vfs such as nosuid. + + +Version 1.03 +------------ +Connect to server when port number override not specified, and tcp port +unitialized. Reset search to restart at correct file when kernel routine +filldir returns error during large directory searches (readdir). + +Version 1.02 +------------ +Fix caching problem when files opened by multiple clients in which +page cache could contain stale data, and write through did +not occur often enough while file was still open when read ahead +(read oplock) not allowed. Treat "sep=" when first mount option +as an overrride of comma as the default separator between mount +options. + +Version 1.01 +------------ +Allow passwords longer than 16 bytes. Allow null password string. + +Version 1.00 +------------ +Gracefully clean up failed mounts when attempting to mount to servers such as +Windows 98 that terminate tcp sessions during prototocol negotiation. Handle +embedded commas in mount parsing of passwords. + +Version 0.99 +------------ +Invalidate local inode cached pages on oplock break and when last file +instance is closed so that the client does not continue using stale local +copy rather than later modified server copy of file. Do not reconnect +when server drops the tcp session prematurely before negotiate +protocol response. Fix oops in roepen_file when dentry freed. Allow +the support for CIFS Unix Extensions to be disabled via proc interface. + +Version 0.98 +------------ +Fix hang in commit_write during reconnection of open files under heavy load. +Fix unload_nls oops in a mount failure path. Serialize writes to same socket +which also fixes any possible races when cifs signatures are enabled in SMBs +being sent out of signature sequence number order. + +Version 0.97 +------------ +Fix byte range locking bug (endian problem) causing bad offset and +length. + +Version 0.96 +------------ +Fix oops (in send_sig) caused by CIFS unmount code trying to +wake up the demultiplex thread after it had exited. Do not log +error on harmless oplock release of closed handle. + +Version 0.95 +------------ +Fix unsafe global variable usage and password hash failure on gcc 3.3.1 +Fix problem reconnecting secondary mounts to same server after session +failure. Fix invalid dentry - race in mkdir when directory gets created +by another client between the lookup and mkdir. + Version 0.94 ------------ Fix to list processing in reopen_files. Fix reconnection when server hung --- diff/fs/cifs/Makefile 2003-07-11 09:39:50.000000000 +0100 +++ source/fs/cifs/Makefile 2004-04-21 10:46:51.681725296 +0100 @@ -3,4 +3,4 @@ # obj-$(CONFIG_CIFS) += cifs.o -cifs-objs := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o +cifs-objs := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o fcntl.o --- diff/fs/cifs/README 2003-10-27 09:20:44.000000000 +0000 +++ source/fs/cifs/README 2004-04-21 10:46:51.684724840 +0100 @@ -1,30 +1,30 @@ -The CIFS VFS support for Linux supports many advanced network filesystem -features such as heirarchical dfs like namespace, hardlinks, locking and more. -It was designed to comply with the SNIA CIFS Technical Reference (which supersedes -the 1992 X/Open SMB Standard) as well as to perform best practice practical -interoperability with Windows 2000, Windows XP, Samba and equivalent +The CIFS VFS support for Linux supports many advanced network filesystem +features such as heirarchical dfs like namespace, hardlinks, locking and more. +It was designed to comply with the SNIA CIFS Technical Reference (which +supersedes the 1992 X/Open SMB Standard) as well as to perform best practice +practical interoperability with Windows 2000, Windows XP, Samba and equivalent servers. -For questions or bug reports please contact sfrench@samba.org (sfrench@us.ibm.com) +For questions or bug reports please contact: + sfrench@samba.org (sfrench@us.ibm.com) Build instructions: ================== For Linux 2.4: -1a) Get the linux kernel source with cifs vfs already in it -from bitkeeper via bk://cifs.bkbits.net/linux-2.4 -or -1b) Get the kernel source (e.g.from http://www.kernel.org) +1) Get the kernel source (e.g.from http://www.kernel.org) and download the cifs vfs source (see the project page at http://us1.samba.org/samba/Linux_CIFS_client.html) and change directory into the top of the kernel directory then patch the kernel (e.g. "patch -p1 < cifs_24.patch") to add the cifs vfs to your kernel configure options if it has not already been added (e.g. current SuSE and UL -users do not need to do not need that patch since the cifs vfs is +users do not need to apply the cifs_24.patch since the cifs vfs is already in the kernel configure menu) and then mkdir linux/fs/cifs and then copy the current cifs vfs files from the cifs download to your kernel build directory e.g. + cp /fs/cifs/* to /fs/cifs + 2) make menuconfig (or make xconfig) 3) select cifs from within the network filesystem choices 4) save and exit @@ -53,56 +53,105 @@ would simply type "make install"). If you do not have the utility mount.cifs (in the Samba 3.0 source tree and on the CIFS VFS web site) copy it to the same directory in which mount.smbfs and -similar files reside (usually /sbin). Although the helper software is required, -mount.cifs is recommended. Eventually the Samba 3.0 utility program "net" -may also be helpful since it may someday provide easier mount syntax for users used -to Windows e.g. - net use -Note that running Winbind on all of your Linux clients is useful in -in mapping Uids and Gids consistently to the proper network user. - -Samba Considerations -==================== -To get the maximum benefit from the CIFS VFS, we recommend using a server that -supports the SNIA CIFS Unix Extensions standard (e.g. Samba 2.2.5 or later or -Samba 3.0) but the CIFS vfs works fine with a wide variety of CIFS servers. -Note that uid, gid and file permissions will display default values if you do -not have a server that supports the Unix extensions for CIFS (such as Samba 2.2.3 or -later). To enable the Unix CIFS Extensions in the Samba server, add the line: +similar files reside (usually /sbin). Although the helper software is not +required, mount.cifs is recommended. Eventually the Samba 3.0 utility program +"net" may also be helpful since it may someday provide easier mount syntax for +users who are used to Windows e.g. net use +Note that running the Winbind pam/nss module (logon service) on all of your +Linux clients is useful in mapping Uids and Gids consistently across the +domain to the proper network user. The mount.cifs mount helper can be +trivially built from Samba 3.0 or later source e.g. by executing: + + gcc samba/source/client/mount.cifs.c -o mount.cifs + +Note that when the mount.cifs utility is run suid (allowing user mounts), +in order to reduce risks, the "nosuid" mount flag is passed in on mount to +disallow execution of an suid program mounted on the remote target. +When mount is executed as root, nosuid is not passed in by default, +and execution of suid programs on the remote target would be enabled +by default. This can be changed, as with nfs and other filesystems, +by simply specifying "nosuid" among the mount options. For user mounts +though to be able to pass the suid flag to mount requires rebuilding +mount.cifs with the following flag: + + gcc samba/source/client/mount.cifs.c -DCIFS_ALLOW_USR_SUID -o mount.cifs + +There is a corresponding manual page for cifs mounting in the Samba 3.0 and +later source tree in docs/manpages/mount.cifs.8 + +Samba Considerations +==================== +To get the maximum benefit from the CIFS VFS, we recommend using a server that +supports the SNIA CIFS Unix Extensions standard (e.g. Samba 2.2.5 or later or +Samba 3.0) but the CIFS vfs works fine with a wide variety of CIFS servers. +Note that uid, gid and file permissions will display default values if you do +not have a server that supports the Unix extensions for CIFS (such as Samba +2.2.5 or later). To enable the Unix CIFS Extensions in the Samba server, add +the line: + unix extensions = yes -to your smb.conf file on the server. Note that the following smb.conf settings are -also useful (on the Samba server) when the majority of clients are Unix -or Linux: + +to your smb.conf file on the server. Note that the following smb.conf settings +are also useful (on the Samba server) when the majority of clients are Unix or +Linux: + case sensitive = yes - delete readonly = yes -Some administrators also change the "map archive" and the "create mask" parameters -from their default values. Creating special devices (mknod) remotely may require -specifying a mkdev function to Samba. For more information on these see the manual -pages ("man smb.conf") on the Samba server system. Note that the cifs vfs, unlike the -smbfs vfs, does not read the smb.conf on the client system (the few optional settings -are passed in on mount via -o parameters instead). Note that Samba 2.2.7 or later -includes a fix that allows the CIFS VFS to delete open files (required for strict -POSIX compliance). Windows Servers already supported this feature. + delete readonly = yes + +Some administrators also change the "map archive" and the "create mask" +parameters from their default values. Creating special devices (mknod) remotely +may require specifying a mkdev function to Samba. For more information on these +see the manual pages ("man smb.conf") on the Samba server system. Note that the +cifs vfs, unlike the smbfs vfs, does not read the smb.conf on the client system +(the few optional settings are passed in on mount via -o parameters instead). +Note that Samba 2.2.7 or later includes a fix that allows the CIFS VFS to delete +open files (required for strict POSIX compliance). Windows Servers already +supported this feature. Use instructions: ================ -Once the CIFS VFS support is built into the kernel or installed as a module (cifs.o), -you can use mount syntax like the following to access Samba or Windows servers: +Once the CIFS VFS support is built into the kernel or installed as a module +(cifs.o), you can use mount syntax like the following to access Samba or Windows +servers: + mount -t cifs //9.53.216.11/e$ /mnt -o user=myname,pass=mypassword -after -o the following cifs vfs specific options are supported: + +Before -o the option -v may be specified to make the mount.cifs +mount helper display the mount steps more verbosely. +After -o the following commonly used cifs vfs specific options +are supported: + user= pass= domain= -TCP names (in addition to ip addresses) will be available when the mount helper -(mount.cifs) is complete + +Other cifs mount options are described below. Use of TCP names (in addition to +ip addresses) is available if the mount helper (mount.cifs) is installed. If +you do not trust the server to which are mounted, or if you do not have +cifs signing enabled (and the physical network is insecure), consider use +of the standard mount options "noexec" and "nosuid" to reduce the risk of +running an altered binary on your local system (downloaded from a hostile server +or altered by a hostile router). + +When using the mount helper mount.cifs, passwords may be specified via alternate +mechanisms, instead of specifying it after -o using the normal "pass=" syntax +on the command line: +1) By including it in a credential file. Specify credentials=filename as one +of the mount options. Credential files contain two lines + username=someuser + password=your_password +2) By specifying the password in the PASSWD environment variable (similarly +the user name can be taken from the USER environment variable). + +If no password is provided, mount.cifs will prompt for password entry Restrictions ============ -Servers must support the NTLM SMB dialect (which is the most recent, supported by Samba -and Windows NT, 2000 and XP and many other SMB/CIFS servers) and servers must support -either "pure-TCP" (port 445 TCP/IP CIFS connections) or RFC 1001/1002 support for -"Netbios-Over-TCP/IP." Neither of these is likely to be a problem as most servers -support this. IPv6 support is planned for the future. +Servers must support the NTLM SMB dialect (which is the most recent, supported +by Samba and Windows NT, 2000 and XP and many other SMB/CIFS servers) and +servers must support either "pure-TCP" (port 445 TCP/IP CIFS connections) or RFC +1001/1002 support for "Netbios-Over-TCP/IP." Neither of these is likely to be a +problem as most servers support this. IPv6 support is planned for the future. CIFS VFS Mount Options ====================== @@ -141,54 +190,91 @@ A partial list of the supported mount op ro mount network share read-only version used to distinguish different versions of the mount helper utility (not typically needed) + sep if first mount option (after the -o), overrides + the comma as the separator between the mount + parms. e.g. + -o user=myname,password=mypassword,domain=mydom + could be passed instead with period as the separator by + -o sep=.user=myname.password=mypassword.domain=mydom + this might be useful when comma is contained within username + or password or domain. This option is less important + when the cifs mount helper cifs.mount (version 1.1 or later) + is used. + nosuid Do not allow remote executables with the suid bit + program to be executed. This is only meaningful for mounts + to servers such as Samba which support the CIFS Unix Extensions. + If you do not trust the servers in your network (your mount + targets) it is recommended that you specify this option for + greater security. + suid Allow remote files on this mountpoint with suid enabled to + be executed (default for mounts when executed as root, + nosuid is default for user mounts). + credentials (allow valid when the cifs mount helper, mount.cifs, is + installed. Specifies the name of the credential file which + will be read to obtain the userid and password. Misc /proc/fs/cifs Flags and Debug Info ======================================= Informational pseudo-files: - DebugData Displays information about active CIFS sessions - SimultaneousOps Counter which holds maximum number of +DebugData Displays information about active CIFS sessions +SimultaneousOps Counter which holds maximum number of simultaneous outstanding SMB/CIFS requests. - Stats Lists summary resource usage information +Stats Lists summary resource usage information Configuration pseudo-files: - MultiuserMount If set to one, more than one CIFS session to +MultiuserMount If set to one, more than one CIFS session to the same server ip address can be established if more than one uid accesses the same mount point and if the uids user/password mapping information is available. (default is 0) - PacketSigningEnabled If set to one, cifs packet signing is enabled +PacketSigningEnabled If set to one, cifs packet signing is enabled and will be used if the server requires it. If set to two, cifs packet signing is required even if the server considers packet signing optional. (default 1) - cifsFYI If set to one, additional debug information is +cifsFYI If set to one, additional debug information is logged to the system error log. (default 0) - ExtendedSecurity If set to one, SPNEGO session establishment +ExtendedSecurity If set to one, SPNEGO session establishment is allowed which enables more advanced secure CIFS session establishment (default 0) - NTLMV2Enabled If set to one, more secure password hashes +NTLMV2Enabled If set to one, more secure password hashes are used when the server supports them and when kerberos is not negotiated (default 0) - traceSMB If set to one, debug information is logged to the +traceSMB If set to one, debug information is logged to the system error log with the start of smb requests and responses (default 0) - LookupCacheEnable If set to one, inode information is kept cached +LookupCacheEnable If set to one, inode information is kept cached for one second improving performance of lookups (default 1) - OplockEnabled If set to one, safe distributed caching enabled. +OplockEnabled If set to one, safe distributed caching enabled. (default 1) +LinuxExtensionsEnabled If set to one then the client will attempt to + use the CIFS "UNIX" extensions which are optional + protocol enhancements that allow CIFS servers + to return accurate UID/GID information as well + as support symbolic links. If you use servers + such as Samba that support the CIFS Unix + extensions but do not want to use symbolic link + support and want to map the uid and gid fields + to values supplied at mount (rather than the + actual values, then set this to zero. (default 1) + +These experimental features and tracing can be enabled by changing flags in +/proc/fs/cifs (after the cifs module has been installed or built into the +kernel, e.g. insmod cifs). To enable a feature set it to 1 e.g. to enable +tracing to the kernel message log type: -These experimental features and tracing can be enabled by changing flags in /proc/fs/cifs -(after the cifs module has been installed or built into the kernel, e.g. insmod cifs). -To enable a feature set it to 1 e.g. to enable tracing to the kernel message log -type: echo 1 > /proc/fs/cifs/cifsFYI + and for more extensive tracing including the start of smb requests and responses + echo 1 > /proc/fs/cifs/traceSMB -Also note that "cat /proc/fs/cifs/DebugData" will display some information about the -active sessions and the shares that are mounted. NTLMv2 enablement and packet -signing will not work since they the implementation is not quite complete. Do not enable -these flags unless you are doing specific testing. Enabling extended security works to -Windows 2000 Workstations and XP but not to Windows 2000 server or Samba since it does not -usually send "raw NTLMSSP" (instead it sends NTLMSSP encapsulated in SPNEGO/GSSAPI, which -support is not complete in the CIFS VFS yet). + +Also note that "cat /proc/fs/cifs/DebugData" will display some information about +the active sessions and the shares that are mounted. Note: NTLMv2 enablement +will not work since they its implementation is not quite complete yet. +Do not alter these configuration values unless you are doing specific testing. +Enabling extended security works to Windows 2000 Workstations and XP but not to +Windows 2000 server or Samba since it does not usually send "raw NTLMSSP" +(instead it sends NTLMSSP encapsulated in SPNEGO/GSSAPI, which support is not +complete in the CIFS VFS yet). --- diff/fs/cifs/TODO 2003-08-20 14:16:13.000000000 +0100 +++ source/fs/cifs/TODO 2004-04-21 10:46:51.685724688 +0100 @@ -1,4 +1,4 @@ -version 0.8.1 July 4th, 2003 +version 1.0.2 January 29, 2004 A Partial List of Known Problems and Missing Features ===================================================== @@ -27,8 +27,8 @@ used (Kerberos or NTLMSSP). Signing alre f) Directory entry caching relies on a 1 second timer, rather than using FindNotify or equivalent. - (started) -g) There may be a few additional changes that could be done to take advantage -of recent 2.5 kernel improvements in byte-range locking +g) A few byte range testcases fail due to POSIX vs. Windows/CIFS +style byte range lock differences h) quota support @@ -36,8 +36,6 @@ i) support for the Linux 2.5 kernel new which will allow us to expose dos attributes as well as real ACLs -j) finish off the mount helper, mount.cifs - (started) - k) finish writepages support (multi-page write behind for improved performance) and syncpage @@ -56,17 +54,15 @@ p) Improve performance of readpages by s at a time when 8 pages or more are requested. -KNOWN BUGS (updated July 4th, 2003) +KNOWN BUGS (updated January 30, 2004) ==================================== 1) existing symbolic links (Windows reparse points) are recognized but can not be created remotely. They are implemented for Samba and those that support the CIFS Unix extensions but Samba has a bug currently handling symlink text beginning with slash -2) delete of file with read-only attribute set will fail (may be ok) -3) mount helper syntax not quite matching man page -4) follow_link and readdir code does not follow dfs junctions +2) follow_link and readdir code does not follow dfs junctions but recognizes them -5) create of new files to FAT partitions on Windows servers can +3) create of new files to FAT partitions on Windows servers can succeed but still return access denied (appears to be Windows not client problem). NTFS partitions do not have this problem. --- diff/fs/cifs/cifs_debug.c 2003-10-27 09:20:44.000000000 +0000 +++ source/fs/cifs/cifs_debug.c 2004-04-21 10:46:51.665727728 +0100 @@ -88,9 +88,26 @@ cifs_debug_data_read(char *buf, char **b i, ses->serverName, ses->serverDomain, atomic_read(&ses->inUse), ses->serverOS, ses->serverNOS, ses->capabilities,ses->status,ses->server->tcpStatus); buf += length; - if(ses->server) + if(ses->server) { buf += sprintf(buf, "\n\tLocal Users To Same Server: %d SecMode: 0x%x", atomic_read(&ses->server->socketUseCount),ses->server->secMode); + + /* length = sprintf(buf, "\nMIDs: \n"); + buf += length; + + spin_lock(&GlobalMid_Lock); + list_for_each(tmp1, &ses->server->pending_mid_q) { + mid_entry = list_entry(tmp1, struct + mid_q_entry, + qhead); + if(mid_entry) { + length = sprintf(buf,"State: %d com: %d pid: %d tsk: %p\n",mid_entry->midState,mid_entry->command,mid_entry->pid,mid_entry->tsk); + buf += length; + } + } + spin_unlock(&GlobalMid_Lock); */ + } + } read_unlock(&GlobalSMBSeslock); sprintf(buf, "\n"); @@ -127,8 +144,10 @@ cifs_debug_data_read(char *buf, char **b buf += sprintf(buf, "\tDISCONNECTED "); } read_unlock(&GlobalSMBSeslock); + length = sprintf(buf, "\n"); buf += length; + *eof = 1; /* BB add code to dump additional info such as TCP session info now */ /* @@ -177,6 +196,9 @@ cifs_stats_read(char *buf, char **beginB item_length = sprintf(buf,"Active Operations (MIDs in use): %d\n",midCount.counter); length += item_length; + buf += item_length; + item_length = sprintf(buf,"%d sessions and %d shares reconnected after failure\n",tcpSesReconnectCount.counter,tconInfoReconnectCount.counter); + length += item_length; return length; } @@ -201,6 +223,8 @@ static read_proc_t packet_signing_enable static write_proc_t packet_signing_enabled_write; static read_proc_t quotaEnabled_read; static write_proc_t quotaEnabled_write; +static read_proc_t linuxExtensionsEnabled_read; +static write_proc_t linuxExtensionsEnabled_write; void cifs_proc_init(void) @@ -213,62 +237,67 @@ cifs_proc_init(void) proc_fs_cifs->owner = THIS_MODULE; create_proc_read_entry("DebugData", 0, proc_fs_cifs, - cifs_debug_data_read, 0); + cifs_debug_data_read, 0); create_proc_read_entry("SimultaneousOps", 0, proc_fs_cifs, - cifs_total_xid_read, 0); + cifs_total_xid_read, 0); create_proc_read_entry("Stats", 0, proc_fs_cifs, - cifs_stats_read, 0); + cifs_stats_read, 0); pde = create_proc_read_entry("cifsFYI", 0, proc_fs_cifs, - cifsFYI_read, 0); + cifsFYI_read, 0); if (pde) pde->write_proc = cifsFYI_write; pde = create_proc_read_entry("traceSMB", 0, proc_fs_cifs, - traceSMB_read, 0); + traceSMB_read, 0); if (pde) pde->write_proc = traceSMB_write; pde = create_proc_read_entry("OplockEnabled", 0, proc_fs_cifs, - oplockEnabled_read, 0); + oplockEnabled_read, 0); if (pde) pde->write_proc = oplockEnabled_write; - pde = create_proc_read_entry("QuotaEnabled", 0, proc_fs_cifs, - quotaEnabled_read, 0); - if (pde) - pde->write_proc = quotaEnabled_write; + pde = create_proc_read_entry("QuotaEnabled", 0, proc_fs_cifs, + quotaEnabled_read, 0); + if (pde) + pde->write_proc = quotaEnabled_write; + + pde = create_proc_read_entry("LinuxExtensionsEnabled", 0, proc_fs_cifs, + linuxExtensionsEnabled_read, 0); + if (pde) + pde->write_proc = linuxExtensionsEnabled_write; pde = create_proc_read_entry("MultiuserMount", 0, proc_fs_cifs, - multiuser_mount_read, 0); + multiuser_mount_read, 0); if (pde) pde->write_proc = multiuser_mount_write; pde = create_proc_read_entry("ExtendedSecurity", 0, proc_fs_cifs, - extended_security_read, 0); + extended_security_read, 0); if (pde) pde->write_proc = extended_security_write; pde = - create_proc_read_entry("LookupCacheEnable", 0, proc_fs_cifs, - lookupFlag_read, 0); + create_proc_read_entry("LookupCacheEnabled", 0, proc_fs_cifs, + lookupFlag_read, 0); if (pde) pde->write_proc = lookupFlag_write; pde = create_proc_read_entry("NTLMV2Enabled", 0, proc_fs_cifs, - ntlmv2_enabled_read, 0); + ntlmv2_enabled_read, 0); if (pde) pde->write_proc = ntlmv2_enabled_write; pde = create_proc_read_entry("PacketSigningEnabled", 0, proc_fs_cifs, - packet_signing_enabled_read, 0); + packet_signing_enabled_read, 0); if (pde) pde->write_proc = packet_signing_enabled_write; } @@ -281,14 +310,17 @@ cifs_proc_clean(void) remove_proc_entry("DebugData", proc_fs_cifs); remove_proc_entry("cifsFYI", proc_fs_cifs); - remove_proc_entry("TraceSMB", proc_fs_cifs); + remove_proc_entry("traceSMB", proc_fs_cifs); remove_proc_entry("SimultaneousOps", proc_fs_cifs); - remove_proc_entry("TotalOps", proc_fs_cifs); + remove_proc_entry("Stats", proc_fs_cifs); remove_proc_entry("MultiuserMount", proc_fs_cifs); remove_proc_entry("OplockEnabled", proc_fs_cifs); remove_proc_entry("NTLMV2Enabled",proc_fs_cifs); remove_proc_entry("ExtendedSecurity",proc_fs_cifs); remove_proc_entry("PacketSigningEnabled",proc_fs_cifs); + remove_proc_entry("LinuxExtensionsEnabled",proc_fs_cifs); + remove_proc_entry("QuotaEnabled",proc_fs_cifs); + remove_proc_entry("LookupCacheEnabled",proc_fs_cifs); remove_proc_entry("cifs", proc_root_fs); } @@ -410,6 +442,46 @@ quotaEnabled_write(struct file *file, co return count; } +static int +linuxExtensionsEnabled_read(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int len; + + len = sprintf(page, "%d\n", linuxExtEnabled); +/* could also check if quotas are enabled in kernel + as a whole first */ + len -= off; + *start = page + off; + + if (len > count) + len = count; + else + *eof = 1; + + if (len < 0) + len = 0; + + return len; +} +static int +linuxExtensionsEnabled_write(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + char c; + int rc; + + rc = get_user(c, buffer); + if (rc) + return rc; + if (c == '0' || c == 'n' || c == 'N') + linuxExtEnabled = 0; + else if (c == '1' || c == 'y' || c == 'Y') + linuxExtEnabled = 1; + + return count; +} + static int lookupFlag_read(char *page, char **start, off_t off, --- diff/fs/cifs/cifs_unicode.c 2003-07-11 09:39:50.000000000 +0100 +++ source/fs/cifs/cifs_unicode.c 2004-04-21 10:46:51.672726664 +0100 @@ -25,25 +25,6 @@ #include "cifs_debug.h" /* - * NAME: toUpper() - * - * FUNCTION: Upper case ASCII string (in place) using the current codepage - * - */ - -void -toUpper(const struct nls_table *n, char *mixed_string) -{ - unsigned int i; - char temp; - - for (i = 0; i < strlen(mixed_string); i++) { - temp = mixed_string[i]; - mixed_string[i] = n->charset2upper[(int) temp]; - } -} - -/* * NAME: cifs_strfromUCS() * * FUNCTION: Convert little-endian unicode string to character string @@ -104,28 +85,3 @@ cifs_strtoUCS(wchar_t * to, const char * return i; } -/* - * NAME: get_UCSname2() - * - * FUNCTION: Allocate and translate to unicode string - * - */ -/*int -get_UCSname2(struct component_name *uniName, struct dentry *dentry, - struct nls_table *nls_tab) -{ - int length = dentry->d_name.len; - - if (length > 255) - return ENAMETOOLONG; - - uniName->name = kmalloc((length + 1) * sizeof (wchar_t), GFP_KERNEL); - - if (uniName->name == NULL) - return ENOSPC; - - uniName->namlen = cifs_strtoUCS(uniName->name, dentry->d_name.name, - length, nls_tab); - - return 0; -} */ --- diff/fs/cifs/cifs_unicode.h 2003-05-21 11:49:46.000000000 +0100 +++ source/fs/cifs/cifs_unicode.h 2004-04-21 10:46:51.672726664 +0100 @@ -58,26 +58,11 @@ extern signed char UniLowerTable[512]; extern struct UniCaseRange UniLowerRange[]; #endif /* UNIUPR_NOLOWER */ -/* - * directory entry argument - */ -struct component_name { - int namlen; - wchar_t *name; -}; - #ifdef __KERNEL__ int cifs_strfromUCS_le(char *, const wchar_t *, int, const struct nls_table *); int cifs_strtoUCS(wchar_t *, const char *, int, const struct nls_table *); - -int cifs_UCSname(struct component_name *, struct dentry *, - const struct nls_table *); - -void toUpper(const struct nls_table *, char *); #endif -#define free_UCSname(COMP) kfree((COMP)->name) - /* * UniStrcat: Concatenate the second string to the first * --- diff/fs/cifs/cifsencrypt.c 2003-10-27 09:20:44.000000000 +0000 +++ source/fs/cifs/cifsencrypt.c 2004-04-21 10:46:51.665727728 +0100 @@ -74,7 +74,7 @@ int cifs_sign_smb(struct smb_hdr * cifs_ rc = cifs_calculate_signature(cifs_pdu, ses->mac_signing_key,smb_signature); if(rc) - memset(cifs_pdu->Signature.SecuritySignature, 0, 8); + memset(cifs_pdu->Signature.SecuritySignature, 0, 8); else memcpy(cifs_pdu->Signature.SecuritySignature, smb_signature, 8); @@ -88,15 +88,15 @@ int cifs_verify_signature(struct smb_hdr char server_response_sig[8]; char what_we_think_sig_should_be[20]; - if((cifs_pdu == NULL) || (mac_key == NULL)) - return -EINVAL; + if((cifs_pdu == NULL) || (mac_key == NULL)) + return -EINVAL; if (cifs_pdu->Command == SMB_COM_NEGOTIATE) return 0; if (cifs_pdu->Command == SMB_COM_LOCKING_ANDX) { struct smb_com_lock_req * pSMB = (struct smb_com_lock_req *)cifs_pdu; - if(pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE) + if(pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE) return 0; } @@ -113,10 +113,10 @@ int cifs_verify_signature(struct smb_hdr its signature against what the server sent */ memcpy(server_response_sig,cifs_pdu->Signature.SecuritySignature,8); - cifs_pdu->Signature.Sequence.SequenceNumber = expected_sequence_number; - cifs_pdu->Signature.Sequence.Reserved = 0; + cifs_pdu->Signature.Sequence.SequenceNumber = expected_sequence_number; + cifs_pdu->Signature.Sequence.Reserved = 0; - rc = cifs_calculate_signature(cifs_pdu, mac_key, + rc = cifs_calculate_signature(cifs_pdu, mac_key, what_we_think_sig_should_be); if(rc) @@ -136,7 +136,7 @@ int cifs_verify_signature(struct smb_hdr int cifs_calculate_mac_key(char * key, const char * rn, const char * password) { char temp_key[16]; - if ((key == NULL) || (rn == NULL) || (password == NULL)) + if ((key == NULL) || (rn == NULL)) return -EINVAL; E_md4hash(password, temp_key); @@ -156,7 +156,7 @@ int CalcNTLMv2_partial_mac_key(struct ci if(ses) return -EINVAL; - E_md4hash(ses->password_with_pad, temp_hash); + E_md4hash(ses->password, temp_hash); hmac_md5_init_limK_to_64(temp_hash, 16, &ctx); user_name_len = strlen(ses->userName); @@ -165,22 +165,21 @@ int CalcNTLMv2_partial_mac_key(struct ci dom_name_len = strlen(ses->domainName); if(dom_name_len > MAX_USERNAME_SIZE) return -EINVAL; - - + ucase_buf = kmalloc((MAX_USERNAME_SIZE+1), GFP_KERNEL); - unicode_buf = kmalloc((MAX_USERNAME_SIZE+1)*4, GFP_KERNEL); - + unicode_buf = kmalloc((MAX_USERNAME_SIZE+1)*4, GFP_KERNEL); + for(i=0;icharset2upper[(int)ses->userName[i]]; ucase_buf[i] = 0; - user_name_len = cifs_strtoUCS(unicode_buf, ucase_buf, MAX_USERNAME_SIZE*2, nls_info); + user_name_len = cifs_strtoUCS(unicode_buf, ucase_buf, MAX_USERNAME_SIZE*2, nls_info); unicode_buf[user_name_len] = 0; user_name_len++; - for(i=0;icharset2upper[(int)ses->domainName[i]]; - ucase_buf[i] = 0; - dom_name_len = cifs_strtoUCS(unicode_buf+user_name_len, ucase_buf, MAX_USERNAME_SIZE*2, nls_info); + for(i=0;icharset2upper[(int)ses->domainName[i]]; + ucase_buf[i] = 0; + dom_name_len = cifs_strtoUCS(unicode_buf+user_name_len, ucase_buf, MAX_USERNAME_SIZE*2, nls_info); unicode_buf[user_name_len + dom_name_len] = 0; hmac_md5_update((const unsigned char *) unicode_buf, @@ -201,6 +200,5 @@ void CalcNTLMv2_response(const struct ci hmac_md5_update(ses->server->cryptKey,8,&context); /* hmac_md5_update(v2_session_response+16)client thing,8,&context); */ /* BB fix */ - hmac_md5_final(v2_session_response,&context); } --- diff/fs/cifs/cifsfs.c 2004-04-21 10:45:35.168357096 +0100 +++ source/fs/cifs/cifsfs.c 2004-04-21 10:46:51.666727576 +0100 @@ -1,7 +1,7 @@ /* * fs/cifs/cifsfs.c * - * Copyright (C) International Business Machines Corp., 2002,2003 + * Copyright (C) International Business Machines Corp., 2002,2004 * Author(s): Steve French (sfrench@us.ibm.com) * * Common Internet FileSystem (CIFS) client @@ -52,6 +52,7 @@ int cifsERROR = 1; int traceSMB = 0; unsigned int oplockEnabled = 1; unsigned int quotaEnabled = 0; +unsigned int linuxExtEnabled = 1; unsigned int lookupCacheEnabled = 1; unsigned int multiuser_mount = 0; unsigned int extended_security = 0; @@ -82,6 +83,9 @@ cifs_read_super(struct super_block *sb, cifs_sb = CIFS_SB(sb); if(cifs_sb == NULL) return -ENOMEM; + else + memset(cifs_sb,0,sizeof(struct cifs_sb_info)); + rc = cifs_mount(sb, cifs_sb, data, devname); @@ -123,14 +127,15 @@ out_no_root: iput(inode); out_mount_failed: - if(cifs_sb->local_nls) - unload_nls(cifs_sb->local_nls); - if(cifs_sb) + if(cifs_sb) { + if(cifs_sb->local_nls) + unload_nls(cifs_sb->local_nls); kfree(cifs_sb); + } return rc; } -void +static void cifs_put_super(struct super_block *sb) { int rc = 0; @@ -151,7 +156,7 @@ cifs_put_super(struct super_block *sb) return; } -int +static int cifs_statfs(struct super_block *sb, struct kstatfs *buf) { int xid, rc; @@ -186,8 +191,21 @@ cifs_statfs(struct super_block *sb, stru static int cifs_permission(struct inode * inode, int mask, struct nameidata *nd) { - /* the server does permission checks, we do not need to do it here */ - return 0; + struct cifs_sb_info *cifs_sb; + + cifs_sb = CIFS_SB(inode->i_sb); + + if (cifs_sb->tcon->ses->capabilities & CAP_UNIX) { + /* the server supports the Unix-like mode bits and does its + own permission checks, and therefore we do not allow the file + mode to be overriden on these mounts - so do not do perm + check on client side */ + return 0; + } else /* file mode might have been restricted at mount time + on the client (above and beyond ACL on servers) for + servers which do not support setting and viewing mode bits, + so allowing client to check permissions is useful */ + return vfs_permission(inode, mask); } static kmem_cache_t *cifs_inode_cachep; @@ -402,24 +420,58 @@ cifs_get_sb(struct file_system_type *fs_ return sb; } -ssize_t +static ssize_t cifs_read_wrapper(struct file * file, char *read_data, size_t read_size, loff_t * poffset) { - if(CIFS_I(file->f_dentry->d_inode)->clientCanCacheRead) + if(file == NULL) + return -EIO; + else if(file->f_dentry == NULL) + return -EIO; + else if(file->f_dentry->d_inode == NULL) + return -EIO; + + cFYI(1,("In read_wrapper size %zd at %lld",read_size,*poffset)); + if(CIFS_I(file->f_dentry->d_inode)->clientCanCacheRead) { return generic_file_read(file,read_data,read_size,poffset); - else - return cifs_read(file,read_data,read_size,poffset); + } else { + /* BB do we need to lock inode from here until after invalidate? */ +/* if(file->f_dentry->d_inode->i_mapping) { + filemap_fdatawrite(file->f_dentry->d_inode->i_mapping); + filemap_fdatawait(file->f_dentry->d_inode->i_mapping); + }*/ +/* cifs_revalidate(file->f_dentry);*/ /* BB fixme */ + + /* BB we should make timer configurable - perhaps + by simply calling cifs_revalidate here */ + /* invalidate_remote_inode(file->f_dentry->d_inode);*/ + return generic_file_read(file,read_data,read_size,poffset); + } } -ssize_t +static ssize_t cifs_write_wrapper(struct file * file, const char *write_data, size_t write_size, loff_t * poffset) { - if(CIFS_I(file->f_dentry->d_inode)->clientCanCacheAll) /* check caching for write */ - return generic_file_write(file,write_data, write_size,poffset); - else - return cifs_write(file,write_data,write_size,poffset); + ssize_t written; + + if(file == NULL) + return -EIO; + else if(file->f_dentry == NULL) + return -EIO; + else if(file->f_dentry->d_inode == NULL) + return -EIO; + + cFYI(1,("In write_wrapper size %zd at %lld",write_size,*poffset)); + + /* check whether we can cache writes locally */ + written = generic_file_write(file,write_data,write_size,poffset); + if(!CIFS_I(file->f_dentry->d_inode)->clientCanCacheAll) { + if(file->f_dentry->d_inode->i_mapping) { + filemap_fdatawrite(file->f_dentry->d_inode->i_mapping); + } + } + return written; } @@ -476,8 +528,8 @@ struct inode_operations cifs_symlink_ino }; struct file_operations cifs_file_ops = { - .read = generic_file_read, - .write = generic_file_write, + .read = cifs_read_wrapper, + .write = cifs_write_wrapper, .open = cifs_open, .release = cifs_close, .lock = cifs_lock, @@ -485,12 +537,18 @@ struct file_operations cifs_file_ops = { .flush = cifs_flush, .mmap = cifs_file_mmap, .sendfile = generic_file_sendfile, +#ifdef CIFS_FCNTL + .fcntl = cifs_fcntl, +#endif }; struct file_operations cifs_dir_ops = { .readdir = cifs_readdir, .release = cifs_closedir, .read = generic_read_dir, +#ifdef CIFS_FCNTL + .fcntl = cifs_fcntl, +#endif }; static void @@ -505,7 +563,7 @@ cifs_init_once(void *inode, kmem_cache_t } } -int +static int cifs_init_inodecache(void) { cifs_inode_cachep = kmem_cache_create("cifs_inode_cache", @@ -518,14 +576,14 @@ cifs_init_inodecache(void) return 0; } -void +static void cifs_destroy_inodecache(void) { if (kmem_cache_destroy(cifs_inode_cachep)) printk(KERN_WARNING "cifs_inode_cache: error freeing\n"); } -int +static int cifs_init_request_bufs(void) { cifs_req_cachep = kmem_cache_create("cifs_request", @@ -538,7 +596,7 @@ cifs_init_request_bufs(void) return 0; } -void +static void cifs_destroy_request_bufs(void) { if (kmem_cache_destroy(cifs_req_cachep)) @@ -546,7 +604,7 @@ cifs_destroy_request_bufs(void) "cifs_destroy_request_cache: error not all structures were freed\n"); } -int +static int cifs_init_mids(void) { cifs_mid_cachep = kmem_cache_create("cifs_mpx_ids", @@ -565,7 +623,7 @@ cifs_init_mids(void) return 0; } -void +static void cifs_destroy_mids(void) { if (kmem_cache_destroy(cifs_mid_cachep)) @@ -591,7 +649,7 @@ static int cifs_oplock_thread(void * dum do { set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(39*HZ); + schedule_timeout(1*HZ); spin_lock(&GlobalMid_Lock); if(list_empty(&GlobalOplock_Q)) { spin_unlock(&GlobalMid_Lock); @@ -600,23 +658,41 @@ static int cifs_oplock_thread(void * dum oplock_item = list_entry(GlobalOplock_Q.next, struct oplock_q_entry, qhead); if(oplock_item) { + cFYI(1,("found oplock item to write out")); pTcon = oplock_item->tcon; inode = oplock_item->pinode; netfid = oplock_item->netfid; spin_unlock(&GlobalMid_Lock); DeleteOplockQEntry(oplock_item); - if (S_ISREG(inode->i_mode)) + /* can not grab inode sem here since it would + deadlock when oplock received on delete + since vfs_unlink holds the i_sem across + the call */ + /* down(&inode->i_sem);*/ + if (S_ISREG(inode->i_mode)) { rc = filemap_fdatawrite(inode->i_mapping); - else + if(CIFS_I(inode)->clientCanCacheRead == 0) + invalidate_remote_inode(inode); + } else rc = 0; + /* up(&inode->i_sem);*/ if (rc) CIFS_I(inode)->write_behind_rc = rc; cFYI(1,("Oplock flush inode %p rc %d",inode,rc)); - rc = CIFSSMBLock(0, pTcon, netfid, - 0 /* len */ , 0 /* offset */, 0, - 0, LOCKING_ANDX_OPLOCK_RELEASE, - 0 /* wait flag */); - cFYI(1,("Oplock release rc = %d ",rc)); + + /* releasing a stale oplock after recent reconnection + of smb session using a now incorrect file + handle is not a data integrity issue but do + not bother sending an oplock release if session + to server still is disconnected since oplock + already released by the server in that case */ + if(pTcon->tidStatus != CifsNeedReconnect) { + rc = CIFSSMBLock(0, pTcon, netfid, + 0 /* len */ , 0 /* offset */, 0, + 0, LOCKING_ANDX_OPLOCK_RELEASE, + 0 /* wait flag */); + cFYI(1,("Oplock release rc = %d ",rc)); + } } else spin_unlock(&GlobalMid_Lock); } @@ -640,6 +716,9 @@ init_cifs(void) */ atomic_set(&sesInfoAllocCount, 0); atomic_set(&tconInfoAllocCount, 0); + atomic_set(&tcpSesReconnectCount, 0); + atomic_set(&tconInfoReconnectCount, 0); + atomic_set(&bufAllocCount, 0); atomic_set(&midCount, 0); GlobalCurrentXid = 0; --- diff/fs/cifs/cifsfs.h 2003-07-22 18:54:27.000000000 +0100 +++ source/fs/cifs/cifsfs.h 2004-04-21 10:46:51.667727424 +0100 @@ -60,8 +60,6 @@ extern int cifs_getattr(struct vfsmount extern int cifs_setattr(struct dentry *, struct iattr *); extern struct inode_operations cifs_file_inode_ops; -extern void cifs_truncate_file(struct inode *); - extern struct inode_operations cifs_symlink_inode_ops; /* Functions related to files and directories */ @@ -80,6 +78,7 @@ extern int cifs_file_mmap(struct file * extern struct file_operations cifs_dir_ops; extern int cifs_dir_open(struct inode *inode, struct file *file); extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir); +extern long cifs_fcntl(int, unsigned int, unsigned long, struct file *); /* Functions related to dir entries */ extern struct dentry_operations cifs_dentry_ops; --- diff/fs/cifs/cifsglob.h 2003-10-27 09:20:44.000000000 +0000 +++ source/fs/cifs/cifsglob.h 2004-04-21 10:46:51.667727424 +0100 @@ -16,6 +16,7 @@ * */ #include +#include #include "cifs_fs_sb.h" /* * The sizes of various internal tables and strings @@ -55,6 +56,10 @@ #define TRUE 1 #endif +#ifndef XATTR_DOS_ATTRIB +#define XATTR_DOS_ATTRIB "user.DOSATTRIB" +#endif + /* * This information is kept on every Server we know about. * @@ -83,6 +88,13 @@ enum securityEnum { Kerberos /* Kerberos via SPNEGO */ }; +enum protocolEnum { + IPV4 = 0, + IPV6, + SCTP + /* Netbios frames protocol not supported at this time */ +}; + /* ***************************************************************** * Except the CIFS PDUs themselves all the @@ -94,13 +106,16 @@ struct TCP_Server_Info { char server_Name[SERVER_NAME_LEN_WITH_NULL]; /* 15 chars + X'20'in 16th */ char unicode_server_Name[SERVER_NAME_LEN_WITH_NULL * 2]; /* Unicode version of server_Name */ struct socket *ssocket; - struct sockaddr_in sockAddr; + union { + struct sockaddr_in sockAddr; + struct sockaddr_in6 sockAddr6; + } addr; wait_queue_head_t response_q; struct list_head pending_mid_q; void *Server_NlsInfo; /* BB - placeholder for future NLS info */ unsigned short server_codepage; /* codepage for the server */ unsigned long ip_address; /* IP addr for the server if known */ - unsigned long svType; /* computer type */ + enum protocolEnum protocolType; char versionMajor; char versionMinor; int svlocal:1; /* local server or remote */ @@ -161,7 +176,7 @@ struct cifsSesInfo { char serverName[SERVER_NAME_LEN_WITH_NULL * 2]; /* BB make bigger for tcp names - will ipv6 and sctp addresses fit here?? */ char userName[MAX_USERNAME_SIZE + 1]; char domainName[MAX_USERNAME_SIZE + 1]; - char password_with_pad[CIFS_ENCPWD_SIZE]; + char * password; }; /* @@ -175,13 +190,14 @@ struct cifsTconInfo { struct cifsSesInfo *ses; /* pointer to session associated with */ char treeName[MAX_TREE_SIZE + 1]; /* UNC name of resource (in ASCII not UTF) */ char *nativeFileSystem; - __u16 tid; /* The 2 byte transaction id */ + __u16 tid; /* The 2 byte tree id */ __u16 Flags; /* optional support bits */ enum statusEnum tidStatus; - atomic_t useCount; /* how many mounts (explicit or implicit refer to this share */ + atomic_t useCount; /* how many mounts (explicit or implicit) to this share */ FILE_SYSTEM_DEVICE_INFO fsDevInfo; - FILE_SYSTEM_ATTRIBUTE_INFO fsAttrInfo; /* note file system name may be truncated - but very unlikely */ + FILE_SYSTEM_ATTRIBUTE_INFO fsAttrInfo; /* ok if file system name truncated */ FILE_SYSTEM_UNIX_INFO fsUnixInfo; + int retry:1; /* BB add field for back pointer to sb struct? */ }; @@ -213,6 +229,7 @@ struct cifsFileInfo { int closePend:1; /* file is marked to close */ int emptyDir:1; int invalidHandle:1; /* file closed via session abend */ + struct semaphore fh_sem; /* prevents reopen race after dead ses*/ char * search_resume_name; unsigned int resume_name_length; __u32 resume_key; @@ -274,6 +291,7 @@ struct oplock_q_entry { #define MID_REQUEST_ALLOCATED 1 #define MID_REQUEST_SUBMITTED 2 #define MID_RESPONSE_RECEIVED 4 +#define MID_RETRY_NEEDED 8 /* session closed while this request out */ struct servers_not_supported { /* @z4a */ struct servers_not_supported *next1; /* @z4a */ @@ -313,7 +331,7 @@ struct servers_not_supported { /* @z4a * * ---------- * sesSem operations on smb session * tconSem operations on tree connection - * i_sem inode operations + * fh_sem file handle reconnection operations * ****************************************************************************/ @@ -358,6 +376,9 @@ GLOBAL_EXTERN char Local_System_Name[15] GLOBAL_EXTERN atomic_t sesInfoAllocCount; GLOBAL_EXTERN atomic_t tconInfoAllocCount; +GLOBAL_EXTERN atomic_t tcpSesReconnectCount; +GLOBAL_EXTERN atomic_t tconInfoReconnectCount; + /* Various Debug counters to remove someday (BB) */ GLOBAL_EXTERN atomic_t bufAllocCount; GLOBAL_EXTERN atomic_t midCount; @@ -374,4 +395,6 @@ GLOBAL_EXTERN unsigned int extended_secu with more secure ntlmssp2 challenge/resp */ GLOBAL_EXTERN unsigned int ntlmv2_support; /* better optional password hash */ GLOBAL_EXTERN unsigned int sign_CIFS_PDUs; /* enable smb packet signing */ +GLOBAL_EXTERN unsigned int linuxExtEnabled; /* enable Linux/Unix CIFS extensions */ + --- diff/fs/cifs/cifspdu.h 2003-08-26 10:00:54.000000000 +0100 +++ source/fs/cifs/cifspdu.h 2004-04-21 10:46:51.668727272 +0100 @@ -727,8 +727,10 @@ typedef struct smb_com_read_rsp { typedef struct locking_andx_range { __u16 Pid; __u16 Pad; - __u64 Offset; - __u64 Length; + __u32 OffsetHigh; + __u32 OffsetLow; + __u32 LengthHigh; + __u32 LengthLow; } LOCKING_ANDX_RANGE; #define LOCKING_ANDX_SHARED_LOCK 0x01 @@ -1101,10 +1103,10 @@ typedef struct smb_com_transaction2_spi_ } TRANSACTION2_SPI_RSP; struct set_file_rename { - __u32 overwrite; /* 1 = overwrite dest */ - __u32 root_fid; /* zero */ + __u32 overwrite; /* 1 = overwrite dest */ + __u32 root_fid; /* zero */ __u32 target_name_len; - char target_name[0]; /* Must be unicode */ + char target_name[0]; /* Must be unicode */ }; struct smb_com_transaction2_sfi_req { --- diff/fs/cifs/cifsproto.h 2003-10-09 09:47:17.000000000 +0100 +++ source/fs/cifs/cifsproto.h 2004-04-21 10:46:51.669727120 +0100 @@ -30,8 +30,8 @@ struct statfs; ***************************************************************** */ -extern struct smb_hdr *buf_get(void); -extern void buf_release(void *); +extern struct smb_hdr *cifs_buf_get(void); +extern void cifs_buf_release(void *); extern int smb_send(struct socket *, struct smb_hdr *, unsigned int /* length */ , struct sockaddr *); extern unsigned int _GetXid(void); @@ -41,7 +41,6 @@ extern void _FreeXid(unsigned int); extern char *build_path_from_dentry(struct dentry *); extern char *build_wildcard_path_from_dentry(struct dentry *direntry); extern void renew_parental_timestamps(struct dentry *direntry); -extern void *kcalloc(size_t mem, int type); extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *, struct smb_hdr * /* input */ , struct smb_hdr * /* out */ , @@ -61,12 +60,6 @@ struct oplock_q_entry * AllocOplockQEntr void DeleteOplockQEntry(struct oplock_q_entry *); extern struct timespec cifs_NTtimeToUnix(u64 /* utc nanoseconds since 1601 */ ); extern u64 cifs_UnixTimeToNT(struct timespec); -extern void RevUcode_to_Ucode(char *revUnicode, char *UnicodeName); -extern void Ucode_to_RevUcode(char *Unicode, char *revUnicodeName); -extern void RevUcode_to_Ucode_with_Len(char *revUnicode, char *UnicodeName, - int Len); -extern void Ucode_to_RevUcode_with_Len(char *Unicode, char *revUnicodeName, - int Len); extern int cifs_get_inode_info(struct inode **pinode, const unsigned char *search_path, FILE_ALL_INFO * pfile_info, @@ -75,21 +68,9 @@ extern int cifs_get_inode_info_unix(stru const unsigned char *search_path, struct super_block *sb); -extern int reopen_files(struct cifsTconInfo *, struct nls_table *); -extern int setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo, +extern int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo, struct nls_table * nls_info); extern int CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses); -extern int CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses, - char *ntlm_session_key, const struct nls_table *); -extern int CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses, - char *SecurityBlob,int SecurityBlobLength, - const struct nls_table *); -extern int CIFSNTLMSSPNegotiateSessSetup(unsigned int xid, - struct cifsSesInfo *ses, int *ntlmv2_flag, - const struct nls_table *); -extern int CIFSNTLMSSPAuthSessSetup(unsigned int xid, - struct cifsSesInfo *ses, char *ntlm_session_key, - int ntlmv2_flag, const struct nls_table *); extern int CIFSTCon(unsigned int xid, struct cifsSesInfo *ses, const char *tree, struct cifsTconInfo *tcon, @@ -224,7 +205,6 @@ extern void sesInfoFree(struct cifsSesIn extern struct cifsTconInfo *tconInfoAlloc(void); extern void tconInfoFree(struct cifsTconInfo *); -extern int cifs_demultiplex_thread(struct TCP_Server_Info *); extern int cifs_reconnect(struct TCP_Server_Info *server); extern int cifs_sign_smb(struct smb_hdr *, struct cifsSesInfo *,__u32 *); --- diff/fs/cifs/cifssmb.c 2003-10-27 09:20:44.000000000 +0000 +++ source/fs/cifs/cifssmb.c 2004-04-21 10:46:51.671726816 +0100 @@ -21,7 +21,11 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ - /* SMB/CIFS PDU handling routines here - except for leftovers in connect.c */ + /* SMB/CIFS PDU handling routines here - except for leftovers in connect.c */ + /* These are mostly routines that operate on a pathname, or on a tree id */ + /* (mounted volume), but there are eight handle based routines which must be */ + /* treated slightly different for reconnection purposes since we never want */ + /* to reuse a stale file handle and the caller knows the file handle */ #include #include @@ -37,38 +41,108 @@ static struct { int index; char *name; } protocols[] = { - { - CIFS_PROT, "\2NT LM 0.12"}, { - BAD_PROT, "\2"} + {CIFS_PROT, "\2NT LM 0.12"}, + {BAD_PROT, "\2"} }; -int + +/* Mark as invalid, all open files on tree connections since they + were closed when session to server was lost */ +static void mark_open_files_invalid(struct cifsTconInfo * pTcon) +{ + struct cifsFileInfo *open_file = NULL; + struct list_head * tmp; + struct list_head * tmp1; + +/* list all files open on tree connection and mark them invalid */ + write_lock(&GlobalSMBSeslock); + list_for_each_safe(tmp, tmp1, &pTcon->openFileList) { + open_file = list_entry(tmp,struct cifsFileInfo, tlist); + if(open_file) { + open_file->invalidHandle = TRUE; + } + } + write_unlock(&GlobalSMBSeslock); + /* BB Add call to invalidate_inodes(sb) for all superblocks mounted to this tcon */ +} + +static int smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, void **request_buf /* returned */ , void **response_buf /* returned */ ) { int rc = 0; - if(tcon && (tcon->tidStatus == CifsNeedReconnect)) { - rc = -EIO; - if(tcon->ses) { - struct nls_table *nls_codepage = load_nls_default(); + /* SMBs NegProt, SessSetup, uLogoff do not have tcon yet so + check for tcp and smb session status done differently + for those three - in the calling routine */ + if(tcon) { + if((tcon->ses) && (tcon->ses->server)){ + struct nls_table *nls_codepage; + /* Give Demultiplex thread up to 10 seconds to + reconnect, should be greater than cifs socket + timeout which is 7 seconds */ + while(tcon->ses->server->tcpStatus == CifsNeedReconnect) { + wait_event_interruptible_timeout(tcon->ses->server->response_q, + (tcon->ses->server->tcpStatus == CifsGood), 10 * HZ); + if(tcon->ses->server->tcpStatus == CifsNeedReconnect) { + /* on "soft" mounts we wait once */ + if((tcon->retry == FALSE) || + (tcon->ses->status == CifsExiting)) { + cFYI(1,("gave up waiting on reconnect in smb_init")); + return -EHOSTDOWN; + } /* else "hard" mount - keep retrying until + process is killed or server comes back up */ + } else /* TCP session is reestablished now */ + break; + + } + + nls_codepage = load_nls_default(); + /* need to prevent multiple threads trying to + simultaneously reconnect the same SMB session */ + down(&tcon->ses->sesSem); if(tcon->ses->status == CifsNeedReconnect) - rc = setup_session(0, tcon->ses, nls_codepage); - if(!rc) { + rc = cifs_setup_session(0, tcon->ses, nls_codepage); + if(!rc && (tcon->tidStatus == CifsNeedReconnect)) { + mark_open_files_invalid(tcon); rc = CIFSTCon(0, tcon->ses, tcon->treeName, tcon, nls_codepage); + up(&tcon->ses->sesSem); + if(rc == 0) + atomic_inc(&tconInfoReconnectCount); + cFYI(1, ("reconnect tcon rc = %d", rc)); - if(!rc) - reopen_files(tcon,nls_codepage); + /* Removed call to reopen open files here - + it is safer (and faster) to reopen files + one at a time as needed in read and write */ + + /* Check if handle based operation so we + know whether we can continue or not without + returning to caller to reset file handle */ + switch(smb_command) { + case SMB_COM_READ_ANDX: + case SMB_COM_WRITE_ANDX: + case SMB_COM_CLOSE: + case SMB_COM_FIND_CLOSE2: + case SMB_COM_LOCKING_ANDX: { + unload_nls(nls_codepage); + return -EAGAIN; + } + } + } else { + up(&tcon->ses->sesSem); } unload_nls(nls_codepage); + + } else { + return -EIO; } } if(rc) return rc; - *request_buf = buf_get(); + *request_buf = cifs_buf_get(); if (request_buf == 0) { return -ENOMEM; } @@ -98,7 +172,6 @@ CIFSSMBNegotiate(unsigned int xid, struc rc = -EIO; return rc; } - rc = smb_init(SMB_COM_NEGOTIATE, 0, 0 /* no tcon yet */ , (void **) &pSMB, (void **) &pSMBr); if (rc) @@ -120,11 +193,11 @@ CIFSSMBNegotiate(unsigned int xid, struc if (rc == 0) { server->secMode = pSMBr->SecurityMode; server->secType = NTLM; /* BB override default for NTLMv2 or krb*/ - /* one byte - no need to convert this or EncryptionKeyLen from le,*/ + /* one byte - no need to convert this or EncryptionKeyLen from le,*/ server->maxReq = le16_to_cpu(pSMBr->MaxMpxCount); /* probably no need to store and check maxvcs */ server->maxBuf = - min(le32_to_cpu(pSMBr->MaxBufferSize), + min(le32_to_cpu(pSMBr->MaxBufferSize), (__u32) CIFS_MAX_MSGSIZE + MAX_CIFS_HDR_SIZE); server->maxRw = le32_to_cpu(pSMBr->MaxRawSize); cFYI(0, ("Max buf = %d ", ses->server->maxBuf)); @@ -172,7 +245,6 @@ CIFSSMBNegotiate(unsigned int xid, struc pSMBr->ByteCount - 16, &server->secType); } - } else server->capabilities &= ~CAP_EXTENDED_SECURITY; if(sign_CIFS_PDUs == FALSE) { @@ -187,7 +259,7 @@ CIFSSMBNegotiate(unsigned int xid, struc } if (pSMB) - buf_release(pSMB); + cifs_buf_release(pSMB); return rc; } @@ -218,12 +290,19 @@ CIFSSMBTDis(const int xid, struct cifsTc return -EBUSY; } + /* No need to return error on this operation if tid invalidated and + closed on server already e.g. due to tcp session crashing */ + if(tcon->tidStatus == CifsNeedReconnect) { + up(&tcon->tconSem); + return 0; + } + /* BB remove (from server) list of shares - but with smp safety BB */ /* BB is ses active - do we need to check here - but how? BB */ - if((tcon->ses == 0) || (tcon->ses->server == 0)) { - up(&tcon->tconSem); - return -EIO; - } + if((tcon->ses == 0) || (tcon->ses->server == 0)) { + up(&tcon->tconSem); + return -EIO; + } rc = smb_init(SMB_COM_TREE_DISCONNECT, 0, tcon, (void **) &smb_buffer, (void **) &smb_buffer_response); @@ -237,8 +316,14 @@ CIFSSMBTDis(const int xid, struct cifsTc cFYI(1, (" Tree disconnect failed %d", rc)); if (smb_buffer) - buf_release(smb_buffer); + cifs_buf_release(smb_buffer); up(&tcon->tconSem); + + /* No need to return error on this operation if tid invalidated and + closed on server already e.g. due to tcp session crashing */ + if (rc == -EAGAIN) + rc = 0; + return rc; } @@ -251,9 +336,8 @@ CIFSSMBLogoff(const int xid, struct cifs int length; cFYI(1, ("In SMBLogoff for session disconnect")); - if (ses) - down(&ses->sesSem); /* check this sem more places */ + down(&ses->sesSem); else return -EIO; @@ -266,8 +350,8 @@ CIFSSMBLogoff(const int xid, struct cifs rc = smb_init(SMB_COM_LOGOFF_ANDX, 2, 0 /* no tcon anymore */, (void **) &pSMB, (void **) &smb_buffer_response); - if(ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) - pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE; + if(ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) + pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE; if (rc) { up(&ses->sesSem); @@ -285,8 +369,14 @@ CIFSSMBLogoff(const int xid, struct cifs ses->server->tcpStatus = CifsExiting; } if (pSMB) - buf_release(pSMB); + cifs_buf_release(pSMB); up(&ses->sesSem); + + /* if session dead then we do not need to do ulogoff, + since server closed smb session, no sense reporting + error */ + if (rc == -EAGAIN) + rc = 0; return rc; } @@ -300,6 +390,7 @@ CIFSSMBDelFile(const int xid, struct cif int bytes_returned; int name_len; +DelFileRetry: rc = smb_init(SMB_COM_DELETE, 1, tcon, (void **) &pSMB, (void **) &pSMBr); if (rc) @@ -329,7 +420,10 @@ CIFSSMBDelFile(const int xid, struct cif cFYI(1, ("Error in RMFile = %d", rc)); } if (pSMB) - buf_release(pSMB); + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto DelFileRetry; + return rc; } @@ -344,7 +438,7 @@ CIFSSMBRmDir(const int xid, struct cifsT int name_len; cFYI(1, ("In CIFSSMBRmDir")); - +RmDirRetry: rc = smb_init(SMB_COM_DELETE_DIRECTORY, 0, tcon, (void **) &pSMB, (void **) &pSMBr); if (rc) @@ -372,7 +466,9 @@ CIFSSMBRmDir(const int xid, struct cifsT cFYI(1, ("Error in RMDir = %d", rc)); } if (pSMB) - buf_release(pSMB); + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto RmDirRetry; return rc; } @@ -387,7 +483,7 @@ CIFSSMBMkDir(const int xid, struct cifsT int name_len; cFYI(1, ("In CIFSSMBMkDir")); - +MkDirRetry: rc = smb_init(SMB_COM_CREATE_DIRECTORY, 0, tcon, (void **) &pSMB, (void **) &pSMBr); if (rc) @@ -415,8 +511,9 @@ CIFSSMBMkDir(const int xid, struct cifsT cFYI(1, ("Error in Mkdir = %d", rc)); } if (pSMB) - buf_release(pSMB); - + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto MkDirRetry; return rc; } @@ -433,6 +530,7 @@ CIFSSMBOpen(const int xid, struct cifsTc int bytes_returned; int name_len; +openRetry: rc = smb_init(SMB_COM_NT_CREATE_ANDX, 24, tcon, (void **) &pSMB, (void **) &pSMBr); if (rc) @@ -464,8 +562,14 @@ CIFSSMBOpen(const int xid, struct cifsTc } pSMB->DesiredAccess = cpu_to_le32(access_flags); pSMB->AllocationSize = 0; - pSMB->FileAttributes = ATTR_NORMAL; /* XP does not handle ATTR_POSIX_SEMANTICS */ - /*if ((omode & S_IWUGO) == 0) + pSMB->FileAttributes = ATTR_NORMAL; + /* XP does not handle ATTR_POSIX_SEMANTICS */ + /* but it helps speed up case sensitive checks for other + servers such as Samba */ + if (tcon->ses->capabilities & CAP_UNIX) + pSMB->FileAttributes |= ATTR_POSIX_SEMANTICS; + + /* if ((omode & S_IWUGO) == 0) pSMB->FileAttributes |= ATTR_READONLY;*/ /* Above line causes problems due to vfs splitting create into two pieces - need to set mode after file created not while it is @@ -501,8 +605,9 @@ CIFSSMBOpen(const int xid, struct cifsTc } } if (pSMB) - buf_release(pSMB); - + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto openRetry; return rc; } @@ -527,9 +632,9 @@ CIFSSMBRead(const int xid, struct cifsTc if (rc) return rc; - /* tcon and ses pointer are checked in smb_init */ - if (tcon->ses->server == NULL) - return -ECONNABORTED; + /* tcon and ses pointer are checked in smb_init */ + if (tcon->ses->server == NULL) + return -ECONNABORTED; pSMB->AndXCommand = 0xFF; /* none */ pSMB->Fid = netfid; @@ -567,10 +672,13 @@ CIFSSMBRead(const int xid, struct cifsTc } if (pSMB) { if(*buf) - buf_release(pSMB); + cifs_buf_release(pSMB); else *buf = (char *)pSMB; } + + /* Note: On -EAGAIN error only caller can retry on handle based calls + since file handle passed in no longer valid */ return rc; } @@ -623,7 +731,10 @@ CIFSSMBWrite(const int xid, struct cifsT *nbytes = le16_to_cpu(pSMBr->Count); if (pSMB) - buf_release(pSMB); + cifs_buf_release(pSMB); + + /* Note: On -EAGAIN error only caller can retry on handle based calls + since file handle passed in no longer valid */ return rc; } @@ -639,9 +750,9 @@ CIFSSMBLock(const int xid, struct cifsTc LOCK_RSP *pSMBr = NULL; int bytes_returned; int timeout = 0; + __u64 temp; - cFYI(1, ("In CIFSSMBLock")); - + cFYI(1, ("In CIFSSMBLock - timeout %d numLock %d",waitFlag,numLock)); rc = smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB, (void **) &pSMBr); if (rc) @@ -649,6 +760,12 @@ CIFSSMBLock(const int xid, struct cifsTc if(lockType == LOCKING_ANDX_OPLOCK_RELEASE) { timeout = -1; /* no response expected */ + pSMB->Timeout = 0; + } else if (waitFlag == TRUE) { + timeout = 3; /* blocking operation, no timeout */ + pSMB->Timeout = -1; /* blocking - do not time out */ + } else { + pSMB->Timeout = 0; } pSMB->NumberOfLocks = cpu_to_le32(numLock); @@ -658,8 +775,12 @@ CIFSSMBLock(const int xid, struct cifsTc pSMB->Fid = smb_file_id; /* netfid stays le */ pSMB->Locks[0].Pid = cpu_to_le16(current->tgid); - pSMB->Locks[0].Length = cpu_to_le64(len); - pSMB->Locks[0].Offset = cpu_to_le64(offset); + temp = cpu_to_le64(len); + pSMB->Locks[0].LengthLow = (__u32)(len & 0xFFFFFFFF); + pSMB->Locks[0].LengthHigh = (__u32)(len>>32); + temp = cpu_to_le64(offset); + pSMB->Locks[0].OffsetLow = (__u32)(offset & 0xFFFFFFFF); + pSMB->Locks[0].OffsetHigh = (__u32)(offset>>32); pSMB->ByteCount = sizeof (LOCKING_ANDX_RANGE); pSMB->hdr.smb_buf_length += pSMB->ByteCount; pSMB->ByteCount = cpu_to_le16(pSMB->ByteCount); @@ -671,8 +792,10 @@ CIFSSMBLock(const int xid, struct cifsTc cERROR(1, ("Send error in Lock = %d", rc)); } if (pSMB) - buf_release(pSMB); + cifs_buf_release(pSMB); + /* Note: On -EAGAIN error only caller can retry on handle based calls + since file handle passed in no longer valid */ return rc; } @@ -685,8 +808,11 @@ CIFSSMBClose(const int xid, struct cifsT int bytes_returned; cFYI(1, ("In CIFSSMBClose")); +/* do not retry on dead session on close */ rc = smb_init(SMB_COM_CLOSE, 3, tcon, (void **) &pSMB, (void **) &pSMBr); + if(rc == -EAGAIN) + return 0; if (rc) return rc; @@ -699,7 +825,11 @@ CIFSSMBClose(const int xid, struct cifsT cERROR(1, ("Send error in Close = %d", rc)); } if (pSMB) - buf_release(pSMB); + cifs_buf_release(pSMB); + + /* Since session is dead, file will be closed on server already */ + if(rc == -EAGAIN) + rc = 0; return rc; } @@ -716,7 +846,7 @@ CIFSSMBRename(const int xid, struct cifs int name_len, name_len2; cFYI(1, ("In CIFSSMBRename")); - +renameRetry: rc = smb_init(SMB_COM_RENAME, 1, tcon, (void **) &pSMB, (void **) &pSMBr); if (rc) @@ -766,7 +896,10 @@ CIFSSMBRename(const int xid, struct cifs cFYI(1, ("Send error in rename = %d", rc)); } if (pSMB) - buf_release(pSMB); + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto renameRetry; return rc; } @@ -774,44 +907,43 @@ CIFSSMBRename(const int xid, struct cifs int CIFSSMBRenameOpenFile(const int xid,struct cifsTconInfo *pTcon, int netfid, char * target_name, const struct nls_table * nls_codepage) { - struct smb_com_transaction2_sfi_req *pSMB = NULL; - struct smb_com_transaction2_sfi_rsp *pSMBr = NULL; + struct smb_com_transaction2_sfi_req *pSMB = NULL; + struct smb_com_transaction2_sfi_rsp *pSMBr = NULL; struct set_file_rename * rename_info; - char *data_offset; + char *data_offset; char dummy_string[30]; - int rc = 0; - int bytes_returned = 0; + int rc = 0; + int bytes_returned = 0; int len_of_str; - cFYI(1, ("Rename to File by handle")); + cFYI(1, ("Rename to File by handle")); + rc = smb_init(SMB_COM_TRANSACTION2, 15, pTcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; - rc = smb_init(SMB_COM_TRANSACTION2, 15, pTcon, (void **) &pSMB, - (void **) &pSMBr); - if (rc) - return rc; - - pSMB->ParameterCount = 6; - pSMB->MaxSetupCount = 0; - pSMB->Reserved = 0; - pSMB->Flags = 0; - pSMB->Timeout = 0; - pSMB->Reserved2 = 0; - pSMB->ParameterOffset = offsetof(struct smb_com_transaction2_sfi_req, - Fid) - 4; - pSMB->DataOffset = pSMB->ParameterOffset + pSMB->ParameterCount; + pSMB->ParameterCount = 6; + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + pSMB->ParameterOffset = offsetof(struct smb_com_transaction2_sfi_req, + Fid) - 4; + pSMB->DataOffset = pSMB->ParameterOffset + pSMB->ParameterCount; - data_offset = (char *) (&pSMB->hdr.Protocol) + pSMB->DataOffset; + data_offset = (char *) (&pSMB->hdr.Protocol) + pSMB->DataOffset; rename_info = (struct set_file_rename *) data_offset; - pSMB->MaxParameterCount = cpu_to_le16(2); - pSMB->MaxDataCount = cpu_to_le16(1000); /* BB find max SMB PDU from sess */ - pSMB->SetupCount = 1; - pSMB->Reserved3 = 0; - pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION); - pSMB->ByteCount = 3 /* pad */ + pSMB->ParameterCount; - pSMB->ParameterCount = cpu_to_le16(pSMB->ParameterCount); - pSMB->TotalParameterCount = pSMB->ParameterCount; - pSMB->ParameterOffset = cpu_to_le16(pSMB->ParameterOffset); - pSMB->DataOffset = cpu_to_le16(pSMB->DataOffset); + pSMB->MaxParameterCount = cpu_to_le16(2); + pSMB->MaxDataCount = cpu_to_le16(1000); /* BB find max SMB PDU from sess */ + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION); + pSMB->ByteCount = 3 /* pad */ + pSMB->ParameterCount; + pSMB->ParameterCount = cpu_to_le16(pSMB->ParameterCount); + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->ParameterOffset = cpu_to_le16(pSMB->ParameterOffset); + pSMB->DataOffset = cpu_to_le16(pSMB->DataOffset); /* construct random name ".cifs_tmp" */ rename_info->overwrite = cpu_to_le32(1); rename_info->root_fid = 0; @@ -836,11 +968,15 @@ int CIFSSMBRenameOpenFile(const int xid, rc = SendReceive(xid, pTcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cFYI(1,("Send error in Rename (by file handle) = %d", rc)); + cFYI(1,("Send error in Rename (by file handle) = %d", rc)); } if (pSMB) - buf_release(pSMB); + cifs_buf_release(pSMB); + + /* Note: On -EAGAIN error only caller can retry on handle based calls + since file handle passed in no longer valid */ + return rc; } @@ -859,7 +995,7 @@ CIFSUnixCreateSymLink(const int xid, str int bytes_returned = 0; cFYI(1, ("In Symlink Unix style")); - +createSymLinkRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); if (rc) @@ -929,7 +1065,11 @@ CIFSUnixCreateSymLink(const int xid, str } if (pSMB) - buf_release(pSMB); + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto createSymLinkRetry; + return rc; } @@ -947,7 +1087,7 @@ CIFSUnixCreateHardLink(const int xid, st int bytes_returned = 0; cFYI(1, ("In Create Hard link Unix style")); - +createHardLinkRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); if (rc) @@ -1014,7 +1154,10 @@ CIFSUnixCreateHardLink(const int xid, st } if (pSMB) - buf_release(pSMB); + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto createHardLinkRetry; + return rc; } @@ -1030,6 +1173,7 @@ CIFSCreateHardLink(const int xid, struct int name_len, name_len2; cFYI(1, ("In CIFSCreateHardLink")); +winCreateHardLinkRetry: rc = smb_init(SMB_COM_NT_RENAME, 4, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -1081,7 +1225,9 @@ CIFSCreateHardLink(const int xid, struct cFYI(1, ("Send error in hard link (NT rename) = %d", rc)); } if (pSMB) - buf_release(pSMB); + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto winCreateHardLinkRetry; return rc; } @@ -1100,6 +1246,8 @@ CIFSSMBUnixQuerySymLink(const int xid, s int name_len; cFYI(1, ("In QPathSymLinkInfo (Unix) for path %s", searchName)); + +querySymLinkRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); if (rc) @@ -1174,7 +1322,9 @@ CIFSSMBUnixQuerySymLink(const int xid, s } } if (pSMB) - buf_release(pSMB); + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto querySymLinkRetry; return rc; } @@ -1256,7 +1406,11 @@ CIFSSMBQueryReparseLinkInfo(const int xi } } if (pSMB) - buf_release(pSMB); + cifs_buf_release(pSMB); + + /* Note: On -EAGAIN error only caller can retry on handle based calls + since file handle passed in no longer valid */ + return rc; } @@ -1274,6 +1428,7 @@ CIFSSMBQPathInfo(const int xid, struct c int name_len; cFYI(1, ("In QPathInfo path %s", searchName)); +QPathInfoRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); if (rc) @@ -1334,7 +1489,10 @@ CIFSSMBQPathInfo(const int xid, struct c rc = -ENOMEM; } if (pSMB) - buf_release(pSMB); + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto QPathInfoRetry; + return rc; } @@ -1352,6 +1510,7 @@ CIFSSMBUnixQPathInfo(const int xid, stru int name_len; cFYI(1, ("In QPathInfo (Unix) the path %s", searchName)); +UnixQPathInfoRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); if (rc) @@ -1413,7 +1572,10 @@ CIFSSMBUnixQPathInfo(const int xid, stru } } if (pSMB) - buf_release(pSMB); + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto UnixQPathInfoRetry; + return rc; } @@ -1430,6 +1592,7 @@ CIFSFindSingle(const int xid, struct cif int name_len; cFYI(1, ("In FindUnique")); +findUniqueRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); if (rc) @@ -1487,7 +1650,10 @@ CIFSFindSingle(const int xid, struct cif /* BB fill in */ } if (pSMB) - buf_release(pSMB); + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto findUniqueRetry; + return rc; } @@ -1507,6 +1673,7 @@ CIFSFindFirst(const int xid, struct cifs int name_len; cFYI(1, ("In FindFirst")); +findFirstRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); if (rc) @@ -1590,7 +1757,11 @@ CIFSFindFirst(const int xid, struct cifs memcpy(findData, response_data, le16_to_cpu(pSMBr->DataCount)); } if (pSMB) - buf_release(pSMB); + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto findFirstRetry; + return rc; } @@ -1608,6 +1779,7 @@ CIFSFindNext(const int xid, struct cifsT int bytes_returned; cFYI(1, ("In FindNext")); + if(resume_file_name == NULL) { return -EIO; } @@ -1690,7 +1862,11 @@ CIFSFindNext(const int xid, struct cifsT memcpy(findData, response_data, le16_to_cpu(pSMBr->DataCount)); } if (pSMB) - buf_release(pSMB); + cifs_buf_release(pSMB); + + /* Note: On -EAGAIN error only caller can retry on handle based calls + since file handle passed in no longer valid */ + return rc; } @@ -1701,10 +1877,14 @@ CIFSFindClose(const int xid, struct cifs FINDCLOSE_REQ *pSMB = NULL; CLOSE_RSP *pSMBr = NULL; int bytes_returned; - cFYI(1, ("In CIFSSMBFindClose")); + cFYI(1, ("In CIFSSMBFindClose")); rc = smb_init(SMB_COM_FIND_CLOSE2, 1, tcon, (void **) &pSMB, (void **) &pSMBr); + /* no sense returning error if session restarted + file handle has been closed */ + if(rc == -EAGAIN) + return 0; if (rc) return rc; @@ -1716,7 +1896,11 @@ CIFSFindClose(const int xid, struct cifs cERROR(1, ("Send error in FindClose = %d", rc)); } if (pSMB) - buf_release(pSMB); + cifs_buf_release(pSMB); + + /* Since session is dead, search handle closed on server already */ + if (rc == -EAGAIN) + rc = 0; return rc; } @@ -1743,7 +1927,7 @@ CIFSGetDFSRefer(const int xid, struct ci cFYI(1, ("In GetDFSRefer the path %s", searchName)); if (ses == NULL) return -ENODEV; - +getDFSRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, 0, (void **) &pSMB, (void **) &pSMBr); if (rc) @@ -1874,7 +2058,11 @@ CIFSGetDFSRefer(const int xid, struct ci } if (pSMB) - buf_release(pSMB); + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto getDFSRetry; + return rc; } @@ -1890,7 +2078,7 @@ CIFSSMBQFSInfo(const int xid, struct cif int bytes_returned = 0; cFYI(1, ("In QFSInfo")); - +QFSInfoRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); if (rc) @@ -1951,7 +2139,11 @@ CIFSSMBQFSInfo(const int xid, struct cif } } if (pSMB) - buf_release(pSMB); + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto QFSInfoRetry; + return rc; } @@ -1967,6 +2159,7 @@ CIFSSMBQFSAttributeInfo(int xid, struct int bytes_returned = 0; cFYI(1, ("In QFSAttributeInfo")); +QFSAttributeRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); if (rc) @@ -2013,7 +2206,11 @@ CIFSSMBQFSAttributeInfo(int xid, struct } } if (pSMB) - buf_release(pSMB); + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto QFSAttributeRetry; + return rc; } @@ -2029,7 +2226,7 @@ CIFSSMBQFSDeviceInfo(int xid, struct cif int bytes_returned = 0; cFYI(1, ("In QFSDeviceInfo")); - +QFSDeviceRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); if (rc) @@ -2078,7 +2275,12 @@ CIFSSMBQFSDeviceInfo(int xid, struct cif } } if (pSMB) - buf_release(pSMB); + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto QFSDeviceRetry; + + return rc; } @@ -2094,6 +2296,7 @@ CIFSSMBQFSUnixInfo(int xid, struct cifsT int bytes_returned = 0; cFYI(1, ("In QFSUnixInfo")); +QFSUnixRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); if (rc) @@ -2140,7 +2343,12 @@ CIFSSMBQFSUnixInfo(int xid, struct cifsT } } if (pSMB) - buf_release(pSMB); + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto QFSUnixRetry; + + return rc; } @@ -2162,7 +2370,7 @@ CIFSSMBSetEOF(int xid, struct cifsTconIn int bytes_returned = 0; cFYI(1, ("In SetEOF")); - +SetEOFRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); if (rc) @@ -2232,7 +2440,11 @@ CIFSSMBSetEOF(int xid, struct cifsTconIn } if (pSMB) - buf_release(pSMB); + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto SetEOFRetry; + return rc; } @@ -2248,8 +2460,7 @@ CIFSSMBSetFileSize(const int xid, struct int bytes_returned = 0; __u32 tmp; - cFYI(1, ("SetFileSize (via SetFileInfo)")); - + cFYI(1, ("SetFileSize (via SetFileInfo) %lld",size)); rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); if (rc) @@ -2318,7 +2529,11 @@ CIFSSMBSetFileSize(const int xid, struct } if (pSMB) - buf_release(pSMB); + cifs_buf_release(pSMB); + + /* Note: On -EAGAIN error only caller can retry on handle based calls + since file handle passed in no longer valid */ + return rc; } @@ -2335,6 +2550,7 @@ CIFSSMBSetTimes(int xid, struct cifsTcon cFYI(1, ("In SetTimes")); +SetTimesRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); if (rc) @@ -2392,7 +2608,11 @@ CIFSSMBSetTimes(int xid, struct cifsTcon } if (pSMB) - buf_release(pSMB); + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto SetTimesRetry; + return rc; } @@ -2409,7 +2629,7 @@ CIFSSMBUnixSetPerms(const int xid, struc FILE_UNIX_BASIC_INFO *data_offset; cFYI(1, ("In SetUID/GID/Mode")); - +setPermsRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); if (rc) @@ -2470,6 +2690,8 @@ CIFSSMBUnixSetPerms(const int xid, struc } if (pSMB) - buf_release(pSMB); + cifs_buf_release(pSMB); + if (rc == -EAGAIN) + goto setPermsRetry; return rc; } --- diff/fs/cifs/connect.c 2004-04-05 12:57:08.000000000 +0100 +++ source/fs/cifs/connect.c 2004-04-21 10:46:51.676726056 +0100 @@ -44,7 +44,7 @@ extern void SMBencrypt(unsigned char *pa unsigned char *p24); extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24); -extern int inet_addr(char *); +extern int cifs_inet_pton(int, const char *, void *dst); struct smb_vol { char *username; @@ -58,7 +58,8 @@ struct smb_vol { gid_t linux_gid; mode_t file_mode; mode_t dir_mode; - int rw; + int rw:1; + int retry:1; unsigned int rsize; unsigned int wsize; unsigned int sockopt; @@ -66,13 +67,14 @@ struct smb_vol { }; int ipv4_connect(struct sockaddr_in *psin_server, struct socket **csocket); +int ipv6_connect(struct sockaddr_in6 *psin_server, struct socket **csocket); /* * cifs tcp session reconnection * * mark tcp session as reconnecting so temporarily locked - * mark all smb sessions as reconnecting for tcp session (TBD BB) + * mark all smb sessions as reconnecting for tcp session * reconnect tcp session * wake up waiters on reconnection? - (not needed currently) */ @@ -84,6 +86,7 @@ cifs_reconnect(struct TCP_Server_Info *s struct list_head *tmp; struct cifsSesInfo *ses; struct cifsTconInfo *tcon; + struct mid_q_entry * mid_entry; if(server->tcpStatus == CifsExiting) return rc; @@ -123,13 +126,35 @@ cifs_reconnect(struct TCP_Server_Info *s server->ssocket = NULL; } + spin_lock(&GlobalMid_Lock); + list_for_each(tmp, &server->pending_mid_q) { + mid_entry = list_entry(tmp, struct + mid_q_entry, + qhead); + if(mid_entry) { + if(mid_entry->midState == MID_REQUEST_SUBMITTED) { + /* Mark other intransit requests as needing retry so + we do not immediately mark the session bad again + (ie after we reconnect below) as they timeout too */ + mid_entry->midState = MID_RETRY_NEEDED; + } + } + } + spin_unlock(&GlobalMid_Lock); + + while ((server->tcpStatus != CifsExiting) && (server->tcpStatus != CifsGood)) { - rc = ipv4_connect(&server->sockAddr, &server->ssocket); + if(server->protocolType == IPV6) { + rc = ipv6_connect(&server->addr.sockAddr6,&server->ssocket); + } else { + rc = ipv4_connect(&server->addr.sockAddr, &server->ssocket); + } if(rc) { set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(3 * HZ); } else { + atomic_inc(&tcpSesReconnectCount); server->tcpStatus = CifsGood; wake_up(&server->response_q); } @@ -138,7 +163,7 @@ cifs_reconnect(struct TCP_Server_Info *s return rc; } -int +static int cifs_demultiplex_thread(struct TCP_Server_Info *server) { int length; @@ -165,7 +190,7 @@ cifs_demultiplex_thread(struct TCP_Serve while (server->tcpStatus != CifsExiting) { if (smb_buffer == NULL) - smb_buffer = buf_get(); + smb_buffer = cifs_buf_get(); else memset(smb_buffer, 0, sizeof (struct smb_hdr)); @@ -193,6 +218,7 @@ cifs_demultiplex_thread(struct TCP_Serve } else if (server->tcpStatus == CifsNeedReconnect) { cFYI(1,("Reconnecting after server stopped responding")); cifs_reconnect(server); + cFYI(1,("call to reconnect done")); csocket = server->ssocket; continue; } else if ((length == -ERESTARTSYS) || (length == -EAGAIN)) { @@ -201,6 +227,15 @@ cifs_demultiplex_thread(struct TCP_Serve tcpStatus CifsNeedReconnect if server hung */ continue; } else if (length <= 0) { + if(server->tcpStatus == CifsNew) { + cFYI(1,("tcp session abended prematurely (after SMBnegprot)")); + /* some servers kill tcp session rather than returning + smb negprot error in which case reconnecting here is + not going to help - return error to mount */ + server->tcpStatus = CifsExiting; + break; + } + cFYI(1,("Reconnecting after unexpected rcvmsg error ")); cifs_reconnect(server); csocket = server->ssocket; @@ -208,6 +243,8 @@ cifs_demultiplex_thread(struct TCP_Serve } pdu_length = 4 + ntohl(smb_buffer->smb_buf_length); + /* Ony read pdu_length after below checks for too short (due + to e.g. int overflow) and too long ie beyond end of buf */ cFYI(1, ("Peek length rcvd: %d with smb length: %d", length, pdu_length)); temp = (char *) smb_buffer; @@ -228,8 +265,8 @@ cifs_demultiplex_thread(struct TCP_Serve } else if (temp[0] != (char) 0) { cERROR(1, - ("Unknown RFC 1001 frame received not 0x00 nor 0x85")); - cifs_dump_mem(" Received Data is: ", temp, length); + ("Unknown RFC 1001 frame not 0x00 nor 0x85")); + cifs_dump_mem(" Received Data: ", temp, length); cifs_reconnect(server); csocket = server->ssocket; continue; @@ -257,8 +294,9 @@ cifs_demultiplex_thread(struct TCP_Serve length = 0; iov.iov_base = smb_buffer; iov.iov_len = pdu_length; - for (total_read = 0; total_read < pdu_length; total_read += length) { - /* Should improve check for buffer overflow with bad pdu_length */ + for (total_read = 0; + total_read < pdu_length; + total_read += length) { length = sock_recvmsg(csocket, &smb_msg, pdu_length - total_read, 0); if (length == 0) { @@ -286,7 +324,7 @@ cifs_demultiplex_thread(struct TCP_Serve mid_q_entry, qhead); - if (mid_entry->mid == smb_buffer->Mid) { + if ((mid_entry->mid == smb_buffer->Mid) && (mid_entry->midState == MID_REQUEST_SUBMITTED)) { cFYI(1, (" Mid 0x%x matched - waking up ",mid_entry->mid)); task_to_wake = mid_entry->tsk; @@ -302,6 +340,7 @@ cifs_demultiplex_thread(struct TCP_Serve wake_up_process(task_to_wake); } else if (is_valid_oplock_break(smb_buffer) == FALSE) { cERROR(1, ("No task to wake, unknown frame rcvd!")); + cifs_dump_mem("Received Data is: ",temp,sizeof(struct smb_hdr)); } } } else { @@ -316,14 +355,16 @@ cifs_demultiplex_thread(struct TCP_Serve } } } - /* BB add code to lock SMB sessions while releasing */ + + server->tcpStatus = CifsExiting; + server->tsk = NULL; if(server->ssocket) { sock_release(csocket); - server->ssocket = NULL; + server->ssocket = NULL; } set_fs(temp_fs); if (smb_buffer) /* buffer usually freed in free_mid - need to free it on error or exit */ - buf_release(smb_buffer); + cifs_buf_release(smb_buffer); read_lock(&GlobalSMBSeslock); if (list_empty(&server->pending_mid_q)) { @@ -337,39 +378,85 @@ cifs_demultiplex_thread(struct TCP_Serve ses->server = NULL; } } - kfree(server); - } else /* BB need to more gracefully handle the rare negative session - response case because response will be still outstanding */ - cERROR(1, ("Active MIDs in queue while exiting - can not delete mid_q_entries or TCP_Server_Info structure due to pending requests MEMORY LEAK!!")); - /* BB wake up waitors, and/or wait and/or free stale mids and try again? BB */ - /* BB Need to fix bug in error path above - perhaps wait until smb requests - time out and then free the tcp per server struct BB */ - read_unlock(&GlobalSMBSeslock); + read_unlock(&GlobalSMBSeslock); + } else { + spin_lock(&GlobalMid_Lock); + list_for_each(tmp, &server->pending_mid_q) { + mid_entry = list_entry(tmp, struct mid_q_entry, qhead); + if (mid_entry->midState == MID_REQUEST_SUBMITTED) { + cFYI(1, + (" Clearing Mid 0x%x - waking up ",mid_entry->mid)); + task_to_wake = mid_entry->tsk; + if(task_to_wake) { + wake_up_process(task_to_wake); + } + } + } + spin_unlock(&GlobalMid_Lock); + read_unlock(&GlobalSMBSeslock); + set_current_state(TASK_INTERRUPTIBLE); + /* 1/8th of sec should be more than enough time for them to exit */ + schedule_timeout(HZ/8); + } + + if (list_empty(&server->pending_mid_q)) { + /* mpx threads have not exited yet give them + at least the smb send timeout time for long ops */ + cFYI(1, ("Wait for exit from demultiplex thread")); + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(46 * HZ); + /* if threads still have not exited they are probably never + coming home not much else we can do but free the memory */ + } + kfree(server); cFYI(1, ("About to exit from demultiplex thread")); return 0; } -int -parse_mount_options(char *options, const char *devname, struct smb_vol *vol) +static void * +cifs_kcalloc(size_t size, int type) +{ + void *addr; + addr = kmalloc(size, type); + if (addr) + memset(addr, 0, size); + return addr; +} + +static int +cifs_parse_mount_options(char *options, const char *devname, struct smb_vol *vol) { char *value; char *data; - int temp_len; + int temp_len, i, j; + char separator[2]; + + separator[0] = ','; + separator[1] = 0; - memset(vol,0,sizeof(struct smb_vol)); vol->linux_uid = current->uid; /* current->euid instead? */ vol->linux_gid = current->gid; vol->dir_mode = S_IRWXUGO; /* 2767 perms indicate mandatory locking support */ vol->file_mode = S_IALLUGO & ~(S_ISUID | S_IXGRP); + /* vol->retry default is 0 (i.e. "soft" limited retry not hard retry) */ vol->rw = TRUE; if (!options) return 1; - while ((data = strsep(&options, ",")) != NULL) { + if(strncmp(options,"sep=",4) == 0) { + if(options[4] != 0) { + separator[0] = options[4]; + options += 5; + } else { + cFYI(1,("Null separator not allowed")); + } + } + + while ((data = strsep(&options, separator)) != NULL) { if (!*data) continue; if ((value = strchr(data, '=')) != NULL) @@ -389,11 +476,54 @@ parse_mount_options(char *options, const } else if (strnicmp(data, "pass", 4) == 0) { if (!value || !*value) { vol->password = NULL; - } else if (strnlen(value, 17) < 17) { - vol->password = value; + continue; + } + temp_len = strlen(value); + /* removed password length check, NTLM passwords + can be arbitrarily long */ + + /* if comma in password, the string will be + prematurely null terminated. Commas in password are + specified across the cifs mount interface by a double + comma ie ,, and a comma used as in other cases ie ',' + as a parameter delimiter/separator is single and due + to the strsep above is temporarily zeroed. */ + + /* NB: password legally can have multiple commas and + the only illegal character in a password is null */ + + if ((value[temp_len] == 0) && (value[temp_len+1] == separator[0])) { + /* reinsert comma */ + value[temp_len] = separator[0]; + temp_len+=2; /* move after the second comma */ + while(value[temp_len] != 0) { + if((value[temp_len] == separator[0]) && (value[temp_len+1] != separator[0])) { + /* single comma indicating start of next parm */ + break; + } + temp_len++; + } + if(value[temp_len] == 0) { + options = NULL; + } else { + value[temp_len] = 0; + /* move options to point to start of next parm */ + options = value + temp_len + 1; + } + /* go from value to (value + temp_len) condensing double commas to singles */ + vol->password = cifs_kcalloc(temp_len, GFP_KERNEL); + for(i=0,j=0;ipassword[j] = value[i]; + if(value[i] == separator[0] && value[i+1] == separator[0]) { + /* skip second comma */ + i++; + } + } + /* value[temp_len] is zeroed above so + vol->password[temp_len] guaranteed to be null */ } else { - printk(KERN_WARNING "CIFS: password too long\n"); - return 1; + vol->password = cifs_kcalloc(temp_len + 1, GFP_KERNEL); + strcpy(vol->password, value); } } else if (strnicmp(data, "ip", 2) == 0) { if (!value || !*value) { @@ -506,8 +636,29 @@ parse_mount_options(char *options, const /* ignore */ } else if (strnicmp(data, "rw", 2) == 0) { vol->rw = TRUE; + } else if ((strnicmp(data, "suid", 4) == 0) || + (strnicmp(data, "nosuid", 6) == 0) || + (strnicmp(data, "exec", 4) == 0) || + (strnicmp(data, "noexec", 6) == 0) || + (strnicmp(data, "nodev", 5) == 0) || + (strnicmp(data, "dev", 3) == 0)) { + /* The mount tool or mount.cifs helper (if present) + uses these opts to set flags, and the flags are read + by the kernel vfs layer before we get here (ie + before read super) so there is no point trying to + parse these options again and set anything and it + is ok to just ignore them */ + continue; } else if (strnicmp(data, "ro", 2) == 0) { vol->rw = FALSE; + } else if (strnicmp(data, "hard", 4) == 0) { + vol->retry = 1; + } else if (strnicmp(data, "soft", 4) == 0) { + vol->retry = 0; + } else if (strnicmp(data, "nohard", 6) == 0) { + vol->retry = 0; + } else if (strnicmp(data, "nosoft", 6) == 0) { + vol->retry = 1; } else printk(KERN_WARNING "CIFS: Unknown mount option %s\n",data); } @@ -537,8 +688,8 @@ parse_mount_options(char *options, const return 0; } -struct cifsSesInfo * -find_tcp_session(__u32 new_target_ip_addr, +static struct cifsSesInfo * +cifs_find_tcp_session(__u32 new_target_ip_addr, char *userName, struct TCP_Server_Info **psrvTcp) { struct list_head *tmp; @@ -549,7 +700,7 @@ find_tcp_session(__u32 new_target_ip_add list_for_each(tmp, &GlobalSMBSessionList) { ses = list_entry(tmp, struct cifsSesInfo, cifsSessionList); if (ses->server) { - if (ses->server->sockAddr.sin_addr.s_addr == + if (ses->server->addr.sockAddr.sin_addr.s_addr == new_target_ip_addr) { /* BB lock server and tcp session and increment use count here?? */ *psrvTcp = ses->server; /* found a match on the TCP session */ @@ -568,7 +719,7 @@ find_tcp_session(__u32 new_target_ip_add return NULL; } -struct cifsTconInfo * +static struct cifsTconInfo * find_unc(__u32 new_target_ip_addr, char *uncName, char *userName) { struct list_head *tmp; @@ -582,9 +733,9 @@ find_unc(__u32 new_target_ip_addr, char if (tcon->ses->server) { cFYI(1, (" old ip addr: %x == new ip %x ?", - tcon->ses->server->sockAddr.sin_addr. + tcon->ses->server->addr.sockAddr.sin_addr. s_addr, new_target_ip_addr)); - if (tcon->ses->server->sockAddr.sin_addr. + if (tcon->ses->server->addr.sockAddr.sin_addr. s_addr == new_target_ip_addr) { /* BB lock tcon and server and tcp session and increment use count here? */ /* found a match on the TCP session */ @@ -628,7 +779,8 @@ connect_to_dfs_path(int xid, struct cifs the helper that resolves tcp names, mount to it, try to tcon to it unmount it if fail */ - /* BB free memory for referrals string BB */ + if(referrals) + kfree(referrals); return rc; } @@ -666,95 +818,11 @@ get_dfs_path(int xid, struct cifsSesInfo return rc; } -int setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo, struct nls_table * nls_info) -{ - int rc = 0; - char ntlm_session_key[CIFS_SESSION_KEY_SIZE]; - int ntlmv2_flag = FALSE; - - /* what if server changes its buffer size after dropping the session? */ - if(pSesInfo->server->maxBuf == 0) /* no need to send on reconnect */ - rc = CIFSSMBNegotiate(xid, pSesInfo); - pSesInfo->capabilities = pSesInfo->server->capabilities; - pSesInfo->sequence_number = 0; - if (!rc) { - cFYI(1,("Security Mode: 0x%x Capabilities: 0x%x Time Zone: %d", - pSesInfo->server->secMode, - pSesInfo->server->capabilities, - pSesInfo->server->timeZone)); - if (extended_security - && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY) - && (pSesInfo->server->secType == NTLMSSP)) { - cFYI(1, ("New style sesssetup ")); - rc = CIFSSpnegoSessSetup(xid, pSesInfo, - NULL /* security blob */, - 0 /* blob length */, - nls_info); - } else if (extended_security - && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY) - && (pSesInfo->server->secType == RawNTLMSSP)) { - cFYI(1, ("NTLMSSP sesssetup ")); - rc = CIFSNTLMSSPNegotiateSessSetup(xid, - pSesInfo, - &ntlmv2_flag, - nls_info); - if (!rc) { - if(ntlmv2_flag) { - char * v2_response; - cFYI(1,("Can use more secure NTLM version 2 password hash")); - CalcNTLMv2_partial_mac_key(pSesInfo, - nls_info); - v2_response = kmalloc(16 + 64 /* blob */, GFP_KERNEL); - if(v2_response) { - CalcNTLMv2_response(pSesInfo,v2_response); -/* cifs_calculate_ntlmv2_mac_key(pSesInfo->mac_signing_key, response, ntlm_session_key, */ - kfree(v2_response); - /* BB Put dummy sig in SessSetup PDU? */ - } else - rc = -ENOMEM; - - } else { - SMBNTencrypt(pSesInfo->password_with_pad, - pSesInfo->server->cryptKey, - ntlm_session_key); - - cifs_calculate_mac_key(pSesInfo->mac_signing_key, - ntlm_session_key, - pSesInfo->password_with_pad); - } - /* for better security the weaker lanman hash not sent - in AuthSessSetup so we no longer calculate it */ - - rc = CIFSNTLMSSPAuthSessSetup(xid, - pSesInfo, - ntlm_session_key, - ntlmv2_flag, - nls_info); - } - } else { /* old style NTLM 0.12 session setup */ - SMBNTencrypt(pSesInfo->password_with_pad, - pSesInfo->server->cryptKey, - ntlm_session_key); - - cifs_calculate_mac_key(pSesInfo->mac_signing_key, - ntlm_session_key, pSesInfo->password_with_pad); - rc = CIFSSessSetup(xid, pSesInfo, - ntlm_session_key, nls_info); - } - if (rc) { - cERROR(1,("Send error in SessSetup = %d",rc)); - } else { - cFYI(1,("CIFS Session Established successfully")); - pSesInfo->status = CifsGood; - } - } - return rc; -} - int ipv4_connect(struct sockaddr_in *psin_server, struct socket **csocket) { int rc = 0; + int connected = 0; if(*csocket == NULL) { rc = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, csocket); @@ -769,39 +837,47 @@ ipv4_connect(struct sockaddr_in *psin_se } psin_server->sin_family = AF_INET; + if(psin_server->sin_port) { /* user overrode default port */ rc = (*csocket)->ops->connect(*csocket, (struct sockaddr *) psin_server, sizeof (struct sockaddr_in),0); - if (rc >= 0) { - return rc; - } - } - - /* do not retry on the same port we just failed on */ - if(psin_server->sin_port != htons(CIFS_PORT)) { - psin_server->sin_port = htons(CIFS_PORT); + if (rc >= 0) + connected = 1; + } + + if(!connected) { + /* do not retry on the same port we just failed on */ + if(psin_server->sin_port != htons(CIFS_PORT)) { + psin_server->sin_port = htons(CIFS_PORT); - rc = (*csocket)->ops->connect(*csocket, + rc = (*csocket)->ops->connect(*csocket, (struct sockaddr *) psin_server, sizeof (struct sockaddr_in),0); + if (rc >= 0) + connected = 1; + } } - if (rc < 0) { + if (!connected) { psin_server->sin_port = htons(RFC1001_PORT); rc = (*csocket)->ops->connect(*csocket, (struct sockaddr *) psin_server, sizeof (struct sockaddr_in),0); - if (rc < 0) { - cFYI(1, ("Error connecting to socket. %d", rc)); - sock_release(*csocket); - *csocket = NULL; - return rc; - } + if (rc >= 0) + connected = 1; } + /* give up here - unless we want to retry on different + protocol families some day */ + if (!connected) { + cFYI(1,("Error %d connecting to server via ipv4",rc)); + sock_release(*csocket); + *csocket = NULL; + return rc; + } /* Eventually check for other socket options to change from the default. sock_setsockopt not used because it expects user space buffer */ - (*csocket)->sk->sk_rcvtimeo = 8 * HZ; + (*csocket)->sk->sk_rcvtimeo = 7 * HZ; return rc; } @@ -810,49 +886,63 @@ int ipv6_connect(struct sockaddr_in6 *psin_server, struct socket **csocket) { int rc = 0; + int connected = 0; - rc = sock_create(PF_INET6, SOCK_STREAM, - IPPROTO_TCP /* IPPROTO_IPV6 ? */ , csocket); - if (rc < 0) { - cERROR(1, ("Error creating socket. Aborting operation")); - return rc; + if(*csocket == NULL) { + rc = sock_create(PF_INET6, SOCK_STREAM, IPPROTO_TCP, csocket); + if (rc < 0) { + cERROR(1, ("Error %d creating ipv6 socket",rc)); + *csocket = NULL; + return rc; + } else { + /* BB other socket options to set KEEPALIVE, NODELAY? */ + cFYI(1,("ipv6 Socket created")); + } } psin_server->sin6_family = AF_INET6; + if(psin_server->sin6_port) { /* user overrode default port */ rc = (*csocket)->ops->connect(*csocket, (struct sockaddr *) psin_server, sizeof (struct sockaddr_in6),0); - if (rc >= 0) { - /* BB other socket options to set KEEPALIVE, timeouts? NODELAY? */ - return rc; - } - } + if (rc >= 0) + connected = 1; + } + + if(!connected) { + /* do not retry on the same port we just failed on */ + if(psin_server->sin6_port != htons(CIFS_PORT)) { + psin_server->sin6_port = htons(CIFS_PORT); - /* do not retry on the same port we just failed on */ - if(psin_server->sin6_port != htons(CIFS_PORT)) { - psin_server->sin6_port = htons(CIFS_PORT); - - rc = (*csocket)->ops->connect(*csocket, - (struct sockaddr *) psin_server, - sizeof (struct sockaddr_in6), 0); -/* BB fix the timeout to be shorter above - and check flags */ + rc = (*csocket)->ops->connect(*csocket, + (struct sockaddr *) psin_server, + sizeof (struct sockaddr_in6),0); + if (rc >= 0) + connected = 1; + } } - if (rc < 0) { + if (!connected) { psin_server->sin6_port = htons(RFC1001_PORT); rc = (*csocket)->ops->connect(*csocket, (struct sockaddr *) - psin_server, - sizeof (struct sockaddr_in6), 0); - if (rc < 0) { - cFYI(1, - ("Error connecting to socket (via ipv6). %d", - rc)); - sock_release(*csocket); - *csocket = NULL; - return rc; - } + psin_server, sizeof (struct sockaddr_in6),0); + if (rc >= 0) + connected = 1; } + /* give up here - unless we want to retry on different + protocol families some day */ + if (!connected) { + cFYI(1,("Error %d connecting to server via ipv6",rc)); + sock_release(*csocket); + *csocket = NULL; + return rc; + } + /* Eventually check for other socket options to change from + the default. sock_setsockopt not used because it expects + user space buffer */ + (*csocket)->sk->sk_rcvtimeo = 7 * HZ; + return rc; } @@ -864,7 +954,7 @@ cifs_mount(struct super_block *sb, struc int xid; struct socket *csocket = NULL; struct sockaddr_in sin_server; -/* struct sockaddr_in6 sin_server6; */ + struct sockaddr_in6 sin_server6; struct smb_vol volume_info; struct cifsSesInfo *pSesInfo = NULL; struct cifsSesInfo *existingCifsSes = NULL; @@ -872,11 +962,16 @@ cifs_mount(struct super_block *sb, struc struct TCP_Server_Info *srvTcp = NULL; xid = GetXid(); - cFYI(1, ("Entering cifs_mount. Xid: %d with: %s", xid, mount_data)); - if (parse_mount_options(mount_data, devname, &volume_info)) { +/* cFYI(1, ("Entering cifs_mount. Xid: %d with: %s", xid, mount_data)); */ + + memset(&volume_info,0,sizeof(struct smb_vol)); + + if (cifs_parse_mount_options(mount_data, devname, &volume_info)) { if(volume_info.UNC) kfree(volume_info.UNC); + if(volume_info.password) + kfree(volume_info.password); FreeXid(xid); return -EINVAL; } @@ -888,17 +983,45 @@ cifs_mount(struct super_block *sb, struc cifserror("No username specified "); /* In userspace mount helper we can get user name from alternate locations such as env variables and files on disk */ + if(volume_info.UNC) + kfree(volume_info.UNC); + if(volume_info.password) + kfree(volume_info.password); FreeXid(xid); return -EINVAL; } - if (volume_info.UNC) { - sin_server.sin_addr.s_addr = inet_addr(volume_info.UNCip); - cFYI(1, ("UNC: %s ", volume_info.UNC)); - } else { - /* BB we could connect to the DFS root? but which server do we ask? */ + if (volume_info.UNCip && volume_info.UNC) { + rc = cifs_inet_pton(AF_INET, volume_info.UNCip,&sin_server.sin_addr.s_addr); + + if(rc == 0) { + /* not ipv4 address, try ipv6 */ + rc = cifs_inet_pton(AF_INET6,volume_info.UNCip,&sin_server6.sin6_addr.in6_u); + } + + if(rc != 1) { + /* we failed translating address */ + if(volume_info.UNC) + kfree(volume_info.UNC); + if(volume_info.password) + kfree(volume_info.password); + FreeXid(xid); + return -EINVAL; + } + + cFYI(1, ("UNC: %s ip: %s", volume_info.UNC, volume_info.UNCip)); + /* success */ + rc = 0; + } else if (volume_info.UNCip){ + /* BB using ip addr as server name connect to the DFS root below */ + cERROR(1,("Connecting to DFS root not implemented yet")); + } else /* which servers DFS root would we conect to */ { cERROR(1, ("CIFS mount error: No UNC path (e.g. -o unc=//192.168.1.100/public) specified ")); + if(volume_info.UNC) + kfree(volume_info.UNC); + if(volume_info.password) + kfree(volume_info.password); FreeXid(xid); return -EINVAL; } @@ -911,19 +1034,25 @@ cifs_mount(struct super_block *sb, struc cifs_sb->local_nls = load_nls(volume_info.iocharset); if(cifs_sb->local_nls == NULL) { cERROR(1,("CIFS mount error: iocharset %s not found",volume_info.iocharset)); + if(volume_info.UNC) + kfree(volume_info.UNC); + if(volume_info.password) + kfree(volume_info.password); FreeXid(xid); return -ELIBACC; } } existingCifsSes = - find_tcp_session(sin_server.sin_addr.s_addr, + cifs_find_tcp_session(sin_server.sin_addr.s_addr, volume_info.username, &srvTcp); if (srvTcp) { cFYI(1, ("Existing tcp session with server found ")); } else { /* create socket */ if(volume_info.port) sin_server.sin_port = htons(volume_info.port); + else + sin_server.sin_port = 0; rc = ipv4_connect(&sin_server, &csocket); if (rc < 0) { cERROR(1, @@ -932,6 +1061,8 @@ cifs_mount(struct super_block *sb, struc sock_release(csocket); if(volume_info.UNC) kfree(volume_info.UNC); + if(volume_info.password) + kfree(volume_info.password); FreeXid(xid); return rc; } @@ -942,16 +1073,20 @@ cifs_mount(struct super_block *sb, struc sock_release(csocket); if(volume_info.UNC) kfree(volume_info.UNC); + if(volume_info.password) + kfree(volume_info.password); FreeXid(xid); return rc; } else { memset(srvTcp, 0, sizeof (struct TCP_Server_Info)); - memcpy(&srvTcp->sockAddr, &sin_server, sizeof (struct sockaddr_in)); + memcpy(&srvTcp->addr.sockAddr, &sin_server, sizeof (struct sockaddr_in)); /* BB Add code for ipv6 case too */ srvTcp->ssocket = csocket; + srvTcp->protocolType = IPV4; init_waitqueue_head(&srvTcp->response_q); INIT_LIST_HEAD(&srvTcp->pending_mid_q); - srvTcp->tcpStatus = CifsGood; + srvTcp->tcpStatus = CifsNew; + init_MUTEX(&srvTcp->tcpSem); kernel_thread((void *)(void *)cifs_demultiplex_thread, srvTcp, CLONE_FS | CLONE_FILES | CLONE_VM); } @@ -960,6 +1095,8 @@ cifs_mount(struct super_block *sb, struc if (existingCifsSes) { pSesInfo = existingCifsSes; cFYI(1, ("Existing smb sess found ")); + if(volume_info.password) + kfree(volume_info.password); } else if (!rc) { cFYI(1, ("Existing smb sess not found ")); pSesInfo = sesInfoAlloc(); @@ -973,8 +1110,7 @@ cifs_mount(struct super_block *sb, struc if (!rc){ if (volume_info.password) - strncpy(pSesInfo->password_with_pad, - volume_info.password,CIFS_ENCPWD_SIZE); + pSesInfo->password = volume_info.password; if (volume_info.username) strncpy(pSesInfo->userName, volume_info.username,MAX_USERNAME_SIZE); @@ -982,11 +1118,14 @@ cifs_mount(struct super_block *sb, struc strncpy(pSesInfo->domainName, volume_info.domainname,MAX_USERNAME_SIZE); pSesInfo->linux_uid = volume_info.linux_uid; - - rc = setup_session(xid,pSesInfo, cifs_sb->local_nls); + down(&pSesInfo->sesSem); + rc = cifs_setup_session(xid,pSesInfo, cifs_sb->local_nls); + up(&pSesInfo->sesSem); if(!rc) atomic_inc(&srvTcp->socketUseCount); - } + } else + if(volume_info.password) + kfree(volume_info.password); } /* search for existing tcon to this server share */ @@ -1013,6 +1152,11 @@ cifs_mount(struct super_block *sb, struc volume_info.username); if (tcon) { cFYI(1, ("Found match on UNC path ")); + /* we can have only one retry value for a connection + to a share so for resources mounted more than once + to the same server share the last value passed in + for the retry flag is used */ + tcon->retry = volume_info.retry; } else { tcon = tconInfoAlloc(); if (tcon == NULL) @@ -1039,8 +1183,10 @@ cifs_mount(struct super_block *sb, struc tcon, cifs_sb->local_nls); cFYI(1, ("CIFS Tcon rc = %d", rc)); } - if (!rc) + if (!rc) { atomic_inc(&pSesInfo->inUse); + tcon->retry = volume_info.retry; + } } } } @@ -1064,8 +1210,6 @@ cifs_mount(struct super_block *sb, struc CIFSSMBLogoff(xid, pSesInfo); if(pSesInfo->server->tsk) send_sig(SIGKILL,pSesInfo->server->tsk,1); - else - cFYI(1,("Can not wake captive thread on cleanup of failed mount")); set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(HZ / 4); /* give captive thread time to exit */ } else @@ -1091,7 +1235,7 @@ cifs_mount(struct super_block *sb, struc return rc; } -int +static int CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses, char session_key[CIFS_SESSION_KEY_SIZE], const struct nls_table *nls_codepage) @@ -1110,7 +1254,7 @@ CIFSSessSetup(unsigned int xid, struct c cFYI(1, ("In sesssetup ")); - smb_buffer = buf_get(); + smb_buffer = cifs_buf_get(); if (smb_buffer == 0) { return -ENOMEM; } @@ -1162,9 +1306,10 @@ CIFSSessSetup(unsigned int xid, struct c } if(user == NULL) bytes_returned = 0; /* skill null user */ - else - bytes_returned = - cifs_strtoUCS((wchar_t *) bcc_ptr, user, 100, nls_codepage); + else + bytes_returned = + cifs_strtoUCS((wchar_t *) bcc_ptr, user, 100, + nls_codepage); bcc_ptr += 2 * bytes_returned; /* convert num 16 bit words to bytes */ bcc_ptr += 2; /* trailing null */ if (domain == NULL) @@ -1257,7 +1402,7 @@ CIFSSessSetup(unsigned int xid, struct c /* We look for obvious messed up bcc or strings in response so we do not go off the end since (at least) WIN2K and Windows XP have a major bug in not null terminating last Unicode string in response */ - ses->serverOS = kcalloc(2 * (len + 1), GFP_KERNEL); + ses->serverOS = cifs_kcalloc(2 * (len + 1), GFP_KERNEL); cifs_strfromUCS_le(ses->serverOS, (wchar_t *)bcc_ptr, len,nls_codepage); bcc_ptr += 2 * (len + 1); @@ -1268,7 +1413,7 @@ CIFSSessSetup(unsigned int xid, struct c len = UniStrnlen((wchar_t *)bcc_ptr, remaining_words - 1); - ses->serverNOS =kcalloc(2 * (len + 1),GFP_KERNEL); + ses->serverNOS =cifs_kcalloc(2 * (len + 1),GFP_KERNEL); cifs_strfromUCS_le(ses->serverNOS, (wchar_t *)bcc_ptr,len,nls_codepage); bcc_ptr += 2 * (len + 1); @@ -1279,7 +1424,7 @@ CIFSSessSetup(unsigned int xid, struct c len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words); /* last string is not always null terminated (for e.g. for Windows XP & 2000) */ ses->serverDomain = - kcalloc(2*(len+1),GFP_KERNEL); + cifs_kcalloc(2*(len+1),GFP_KERNEL); cifs_strfromUCS_le(ses->serverDomain, (wchar_t *)bcc_ptr,len,nls_codepage); bcc_ptr += 2 * (len + 1); @@ -1288,20 +1433,20 @@ CIFSSessSetup(unsigned int xid, struct c } /* else no more room so create dummy domain string */ else ses->serverDomain = - kcalloc(2, + cifs_kcalloc(2, GFP_KERNEL); } else { /* no room so create dummy domain and NOS string */ ses->serverDomain = - kcalloc(2, GFP_KERNEL); + cifs_kcalloc(2, GFP_KERNEL); ses->serverNOS = - kcalloc(2, GFP_KERNEL); + cifs_kcalloc(2, GFP_KERNEL); } } else { /* ASCII */ len = strnlen(bcc_ptr, 1024); if (((long) bcc_ptr + len) - (long) pByteArea(smb_buffer_response) <= BCC(smb_buffer_response)) { - ses->serverOS = kcalloc(len + 1,GFP_KERNEL); + ses->serverOS = cifs_kcalloc(len + 1,GFP_KERNEL); strncpy(ses->serverOS,bcc_ptr, len); bcc_ptr += len; @@ -1309,14 +1454,14 @@ CIFSSessSetup(unsigned int xid, struct c bcc_ptr++; len = strnlen(bcc_ptr, 1024); - ses->serverNOS = kcalloc(len + 1,GFP_KERNEL); + ses->serverNOS = cifs_kcalloc(len + 1,GFP_KERNEL); strncpy(ses->serverNOS, bcc_ptr, len); bcc_ptr += len; bcc_ptr[0] = 0; bcc_ptr++; len = strnlen(bcc_ptr, 1024); - ses->serverDomain = kcalloc(len + 1,GFP_KERNEL); + ses->serverDomain = cifs_kcalloc(len + 1,GFP_KERNEL); strncpy(ses->serverDomain, bcc_ptr, len); bcc_ptr += len; bcc_ptr[0] = 0; @@ -1341,12 +1486,12 @@ CIFSSessSetup(unsigned int xid, struct c } if (smb_buffer) - buf_release(smb_buffer); + cifs_buf_release(smb_buffer); return rc; } -int +static int CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses, char *SecurityBlob,int SecurityBlobLength, const struct nls_table *nls_codepage) @@ -1365,7 +1510,7 @@ CIFSSpnegoSessSetup(unsigned int xid, st cFYI(1, ("In spnego sesssetup ")); - smb_buffer = buf_get(); + smb_buffer = cifs_buf_get(); if (smb_buffer == 0) { return -ENOMEM; } @@ -1511,7 +1656,7 @@ CIFSSpnegoSessSetup(unsigned int xid, st the end since (at least) WIN2K and Windows XP have a major bug in not null terminating last Unicode string in response */ ses->serverOS = - kcalloc(2 * (len + 1), GFP_KERNEL); + cifs_kcalloc(2 * (len + 1), GFP_KERNEL); cifs_strfromUCS_le(ses->serverOS, (wchar_t *) bcc_ptr, len, @@ -1525,7 +1670,7 @@ CIFSSpnegoSessSetup(unsigned int xid, st remaining_words - 1); ses->serverNOS = - kcalloc(2 * (len + 1), + cifs_kcalloc(2 * (len + 1), GFP_KERNEL); cifs_strfromUCS_le(ses->serverNOS, (wchar_t *)bcc_ptr, @@ -1538,7 +1683,7 @@ CIFSSpnegoSessSetup(unsigned int xid, st if (remaining_words > 0) { len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words); /* last string is not always null terminated (for e.g. for Windows XP & 2000) */ - ses->serverDomain = kcalloc(2*(len+1),GFP_KERNEL); + ses->serverDomain = cifs_kcalloc(2*(len+1),GFP_KERNEL); cifs_strfromUCS_le(ses->serverDomain, (wchar_t *)bcc_ptr, len, @@ -1549,10 +1694,10 @@ CIFSSpnegoSessSetup(unsigned int xid, st } /* else no more room so create dummy domain string */ else ses->serverDomain = - kcalloc(2,GFP_KERNEL); + cifs_kcalloc(2,GFP_KERNEL); } else { /* no room so create dummy domain and NOS string */ - ses->serverDomain = kcalloc(2, GFP_KERNEL); - ses->serverNOS = kcalloc(2, GFP_KERNEL); + ses->serverDomain = cifs_kcalloc(2, GFP_KERNEL); + ses->serverNOS = cifs_kcalloc(2, GFP_KERNEL); } } else { /* ASCII */ @@ -1560,7 +1705,7 @@ CIFSSpnegoSessSetup(unsigned int xid, st if (((long) bcc_ptr + len) - (long) pByteArea(smb_buffer_response) <= BCC(smb_buffer_response)) { - ses->serverOS = kcalloc(len + 1, GFP_KERNEL); + ses->serverOS = cifs_kcalloc(len + 1, GFP_KERNEL); strncpy(ses->serverOS, bcc_ptr, len); bcc_ptr += len; @@ -1568,14 +1713,14 @@ CIFSSpnegoSessSetup(unsigned int xid, st bcc_ptr++; len = strnlen(bcc_ptr, 1024); - ses->serverNOS = kcalloc(len + 1,GFP_KERNEL); + ses->serverNOS = cifs_kcalloc(len + 1,GFP_KERNEL); strncpy(ses->serverNOS, bcc_ptr, len); bcc_ptr += len; bcc_ptr[0] = 0; bcc_ptr++; len = strnlen(bcc_ptr, 1024); - ses->serverDomain = kcalloc(len + 1, GFP_KERNEL); + ses->serverDomain = cifs_kcalloc(len + 1, GFP_KERNEL); strncpy(ses->serverDomain, bcc_ptr, len); bcc_ptr += len; bcc_ptr[0] = 0; @@ -1600,12 +1745,12 @@ CIFSSpnegoSessSetup(unsigned int xid, st } if (smb_buffer) - buf_release(smb_buffer); + cifs_buf_release(smb_buffer); return rc; } -int +static int CIFSNTLMSSPNegotiateSessSetup(unsigned int xid, struct cifsSesInfo *ses, int * pNTLMv2_flag, const struct nls_table *nls_codepage) @@ -1626,7 +1771,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned i cFYI(1, ("In NTLMSSP sesssetup (negotiate) ")); *pNTLMv2_flag = FALSE; - smb_buffer = buf_get(); + smb_buffer = cifs_buf_get(); if (smb_buffer == 0) { return -ENOMEM; } @@ -1822,7 +1967,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned i the end since (at least) WIN2K and Windows XP have a major bug in not null terminating last Unicode string in response */ ses->serverOS = - kcalloc(2 * (len + 1), GFP_KERNEL); + cifs_kcalloc(2 * (len + 1), GFP_KERNEL); cifs_strfromUCS_le(ses->serverOS, (wchar_t *) bcc_ptr, len, @@ -1837,7 +1982,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned i remaining_words - 1); ses->serverNOS = - kcalloc(2 * (len + 1), + cifs_kcalloc(2 * (len + 1), GFP_KERNEL); cifs_strfromUCS_le(ses-> serverNOS, @@ -1854,7 +1999,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned i len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words); /* last string is not always null terminated (for e.g. for Windows XP & 2000) */ ses->serverDomain = - kcalloc(2 * + cifs_kcalloc(2 * (len + 1), GFP_KERNEL); @@ -1880,13 +2025,13 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned i } /* else no more room so create dummy domain string */ else ses->serverDomain = - kcalloc(2, + cifs_kcalloc(2, GFP_KERNEL); } else { /* no room so create dummy domain and NOS string */ ses->serverDomain = - kcalloc(2, GFP_KERNEL); + cifs_kcalloc(2, GFP_KERNEL); ses->serverNOS = - kcalloc(2, GFP_KERNEL); + cifs_kcalloc(2, GFP_KERNEL); } } else { /* ASCII */ len = strnlen(bcc_ptr, 1024); @@ -1894,7 +2039,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned i pByteArea(smb_buffer_response) <= BCC(smb_buffer_response)) { ses->serverOS = - kcalloc(len + 1, + cifs_kcalloc(len + 1, GFP_KERNEL); strncpy(ses->serverOS, bcc_ptr, len); @@ -1905,7 +2050,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned i len = strnlen(bcc_ptr, 1024); ses->serverNOS = - kcalloc(len + 1, + cifs_kcalloc(len + 1, GFP_KERNEL); strncpy(ses->serverNOS, bcc_ptr, len); bcc_ptr += len; @@ -1914,7 +2059,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned i len = strnlen(bcc_ptr, 1024); ses->serverDomain = - kcalloc(len + 1, + cifs_kcalloc(len + 1, GFP_KERNEL); strncpy(ses->serverDomain, bcc_ptr, len); bcc_ptr += len; @@ -1940,12 +2085,12 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned i } if (smb_buffer) - buf_release(smb_buffer); + cifs_buf_release(smb_buffer); return rc; } -int +static int CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses, char *ntlm_session_key, int ntlmv2_flag, const struct nls_table *nls_codepage) @@ -1966,7 +2111,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xi cFYI(1, ("In NTLMSSPSessSetup (Authenticate)")); - smb_buffer = buf_get(); + smb_buffer = cifs_buf_get(); if (smb_buffer == 0) { return -ENOMEM; } @@ -2215,7 +2360,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xi the end since (at least) WIN2K and Windows XP have a major bug in not null terminating last Unicode string in response */ ses->serverOS = - kcalloc(2 * (len + 1), GFP_KERNEL); + cifs_kcalloc(2 * (len + 1), GFP_KERNEL); cifs_strfromUCS_le(ses->serverOS, (wchar_t *) bcc_ptr, len, @@ -2230,7 +2375,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xi remaining_words - 1); ses->serverNOS = - kcalloc(2 * (len + 1), + cifs_kcalloc(2 * (len + 1), GFP_KERNEL); cifs_strfromUCS_le(ses-> serverNOS, @@ -2246,7 +2391,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xi len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words); /* last string not always null terminated (e.g. for Windows XP & 2000) */ ses->serverDomain = - kcalloc(2 * + cifs_kcalloc(2 * (len + 1), GFP_KERNEL); @@ -2271,17 +2416,17 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xi = 0; } /* else no more room so create dummy domain string */ else - ses->serverDomain = kcalloc(2,GFP_KERNEL); + ses->serverDomain = cifs_kcalloc(2,GFP_KERNEL); } else { /* no room so create dummy domain and NOS string */ - ses->serverDomain = kcalloc(2, GFP_KERNEL); - ses->serverNOS = kcalloc(2, GFP_KERNEL); + ses->serverDomain = cifs_kcalloc(2, GFP_KERNEL); + ses->serverNOS = cifs_kcalloc(2, GFP_KERNEL); } } else { /* ASCII */ len = strnlen(bcc_ptr, 1024); if (((long) bcc_ptr + len) - (long) pByteArea(smb_buffer_response) <= BCC(smb_buffer_response)) { - ses->serverOS = kcalloc(len + 1,GFP_KERNEL); + ses->serverOS = cifs_kcalloc(len + 1,GFP_KERNEL); strncpy(ses->serverOS,bcc_ptr, len); bcc_ptr += len; @@ -2289,14 +2434,14 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xi bcc_ptr++; len = strnlen(bcc_ptr, 1024); - ses->serverNOS = kcalloc(len+1,GFP_KERNEL); + ses->serverNOS = cifs_kcalloc(len+1,GFP_KERNEL); strncpy(ses->serverNOS, bcc_ptr, len); bcc_ptr += len; bcc_ptr[0] = 0; bcc_ptr++; len = strnlen(bcc_ptr, 1024); - ses->serverDomain = kcalloc(len+1,GFP_KERNEL); + ses->serverDomain = cifs_kcalloc(len+1,GFP_KERNEL); strncpy(ses->serverDomain, bcc_ptr, len); bcc_ptr += len; bcc_ptr[0] = 0; @@ -2321,7 +2466,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xi } if (smb_buffer) - buf_release(smb_buffer); + cifs_buf_release(smb_buffer); return rc; } @@ -2342,7 +2487,7 @@ CIFSTCon(unsigned int xid, struct cifsSe if (ses == NULL) return -EIO; - smb_buffer = buf_get(); + smb_buffer = cifs_buf_get(); if (smb_buffer == 0) { return -ENOMEM; } @@ -2404,8 +2549,10 @@ CIFSTCon(unsigned int xid, struct cifsSe if (((long) bcc_ptr + (2 * length)) - (long) pByteArea(smb_buffer_response) <= BCC(smb_buffer_response)) { + if(tcon->nativeFileSystem) + kfree(tcon->nativeFileSystem); tcon->nativeFileSystem = - kcalloc(length + 2, GFP_KERNEL); + cifs_kcalloc(length + 2, GFP_KERNEL); cifs_strfromUCS_le(tcon->nativeFileSystem, (wchar_t *) bcc_ptr, length, nls_codepage); @@ -2420,8 +2567,10 @@ CIFSTCon(unsigned int xid, struct cifsSe if (((long) bcc_ptr + length) - (long) pByteArea(smb_buffer_response) <= BCC(smb_buffer_response)) { + if(tcon->nativeFileSystem) + kfree(tcon->nativeFileSystem); tcon->nativeFileSystem = - kcalloc(length + 1, GFP_KERNEL); + cifs_kcalloc(length + 1, GFP_KERNEL); strncpy(tcon->nativeFileSystem, bcc_ptr, length); } @@ -2435,7 +2584,7 @@ CIFSTCon(unsigned int xid, struct cifsSe } if (smb_buffer) - buf_release(smb_buffer); + cifs_buf_release(smb_buffer); return rc; } @@ -2472,7 +2621,7 @@ cifs_umount(struct super_block *sb, stru } else cFYI(1, ("No session or bad tcon")); } - /* BB future check active count of tcon and then free if needed BB */ + cifs_sb->tcon = NULL; if (ses) { set_current_state(TASK_INTERRUPTIBLE); @@ -2484,3 +2633,100 @@ cifs_umount(struct super_block *sb, stru FreeXid(xid); return rc; /* BB check if we should always return zero here */ } + +int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo, + struct nls_table * nls_info) +{ + int rc = 0; + char ntlm_session_key[CIFS_SESSION_KEY_SIZE]; + int ntlmv2_flag = FALSE; + + /* what if server changes its buffer size after dropping the session? */ + if(pSesInfo->server->maxBuf == 0) /* no need to send on reconnect */ { + rc = CIFSSMBNegotiate(xid, pSesInfo); + if(rc == -EAGAIN) /* retry only once on 1st time connection */ { + rc = CIFSSMBNegotiate(xid, pSesInfo); + if(rc == -EAGAIN) + rc = -EHOSTDOWN; + } + if(rc == 0) + pSesInfo->server->tcpStatus = CifsGood; + } + pSesInfo->capabilities = pSesInfo->server->capabilities; + if(linuxExtEnabled == 0) + pSesInfo->capabilities &= (~CAP_UNIX); + pSesInfo->sequence_number = 0; + if (!rc) { + cFYI(1,("Security Mode: 0x%x Capabilities: 0x%x Time Zone: %d", + pSesInfo->server->secMode, + pSesInfo->server->capabilities, + pSesInfo->server->timeZone)); + if (extended_security + && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY) + && (pSesInfo->server->secType == NTLMSSP)) { + cFYI(1, ("New style sesssetup ")); + rc = CIFSSpnegoSessSetup(xid, pSesInfo, + NULL /* security blob */, + 0 /* blob length */, + nls_info); + } else if (extended_security + && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY) + && (pSesInfo->server->secType == RawNTLMSSP)) { + cFYI(1, ("NTLMSSP sesssetup ")); + rc = CIFSNTLMSSPNegotiateSessSetup(xid, + pSesInfo, + &ntlmv2_flag, + nls_info); + if (!rc) { + if(ntlmv2_flag) { + char * v2_response; + cFYI(1,("Can use more secure NTLM version 2 password hash")); + CalcNTLMv2_partial_mac_key(pSesInfo, + nls_info); + v2_response = kmalloc(16 + 64 /* blob */, GFP_KERNEL); + if(v2_response) { + CalcNTLMv2_response(pSesInfo,v2_response); +/* cifs_calculate_ntlmv2_mac_key(pSesInfo->mac_signing_key, response, ntlm_session_key, */ + kfree(v2_response); + /* BB Put dummy sig in SessSetup PDU? */ + } else + rc = -ENOMEM; + + } else { + SMBNTencrypt(pSesInfo->password, + pSesInfo->server->cryptKey, + ntlm_session_key); + + cifs_calculate_mac_key(pSesInfo->mac_signing_key, + ntlm_session_key, + pSesInfo->password); + } + /* for better security the weaker lanman hash not sent + in AuthSessSetup so we no longer calculate it */ + + rc = CIFSNTLMSSPAuthSessSetup(xid, + pSesInfo, + ntlm_session_key, + ntlmv2_flag, + nls_info); + } + } else { /* old style NTLM 0.12 session setup */ + SMBNTencrypt(pSesInfo->password, + pSesInfo->server->cryptKey, + ntlm_session_key); + + cifs_calculate_mac_key(pSesInfo->mac_signing_key, + ntlm_session_key, pSesInfo->password); + rc = CIFSSessSetup(xid, pSesInfo, + ntlm_session_key, nls_info); + } + if (rc) { + cERROR(1,("Send error in SessSetup = %d",rc)); + } else { + cFYI(1,("CIFS Session Established successfully")); + pSesInfo->status = CifsGood; + } + } + return rc; +} + --- diff/fs/cifs/dir.c 2003-10-27 09:20:44.000000000 +0000 +++ source/fs/cifs/dir.c 2004-04-21 10:46:51.676726056 +0100 @@ -125,7 +125,7 @@ cifs_create(struct inode *inode, struct int rc = -ENOENT; int xid; int oplock = 0; - int desiredAccess = GENERIC_ALL; + int desiredAccess = GENERIC_READ | GENERIC_WRITE; __u16 fileHandle; struct cifs_sb_info *cifs_sb; struct cifsTconInfo *pTcon; @@ -147,11 +147,15 @@ cifs_create(struct inode *inode, struct cFYI(1,("In create for inode %p dentry->inode %p nd flags = 0x%x for %s",inode, direntry->d_inode, nd->flags,full_path)); if ((nd->intent.open.flags & O_ACCMODE) == O_RDONLY) - desiredAccess = GENERIC_READ; + desiredAccess = GENERIC_READ; else if ((nd->intent.open.flags & O_ACCMODE) == O_WRONLY) desiredAccess = GENERIC_WRITE; - else if ((nd->intent.open.flags & O_ACCMODE) == O_RDWR) - desiredAccess = GENERIC_ALL; + else if ((nd->intent.open.flags & O_ACCMODE) == O_RDWR) { + /* GENERIC_ALL is too much permission to request */ + /* can cause unnecessary access denied on create */ + /* desiredAccess = GENERIC_ALL; */ + desiredAccess = GENERIC_READ | GENERIC_WRITE; + } if((nd->intent.open.flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) disposition = FILE_CREATE; @@ -220,6 +224,7 @@ cifs_create(struct inode *inode, struct pCifsFile->pInode = newinode; pCifsFile->invalidHandle = FALSE; pCifsFile->closePend = FALSE; + init_MUTEX(&pCifsFile->fh_sem); /* pCifsFile->pfile = file; */ /* put in at open time */ write_lock(&GlobalSMBSeslock); list_add(&pCifsFile->tlist,&pTcon->openFileList); @@ -282,19 +287,19 @@ int cifs_mknod(struct inode *inode, stru full_path, mode, current->euid, current->egid, device_number, cifs_sb->local_nls); if(!rc) { - rc = cifs_get_inode_info_unix(&newinode, full_path, - inode->i_sb); + rc = cifs_get_inode_info_unix(&newinode, full_path, + inode->i_sb); direntry->d_op = &cifs_dentry_ops; if(rc == 0) d_instantiate(direntry, newinode); } } - if (full_path) - kfree(full_path); - FreeXid(xid); + if (full_path) + kfree(full_path); + FreeXid(xid); - return rc; + return rc; } --- diff/fs/cifs/file.c 2004-04-21 10:45:35.169356944 +0100 +++ source/fs/cifs/file.c 2004-04-21 10:46:51.679725600 +0100 @@ -47,7 +47,7 @@ cifs_open(struct inode *inode, struct fi struct list_head * tmp; char *full_path = NULL; int desiredAccess = 0x20197; - int disposition = FILE_OPEN; + int disposition; __u16 netfid; FILE_ALL_INFO * buf = NULL; @@ -57,27 +57,27 @@ cifs_open(struct inode *inode, struct fi pTcon = cifs_sb->tcon; if (file->f_flags & O_CREAT) { - /* search inode for this file and fill in file->private_data = */ - pCifsInode = CIFS_I(file->f_dentry->d_inode); - read_lock(&GlobalSMBSeslock); - list_for_each(tmp, &pCifsInode->openFileList) { - pCifsFile = list_entry(tmp,struct cifsFileInfo, flist); - if((pCifsFile->pfile == NULL)&& (pCifsFile->pid = current->pid)){ - /* set mode ?? */ - pCifsFile->pfile = file; /* needed for writepage */ - file->private_data = pCifsFile; - break; - } - } - read_unlock(&GlobalSMBSeslock); - if(file->private_data != NULL) { - rc = 0; - FreeXid(xid); - return rc; - } else { - if(file->f_flags & O_EXCL) - cERROR(1,("could not find file instance for new file %p ",file)); - } + /* search inode for this file and fill in file->private_data = */ + pCifsInode = CIFS_I(file->f_dentry->d_inode); + read_lock(&GlobalSMBSeslock); + list_for_each(tmp, &pCifsInode->openFileList) { + pCifsFile = list_entry(tmp,struct cifsFileInfo, flist); + if((pCifsFile->pfile == NULL)&& (pCifsFile->pid = current->pid)){ + /* set mode ?? */ + pCifsFile->pfile = file; /* needed for writepage */ + file->private_data = pCifsFile; + break; + } + } + read_unlock(&GlobalSMBSeslock); + if(file->private_data != NULL) { + rc = 0; + FreeXid(xid); + return rc; + } else { + if(file->f_flags & O_EXCL) + cERROR(1,("could not find file instance for new file %p ",file)); + } } full_path = build_path_from_dentry(file->f_dentry); @@ -87,29 +87,45 @@ cifs_open(struct inode *inode, struct fi desiredAccess = GENERIC_READ; else if ((file->f_flags & O_ACCMODE) == O_WRONLY) desiredAccess = GENERIC_WRITE; - else if ((file->f_flags & O_ACCMODE) == O_RDWR) - desiredAccess = GENERIC_ALL; - -/* BB check other flags carefully to find equivalent NTCreateX flags */ - -/* -#define O_CREAT 0100 -#define O_EXCL 0200 -#define O_NOCTTY 0400 -#define O_TRUNC 01000 -#define O_APPEND 02000 -#define O_NONBLOCK 04000 -#define O_NDELAY O_NONBLOCK -#define O_SYNC 010000 -#define FASYNC 020000 -#define O_DIRECT 040000 -#define O_LARGEFILE 0100000 -#define O_DIRECTORY 0200000 -#define O_NOFOLLOW 0400000 -#define O_ATOMICLOOKUP 01000000 */ - - if (file->f_flags & O_CREAT) - disposition = FILE_OVERWRITE; + else if ((file->f_flags & O_ACCMODE) == O_RDWR) { + /* GENERIC_ALL is too much permission to request */ + /* can cause unnecessary access denied on create */ + /* desiredAccess = GENERIC_ALL; */ + desiredAccess = GENERIC_READ | GENERIC_WRITE; + } + +/********************************************************************* + * open flag mapping table: + * + * POSIX Flag CIFS Disposition + * ---------- ---------------- + * O_CREAT FILE_OPEN_IF + * O_CREAT | O_EXCL FILE_CREATE + * O_CREAT | O_TRUNC FILE_OVERWRITE_IF + * O_TRUNC FILE_OVERWRITE + * none of the above FILE_OPEN + * + * Note that there is not a direct match between disposition + * FILE_SUPERSEDE (ie create whether or not file exists although + * O_CREAT | O_TRUNC is similar but truncates the existing + * file rather than creating a new file as FILE_SUPERSEDE does + * (which uses the attributes / metadata passed in on open call) + *? + *? O_SYNC is a reasonable match to CIFS writethrough flag + *? and the read write flags match reasonably. O_LARGEFILE + *? is irrelevant because largefile support is always used + *? by this client. Flags O_APPEND, O_DIRECT, O_DIRECTORY, + * O_FASYNC, O_NOFOLLOW, O_NONBLOCK need further investigation + *********************************************************************/ + + if((file->f_flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) + disposition = FILE_CREATE; + else if((file->f_flags & (O_CREAT | O_TRUNC)) == (O_CREAT | O_TRUNC)) + disposition = FILE_OVERWRITE_IF; + else if((file->f_flags & O_CREAT) == O_CREAT) + disposition = FILE_OPEN_IF; + else + disposition = FILE_OPEN; if (oplockEnabled) oplock = REQ_OPLOCK; @@ -121,7 +137,7 @@ cifs_open(struct inode *inode, struct fi /* Also refresh inode by passing in file_info buf returned by SMBOpen and calling get_inode_info with returned buf (at least helps non-Unix server case */ - buf = kmalloc(sizeof(FILE_ALL_INFO),GFP_KERNEL); + buf = kmalloc(sizeof(FILE_ALL_INFO),GFP_KERNEL); if(buf==0) { if (full_path) kfree(full_path); @@ -134,14 +150,16 @@ cifs_open(struct inode *inode, struct fi cFYI(1, ("cifs_open returned 0x%x ", rc)); cFYI(1, ("oplock: %d ", oplock)); } else { + if(file->private_data) + kfree(file->private_data); file->private_data = - kmalloc(sizeof (struct cifsFileInfo), GFP_KERNEL); + kmalloc(sizeof (struct cifsFileInfo), GFP_KERNEL); if (file->private_data) { - memset(file->private_data, 0, - sizeof (struct cifsFileInfo)); + memset(file->private_data, 0, sizeof(struct cifsFileInfo)); pCifsFile = (struct cifsFileInfo *) file->private_data; pCifsFile->netfid = netfid; pCifsFile->pid = current->pid; + init_MUTEX(&pCifsFile->fh_sem); pCifsFile->pfile = file; /* needed for writepage */ pCifsFile->pInode = inode; pCifsFile->invalidHandle = FALSE; @@ -154,8 +172,27 @@ cifs_open(struct inode *inode, struct fi list_add(&pCifsFile->flist,&pCifsInode->openFileList); write_unlock(&GlobalSMBSeslock); write_unlock(&file->f_owner.lock); + if(pCifsInode->clientCanCacheRead) { + /* we have the inode open somewhere else + no need to discard cache data */ + } else { + if(buf) { + /* BB need same check in cifs_create too? */ - if (pTcon->ses->capabilities & CAP_UNIX) + /* if not oplocked, invalidate inode pages if mtime + or file size changed */ + struct timespec temp; + temp = cifs_NTtimeToUnix(le64_to_cpu(buf->LastWriteTime)); + if(timespec_equal(&file->f_dentry->d_inode->i_mtime,&temp) && + (file->f_dentry->d_inode->i_size == (loff_t)le64_to_cpu(buf->EndOfFile))) { + cFYI(1,("inode unchanged on server")); + } else { + cFYI(1,("invalidating remote inode since open detected it changed")); + invalidate_remote_inode(file->f_dentry->d_inode); + } + } + } + if (pTcon->ses->capabilities & CAP_UNIX) rc = cifs_get_inode_info_unix(&file->f_dentry->d_inode, full_path, inode->i_sb); else @@ -200,162 +237,119 @@ cifs_open(struct inode *inode, struct fi /* Try to reaquire byte range locks that were released when session */ /* to server was lost */ -int relock_files(struct cifsFileInfo * cifsFile) +static int cifs_relock_file(struct cifsFileInfo * cifsFile) { int rc = 0; -/* list all locks open on this file */ +/* BB list all locks open on this file and relock */ + return rc; } static int cifs_reopen_file(struct inode *inode, struct file *file) { - int rc = -EACCES; - int xid, oplock; - struct cifs_sb_info *cifs_sb; - struct cifsTconInfo *pTcon; - struct cifsFileInfo *pCifsFile; - struct cifsInodeInfo *pCifsInode; - char *full_path = NULL; - int desiredAccess = 0x20197; - int disposition = FILE_OPEN; - __u16 netfid; - FILE_ALL_INFO * buf = NULL; - - xid = GetXid(); - - cifs_sb = CIFS_SB(inode->i_sb); - pTcon = cifs_sb->tcon; - - full_path = build_path_from_dentry(file->f_dentry); - - cFYI(1, (" inode = 0x%p file flags are 0x%x for %s", inode, file->f_flags,full_path)); - if ((file->f_flags & O_ACCMODE) == O_RDONLY) - desiredAccess = GENERIC_READ; - else if ((file->f_flags & O_ACCMODE) == O_WRONLY) - desiredAccess = GENERIC_WRITE; - else if ((file->f_flags & O_ACCMODE) == O_RDWR) - desiredAccess = GENERIC_ALL; - if (oplockEnabled) - oplock = REQ_OPLOCK; - else - oplock = FALSE; - - /* BB pass O_SYNC flag through on file attributes .. BB */ - - /* Also refresh inode by passing in file_info buf returned by SMBOpen - and calling get_inode_info with returned buf (at least - helps non-Unix server case */ - buf = kmalloc(sizeof(FILE_ALL_INFO),GFP_KERNEL); - if(buf==0) { - if (full_path) - kfree(full_path); - FreeXid(xid); - return -ENOMEM; - } - rc = CIFSSMBOpen(xid, pTcon, full_path, disposition, desiredAccess, - CREATE_NOT_DIR, &netfid, &oplock, buf, cifs_sb->local_nls); - if (rc) { - cFYI(1, ("cifs_open returned 0x%x ", rc)); - cFYI(1, ("oplock: %d ", oplock)); - } else { - if (file->private_data) { - pCifsFile = (struct cifsFileInfo *) file->private_data; + int rc = -EACCES; + int xid, oplock; + struct cifs_sb_info *cifs_sb; + struct cifsTconInfo *pTcon; + struct cifsFileInfo *pCifsFile; + struct cifsInodeInfo *pCifsInode; + char *full_path = NULL; + int desiredAccess = 0x20197; + int disposition = FILE_OPEN; + __u16 netfid; + FILE_ALL_INFO * buf = NULL; - pCifsFile->netfid = netfid; - pCifsFile->invalidHandle = FALSE; - pCifsInode = CIFS_I(file->f_dentry->d_inode); - if(pCifsInode) { - if (pTcon->ses->capabilities & CAP_UNIX) - rc = cifs_get_inode_info_unix(&file->f_dentry->d_inode, - full_path, inode->i_sb); - else - rc = cifs_get_inode_info(&file->f_dentry->d_inode, - full_path, buf, inode->i_sb); - - if(oplock == OPLOCK_EXCLUSIVE) { - pCifsInode->clientCanCacheAll = TRUE; - pCifsInode->clientCanCacheRead = TRUE; - cFYI(1,("Exclusive Oplock granted on inode %p",file->f_dentry->d_inode)); - } else if(oplock == OPLOCK_READ) { - pCifsInode->clientCanCacheRead = TRUE; - pCifsInode->clientCanCacheAll = FALSE; - } else { - pCifsInode->clientCanCacheRead = FALSE; - pCifsInode->clientCanCacheAll = FALSE; - } - } - } else - rc = -EBADF; - } + if(inode == NULL) + return -EBADF; + if (file->private_data) { + pCifsFile = (struct cifsFileInfo *) file->private_data; + } else + return -EBADF; - if (buf) - kfree(buf); - if (full_path) - kfree(full_path); - FreeXid(xid); - return rc; -} + xid = GetXid(); + down(&pCifsFile->fh_sem); + if(pCifsFile->invalidHandle == FALSE) { + up(&pCifsFile->fh_sem); + FreeXid(xid); + return 0; + } -/* Try to reopen files that were closed when session to server was lost */ -int reopen_files(struct cifsTconInfo * pTcon, struct nls_table * nlsinfo) -{ - int rc = 0; - struct cifsFileInfo *open_file = NULL; - struct file * file = NULL; - struct list_head invalid_file_list; - struct list_head * tmp; - struct list_head * tmp1; - INIT_LIST_HEAD(&invalid_file_list); + cifs_sb = CIFS_SB(inode->i_sb); + pTcon = cifs_sb->tcon; -/* list all files open on tree connection and mark them invalid */ - write_lock(&GlobalSMBSeslock); - list_for_each_safe(tmp, tmp1, &pTcon->openFileList) { - open_file = list_entry(tmp,struct cifsFileInfo, tlist); - if(open_file) { - open_file->invalidHandle = TRUE; - list_move(&open_file->tlist,&invalid_file_list); - } + full_path = build_path_from_dentry(file->f_dentry); + + cFYI(1, (" inode = 0x%p file flags are 0x%x for %s", inode, file->f_flags,full_path)); + if ((file->f_flags & O_ACCMODE) == O_RDONLY) + desiredAccess = GENERIC_READ; + else if ((file->f_flags & O_ACCMODE) == O_WRONLY) + desiredAccess = GENERIC_WRITE; + else if ((file->f_flags & O_ACCMODE) == O_RDWR) { + /* GENERIC_ALL is too much permission to request */ + /* can cause unnecessary access denied on create */ + /* desiredAccess = GENERIC_ALL; */ + desiredAccess = GENERIC_READ | GENERIC_WRITE; } - /* reopen files */ - list_for_each_safe(tmp,tmp1, &invalid_file_list) { - /* BB need to fix above to check list end and skip entries we do not need to reopen */ - open_file = list_entry(tmp,struct cifsFileInfo, tlist); - if(open_file == NULL) { - break; - } else { - if((open_file->invalidHandle == FALSE) && - (open_file->closePend == FALSE)) { - list_move(&open_file->tlist,&pTcon->openFileList); - continue; - } - file = open_file->pfile; - if(file->f_dentry == 0) { - cFYI(1,("Null dentry for file %p",file)); + if (oplockEnabled) + oplock = REQ_OPLOCK; + else + oplock = FALSE; + + /* BB pass O_SYNC flag through on file attributes .. BB */ + + /* Also refresh inode by passing in file_info buf returned by SMBOpen + and calling get_inode_info with returned buf (at least + helps non-Unix server case */ + buf = kmalloc(sizeof(FILE_ALL_INFO),GFP_KERNEL); + if(buf==0) { + up(&pCifsFile->fh_sem); + if (full_path) + kfree(full_path); + FreeXid(xid); + return -ENOMEM; + } + rc = CIFSSMBOpen(xid, pTcon, full_path, disposition, desiredAccess, + CREATE_NOT_DIR, &netfid, &oplock, buf, cifs_sb->local_nls); + if (rc) { + up(&pCifsFile->fh_sem); + cFYI(1, ("cifs_open returned 0x%x ", rc)); + cFYI(1, ("oplock: %d ", oplock)); + } else { + pCifsFile->netfid = netfid; + pCifsFile->invalidHandle = FALSE; + up(&pCifsFile->fh_sem); + pCifsInode = CIFS_I(inode); + if(pCifsInode) { + if (pTcon->ses->capabilities & CAP_UNIX) + rc = cifs_get_inode_info_unix(&inode, + full_path, inode->i_sb); + else + rc = cifs_get_inode_info(&inode, + full_path, buf, inode->i_sb); + + if(oplock == OPLOCK_EXCLUSIVE) { + pCifsInode->clientCanCacheAll = TRUE; + pCifsInode->clientCanCacheRead = TRUE; + cFYI(1,("Exclusive Oplock granted on inode %p",file->f_dentry->d_inode)); + } else if(oplock == OPLOCK_READ) { + pCifsInode->clientCanCacheRead = TRUE; + pCifsInode->clientCanCacheAll = FALSE; } else { - write_unlock(&GlobalSMBSeslock); - rc = cifs_reopen_file(file->f_dentry->d_inode,file); - write_lock(&GlobalSMBSeslock); - if(file->private_data == NULL) { - tmp = invalid_file_list.next; - tmp1 = tmp->next; - continue; - } - - list_move(&open_file->tlist,&pTcon->openFileList); - if(rc) { - cFYI(1,("reconnecting file %s failed with %d", - file->f_dentry->d_name.name,rc)); - } else { - cFYI(1,("reconnection of %s succeeded", - file->f_dentry->d_name.name)); - } + pCifsInode->clientCanCacheRead = FALSE; + pCifsInode->clientCanCacheAll = FALSE; } + cifs_relock_file(pCifsFile); } } - write_unlock(&GlobalSMBSeslock); + + if (buf) + kfree(buf); + if (full_path) + kfree(full_path); + FreeXid(xid); return rc; } @@ -437,6 +431,7 @@ cifs_lock(struct file *file, int cmd, st __u32 numLock = 0; __u32 numUnlock = 0; __u64 length; + int wait_flag = FALSE; struct cifs_sb_info *cifs_sb; struct cifsTconInfo *pTcon; length = 1 + pfLock->fl_end - pfLock->fl_start; @@ -454,14 +449,16 @@ cifs_lock(struct file *file, int cmd, st cFYI(1, ("Posix ")); if (pfLock->fl_flags & FL_FLOCK) cFYI(1, ("Flock ")); - if (pfLock->fl_flags & FL_SLEEP) + if (pfLock->fl_flags & FL_SLEEP) { cFYI(1, ("Blocking lock ")); + wait_flag = TRUE; + } if (pfLock->fl_flags & FL_ACCESS) - cFYI(1, ("Process suspended by mandatory locking ")); + cFYI(1, ("Process suspended by mandatory locking - not implemented yet ")); if (pfLock->fl_flags & FL_LEASE) - cFYI(1, ("Lease on file ")); - if (pfLock->fl_flags & 0xFFD0) - cFYI(1, ("Unknown lock flags ")); + cFYI(1, ("Lease on file - not implemented yet")); + if (pfLock->fl_flags & (~(FL_POSIX | FL_FLOCK | FL_SLEEP | FL_ACCESS | FL_LEASE))) + cFYI(1, ("Unknown lock flags 0x%x",pfLock->fl_flags)); if (pfLock->fl_type == F_WRLCK) { cFYI(1, ("F_WRLCK ")); @@ -509,7 +506,7 @@ cifs_lock(struct file *file, int cmd, st pfLock->fl_type = F_UNLCK; if (rc != 0) cERROR(1, - ("Error unlocking previously locked range %d during test of lock ", + ("Error unlocking previously locked range %d during test of lock ", rc)); rc = 0; @@ -526,7 +523,7 @@ cifs_lock(struct file *file, int cmd, st ((struct cifsFileInfo *) file->private_data)-> netfid, length, pfLock->fl_start, numUnlock, numLock, lockType, - 0 /* wait flag */ ); + wait_flag); FreeXid(xid); return rc; } @@ -541,6 +538,7 @@ cifs_write(struct file * file, const cha struct cifs_sb_info *cifs_sb; struct cifsTconInfo *pTcon; int xid, long_op; + struct cifsFileInfo * open_file; xid = GetXid(); @@ -555,20 +553,30 @@ cifs_write(struct file * file, const cha FreeXid(xid); return -EBADF; } + open_file = (struct cifsFileInfo *) file->private_data; + if (*poffset > file->f_dentry->d_inode->i_size) - long_op = 2; /* writes past end of file can take a long time */ + long_op = 2; /* writes past end of file can take a long time */ else long_op = 1; for (total_written = 0; write_size > total_written; total_written += bytes_written) { - rc = CIFSSMBWrite(xid, pTcon, - ((struct cifsFileInfo *) file-> - private_data)->netfid, + rc = -EAGAIN; + while(rc == -EAGAIN) { + if ((open_file->invalidHandle) && (!open_file->closePend)) { + rc = cifs_reopen_file(file->f_dentry->d_inode,file); + if(rc != 0) + break; + } + + rc = CIFSSMBWrite(xid, pTcon, + open_file->netfid, write_size - total_written, *poffset, &bytes_written, write_data + total_written, long_op); + } if (rc || (bytes_written == 0)) { if (total_written) break; @@ -580,10 +588,11 @@ cifs_write(struct file * file, const cha *poffset += bytes_written; long_op = FALSE; /* subsequent writes fast - 15 seconds is plenty */ } - file->f_dentry->d_inode->i_ctime = file->f_dentry->d_inode->i_mtime = CURRENT_TIME; + file->f_dentry->d_inode->i_ctime = file->f_dentry->d_inode->i_mtime = + CURRENT_TIME; if (bytes_written > 0) { if (*poffset > file->f_dentry->d_inode->i_size) - file->f_dentry->d_inode->i_size = *poffset; + i_size_write(file->f_dentry->d_inode, *poffset); } mark_inode_dirty_sync(file->f_dentry->d_inode); FreeXid(xid); @@ -605,24 +614,18 @@ cifs_partialpagewrite(struct page *page, struct cifsFileInfo *open_file = NULL; struct list_head *tmp; struct list_head *tmp1; - int xid; - - xid = GetXid(); cifs_sb = CIFS_SB(inode->i_sb); pTcon = cifs_sb->tcon; /* figure out which file struct to use if (file->private_data == NULL) { - FreeXid(xid); return -EBADF; } */ if (!mapping) { - FreeXid(xid); return -EFAULT; } else if(!mapping->host) { - FreeXid(xid); return -EFAULT; } @@ -632,14 +635,12 @@ cifs_partialpagewrite(struct page *page, if((to > PAGE_CACHE_SIZE) || (from > to)) { kunmap(page); - FreeXid(xid); return -EIO; } /* racing with truncate? */ if(offset > mapping->host->i_size) { kunmap(page); - FreeXid(xid); return 0; /* don't care */ } @@ -683,7 +684,6 @@ cifs_partialpagewrite(struct page *page, } kunmap(page); - FreeXid(xid); return rc; } @@ -727,23 +727,38 @@ cifs_commit_write(struct file *file, str int rc = 0; struct inode *inode = page->mapping->host; loff_t position = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; - struct cifsFileInfo *open_file; - struct cifs_sb_info *cifs_sb; + /* struct cifsFileInfo *open_file; + struct cifs_sb_info *cifs_sb; */ xid = GetXid(); - + cFYI(1,("commit write for page %p up to position %lld for %d",page,position,to)); if (position > inode->i_size){ - inode->i_size = position; - if (file->private_data == NULL) { + i_size_write(inode, position); + /*if (file->private_data == NULL) { rc = -EBADF; } else { - cifs_sb = CIFS_SB(inode->i_sb); open_file = (struct cifsFileInfo *)file->private_data; - rc = CIFSSMBSetFileSize(xid, cifs_sb->tcon, position, - open_file->netfid,open_file->pid,FALSE); + cifs_sb = CIFS_SB(inode->i_sb); + rc = -EAGAIN; + while(rc == -EAGAIN) { + if((open_file->invalidHandle) && + (!open_file->closePend)) { + rc = cifs_reopen_file(file->f_dentry->d_inode,file); + if(rc != 0) + break; + } + if(!open_file->closePend) { + rc = CIFSSMBSetFileSize(xid, cifs_sb->tcon, + position, open_file->netfid, + open_file->pid,FALSE); + } else { + rc = -EBADF; + break; + } + } cFYI(1,(" SetEOF (commit write) rc = %d",rc)); - } - } + }*/ + } set_page_dirty(page); FreeXid(xid); @@ -769,7 +784,7 @@ cifs_fsync(struct file *file, struct den return rc; } -static int +/* static int cifs_sync_page(struct page *page) { struct address_space *mapping; @@ -784,17 +799,17 @@ cifs_sync_page(struct page *page) return 0; inode = mapping->host; if (!inode) - return 0; + return 0;*/ /* fill in rpages then result = cifs_pagein_inode(inode, index, rpages); *//* BB finish */ - cFYI(1, ("rpages is %d for sync page of Index %ld ", rpages, index)); +/* cFYI(1, ("rpages is %d for sync page of Index %ld ", rpages, index)); if (rc < 0) return rc; return 0; -} +} */ /* * As file closes, flush all cached write data for this inode checking @@ -837,6 +852,7 @@ cifs_read(struct file * file, char *read struct cifsTconInfo *pTcon; int xid; char * current_offset; + struct cifsFileInfo * open_file; xid = GetXid(); cifs_sb = CIFS_SB(file->f_dentry->d_sb); @@ -846,15 +862,24 @@ cifs_read(struct file * file, char *read FreeXid(xid); return -EBADF; } + open_file = (struct cifsFileInfo *)file->private_data; for (total_read = 0,current_offset=read_data; read_size > total_read; total_read += bytes_read,current_offset+=bytes_read) { current_read_size = min_t(const int,read_size - total_read,cifs_sb->rsize); - rc = CIFSSMBRead(xid, pTcon, - ((struct cifsFileInfo *) file-> - private_data)->netfid, + rc = -EAGAIN; + while(rc == -EAGAIN) { + if ((open_file->invalidHandle) && (!open_file->closePend)) { + rc = cifs_reopen_file(file->f_dentry->d_inode,file); + if(rc != 0) + break; + } + + rc = CIFSSMBRead(xid, pTcon, + open_file->netfid, current_read_size, *poffset, &bytes_read, ¤t_offset); + } if (rc || (bytes_read == 0)) { if (total_read) { break; @@ -862,8 +887,9 @@ cifs_read(struct file * file, char *read FreeXid(xid); return rc; } - } else + } else { *poffset += bytes_read; + } } FreeXid(xid); @@ -899,7 +925,6 @@ static void cifs_copy_cache_pages(struct break; page = list_entry(pages->prev, struct page, lru); - list_del(&page->lru); if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) { @@ -908,24 +933,24 @@ static void cifs_copy_cache_pages(struct continue; } - page_cache_get(page); target = kmap_atomic(page,KM_USER0); if(PAGE_CACHE_SIZE > bytes_read) { memcpy(target,data,bytes_read); + /* zero the tail end of this partial page */ + memset(target+bytes_read,0,PAGE_CACHE_SIZE-bytes_read); bytes_read = 0; } else { memcpy(target,data,PAGE_CACHE_SIZE); bytes_read -= PAGE_CACHE_SIZE; } + kunmap_atomic(target,KM_USER0); - if (!pagevec_add(plru_pvec, page)) - __pagevec_lru_add(plru_pvec); flush_dcache_page(page); SetPageUptodate(page); - kunmap_atomic(target,KM_USER0); unlock_page(page); - page_cache_release(page); + if (!pagevec_add(plru_pvec, page)) + __pagevec_lru_add(plru_pvec); data += PAGE_CACHE_SIZE; } return; @@ -947,46 +972,78 @@ cifs_readpages(struct file *file, struct char * smb_read_data = 0; struct smb_com_read_rsp * pSMBr; struct pagevec lru_pvec; + struct cifsFileInfo * open_file; xid = GetXid(); if (file->private_data == NULL) { FreeXid(xid); return -EBADF; } - + open_file = (struct cifsFileInfo *)file->private_data; cifs_sb = CIFS_SB(file->f_dentry->d_sb); pTcon = cifs_sb->tcon; pagevec_init(&lru_pvec, 0); for(i = 0;iprev, struct page, lru); offset = (loff_t)page->index << PAGE_CACHE_SHIFT; + /* count adjacent pages that we will read into */ + contig_pages = 0; + expected_index = list_entry(page_list->prev,struct page,lru)->index; + list_for_each_entry_reverse(tmp_page,page_list,lru) { + if(tmp_page->index == expected_index) { + contig_pages++; + expected_index++; + } else { + break; + } + } + if(contig_pages + i > num_pages) { + contig_pages = num_pages - i; + } + /* for reads over a certain size could initiate async read ahead */ - cFYI(0,("Read %d pages into cache at offset %ld ", - num_pages-i, (unsigned long) offset)); - - read_size = (num_pages - i) * PAGE_CACHE_SIZE; + read_size = contig_pages * PAGE_CACHE_SIZE; /* Read size needs to be in multiples of one page */ read_size = min_t(const unsigned int,read_size,cifs_sb->rsize & PAGE_CACHE_MASK); - rc = CIFSSMBRead(xid, pTcon, - ((struct cifsFileInfo *) file-> - private_data)->netfid, - read_size, offset, - &bytes_read, &smb_read_data); + rc = -EAGAIN; + while(rc == -EAGAIN) { + if ((open_file->invalidHandle) && (!open_file->closePend)) { + rc = cifs_reopen_file(file->f_dentry->d_inode,file); + if(rc != 0) + break; + } + + rc = CIFSSMBRead(xid, pTcon, + open_file->netfid, + read_size, offset, + &bytes_read, &smb_read_data); + /* BB need to check return code here */ + if(rc== -EAGAIN) { + if(smb_read_data) { + cifs_buf_release(smb_read_data); + smb_read_data = 0; + } + } + } if ((rc < 0) || (smb_read_data == NULL)) { cFYI(1,("Read error in readpages: %d",rc)); /* clean up remaing pages off list */ - while (!list_empty(page_list) && (i < num_pages)) { - page = list_entry(page_list->prev, - struct page, lru); + page = list_entry(page_list->prev, struct page, lru); list_del(&page->lru); + page_cache_release(page); } break; } else if (bytes_read > 0) { @@ -994,24 +1051,37 @@ cifs_readpages(struct file *file, struct cifs_copy_cache_pages(mapping, page_list, bytes_read, smb_read_data + 4 /* RFC1000 hdr */ + le16_to_cpu(pSMBr->DataOffset), &lru_pvec); + i += bytes_read >> PAGE_CACHE_SHIFT; - if((bytes_read & PAGE_CACHE_MASK) != bytes_read) { + + if((int)(bytes_read & PAGE_CACHE_MASK) != bytes_read) { cFYI(1,("Partial page %d of %d read to cache",i++,num_pages)); - break; + + i++; /* account for partial page */ + + /* server copy of file can have smaller size than client */ + /* BB do we need to verify this common case ? this case is ok - + if we are at server EOF we will hit it on next read */ + + /* while(!list_empty(page_list) && (i < num_pages)) { + page = list_entry(page_list->prev,struct page, list); + list_del(&page->list); + page_cache_release(page); + } + break; */ } } else { - cFYI(1,("No bytes read cleaning remaining pages off readahead list")); + cFYI(1,("No bytes read (%d) at offset %lld . Cleaning remaining pages from readahead list",bytes_read,offset)); /* BB turn off caching and do new lookup on file size at server? */ while (!list_empty(page_list) && (i < num_pages)) { - page = list_entry(page_list->prev, - struct page, lru); + page = list_entry(page_list->prev, struct page, lru); list_del(&page->lru); + page_cache_release(page); /* BB removeme - replace with zero of page? */ } - break; } if(smb_read_data) { - buf_release(smb_read_data); + cifs_buf_release(smb_read_data); smb_read_data = 0; } bytes_read = 0; @@ -1019,6 +1089,12 @@ cifs_readpages(struct file *file, struct pagevec_lru_add(&lru_pvec); +/* need to free smb_read_data buf before exit */ + if(smb_read_data) { + cifs_buf_release(smb_read_data); + smb_read_data = 0; + } + FreeXid(xid); return rc; } @@ -1122,7 +1198,7 @@ fill_in_inode(struct inode *tmp_inode, }/* could add code here - to validate if device or weird share type? */ /* can not fill in nlink here as in qpathinfo version and Unx search */ - tmp_inode->i_size = pfindData->EndOfFile; + i_size_write(tmp_inode,pfindData->EndOfFile); tmp_inode->i_blocks = (tmp_inode->i_blksize - 1 + pfindData->AllocationSize) >> tmp_inode->i_blkbits; if (pfindData->AllocationSize < pfindData->EndOfFile) @@ -1196,7 +1272,7 @@ unix_fill_in_inode(struct inode *tmp_ino pfindData->NumOfBytes = le64_to_cpu(pfindData->NumOfBytes); pfindData->EndOfFile = le64_to_cpu(pfindData->EndOfFile); - tmp_inode->i_size = pfindData->EndOfFile; + i_size_write(tmp_inode,pfindData->EndOfFile); tmp_inode->i_blocks = (tmp_inode->i_blksize - 1 + pfindData->NumOfBytes) >> tmp_inode->i_blkbits; @@ -1220,7 +1296,7 @@ unix_fill_in_inode(struct inode *tmp_ino } } -void +static void construct_dentry(struct qstr *qstring, struct file *file, struct inode **ptmp_inode, struct dentry **pnew_dentry) { @@ -1244,6 +1320,11 @@ construct_dentry(struct qstr *qstring, s } } else { tmp_dentry = d_alloc(file->f_dentry, qstring); + if(tmp_dentry == NULL) { + cERROR(1,("Failed allocating dentry")); + return; + } + *ptmp_inode = new_inode(file->f_dentry->d_sb); tmp_dentry->d_op = &cifs_dentry_ops; cFYI(0, (" instantiate dentry 0x%p with inode 0x%p ", @@ -1256,13 +1337,44 @@ construct_dentry(struct qstr *qstring, s *pnew_dentry = tmp_dentry; } -void +static void reset_resume_key(struct file * dir_file, + unsigned char * filename, + unsigned int len,int Unicode,struct nls_table * nls_tab) { + struct cifsFileInfo *cifsFile; + + cifsFile = (struct cifsFileInfo *)dir_file->private_data; + if(cifsFile == NULL) + return; + if(cifsFile->search_resume_name) { + kfree(cifsFile->search_resume_name); + } + + if(Unicode) + len *= 2; + cifsFile->resume_name_length = len; + + cifsFile->search_resume_name = + kmalloc(cifsFile->resume_name_length, GFP_KERNEL); + + if(Unicode) + cifs_strtoUCS((wchar_t *) cifsFile->search_resume_name, + filename, len, nls_tab); + else + memcpy(cifsFile->search_resume_name, filename, + cifsFile->resume_name_length); + cFYI(1,("Reset resume key to: %s with len %d",filename,len)); + return; +} + + + +static int cifs_filldir(struct qstr *pqstring, FILE_DIRECTORY_INFO * pfindData, struct file *file, filldir_t filldir, void *direntry) { struct inode *tmp_inode; struct dentry *tmp_dentry; - int object_type; + int object_type,rc; pqstring->name = pfindData->FileName; pqstring->len = pfindData->FileNameLength; @@ -1270,19 +1382,25 @@ cifs_filldir(struct qstr *pqstring, FILE construct_dentry(pqstring, file, &tmp_inode, &tmp_dentry); fill_in_inode(tmp_inode, pfindData, &object_type); - filldir(direntry, pfindData->FileName, pqstring->len, file->f_pos, + rc = filldir(direntry, pfindData->FileName, pqstring->len, file->f_pos, tmp_inode->i_ino, object_type); + if(rc) { + /* due to readdir error we need to recalculate resume + key so next readdir will restart on right entry */ + cFYI(1,("Error %d on filldir of %s",rc ,pfindData->FileName)); + } dput(tmp_dentry); + return rc; } -void +static int cifs_filldir_unix(struct qstr *pqstring, FILE_UNIX_INFO * pUnixFindData, struct file *file, filldir_t filldir, void *direntry) { struct inode *tmp_inode; struct dentry *tmp_dentry; - int object_type; + int object_type, rc; pqstring->name = pUnixFindData->FileName; pqstring->len = strnlen(pUnixFindData->FileName, MAX_PATHCONF); @@ -1290,9 +1408,15 @@ cifs_filldir_unix(struct qstr *pqstring, construct_dentry(pqstring, file, &tmp_inode, &tmp_dentry); unix_fill_in_inode(tmp_inode, pUnixFindData, &object_type); - filldir(direntry, pUnixFindData->FileName, pqstring->len, + rc = filldir(direntry, pUnixFindData->FileName, pqstring->len, file->f_pos, tmp_inode->i_ino, object_type); + if(rc) { + /* due to readdir error we need to recalculate resume + key so next readdir will restart on right entry */ + cFYI(1,("Error %d on filldir of %s",rc ,pUnixFindData->FileName)); + } dput(tmp_dentry); + return rc; } int @@ -1378,8 +1502,7 @@ cifs_readdir(struct file *file, void *di searchHandle = findParms.SearchHandle; if(file->private_data == NULL) file->private_data = - kmalloc(sizeof(struct cifsFileInfo), - GFP_KERNEL); + kmalloc(sizeof(struct cifsFileInfo),GFP_KERNEL); if (file->private_data) { memset(file->private_data, 0, sizeof (struct cifsFileInfo)); @@ -1387,6 +1510,7 @@ cifs_readdir(struct file *file, void *di (struct cifsFileInfo *) file->private_data; cifsFile->netfid = searchHandle; cifsFile->invalidHandle = FALSE; + init_MUTEX(&cifsFile->fh_sem); } else { rc = -ENOMEM; break; @@ -1471,10 +1595,18 @@ cifs_readdir(struct file *file, void *di FileName[0] != '.') || (pfindData-> FileName[1] != '.'))) { - cifs_filldir(&qstring, + if(cifs_filldir(&qstring, pfindData, file, filldir, - direntry); + direntry)) { + /* do not end search if + kernel not ready to take + remaining entries yet */ + reset_resume_key(file, pfindData->FileName,qstring.len, + Unicode, cifs_sb->local_nls); + findParms.EndofSearch = 0; + break; + } file->f_pos++; } } else { /* UnixSearch */ @@ -1501,11 +1633,19 @@ cifs_readdir(struct file *file, void *di FileName[0] != '.') || (pfindDataUnix-> FileName[1] != '.'))) { - cifs_filldir_unix(&qstring, + if(cifs_filldir_unix(&qstring, pfindDataUnix, file, filldir, - direntry); + direntry)) { + /* do not end search if + kernel not ready to take + remaining entries yet */ + findParms.EndofSearch = 0; + reset_resume_key(file, pfindDataUnix->FileName, + qstring.len,Unicode,cifs_sb->local_nls); + break; + } file->f_pos++; } } @@ -1573,6 +1713,11 @@ cifs_readdir(struct file *file, void *di rc = -ENOMEM; break; } + /* Free the memory allocated by previous findfirst + or findnext call - we can not reuse the memory since + the resume name may not be same string length */ + if(cifsFile->search_resume_name) + kfree(cifsFile->search_resume_name); cifsFile->search_resume_name = kmalloc(cifsFile->resume_name_length, GFP_KERNEL); cFYI(1,("Last file: %s with name %d bytes long", @@ -1603,6 +1748,11 @@ cifs_readdir(struct file *file, void *di rc = -ENOMEM; break; } + /* Free the memory allocated by previous findfirst + or findnext call - we can not reuse the memory since + the resume name may not be same string length */ + if(cifsFile->search_resume_name) + kfree(cifsFile->search_resume_name); cifsFile->search_resume_name = kmalloc(cifsFile->resume_name_length, GFP_KERNEL); cFYI(1,("fnext last file: %s with name %d bytes long", @@ -1634,11 +1784,19 @@ cifs_readdir(struct file *file, void *di || (pfindData->FileName[0] != '.') || (pfindData->FileName[1] != '.'))) { - cifs_filldir + if(cifs_filldir (&qstring, pfindData, file, filldir, - direntry); + direntry)) { + /* do not end search if + kernel not ready to take + remaining entries yet */ + findNextParms.EndofSearch = 0; + reset_resume_key(file, pfindData->FileName,qstring.len, + Unicode,cifs_sb->local_nls); + break; + } file->f_pos++; } } else { /* UnixSearch */ @@ -1668,11 +1826,19 @@ cifs_readdir(struct file *file, void *di || (pfindDataUnix-> FileName[1] != '.'))) { - cifs_filldir_unix + if(cifs_filldir_unix (&qstring, pfindDataUnix, file, filldir, - direntry); + direntry)) { + /* do not end search if + kernel not ready to take + remaining entries yet */ + findNextParms.EndofSearch = 0; + reset_resume_key(file, pfindDataUnix->FileName,qstring.len, + Unicode,cifs_sb->local_nls); + break; + } file->f_pos++; } } @@ -1696,34 +1862,31 @@ cifs_readdir(struct file *file, void *di return rc; } +int cifs_prepare_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + cFYI(1,("prepare write for page %p from %d to %d",page,from,to)); + if (!PageUptodate(page)) { + if (to - from != PAGE_CACHE_SIZE) { + void *kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr, 0, from); + memset(kaddr + to, 0, PAGE_CACHE_SIZE - to); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + } + SetPageUptodate(page); + } + return 0; +} -struct address_space_operations cifs_addr_ops = { - .readpage = cifs_readpage, - .readpages = cifs_readpages, - .writepage = cifs_writepage, - .prepare_write = simple_prepare_write, - .commit_write = cifs_commit_write, - .sync_page = cifs_sync_page, - /*.direct_IO = */ -}; - -struct address_space_operations cifs_addr_ops_writethrough = { - .readpage = cifs_readpage, - .readpages = cifs_readpages, - .writepage = cifs_writepage, - .prepare_write = simple_prepare_write, - .commit_write = cifs_commit_write, - .sync_page = cifs_sync_page, - /*.direct_IO = */ -}; -struct address_space_operations cifs_addr_ops_nocache = { +struct address_space_operations cifs_addr_ops = { .readpage = cifs_readpage, .readpages = cifs_readpages, .writepage = cifs_writepage, - .prepare_write = simple_prepare_write, + .prepare_write = simple_prepare_write, /* BB fixme BB */ +/* .prepare_write = cifs_prepare_write, */ /* BB removeme BB */ .commit_write = cifs_commit_write, - .sync_page = cifs_sync_page, + /* .sync_page = cifs_sync_page, */ /*.direct_IO = */ }; - --- diff/fs/cifs/inode.c 2003-09-17 12:28:11.000000000 +0100 +++ source/fs/cifs/inode.c 2004-04-21 10:46:51.680725448 +0100 @@ -125,7 +125,7 @@ cifs_get_inode_info_unix(struct inode ** inode->i_nlink = le64_to_cpu(findData.Nlinks); findData.NumOfBytes = le64_to_cpu(findData.NumOfBytes); findData.EndOfFile = le64_to_cpu(findData.EndOfFile); - inode->i_size = findData.EndOfFile; + i_size_write(inode,findData.EndOfFile); /* blksize needs to be multiple of two. So safer to default to blksize and blkbits set in superblock so 2**blkbits and blksize will match */ /* inode->i_blksize = @@ -204,10 +204,10 @@ cifs_get_inode_info(struct inode **pinod strnlen(search_path, MAX_PATHCONF) + 1, GFP_KERNEL); if (tmp_path == NULL) { - if(buf) - kfree(buf); - FreeXid(xid); - return -ENOMEM; + if(buf) + kfree(buf); + FreeXid(xid); + return -ENOMEM; } strncpy(tmp_path, pTcon->treeName, MAX_TREE_SIZE); @@ -218,10 +218,10 @@ cifs_get_inode_info(struct inode **pinod kfree(tmp_path); /* BB fix up inode etc. */ } else if (rc) { - if(buf) - kfree(buf); - FreeXid(xid); - return rc; + if(buf) + kfree(buf); + FreeXid(xid); + return rc; } } else { struct cifsInodeInfo *cifsInfo; @@ -272,10 +272,10 @@ cifs_get_inode_info(struct inode **pinod inode->i_mode &= ~(S_IWUGO); /* BB add code here - validate if device or weird share or device type? */ } - inode->i_size = le64_to_cpu(pfindData->EndOfFile); + i_size_write(inode,le64_to_cpu(pfindData->EndOfFile)); pfindData->AllocationSize = le64_to_cpu(pfindData->AllocationSize); inode->i_blocks = - (inode->i_blksize - 1 + pfindData->AllocationSize) >> inode->i_blkbits; + (inode->i_blksize - 1 + pfindData->AllocationSize) >> inode->i_blkbits; inode->i_nlink = le32_to_cpu(pfindData->NumberOfLinks); @@ -380,8 +380,8 @@ cifs_unlink(struct inode *inode, struct __u16 netfid; rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN, DELETE, - CREATE_NOT_DIR | CREATE_DELETE_ON_CLOSE, - &netfid, &oplock, NULL, cifs_sb->local_nls); + CREATE_NOT_DIR | CREATE_DELETE_ON_CLOSE, + &netfid, &oplock, NULL, cifs_sb->local_nls); if(rc==0) { CIFSSMBRenameOpenFile(xid,pTcon,netfid,NULL,cifs_sb->local_nls); CIFSSMBClose(xid, pTcon, netfid); @@ -426,6 +426,7 @@ cifs_mkdir(struct inode *inode, struct d rc = CIFSSMBMkDir(xid, pTcon, full_path, cifs_sb->local_nls); if (rc) { cFYI(1, ("cifs_mkdir returned 0x%x ", rc)); + d_drop(direntry); } else { inode->i_nlink++; if (pTcon->ses->capabilities & CAP_UNIX) @@ -479,7 +480,7 @@ cifs_rmdir(struct inode *inode, struct d if (!rc) { inode->i_nlink--; - direntry->d_inode->i_size = 0; + i_size_write(direntry->d_inode,0); direntry->d_inode->i_nlink = 0; } @@ -530,17 +531,17 @@ cifs_rename(struct inode *source_inode, } if((rc == -EIO)||(rc == -EEXIST)) { - int oplock = FALSE; - __u16 netfid; + int oplock = FALSE; + __u16 netfid; - rc = CIFSSMBOpen(xid, pTcon, fromName, FILE_OPEN, GENERIC_READ, - CREATE_NOT_DIR, - &netfid, &oplock, NULL, cifs_sb_source->local_nls); - if(rc==0) { - CIFSSMBRenameOpenFile(xid,pTcon,netfid, - toName, cifs_sb_source->local_nls); - CIFSSMBClose(xid, pTcon, netfid); - } + rc = CIFSSMBOpen(xid, pTcon, fromName, FILE_OPEN, GENERIC_READ, + CREATE_NOT_DIR, + &netfid, &oplock, NULL, cifs_sb_source->local_nls); + if(rc==0) { + CIFSSMBRenameOpenFile(xid,pTcon,netfid, + toName, cifs_sb_source->local_nls); + CIFSSMBClose(xid, pTcon, netfid); + } } if (fromName) kfree(fromName); @@ -559,6 +560,21 @@ cifs_revalidate(struct dentry *direntry) char *full_path; struct cifs_sb_info *cifs_sb; struct cifsInodeInfo *cifsInode; + loff_t local_size; + struct timespec local_mtime; + int invalidate_inode = FALSE; + + if(direntry->d_inode == NULL) + return -ENOENT; + + cifsInode = CIFS_I(direntry->d_inode); + + if(cifsInode == NULL) + return -ENOENT; + + /* no sense revalidating inode info on file that no one can write */ + if(CIFS_I(direntry->d_inode)->clientCanCacheRead) + return rc; xid = GetXid(); @@ -566,16 +582,14 @@ cifs_revalidate(struct dentry *direntry) full_path = build_path_from_dentry(direntry); cFYI(1, - ("Revalidate full path: %s for inode 0x%p with count %d dentry: 0x%p d_time %ld at time %ld ", + ("Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld jiffies %ld", full_path, direntry->d_inode, direntry->d_inode->i_count.counter, direntry, direntry->d_time, jiffies)); - - cifsInode = CIFS_I(direntry->d_inode); - /* BB add check - do not need to revalidate oplocked files */ - - if (time_before(jiffies, cifsInode->time + HZ) && lookupCacheEnabled) { + if (cifsInode->time == 0){ + /* was set to zero previously to force revalidate */ + } else if (time_before(jiffies, cifsInode->time + HZ) && lookupCacheEnabled) { if((S_ISREG(direntry->d_inode->i_mode) == 0) || (direntry->d_inode->i_nlink == 1)) { if (full_path) @@ -586,6 +600,10 @@ cifs_revalidate(struct dentry *direntry) cFYI(1,("Have to revalidate file due to hardlinks")); } } + + /* save mtime and size */ + local_mtime = direntry->d_inode->i_mtime; + local_size = direntry->d_inode->i_size; if (cifs_sb->tcon->ses->capabilities & CAP_UNIX) { rc = cifs_get_inode_info_unix(&direntry->d_inode, full_path, @@ -606,8 +624,43 @@ cifs_revalidate(struct dentry *direntry) } /* should we remap certain errors, access denied?, to zero */ - /* BB if not oplocked, invalidate inode pages if mtime has changed */ + /* if not oplocked, we invalidate inode pages if mtime + or file size had changed on server */ + + if(timespec_equal(&local_mtime,&direntry->d_inode->i_mtime) && + (local_size == direntry->d_inode->i_size)) { + cFYI(1,("cifs_revalidate - inode unchanged")); + } else { + /* file may have changed on server */ + if(cifsInode->clientCanCacheRead) { + /* no need to invalidate inode pages since we were + the only ones who could have modified the file and + the server copy is staler than ours */ + } else { + invalidate_inode = TRUE; + } + } + + /* need to write out dirty pages here */ + down(&direntry->d_inode->i_sem); + if(direntry->d_inode->i_mapping) { + /* do we need to lock inode until after invalidate completes below? */ + filemap_fdatawrite(direntry->d_inode->i_mapping); + } + if(invalidate_inode) { + filemap_fdatawait(direntry->d_inode->i_mapping); + /* may eventually have to do this for open files too */ + if(list_empty(&(cifsInode->openFileList))) { + /* Has changed on server - flush read ahead pages */ + cFYI(1,("Invalidating read ahead data on closed file")); + invalidate_remote_inode(direntry->d_inode); + } + } + + + up(&direntry->d_inode->i_sem); + if (full_path) kfree(full_path); FreeXid(xid); @@ -623,78 +676,25 @@ int cifs_getattr(struct vfsmount *mnt, s return err; } -void -cifs_truncate_file(struct inode *inode) -{ /* BB remove - may not need this function after all BB */ - int xid; +static int cifs_truncate_page(struct address_space *mapping, loff_t from) +{ + pgoff_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + struct page *page; + char *kaddr; int rc = 0; - struct cifsFileInfo *open_file = NULL; - struct cifs_sb_info *cifs_sb; - struct cifsTconInfo *pTcon; - struct cifsInodeInfo *cifsInode; - struct dentry *dirent; - char *full_path = NULL; - - xid = GetXid(); - cifs_sb = CIFS_SB(inode->i_sb); - pTcon = cifs_sb->tcon; - - if (list_empty(&inode->i_dentry)) { - cERROR(1, - ("Can not get pathname from empty dentry in inode 0x%p ", - inode)); - FreeXid(xid); - return; - } - dirent = list_entry(inode->i_dentry.next, struct dentry, d_alias); - if (dirent) { - full_path = build_path_from_dentry(dirent); - rc = CIFSSMBSetEOF(xid, pTcon, full_path, inode->i_size,FALSE, - cifs_sb->local_nls); - cFYI(1,(" SetEOF (truncate) rc = %d",rc)); - if(rc == -ETXTBSY) { - cifsInode = CIFS_I(inode); - if(!list_empty(&(cifsInode->openFileList))) { - open_file = list_entry(cifsInode->openFileList.next, - struct cifsFileInfo, flist); - /* We could check if file is open for writing first */ - rc = CIFSSMBSetFileSize(xid, pTcon, inode->i_size, - open_file->netfid,open_file->pid,FALSE); - } else { - cFYI(1,(" No open files to get file handle from")); - } - } - if (!rc) - CIFSSMBSetEOF(xid,pTcon,full_path,inode->i_size,TRUE,cifs_sb->local_nls); - /* allocation size setting seems optional so ignore return code */ - } - if (full_path) - kfree(full_path); - FreeXid(xid); - return; -} - -static int cifs_trunc_page(struct address_space *mapping, loff_t from) -{ - pgoff_t index = from >> PAGE_CACHE_SHIFT; - unsigned offset = from & (PAGE_CACHE_SIZE-1); - struct page *page; - char *kaddr; - int rc = 0; - - page = grab_cache_page(mapping, index); - if (!page) - return -ENOMEM; - - kaddr = kmap_atomic(page, KM_USER0); - memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); - flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); - set_page_dirty(page); - unlock_page(page); - page_cache_release(page); - return rc; + page = grab_cache_page(mapping, index); + if (!page) + return -ENOMEM; + + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + unlock_page(page); + page_cache_release(page); + return rc; } int @@ -705,6 +705,7 @@ cifs_setattr(struct dentry *direntry, st struct cifsTconInfo *pTcon; char *full_path = NULL; int rc = -EACCES; + int found = FALSE; struct cifsFileInfo *open_file = NULL; FILE_BASIC_INFO time_buf; int set_time = FALSE; @@ -712,6 +713,7 @@ cifs_setattr(struct dentry *direntry, st __u64 uid = 0xFFFFFFFFFFFFFFFFULL; __u64 gid = 0xFFFFFFFFFFFFFFFFULL; struct cifsInodeInfo *cifsInode; + struct list_head * tmp; xid = GetXid(); @@ -726,32 +728,63 @@ cifs_setattr(struct dentry *direntry, st /* BB check if we need to refresh inode from server now ? BB */ - cFYI(1, (" Changing attributes 0x%x", attrs->ia_valid)); + /* need to flush data before changing file size on server */ + filemap_fdatawrite(direntry->d_inode->i_mapping); + filemap_fdatawait(direntry->d_inode->i_mapping); if (attrs->ia_valid & ATTR_SIZE) { - rc = CIFSSMBSetEOF(xid, pTcon, full_path, attrs->ia_size,FALSE, - cifs_sb->local_nls); - cFYI(1,(" SetEOF (setattrs) rc = %d",rc)); + read_lock(&GlobalSMBSeslock); + /* To avoid spurious oplock breaks from server, in the case + of inodes that we already have open, avoid doing path + based setting of file size if we can do it by handle. + This keeps our caching token (oplock) and avoids + timeouts when the local oplock break takes longer to flush + writebehind data than the SMB timeout for the SetPathInfo + request would allow */ + list_for_each(tmp, &cifsInode->openFileList) { + open_file = list_entry(tmp,struct cifsFileInfo, flist); + /* We check if file is open for writing first */ + if((open_file->pfile) && + ((open_file->pfile->f_flags & O_RDWR) || + (open_file->pfile->f_flags & O_WRONLY))) { + if(open_file->invalidHandle == FALSE) { + /* we found a valid, writeable network file + handle to use to try to set the file size */ + __u16 nfid = open_file->netfid; + __u32 npid = open_file->pid; + read_unlock(&GlobalSMBSeslock); + found = TRUE; + rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, + nfid,npid,FALSE); + cFYI(1,("SetFileSize by handle (setattrs) rc = %d",rc)); + /* Do not need reopen and retry on EAGAIN since we will + retry by pathname below */ - if(rc == -ETXTBSY) { - if(!list_empty(&(cifsInode->openFileList))) { - open_file = list_entry(cifsInode->openFileList.next, - struct cifsFileInfo, flist); - /* We could check if file is open for writing first */ - rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, - open_file->netfid,open_file->pid,FALSE); - } else { - cFYI(1,(" No open files to get file handle from")); + break; /* now that we found one valid file handle no + sense continuing to loop trying others */ + } } } - /* For Allocation Size - do not need to call the following - it did not hurt if it fails but why bother */ - /* CIFSSMBSetEOF(xid, pTcon, full_path, attrs->ia_size, TRUE, cifs_sb->local_nls);*/ + if(found == FALSE) { + read_unlock(&GlobalSMBSeslock); + } + + + if(rc != 0) { + /* Set file size by pathname rather than by handle either + because no valid, writeable file handle for it was found or + because there was an error setting it by handle */ + rc = CIFSSMBSetEOF(xid, pTcon, full_path, attrs->ia_size,FALSE, + cifs_sb->local_nls); + cFYI(1,(" SetEOF by path (setattrs) rc = %d",rc)); + } + + /* Server is ok setting allocation size implicitly - no need to call: */ + /*CIFSSMBSetEOF(xid, pTcon, full_path, attrs->ia_size, TRUE, cifs_sb->local_nls);*/ + if (rc == 0) { rc = vmtruncate(direntry->d_inode, attrs->ia_size); - cifs_trunc_page(direntry->d_inode->i_mapping, direntry->d_inode->i_size); - -/* cFYI(1,("truncate_page to 0x%lx \n",direntry->d_inode->i_size)); */ + cifs_truncate_page(direntry->d_inode->i_mapping, direntry->d_inode->i_size); } } if (attrs->ia_valid & ATTR_UID) { @@ -816,6 +849,8 @@ cifs_setattr(struct dentry *direntry, st /* BB what if setting one attribute fails (such as size) but time setting works */ time_buf.CreationTime = 0; /* do not change */ + /* In the future we should experiment - try setting timestamps + via Handle (SetFileInfo) instead of by path */ rc = CIFSSMBSetTimes(xid, pTcon, full_path, &time_buf, cifs_sb->local_nls); } @@ -831,8 +866,7 @@ cifs_setattr(struct dentry *direntry, st void cifs_delete_inode(struct inode *inode) { - /* Note: called without the big kernel filelock - remember spinlocks! */ cFYI(1, ("In cifs_delete_inode, inode = 0x%p ", inode)); - /* may have to add back in when safe distributed caching of - directories via e.g. FindNotify added */ + /* may have to add back in if and when safe distributed caching of + directories added e.g. via FindNotify */ } --- diff/fs/cifs/link.c 2003-09-17 12:28:11.000000000 +0100 +++ source/fs/cifs/link.c 2004-04-21 10:46:51.681725296 +0100 @@ -96,6 +96,8 @@ cifs_follow_link(struct dentry *direntry pTcon = cifs_sb->tcon; target_path = kmalloc(PATH_MAX, GFP_KERNEL); if(target_path == NULL) { + if (full_path) + kfree(full_path); FreeXid(xid); return -ENOMEM; } @@ -212,6 +214,8 @@ cifs_readlink(struct dentry *direntry, c len = buflen; tmpbuffer = kmalloc(len,GFP_KERNEL); if(tmpbuffer == NULL) { + if (full_path) + kfree(full_path); FreeXid(xid); return -ENOMEM; } @@ -251,10 +255,11 @@ cifs_readlink(struct dentry *direntry, c cFYI(1,("num referral: %d",num_referrals)); if(referrals) { cFYI(1,("referral string: %s ",referrals)); - strncpy(tmpbuffer, referrals, len-1); + strncpy(tmpbuffer, referrals, len-1); } } - + if(referrals) + kfree(referrals); kfree(tmp_path); if(referrals) { kfree(referrals); --- diff/fs/cifs/md4.c 2002-10-16 04:28:31.000000000 +0100 +++ source/fs/cifs/md4.c 2004-04-21 10:46:51.681725296 +0100 @@ -3,7 +3,7 @@ Version 1.9. a implementation of MD4 designed for use in the SMB authentication protocol Copyright (C) Andrew Tridgell 1997-1998. - Modified by Steve French (sfrench@us.ibm.com) 2002 + Modified by Steve French (sfrench@us.ibm.com) 2002-2003 This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -21,13 +21,7 @@ */ #include #include - -/* NOTE: This code makes no attempt to be fast! - - It assumes that a int is at least 32 bits long -*/ - -static __u32 A, B, C, D; +/* NOTE: This code makes no attempt to be fast! */ static __u32 F(__u32 X, __u32 Y, __u32 Z) @@ -54,25 +48,26 @@ lshift(__u32 x, int s) return ((x << s) & 0xFFFFFFFF) | (x >> (32 - s)); } -#define ROUND1(a,b,c,d,k,s) a = lshift(a + F(b,c,d) + X[k], s) -#define ROUND2(a,b,c,d,k,s) a = lshift(a + G(b,c,d) + X[k] + (__u32)0x5A827999,s) -#define ROUND3(a,b,c,d,k,s) a = lshift(a + H(b,c,d) + X[k] + (__u32)0x6ED9EBA1,s) +#define ROUND1(a,b,c,d,k,s) (*a) = lshift((*a) + F(*b,*c,*d) + X[k], s) +#define ROUND2(a,b,c,d,k,s) (*a) = lshift((*a) + G(*b,*c,*d) + X[k] + (__u32)0x5A827999,s) +#define ROUND3(a,b,c,d,k,s) (*a) = lshift((*a) + H(*b,*c,*d) + X[k] + (__u32)0x6ED9EBA1,s) /* this applies md4 to 64 byte chunks */ static void -mdfour64(__u32 * M) +mdfour64(__u32 * M, __u32 * A, __u32 *B, __u32 * C, __u32 *D) { int j; __u32 AA, BB, CC, DD; __u32 X[16]; + for (j = 0; j < 16; j++) X[j] = M[j]; - AA = A; - BB = B; - CC = C; - DD = D; + AA = *A; + BB = *B; + CC = *C; + DD = *D; ROUND1(A, B, C, D, 0, 3); ROUND1(D, A, B, C, 1, 7); @@ -125,15 +120,15 @@ mdfour64(__u32 * M) ROUND3(C, D, A, B, 7, 11); ROUND3(B, C, D, A, 15, 15); - A += AA; - B += BB; - C += CC; - D += DD; - - A &= 0xFFFFFFFF; - B &= 0xFFFFFFFF; - C &= 0xFFFFFFFF; - D &= 0xFFFFFFFF; + *A += AA; + *B += BB; + *C += CC; + *D += DD; + + *A &= 0xFFFFFFFF; + *B &= 0xFFFFFFFF; + *C &= 0xFFFFFFFF; + *D &= 0xFFFFFFFF; for (j = 0; j < 16; j++) X[j] = 0; @@ -166,15 +161,14 @@ mdfour(unsigned char *out, unsigned char __u32 M[16]; __u32 b = n * 8; int i; - - A = 0x67452301; - B = 0xefcdab89; - C = 0x98badcfe; - D = 0x10325476; + __u32 A = 0x67452301; + __u32 B = 0xefcdab89; + __u32 C = 0x98badcfe; + __u32 D = 0x10325476; while (n > 64) { copy64(M, in); - mdfour64(M); + mdfour64(M,&A,&B, &C, &D); in += 64; n -= 64; } @@ -187,13 +181,13 @@ mdfour(unsigned char *out, unsigned char if (n <= 55) { copy4(buf + 56, b); copy64(M, buf); - mdfour64(M); + mdfour64(M, &A, &B, &C, &D); } else { copy4(buf + 120, b); copy64(M, buf); - mdfour64(M); + mdfour64(M, &A, &B, &C, &D); copy64(M, buf + 64); - mdfour64(M); + mdfour64(M, &A, &B, &C, &D); } for (i = 0; i < 128; i++) --- diff/fs/cifs/misc.c 2003-10-27 09:20:44.000000000 +0000 +++ source/fs/cifs/misc.c 2004-04-21 10:46:51.682725144 +0100 @@ -25,6 +25,8 @@ #include "cifsglob.h" #include "cifsproto.h" #include "cifs_debug.h" +#include "smberr.h" +#include "nterr.h" extern kmem_cache_t *cifs_req_cachep; extern struct task_struct * oplockThread; @@ -99,6 +101,8 @@ sesInfoFree(struct cifsSesInfo *buf_to_f kfree(buf_to_free->serverDomain); if (buf_to_free->serverNOS) kfree(buf_to_free->serverNOS); + if (buf_to_free->password) + kfree(buf_to_free->password); kfree(buf_to_free); } @@ -139,20 +143,10 @@ tconInfoFree(struct cifsTconInfo *buf_to kfree(buf_to_free); } -void * -kcalloc(size_t size, int type) -{ - void *addr; - addr = kmalloc(size, type); - if (addr) - memset(addr, 0, size); - return addr; -} - struct smb_hdr * -buf_get(void) +cifs_buf_get(void) { - struct smb_hdr *ret_buf; + struct smb_hdr *ret_buf = 0; /* We could use negotiated size instead of max_msgsize - but it may be more efficient to always alloc same size @@ -171,11 +165,11 @@ buf_get(void) } void -buf_release(void *buf_to_free) +cifs_buf_release(void *buf_to_free) { if (buf_to_free == NULL) { - cFYI(1, ("Null buffer passed to buf_release")); + cFYI(1, ("Null buffer passed to cifs_buf_release")); return; } kmem_cache_free(cifs_req_cachep, buf_to_free); @@ -267,7 +261,7 @@ header_assemble(struct smb_hdr *buffer, buffer->Uid = ses->Suid; break; } else { - /* BB eventually call setup_session here */ + /* BB eventually call cifs_setup_session here */ cFYI(1,("local UID found but smb sess with this server does not exist")); } } @@ -324,8 +318,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, ("Entering checkSMB with Length: %x, smb_buf_length: %x ", length, ntohl(smb->smb_buf_length))); if (((unsigned int)length < 2 + sizeof (struct smb_hdr)) - || (4 + ntohl(smb->smb_buf_length) > - CIFS_MAX_MSGSIZE + MAX_CIFS_HDR_SIZE)) { + || (ntohl(smb->smb_buf_length) > + CIFS_MAX_MSGSIZE + MAX_CIFS_HDR_SIZE - 4)) { if ((unsigned int)length < 2 + sizeof (struct smb_hdr)) { cERROR(1, ("Length less than 2 + sizeof smb_hdr ")); if (((unsigned int)length >= sizeof (struct smb_hdr) - 1) @@ -333,8 +327,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, return 0; /* some error cases do not return wct and bcc */ } - if (4 + ntohl(smb->smb_buf_length) > - CIFS_MAX_MSGSIZE + MAX_CIFS_HDR_SIZE) + if (ntohl(smb->smb_buf_length) > + CIFS_MAX_MSGSIZE + MAX_CIFS_HDR_SIZE - 4) cERROR(1, ("smb_buf_length greater than CIFS_MAX_MSGSIZE ... ")); cERROR(1, @@ -369,8 +363,22 @@ is_valid_oplock_break(struct smb_hdr *bu cFYI(1,("Checking for oplock break")); if(pSMB->hdr.Command != SMB_COM_LOCKING_ANDX) return FALSE; - if(pSMB->hdr.Flags & SMBFLG_RESPONSE) - return FALSE; /* server sends us "request" here */ + if(pSMB->hdr.Flags & SMBFLG_RESPONSE) { + /* no sense logging error on invalid handle on oplock + break - harmless race between close request and oplock + break response is expected from time to time writing out + large dirty files cached on the client */ + if ((NT_STATUS_INVALID_HANDLE) == + le32_to_cpu(pSMB->hdr.Status.CifsError)) { + cFYI(1,("invalid handle on oplock break")); + return TRUE; + } else if (ERRbadfid == + le16_to_cpu(pSMB->hdr.Status.DosError.Error)) { + return TRUE; + } else { + return FALSE; /* on valid oplock brk we get "request" */ + } + } if(pSMB->hdr.WordCount != 8) return FALSE; @@ -387,8 +395,6 @@ is_valid_oplock_break(struct smb_hdr *bu netfile = list_entry(tmp1,struct cifsFileInfo,tlist); if(pSMB->Fid == netfile->netfid) { struct cifsInodeInfo *pCifsInode; - /* BB Add following logic to mark inode for write through - inode->i_data.a_ops = &cifs_addr_ops_writethrough; */ read_unlock(&GlobalSMBSeslock); cFYI(1,("Matching file id, processing oplock break")); pCifsInode = --- diff/fs/cifs/netmisc.c 2003-06-30 10:07:34.000000000 +0100 +++ source/fs/cifs/netmisc.c 2004-04-21 10:46:51.683724992 +0100 @@ -125,10 +125,10 @@ const struct smb_to_posix_error mapping_ /* Convert string containing dotted ip address to binary form */ /* returns 0 if invalid address */ -/* BB add address family, change rc to status flag and return *//* also see inet_pton */ -/* To identify v4 vs. v6 - 1) check for colon (v6 only) 2) then call inet_pton to parse for bad address */ +/* BB add address family, change rc to status flag and return union or for ipv6 */ +/* will need parent to call something like inet_pton to convert ipv6 address BB */ int -inet_addr(char *cp) +cifs_inet_pton(int address_family, char *cp,void *dst) { struct in_addr address; int value; @@ -140,6 +140,9 @@ inet_addr(char *cp) static const int addr_class_max[4] = { 0xffffffff, 0xffffff, 0xffff, 0xff }; + if(address_family != AF_INET) + return -EAFNOSUPPORT; + for (i = 0; i < 4; i++) { bytes[i] = 0; } @@ -166,6 +169,9 @@ inet_addr(char *cp) return 0; *end++ = value; temp = *++cp; + } else if (temp == ':') { + cFYI(1,("IPv6 addresses not supported for CIFS mounts yet")); + return -1; } else break; } @@ -182,8 +188,8 @@ inet_addr(char *cp) return 0; address.s_addr = *((int *) bytes) | htonl(value); - return address.s_addr; - + *((int *)dst) = address.s_addr; + return 1; /* success */ } /***************************************************************************** --- diff/fs/cifs/smbencrypt.c 2003-10-27 09:20:44.000000000 +0000 +++ source/fs/cifs/smbencrypt.c 2004-04-21 10:46:51.684724840 +0100 @@ -23,8 +23,6 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -extern int DEBUGLEVEL; - #include #include #include @@ -45,9 +43,7 @@ extern int DEBUGLEVEL; /* following came from the other byteorder.h to avoid include conflicts */ #define CVAL(buf,pos) (((unsigned char *)(buf))[pos]) #define SSVALX(buf,pos,val) (CVAL(buf,pos)=(val)&0xFF,CVAL(buf,pos+1)=(val)>>8) -#define SIVALX(buf,pos,val) (SSVALX(buf,pos,val&0xFFFF),SSVALX(buf,pos+2,val>>16)) #define SSVAL(buf,pos,val) SSVALX((buf),(pos),((__u16)(val))) -#define SIVAL(buf,pos,val) SIVALX((buf),(pos),((__u32)(val))) /*The following definitions come from lib/md4.c */ @@ -96,12 +92,6 @@ SMBencrypt(unsigned char *passwd, unsign SMBOWFencrypt(p21, c8, p24); -#ifdef DEBUG_PASSWORD - DEBUG(100, ("SMBencrypt: lm#, challenge, response\n")); - dump_data(100, (char *) p21, 16); - dump_data(100, (char *) c8, 8); - dump_data(100, (char *) p24, 24); -#endif memset(p14,0,15); memset(p21,0,21); } @@ -151,12 +141,17 @@ E_md4hash(const unsigned char *passwd, u __u16 wpwd[129]; /* Password cannot be longer than 128 characters */ - len = strlen((char *) passwd); - if (len > 128) - len = 128; - /* Password must be converted to NT unicode */ - _my_mbstowcs(wpwd, passwd, len); - wpwd[len] = 0; /* Ensure string is null terminated */ + if(passwd) { + len = strlen((char *) passwd); + if (len > 128) { + len = 128; + } + /* Password must be converted to NT unicode */ + _my_mbstowcs(wpwd, passwd, len); + } else + len = 0; + + wpwd[len] = 0; /* Ensure string is null terminated */ /* Calculate length in bytes */ len = _my_wcslen(wpwd) * sizeof (__u16); @@ -179,12 +174,6 @@ nt_lm_owf_gen(char *pwd, unsigned char n memset(nt_p16, '\0', 16); E_md4hash(passwd, nt_p16); -#ifdef DEBUG_PASSWORD - DEBUG(100, ("nt_lm_owf_gen: pwd, nt#\n")); - dump_data(120, passwd, strlen(passwd)); - dump_data(100, (char *) nt_p16, 16); -#endif - /* Mangle the passwords into Lanman format */ passwd[14] = '\0'; /* strupper(passwd); */ @@ -194,11 +183,6 @@ nt_lm_owf_gen(char *pwd, unsigned char n memset(p16, '\0', 16); E_P16((unsigned char *) passwd, (unsigned char *) p16); -#ifdef DEBUG_PASSWORD - DEBUG(100, ("nt_lm_owf_gen: pwd, lm#\n")); - dump_data(120, passwd, strlen(passwd)); - dump_data(100, (char *) p16, 16); -#endif /* clear out local copy of user's password (just being paranoid). */ memset(passwd, '\0', sizeof (passwd)); } @@ -235,13 +219,6 @@ ntv2_owf_gen(const unsigned char owf[16] hmac_md5_update((const unsigned char *) dom_u, domain_l * 2, &ctx); hmac_md5_final(kr_buf, &ctx); -#ifdef DEBUG_PASSWORD - DEBUG(100, ("ntv2_owf_gen: user, domain, owfkey, kr\n")); - dump_data(100, user_u, user_l * 2); - dump_data(100, dom_u, domain_l * 2); - dump_data(100, owf, 16); - dump_data(100, kr_buf, 16); -#endif kfree(user_u); } @@ -270,12 +247,6 @@ NTLMSSPOWFencrypt(unsigned char passwd[8 memset(p21 + 8, 0xbd, 8); E_P24(p21, ntlmchalresp, p24); -#ifdef DEBUG_PASSWORD - DEBUG(100, ("NTLMSSPOWFencrypt: p21, c8, p24\n")); - dump_data(100, (char *) p21, 21); - dump_data(100, (char *) ntlmchalresp, 8); - dump_data(100, (char *) p24, 24); -#endif } /* Does the NT MD4 hash then des encryption. */ @@ -289,13 +260,6 @@ SMBNTencrypt(unsigned char *passwd, unsi E_md4hash(passwd, p21); SMBOWFencrypt(p21, c8, p24); - -#ifdef DEBUG_PASSWORD - DEBUG(100, ("SMBNTencrypt: nt#, challenge, response\n")); - dump_data(100, (char *) p21, 16); - dump_data(100, (char *) c8, 8); - dump_data(100, (char *) p24, 24); -#endif } /* Does the md5 encryption from the NT hash for NTLMv2. */ @@ -310,37 +274,6 @@ SMBOWFencrypt_ntv2(const unsigned char k hmac_md5_update(srv_chal->data, srv_chal->length, &ctx); hmac_md5_update(cli_chal->data, cli_chal->length, &ctx); hmac_md5_final(resp_buf, &ctx); - -#ifdef DEBUG_PASSWORD - DEBUG(100, ("SMBOWFencrypt_ntv2: srv_chal, cli_chal, resp_buf\n")); - dump_data(100, srv_chal->data, srv_chal->length); - dump_data(100, cli_chal->data, cli_chal->length); - dump_data(100, resp_buf, 16); -#endif -} - -static struct data_blob LMv2_generate_response(const unsigned char ntlm_v2_hash[16], - const struct data_blob * server_chal) -{ - unsigned char lmv2_response[16]; - struct data_blob lmv2_client_data/* = data_blob(NULL, 8)*/; /* BB Fix BB */ - struct data_blob final_response /* = data_blob(NULL, 24)*/; /* BB Fix BB */ - - /* LMv2 */ - /* client-supplied random data */ - get_random_bytes(lmv2_client_data.data, lmv2_client_data.length); - /* Given that data, and the challenge from the server, generate a response */ - SMBOWFencrypt_ntv2(ntlm_v2_hash, server_chal, &lmv2_client_data, lmv2_response); - memcpy(final_response.data, lmv2_response, sizeof(lmv2_response)); - - /* after the first 16 bytes is the random data we generated above, - so the server can verify us with it */ - memcpy(final_response.data+sizeof(lmv2_response), - lmv2_client_data.data, lmv2_client_data.length); - -/* data_blob_free(&lmv2_client_data); */ /* BB fix BB */ - - return final_response; } void @@ -352,11 +285,6 @@ SMBsesskeygen_ntv2(const unsigned char k hmac_md5_init_limK_to_64(kr, 16, &ctx); hmac_md5_update(nt_resp, 16, &ctx); hmac_md5_final((unsigned char *) sess_key, &ctx); - -#ifdef DEBUG_PASSWORD - DEBUG(100, ("SMBsesskeygen_ntv2:\n")); - dump_data(100, sess_key, 16); -#endif } void @@ -364,66 +292,4 @@ SMBsesskeygen_ntv1(const unsigned char k const unsigned char *nt_resp, __u8 sess_key[16]) { mdfour((unsigned char *) sess_key, (unsigned char *) kr, 16); - -#ifdef DEBUG_PASSWORD - DEBUG(100, ("SMBsesskeygen_ntv1:\n")); - dump_data(100, sess_key, 16); -#endif -} - -/*********************************************************** - encode a password buffer. The caller gets to figure out - what to put in it. -************************************************************/ -int -encode_pw_buffer(char buffer[516], char *new_pw, int new_pw_length) -{ - get_random_bytes(buffer, sizeof (buffer)); - - memcpy(&buffer[512 - new_pw_length], new_pw, new_pw_length); - - /* - * The length of the new password is in the last 4 bytes of - * the data buffer. - */ - SIVAL(buffer, 512, new_pw_length); - - return TRUE; -} - -int SMBNTLMv2encrypt(const char *user, const char *domain, const char *password, - const struct data_blob *server_chal, - const struct data_blob *names_blob, - struct data_blob *lm_response, struct data_blob *nt_response, - struct data_blob *nt_session_key,struct nls_table * nls_codepage) -{ - unsigned char nt_hash[16]; - unsigned char ntlm_v2_hash[16]; - E_md4hash(password, nt_hash); - - /* We don't use the NT# directly. Instead we use it mashed up with - the username and domain. - This prevents username swapping during the auth exchange - */ - ntv2_owf_gen(nt_hash, user, domain, ntlm_v2_hash,nls_codepage); - - if (nt_response) { -/* *nt_response = NTLMv2_generate_response(ntlm_v2_hash, server_chal, - names_blob); */ /* BB fix BB */ - if (nt_session_key) { -/* *nt_session_key = data_blob(NULL, 16); */ /* BB fix BB */ - - /* The NTLMv2 calculations also provide a session key, for signing etc later */ - /* use only the first 16 bytes of nt_response for session key */ - SMBsesskeygen_ntv2(ntlm_v2_hash, nt_response->data, nt_session_key->data); - } - } - - /* LMv2 */ - - if (lm_response) { - *lm_response = LMv2_generate_response(ntlm_v2_hash, server_chal); - } - - return TRUE; } --- diff/fs/cifs/transport.c 2003-10-27 09:20:44.000000000 +0000 +++ source/fs/cifs/transport.c 2004-04-21 10:46:51.685724688 +0100 @@ -37,7 +37,6 @@ struct mid_q_entry * AllocMidQEntry(struct smb_hdr *smb_buffer, struct cifsSesInfo *ses) { struct mid_q_entry *temp; - int timeout = 10 * HZ; if (ses == NULL) { cERROR(1, ("Null session passed in to AllocMidQEntry ")); @@ -63,25 +62,11 @@ AllocMidQEntry(struct smb_hdr *smb_buffe temp->tsk = current; } - while ((ses->server->tcpStatus != CifsGood) && (timeout > 0)){ - /* Give the tcp thread up to 10 seconds to reconnect */ - /* Should we wake up tcp thread first? BB */ - timeout = wait_event_interruptible_timeout(ses->server->response_q, - (ses->server->tcpStatus == CifsGood), timeout); - } - - if (ses->server->tcpStatus == CifsGood) { - spin_lock(&GlobalMid_Lock); - list_add_tail(&temp->qhead, &ses->server->pending_mid_q); - atomic_inc(&midCount); - temp->midState = MID_REQUEST_ALLOCATED; - spin_unlock(&GlobalMid_Lock); - } else { - cERROR(1,("Need to reconnect after session died to server")); - if (temp) - kmem_cache_free(cifs_mid_cachep, temp); - return NULL; - } + spin_lock(&GlobalMid_Lock); + list_add_tail(&temp->qhead, &ses->server->pending_mid_q); + atomic_inc(&midCount); + temp->midState = MID_REQUEST_ALLOCATED; + spin_unlock(&GlobalMid_Lock); return temp; } @@ -93,7 +78,7 @@ DeleteMidQEntry(struct mid_q_entry *midE list_del(&midEntry->qhead); atomic_dec(&midCount); spin_unlock(&GlobalMid_Lock); - buf_release(midEntry->resp_buf); + cifs_buf_release(midEntry->resp_buf); kmem_cache_free(cifs_mid_cachep, midEntry); } @@ -190,10 +175,35 @@ SendReceive(const unsigned int xid, stru long timeout; struct mid_q_entry *midQ; + if ((ses == NULL) || (ses->server == NULL)) { + cERROR(1,("Null tcp session or smb session: %p",ses)); + return -EIO; + } + + if (ses->server->tcpStatus == CifsExiting) { + return -ENOENT; + } else if (ses->server->tcpStatus == CifsNeedReconnect) { + cFYI(1,("tcp session dead - return to caller to retry")); + return -EAGAIN; + } else if (ses->status != CifsGood) { + /* check if SMB session is bad because we are setting it up */ + if((in_buf->Command != SMB_COM_SESSION_SETUP_ANDX) && + (in_buf->Command != SMB_COM_NEGOTIATE)) { + return -EAGAIN; + } /* else ok - we are setting up session */ + } + /* make sure that we sign in the same order that we send on this socket + and avoid races inside tcp sendmsg code that could cause corruption + of smb data */ + down(&ses->server->tcpSem); midQ = AllocMidQEntry(in_buf, ses); - if (midQ == NULL) + if (midQ == NULL) { + up(&ses->server->tcpSem); return -EIO; + } + if (in_buf->smb_buf_length > CIFS_MAX_MSGSIZE + MAX_CIFS_HDR_SIZE - 4) { + up(&ses->server->tcpSem); cERROR(1, ("Illegal length, greater than maximum frame, %d ", in_buf->smb_buf_length)); @@ -201,23 +211,25 @@ SendReceive(const unsigned int xid, stru return -EIO; } - if (in_buf->smb_buf_length > 12) - in_buf->Flags2 = cpu_to_le16(in_buf->Flags2); + if (in_buf->smb_buf_length > 12) + in_buf->Flags2 = cpu_to_le16(in_buf->Flags2); - rc = cifs_sign_smb(in_buf, ses, &midQ->sequence_number); + rc = cifs_sign_smb(in_buf, ses, &midQ->sequence_number); midQ->midState = MID_REQUEST_SUBMITTED; rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length, - (struct sockaddr *) &(ses->server->sockAddr)); - + (struct sockaddr *) &(ses->server->addr.sockAddr)); + up(&ses->server->tcpSem); if (long_op == -1) goto cifs_no_response_exit; - if (long_op > 1) /* writes past end of file can take looooong time */ + else if (long_op == 2) /* writes past end of file can take looooong time */ timeout = 300 * HZ; else if (long_op == 1) timeout = 45 * HZ; /* should be greater than servers oplock break timeout (about 43 seconds) */ - else + else if (long_op > 2) { + timeout = MAX_SCHEDULE_TIMEOUT; + } else timeout = 15 * HZ; /* wait for 15 seconds or until woken up due to response arriving or due to last connection to this server being unmounted */ @@ -227,26 +239,39 @@ SendReceive(const unsigned int xid, stru midState & MID_RESPONSE_RECEIVED, timeout); if (signal_pending(current)) { - cERROR(1, ("CIFS: caught signal")); + cFYI(1, ("CIFS: caught signal")); DeleteMidQEntry(midQ); return -EINTR; - } else { - if (midQ->resp_buf) + } else { /* BB spinlock protect this against races with demux thread */ + spin_lock(&GlobalMid_Lock); + if (midQ->resp_buf) { + spin_unlock(&GlobalMid_Lock); receive_len = be32_to_cpu(midQ->resp_buf->smb_buf_length); - else { + } else { cFYI(1,("No response buffer")); + if(midQ->midState == MID_REQUEST_SUBMITTED) { + if(ses->server->tcpStatus == CifsExiting) + rc = -EHOSTDOWN; + else { + ses->server->tcpStatus = CifsNeedReconnect; + midQ->midState = MID_RETRY_NEEDED; + } + } + + if(midQ->midState == MID_RETRY_NEEDED) { + rc = -EAGAIN; + cFYI(1,("marking request for retry")); + } else { + rc = -EIO; + } + spin_unlock(&GlobalMid_Lock); DeleteMidQEntry(midQ); - ses->server->tcpStatus = CifsNeedReconnect; - return -EIO; + return rc; } } - if (timeout == 0) { - cFYI(1, - ("Timeout on receive. Assume response SMB is invalid.")); - rc = -ETIMEDOUT; - } else if (receive_len > CIFS_MAX_MSGSIZE + MAX_CIFS_HDR_SIZE) { + if (receive_len > CIFS_MAX_MSGSIZE + MAX_CIFS_HDR_SIZE) { cERROR(1, ("Frame too large received. Length: %d Xid: %d", receive_len, xid)); --- diff/fs/compat.c 2004-04-05 12:57:08.000000000 +0100 +++ source/fs/compat.c 2004-04-21 10:46:51.687724384 +0100 @@ -34,9 +34,17 @@ #include #include #include +#include +#include +#include +#include +#include +#include + #include /* siocdevprivate_ioctl */ #include +#include /* * Not all architectures have sys_utime, so implement this in terms @@ -794,3 +802,839 @@ asmlinkage int compat_sys_mount(char __u return retval; } +static ssize_t compat_do_readv_writev(int type, struct file *file, + const struct compat_iovec __user *uvector, + unsigned long nr_segs, loff_t *pos) +{ + typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *); + typedef ssize_t (*iov_fn_t)(struct file *, const struct iovec *, unsigned long, loff_t *); + + compat_ssize_t tot_len; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov=iovstack, *vector; + ssize_t ret; + int seg; + io_fn_t fn; + iov_fn_t fnv; + struct inode *inode; + + /* + * SuS says "The readv() function *may* fail if the iovcnt argument + * was less than or equal to 0, or greater than {IOV_MAX}. Linux has + * traditionally returned zero for zero segments, so... + */ + ret = 0; + if (nr_segs == 0) + goto out; + + /* + * First get the "struct iovec" from user memory and + * verify all the pointers + */ + ret = -EINVAL; + if ((nr_segs > UIO_MAXIOV) || (nr_segs <= 0)) + goto out; + if (!file->f_op) + goto out; + if (nr_segs > UIO_FASTIOV) { + ret = -ENOMEM; + iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); + if (!iov) + goto out; + } + ret = -EFAULT; + if (verify_area(VERIFY_READ, uvector, nr_segs*sizeof(*uvector))) + goto out; + + /* + * Single unix specification: + * We should -EINVAL if an element length is not >= 0 and fitting an + * ssize_t. The total length is fitting an ssize_t + * + * Be careful here because iov_len is a size_t not an ssize_t + */ + tot_len = 0; + vector = iov; + ret = -EINVAL; + for (seg = 0 ; seg < nr_segs; seg++) { + compat_ssize_t tmp = tot_len; + compat_ssize_t len; + compat_uptr_t buf; + + if (__get_user(len, &uvector->iov_len) || + __get_user(buf, &uvector->iov_base)) { + ret = -EFAULT; + goto out; + } + if (len < 0) /* size_t not fitting an compat_ssize_t .. */ + goto out; + tot_len += len; + if (tot_len < tmp) /* maths overflow on the compat_ssize_t */ + goto out; + vector->iov_base = compat_ptr(buf); + vector->iov_len = (compat_size_t) len; + uvector++; + vector++; + } + if (tot_len == 0) { + ret = 0; + goto out; + } + + inode = file->f_dentry->d_inode; + /* VERIFY_WRITE actually means a read, as we write to user space */ + ret = locks_verify_area((type == READ + ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE), + inode, file, *pos, tot_len); + if (ret) + goto out; + + fnv = NULL; + if (type == READ) { + fn = file->f_op->read; + fnv = file->f_op->readv; + } else { + fn = (io_fn_t)file->f_op->write; + fnv = file->f_op->writev; + } + if (fnv) { + ret = fnv(file, iov, nr_segs, pos); + goto out; + } + + /* Do it by hand, with file-ops */ + ret = 0; + vector = iov; + while (nr_segs > 0) { + void __user * base; + size_t len; + ssize_t nr; + + base = vector->iov_base; + len = vector->iov_len; + vector++; + nr_segs--; + + nr = fn(file, base, len, pos); + + if (nr < 0) { + if (!ret) ret = nr; + break; + } + ret += nr; + if (nr != len) + break; + } +out: + if (iov != iovstack) + kfree(iov); + if ((ret + (type == READ)) > 0) + dnotify_parent(file->f_dentry, + (type == READ) ? DN_ACCESS : DN_MODIFY); + return ret; +} + +asmlinkage ssize_t +compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, unsigned long vlen) +{ + struct file *file; + ssize_t ret = -EBADF; + + file = fget(fd); + if (!file) + return -EBADF; + + if (!(file->f_mode & FMODE_READ)) + goto out; + + ret = -EINVAL; + if (!file->f_op || (!file->f_op->readv && !file->f_op->read)) + goto out; + + ret = compat_do_readv_writev(READ, file, vec, vlen, &file->f_pos); + +out: + fput(file); + return ret; +} + +asmlinkage ssize_t +compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, unsigned long vlen) +{ + struct file *file; + ssize_t ret = -EBADF; + + file = fget(fd); + if (!file) + return -EBADF; + if (!(file->f_mode & FMODE_WRITE)) + goto out; + + ret = -EINVAL; + if (!file->f_op || (!file->f_op->writev && !file->f_op->write)) + goto out; + + ret = compat_do_readv_writev(WRITE, file, vec, vlen, &file->f_pos); + +out: + fput(file); + return ret; +} + +/* + * compat_count() counts the number of arguments/envelopes. It is basically + * a copy of count() from fs/exec.c, except that it works with 32 bit argv + * and envp pointers. + */ +static int compat_count(compat_uptr_t *argv, int max) +{ + int i = 0; + + if (argv != NULL) { + for (;;) { + compat_uptr_t p; + + if (get_user(p, argv)) + return -EFAULT; + if (!p) + break; + argv++; + if(++i > max) + return -E2BIG; + } + } + return i; +} + +/* + * compat_copy_strings() is basically a copy of copy_strings() from fs/exec.c + * except that it works with 32 bit argv and envp pointers. + */ +static int compat_copy_strings(int argc, compat_uptr_t __user *argv, + struct linux_binprm *bprm) +{ + struct page *kmapped_page = NULL; + char *kaddr = NULL; + int ret; + + while (argc-- > 0) { + compat_uptr_t str; + int len; + unsigned long pos; + + if (get_user(str, argv+argc) || + !(len = strnlen_user(compat_ptr(str), bprm->p))) { + ret = -EFAULT; + goto out; + } + + if (bprm->p < len) { + ret = -E2BIG; + goto out; + } + + bprm->p -= len; + /* XXX: add architecture specific overflow check here. */ + pos = bprm->p; + + while (len > 0) { + int i, new, err; + int offset, bytes_to_copy; + struct page *page; + + offset = pos % PAGE_SIZE; + i = pos/PAGE_SIZE; + page = bprm->page[i]; + new = 0; + if (!page) { + page = alloc_page(GFP_HIGHUSER); + bprm->page[i] = page; + if (!page) { + ret = -ENOMEM; + goto out; + } + new = 1; + } + + if (page != kmapped_page) { + if (kmapped_page) + kunmap(kmapped_page); + kmapped_page = page; + kaddr = kmap(kmapped_page); + } + if (new && offset) + memset(kaddr, 0, offset); + bytes_to_copy = PAGE_SIZE - offset; + if (bytes_to_copy > len) { + bytes_to_copy = len; + if (new) + memset(kaddr+offset+len, 0, + PAGE_SIZE-offset-len); + } + err = copy_from_user(kaddr+offset, compat_ptr(str), + bytes_to_copy); + if (err) { + ret = -EFAULT; + goto out; + } + + pos += bytes_to_copy; + str += bytes_to_copy; + len -= bytes_to_copy; + } + } + ret = 0; +out: + if (kmapped_page) + kunmap(kmapped_page); + return ret; +} + +#ifdef CONFIG_MMU + +#define free_arg_pages(bprm) do { } while (0) + +#else + +static inline void free_arg_pages(struct linux_binprm *bprm) +{ + int i; + + for (i = 0; i < MAX_ARG_PAGES; i++) { + if (bprm->page[i]) + __free_page(bprm->page[i]); + bprm->page[i] = NULL; + } +} + +#endif /* CONFIG_MMU */ + +/* + * compat_do_execve() is mostly a copy of do_execve(), with the exception + * that it processes 32 bit argv and envp pointers. + */ +int compat_do_execve(char * filename, + compat_uptr_t __user *argv, + compat_uptr_t __user *envp, + struct pt_regs * regs) +{ + struct linux_binprm bprm; + struct file *file; + int retval; + int i; + + sched_balance_exec(); + + file = open_exec(filename); + + retval = PTR_ERR(file); + if (IS_ERR(file)) + return retval; + + bprm.p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *); + memset(bprm.page, 0, MAX_ARG_PAGES*sizeof(bprm.page[0])); + + bprm.file = file; + bprm.filename = filename; + bprm.interp = filename; + bprm.sh_bang = 0; + bprm.loader = 0; + bprm.exec = 0; + bprm.security = NULL; + bprm.mm = mm_alloc(); + retval = -ENOMEM; + if (!bprm.mm) + goto out_file; + + retval = init_new_context(current, bprm.mm); + if (retval < 0) + goto out_mm; + + bprm.argc = compat_count(argv, bprm.p / sizeof(compat_uptr_t)); + if ((retval = bprm.argc) < 0) + goto out_mm; + + bprm.envc = compat_count(envp, bprm.p / sizeof(compat_uptr_t)); + if ((retval = bprm.envc) < 0) + goto out_mm; + + retval = security_bprm_alloc(&bprm); + if (retval) + goto out; + + retval = prepare_binprm(&bprm); + if (retval < 0) + goto out; + + retval = copy_strings_kernel(1, &bprm.filename, &bprm); + if (retval < 0) + goto out; + + bprm.exec = bprm.p; + retval = compat_copy_strings(bprm.envc, envp, &bprm); + if (retval < 0) + goto out; + + retval = compat_copy_strings(bprm.argc, argv, &bprm); + if (retval < 0) + goto out; + + retval = search_binary_handler(&bprm,regs); + if (retval >= 0) { + free_arg_pages(&bprm); + + /* execve success */ + security_bprm_free(&bprm); + return retval; + } + +out: + /* Something went wrong, return the inode and free the argument pages*/ + for (i = 0 ; i < MAX_ARG_PAGES ; i++) { + struct page * page = bprm.page[i]; + if (page) + __free_page(page); + } + + if (bprm.security) + security_bprm_free(&bprm); + +out_mm: + if (bprm.mm) + mmdrop(bprm.mm); + +out_file: + if (bprm.file) { + allow_write_access(bprm.file); + fput(bprm.file); + } + + return retval; +} + +#define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t)) + +#define ROUND_UP(x,y) (((x)+(y)-1)/(y)) + +/* + * Ooo, nasty. We need here to frob 32-bit unsigned longs to + * 64-bit unsigned longs. + */ +static inline +int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, + unsigned long *fdset) +{ + nr = ROUND_UP(nr, __COMPAT_NFDBITS); + if (ufdset) { + unsigned long odd; + + if (verify_area(VERIFY_WRITE, ufdset, nr*sizeof(compat_ulong_t))) + return -EFAULT; + + odd = nr & 1UL; + nr &= ~1UL; + while (nr) { + unsigned long h, l; + __get_user(l, ufdset); + __get_user(h, ufdset+1); + ufdset += 2; + *fdset++ = h << 32 | l; + nr -= 2; + } + if (odd) + __get_user(*fdset, ufdset); + } else { + /* Tricky, must clear full unsigned long in the + * kernel fdset at the end, this makes sure that + * actually happens. + */ + memset(fdset, 0, ((nr + 1) & ~1)*sizeof(compat_ulong_t)); + } + return 0; +} + +static inline +void compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, + unsigned long *fdset) +{ + unsigned long odd; + nr = ROUND_UP(nr, __COMPAT_NFDBITS); + + if (!ufdset) + return; + + odd = nr & 1UL; + nr &= ~1UL; + while (nr) { + unsigned long h, l; + l = *fdset++; + h = l >> 32; + __put_user(l, ufdset); + __put_user(h, ufdset+1); + ufdset += 2; + nr -= 2; + } + if (odd) + __put_user(*fdset, ufdset); +} + + +/* + * This is a virtual copy of sys_select from fs/select.c and probably + * should be compared to it from time to time + */ +static void *select_bits_alloc(int size) +{ + return kmalloc(6 * size, GFP_KERNEL); +} + +static void select_bits_free(void *bits, int size) +{ + kfree(bits); +} + +/* + * We can actually return ERESTARTSYS instead of EINTR, but I'd + * like to be certain this leads to no problems. So I return + * EINTR just for safety. + * + * Update: ERESTARTSYS breaks at least the xview clock binary, so + * I'm trying ERESTARTNOHAND which restart only when you want to. + */ +#define MAX_SELECT_SECONDS \ + ((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1) + +asmlinkage long +compat_sys_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, + compat_ulong_t __user *exp, struct compat_timeval __user *tvp) +{ + fd_set_bits fds; + char *bits; + long timeout; + int ret, size, max_fdset; + + timeout = MAX_SCHEDULE_TIMEOUT; + if (tvp) { + time_t sec, usec; + + if ((ret = verify_area(VERIFY_READ, tvp, sizeof(*tvp))) + || (ret = __get_user(sec, &tvp->tv_sec)) + || (ret = __get_user(usec, &tvp->tv_usec))) + goto out_nofds; + + ret = -EINVAL; + if (sec < 0 || usec < 0) + goto out_nofds; + + if ((unsigned long) sec < MAX_SELECT_SECONDS) { + timeout = ROUND_UP(usec, 1000000/HZ); + timeout += sec * (unsigned long) HZ; + } + } + + ret = -EINVAL; + if (n < 0) + goto out_nofds; + + /* max_fdset can increase, so grab it once to avoid race */ + max_fdset = current->files->max_fdset; + if (n > max_fdset) + n = max_fdset; + + /* + * We need 6 bitmaps (in/out/ex for both incoming and outgoing), + * since we used fdset we need to allocate memory in units of + * long-words. + */ + ret = -ENOMEM; + size = FDS_BYTES(n); + bits = select_bits_alloc(size); + if (!bits) + goto out_nofds; + fds.in = (unsigned long *) bits; + fds.out = (unsigned long *) (bits + size); + fds.ex = (unsigned long *) (bits + 2*size); + fds.res_in = (unsigned long *) (bits + 3*size); + fds.res_out = (unsigned long *) (bits + 4*size); + fds.res_ex = (unsigned long *) (bits + 5*size); + + if ((ret = compat_get_fd_set(n, inp, fds.in)) || + (ret = compat_get_fd_set(n, outp, fds.out)) || + (ret = compat_get_fd_set(n, exp, fds.ex))) + goto out; + zero_fd_set(n, fds.res_in); + zero_fd_set(n, fds.res_out); + zero_fd_set(n, fds.res_ex); + + ret = do_select(n, &fds, &timeout); + + if (tvp && !(current->personality & STICKY_TIMEOUTS)) { + time_t sec = 0, usec = 0; + if (timeout) { + sec = timeout / HZ; + usec = timeout % HZ; + usec *= (1000000/HZ); + } + if (put_user(sec, &tvp->tv_sec) || + put_user(usec, &tvp->tv_usec)) + ret = -EFAULT; + } + + if (ret < 0) + goto out; + if (!ret) { + ret = -ERESTARTNOHAND; + if (signal_pending(current)) + goto out; + ret = 0; + } + + compat_set_fd_set(n, inp, fds.res_in); + compat_set_fd_set(n, outp, fds.res_out); + compat_set_fd_set(n, exp, fds.res_ex); + +out: + select_bits_free(bits, size); +out_nofds: + return ret; +} + +#if defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE) +/* Stuff for NFS server syscalls... */ +struct compat_nfsctl_svc { + u16 svc32_port; + s32 svc32_nthreads; +}; + +struct compat_nfsctl_client { + s8 cl32_ident[NFSCLNT_IDMAX+1]; + s32 cl32_naddr; + struct in_addr cl32_addrlist[NFSCLNT_ADDRMAX]; + s32 cl32_fhkeytype; + s32 cl32_fhkeylen; + u8 cl32_fhkey[NFSCLNT_KEYMAX]; +}; + +struct compat_nfsctl_export { + char ex32_client[NFSCLNT_IDMAX+1]; + char ex32_path[NFS_MAXPATHLEN+1]; + compat_dev_t ex32_dev; + compat_ino_t ex32_ino; + compat_int_t ex32_flags; + compat_uid_t ex32_anon_uid; + compat_gid_t ex32_anon_gid; +}; + +struct compat_nfsctl_fdparm { + struct sockaddr gd32_addr; + s8 gd32_path[NFS_MAXPATHLEN+1]; + compat_int_t gd32_version; +}; + +struct compat_nfsctl_fsparm { + struct sockaddr gd32_addr; + s8 gd32_path[NFS_MAXPATHLEN+1]; + compat_int_t gd32_maxlen; +}; + +struct compat_nfsctl_arg { + compat_int_t ca32_version; /* safeguard */ + union { + struct compat_nfsctl_svc u32_svc; + struct compat_nfsctl_client u32_client; + struct compat_nfsctl_export u32_export; + struct compat_nfsctl_fdparm u32_getfd; + struct compat_nfsctl_fsparm u32_getfs; + } u; +#define ca32_svc u.u32_svc +#define ca32_client u.u32_client +#define ca32_export u.u32_export +#define ca32_getfd u.u32_getfd +#define ca32_getfs u.u32_getfs +}; + +union compat_nfsctl_res { + __u8 cr32_getfh[NFS_FHSIZE]; + struct knfsd_fh cr32_getfs; +}; + +static int compat_nfs_svc_trans(struct nfsctl_arg *karg, struct compat_nfsctl_arg *arg) +{ + int err; + + err = access_ok(VERIFY_READ, &arg->ca32_svc, sizeof(arg->ca32_svc)); + err |= get_user(karg->ca_version, &arg->ca32_version); + err |= __get_user(karg->ca_svc.svc_port, &arg->ca32_svc.svc32_port); + err |= __get_user(karg->ca_svc.svc_nthreads, &arg->ca32_svc.svc32_nthreads); + return (err) ? -EFAULT : 0; +} + +static int compat_nfs_clnt_trans(struct nfsctl_arg *karg, struct compat_nfsctl_arg *arg) +{ + int err; + + err = access_ok(VERIFY_READ, &arg->ca32_client, sizeof(arg->ca32_client)); + err |= get_user(karg->ca_version, &arg->ca32_version); + err |= __copy_from_user(&karg->ca_client.cl_ident[0], + &arg->ca32_client.cl32_ident[0], + NFSCLNT_IDMAX); + err |= __get_user(karg->ca_client.cl_naddr, &arg->ca32_client.cl32_naddr); + err |= __copy_from_user(&karg->ca_client.cl_addrlist[0], + &arg->ca32_client.cl32_addrlist[0], + (sizeof(struct in_addr) * NFSCLNT_ADDRMAX)); + err |= __get_user(karg->ca_client.cl_fhkeytype, + &arg->ca32_client.cl32_fhkeytype); + err |= __get_user(karg->ca_client.cl_fhkeylen, + &arg->ca32_client.cl32_fhkeylen); + err |= __copy_from_user(&karg->ca_client.cl_fhkey[0], + &arg->ca32_client.cl32_fhkey[0], + NFSCLNT_KEYMAX); + + return (err) ? -EFAULT : 0; +} + +static int compat_nfs_exp_trans(struct nfsctl_arg *karg, struct compat_nfsctl_arg *arg) +{ + int err; + + err = access_ok(VERIFY_READ, &arg->ca32_export, sizeof(arg->ca32_export)); + err |= get_user(karg->ca_version, &arg->ca32_version); + err |= __copy_from_user(&karg->ca_export.ex_client[0], + &arg->ca32_export.ex32_client[0], + NFSCLNT_IDMAX); + err |= __copy_from_user(&karg->ca_export.ex_path[0], + &arg->ca32_export.ex32_path[0], + NFS_MAXPATHLEN); + err |= __get_user(karg->ca_export.ex_dev, + &arg->ca32_export.ex32_dev); + err |= __get_user(karg->ca_export.ex_ino, + &arg->ca32_export.ex32_ino); + err |= __get_user(karg->ca_export.ex_flags, + &arg->ca32_export.ex32_flags); + err |= __get_user(karg->ca_export.ex_anon_uid, + &arg->ca32_export.ex32_anon_uid); + err |= __get_user(karg->ca_export.ex_anon_gid, + &arg->ca32_export.ex32_anon_gid); + SET_UID(karg->ca_export.ex_anon_uid, karg->ca_export.ex_anon_uid); + SET_GID(karg->ca_export.ex_anon_gid, karg->ca_export.ex_anon_gid); + + return (err) ? -EFAULT : 0; +} + +static int compat_nfs_getfd_trans(struct nfsctl_arg *karg, struct compat_nfsctl_arg *arg) +{ + int err; + + err = access_ok(VERIFY_READ, &arg->ca32_getfd, sizeof(arg->ca32_getfd)); + err |= get_user(karg->ca_version, &arg->ca32_version); + err |= __copy_from_user(&karg->ca_getfd.gd_addr, + &arg->ca32_getfd.gd32_addr, + (sizeof(struct sockaddr))); + err |= __copy_from_user(&karg->ca_getfd.gd_path, + &arg->ca32_getfd.gd32_path, + (NFS_MAXPATHLEN+1)); + err |= __get_user(karg->ca_getfd.gd_version, + &arg->ca32_getfd.gd32_version); + + return (err) ? -EFAULT : 0; +} + +static int compat_nfs_getfs_trans(struct nfsctl_arg *karg, struct compat_nfsctl_arg *arg) +{ + int err; + + err = access_ok(VERIFY_READ, &arg->ca32_getfs, sizeof(arg->ca32_getfs)); + err |= get_user(karg->ca_version, &arg->ca32_version); + err |= __copy_from_user(&karg->ca_getfs.gd_addr, + &arg->ca32_getfs.gd32_addr, + (sizeof(struct sockaddr))); + err |= __copy_from_user(&karg->ca_getfs.gd_path, + &arg->ca32_getfs.gd32_path, + (NFS_MAXPATHLEN+1)); + err |= __get_user(karg->ca_getfs.gd_maxlen, + &arg->ca32_getfs.gd32_maxlen); + + return (err) ? -EFAULT : 0; +} + +/* This really doesn't need translations, we are only passing + * back a union which contains opaque nfs file handle data. + */ +static int compat_nfs_getfh_res_trans(union nfsctl_res *kres, union compat_nfsctl_res *res) +{ + int err; + + err = copy_to_user(res, kres, sizeof(*res)); + + return (err) ? -EFAULT : 0; +} + +asmlinkage long compat_sys_nfsservctl(int cmd, struct compat_nfsctl_arg *arg, + union compat_nfsctl_res *res) +{ + struct nfsctl_arg *karg; + union nfsctl_res *kres; + mm_segment_t oldfs; + int err; + + karg = kmalloc(sizeof(*karg), GFP_USER); + kres = kmalloc(sizeof(*kres), GFP_USER); + if(!karg || !kres) { + err = -ENOMEM; + goto done; + } + + switch(cmd) { + case NFSCTL_SVC: + err = compat_nfs_svc_trans(karg, arg); + break; + + case NFSCTL_ADDCLIENT: + err = compat_nfs_clnt_trans(karg, arg); + break; + + case NFSCTL_DELCLIENT: + err = compat_nfs_clnt_trans(karg, arg); + break; + + case NFSCTL_EXPORT: + case NFSCTL_UNEXPORT: + err = compat_nfs_exp_trans(karg, arg); + break; + + case NFSCTL_GETFD: + err = compat_nfs_getfd_trans(karg, arg); + break; + + case NFSCTL_GETFS: + err = compat_nfs_getfs_trans(karg, arg); + break; + + default: + err = -EINVAL; + goto done; + } + + oldfs = get_fs(); + set_fs(KERNEL_DS); + err = sys_nfsservctl(cmd, karg, kres); + set_fs(oldfs); + + if (err) + goto done; + + if((cmd == NFSCTL_GETFD) || + (cmd == NFSCTL_GETFS)) + err = compat_nfs_getfh_res_trans(kres, res); + +done: + kfree(karg); + kfree(kres); + return err; +} +#else /* !NFSD */ +long asmlinkage compat_sys_nfsservctl(int cmd, void *notused, void *notused2) +{ + return sys_ni_syscall(); +} +#endif --- diff/fs/compat_ioctl.c 2004-04-21 10:45:35.172356488 +0100 +++ source/fs/compat_ioctl.c 2004-04-21 10:46:51.688724232 +0100 @@ -70,8 +70,10 @@ #include /* siocdevprivate_ioctl */ #include -#include #include +#include + +#include #include /* Ugly hack. */ --- diff/fs/dcache.c 2004-04-21 10:45:35.173356336 +0100 +++ source/fs/dcache.c 2004-04-21 10:46:51.689724080 +0100 @@ -1557,13 +1557,9 @@ static void __init dcache_init(unsigned * flag could be removed here, to hint to the allocator that * it should not try to get multiple page regions. */ - dentry_cache = kmem_cache_create("dentry_cache", - sizeof(struct dentry), - 0, - SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, - NULL, NULL); - if (!dentry_cache) - panic("Cannot create dentry cache"); + dentry_cache = kmem_cache_create("dentry_cache", sizeof(struct dentry), + 0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_PANIC, + NULL, NULL); set_shrinker(DEFAULT_SEEKS, shrink_dcache_memory); @@ -1628,17 +1624,11 @@ void __init vfs_caches_init(unsigned lon reserve = (mempages - nr_free_pages()) * 3/2; mempages -= reserve; - names_cachep = kmem_cache_create("names_cache", - PATH_MAX, 0, - SLAB_HWCACHE_ALIGN, NULL, NULL); - if (!names_cachep) - panic("Cannot create names SLAB cache"); - - filp_cachep = kmem_cache_create("filp", - sizeof(struct file), 0, - SLAB_HWCACHE_ALIGN, filp_ctor, filp_dtor); - if(!filp_cachep) - panic("Cannot create filp SLAB cache"); + names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); + + filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, filp_ctor, filp_dtor); dcache_init(mempages); inode_init(mempages); --- diff/fs/dnotify.c 2004-04-21 10:45:35.181355120 +0100 +++ source/fs/dnotify.c 2004-04-21 10:46:51.690723928 +0100 @@ -173,9 +173,7 @@ EXPORT_SYMBOL_GPL(dnotify_parent); static int __init dnotify_init(void) { dn_cache = kmem_cache_create("dnotify_cache", - sizeof(struct dnotify_struct), 0, 0, NULL, NULL); - if (!dn_cache) - panic("cannot create dnotify slab cache"); + sizeof(struct dnotify_struct), 0, SLAB_PANIC, NULL, NULL); return 0; } --- diff/fs/dquot.c 2004-04-21 10:45:35.183354816 +0100 +++ source/fs/dquot.c 2004-04-21 10:46:51.690723928 +0100 @@ -1281,9 +1281,6 @@ int vfs_quota_off(struct super_block *sb int cnt; struct quota_info *dqopt = sb_dqopt(sb); - if (!sb) - goto out; - /* We need to serialize quota_off() for device */ down(&dqopt->dqonoff_sem); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -1703,12 +1700,9 @@ static int __init dquot_init(void) dquot_cachep = kmem_cache_create("dquot", sizeof(struct dquot), sizeof(unsigned long) * 4, - SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, NULL, NULL); - if (!dquot_cachep) - panic("Cannot create dquot SLAB cache"); - + SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_PANIC, + NULL, NULL); set_shrinker(DEFAULT_SEEKS, shrink_dqcache_memory); - return 0; } module_init(dquot_init); --- diff/fs/eventpoll.c 2004-04-21 10:45:35.184354664 +0100 +++ source/fs/eventpoll.c 2004-04-21 10:46:51.691723776 +0100 @@ -1695,22 +1695,14 @@ static int __init eventpoll_init(void) ep_poll_safewake_init(&psw); /* Allocates slab cache used to allocate "struct epitem" items */ - error = -ENOMEM; - epi_cache = kmem_cache_create("eventpoll_epi", - sizeof(struct epitem), - 0, - SLAB_HWCACHE_ALIGN | EPI_SLAB_DEBUG, NULL, NULL); - if (!epi_cache) - goto eexit_1; + epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem), + 0, SLAB_HWCACHE_ALIGN|EPI_SLAB_DEBUG|SLAB_PANIC, + NULL, NULL); /* Allocates slab cache used to allocate "struct eppoll_entry" */ - error = -ENOMEM; pwq_cache = kmem_cache_create("eventpoll_pwq", - sizeof(struct eppoll_entry), - 0, - EPI_SLAB_DEBUG, NULL, NULL); - if (!pwq_cache) - goto eexit_2; + sizeof(struct eppoll_entry), 0, + EPI_SLAB_DEBUG|SLAB_PANIC, NULL, NULL); /* * Register the virtual file system that will be the source of inodes @@ -1718,27 +1710,20 @@ static int __init eventpoll_init(void) */ error = register_filesystem(&eventpoll_fs_type); if (error) - goto eexit_3; + goto epanic; /* Mount the above commented virtual file system */ eventpoll_mnt = kern_mount(&eventpoll_fs_type); error = PTR_ERR(eventpoll_mnt); if (IS_ERR(eventpoll_mnt)) - goto eexit_4; - - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: successfully initialized.\n", current)); + goto epanic; + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: successfully initialized.\n", + current)); return 0; -eexit_4: - unregister_filesystem(&eventpoll_fs_type); -eexit_3: - kmem_cache_destroy(pwq_cache); -eexit_2: - kmem_cache_destroy(epi_cache); -eexit_1: - - return error; +epanic: + panic("eventpoll_init() failed\n"); } @@ -1755,4 +1740,3 @@ module_init(eventpoll_init); module_exit(eventpoll_exit); MODULE_LICENSE("GPL"); - --- diff/fs/exec.c 2004-04-21 10:45:35.185354512 +0100 +++ source/fs/exec.c 2004-04-21 10:46:51.692723624 +0100 @@ -48,7 +48,6 @@ #include #include -#include #include #ifdef CONFIG_KMOD @@ -293,53 +292,42 @@ EXPORT_SYMBOL(copy_strings_kernel); * This routine is used to map in a page into an address space: needed by * execve() for the initial stack and environment pages. * - * tsk->mmap_sem is held for writing. + * tsk->mm->mmap_sem is held for writing. */ void put_dirty_page(struct task_struct *tsk, struct page *page, unsigned long address, pgprot_t prot) { + struct mm_struct *mm = tsk->mm; pgd_t * pgd; pmd_t * pmd; pte_t * pte; - struct pte_chain *pte_chain; - if (page_count(page) != 1) - printk(KERN_ERR "mem_map disagrees with %p at %08lx\n", - page, address); - - pgd = pgd_offset(tsk->mm, address); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto out_sig; - spin_lock(&tsk->mm->page_table_lock); - pmd = pmd_alloc(tsk->mm, pgd, address); + pgd = pgd_offset(mm, address); + spin_lock(&mm->page_table_lock); + pmd = pmd_alloc(mm, pgd, address); if (!pmd) goto out; - pte = pte_alloc_map(tsk->mm, pmd, address); + pte = pte_alloc_map(mm, pmd, address); if (!pte) goto out; if (!pte_none(*pte)) { pte_unmap(pte); goto out; } + mm->rss++; lru_cache_add_active(page); flush_dcache_page(page); set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, prot)))); - pte_chain = page_add_rmap(page, pte, pte_chain); + page_add_anon_rmap(page, mm, address); pte_unmap(pte); - tsk->mm->rss++; - spin_unlock(&tsk->mm->page_table_lock); + spin_unlock(&mm->page_table_lock); /* no need for flush_tlb */ - pte_chain_free(pte_chain); return; out: - spin_unlock(&tsk->mm->page_table_lock); -out_sig: + spin_unlock(&mm->page_table_lock); __free_page(page); force_sig(SIGKILL, tsk); - pte_chain_free(pte_chain); - return; } int setup_arg_pages(struct linux_binprm *bprm, int executable_stack) @@ -438,6 +426,7 @@ int setup_arg_pages(struct linux_binprm mpnt->vm_ops = NULL; mpnt->vm_pgoff = 0; mpnt->vm_file = NULL; + mpol_set_vma_default(mpnt); INIT_LIST_HEAD(&mpnt->shared); mpnt->vm_private_data = (void *) 0; insert_vm_struct(mm, mpnt); @@ -848,7 +837,8 @@ int flush_old_exec(struct linux_binprm * flush_thread(); if (bprm->e_uid != current->euid || bprm->e_gid != current->egid || - permission(bprm->file->f_dentry->d_inode,MAY_READ, NULL)) + permission(bprm->file->f_dentry->d_inode,MAY_READ, NULL) || + (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)) current->mm->dumpable = 0; /* An exec changes our domain. We are no longer part of the thread @@ -870,15 +860,6 @@ out: EXPORT_SYMBOL(flush_old_exec); -/* - * We mustn't allow tracing of suid binaries, unless - * the tracer has the capability to trace anything.. - */ -static inline int must_not_trace_exec(struct task_struct * p) -{ - return (p->ptrace & PT_PTRACED) && !(p->ptrace & PT_PTRACE_CAP); -} - /* * Fill the binprm structure from the inode. * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes @@ -945,27 +926,7 @@ EXPORT_SYMBOL(prepare_binprm); void compute_creds(struct linux_binprm *bprm) { - task_lock(current); - if (bprm->e_uid != current->uid || bprm->e_gid != current->gid) { - current->mm->dumpable = 0; - - if (must_not_trace_exec(current) - || atomic_read(¤t->fs->count) > 1 - || atomic_read(¤t->files->count) > 1 - || atomic_read(¤t->sighand->count) > 1) { - if(!capable(CAP_SETUID)) { - bprm->e_uid = current->uid; - bprm->e_gid = current->gid; - } - } - } - - current->suid = current->euid = current->fsuid = bprm->e_uid; - current->sgid = current->egid = current->fsgid = bprm->e_gid; - - task_unlock(current); - - security_bprm_compute_creds(bprm); + security_bprm_apply_creds(bprm); } EXPORT_SYMBOL(compute_creds); @@ -1120,6 +1081,7 @@ int do_execve(char * filename, bprm.file = file; bprm.filename = filename; bprm.interp = filename; + bprm.interp_flags = 0; bprm.sh_bang = 0; bprm.loader = 0; bprm.exec = 0; --- diff/fs/ext3/balloc.c 2004-03-11 10:20:28.000000000 +0000 +++ source/fs/ext3/balloc.c 2004-04-21 10:46:51.694723320 +0100 @@ -96,9 +96,87 @@ read_block_bitmap(struct super_block *sb error_out: return bh; } +/* + * The reservation window structure operations + * -------------------------------------------- + * Operations include: + * dump, find, add, remove, is_empty, find_next_reservable_window, etc. + * + * We use sorted double linked list for the per-filesystem reservation + * window list. (like in vm_region). + * + * Initially, we keep those small operations in the abstract functions, + * so later if we need a better searching tree than double linked-list, + * we could easily switch to that without changing too much + * code. + */ +static inline void rsv_window_dump(struct reserve_window *head, char *fn) +{ + struct reserve_window *rsv; + + printk("Block Allocation Reservation Windows Map (%s):\n", fn); + list_for_each_entry(rsv, &head->rsv_list, rsv_list) { + printk("reservation window 0x%p start: %d, end: %d\n", + rsv, rsv->rsv_start, rsv->rsv_end); + } +} + +static int +goal_in_my_reservation(struct reserve_window *rsv, int goal, + unsigned int group, struct super_block * sb) +{ + unsigned long group_first_block, group_last_block; + + group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) + + group * EXT3_BLOCKS_PER_GROUP(sb); + group_last_block = group_first_block + EXT3_BLOCKS_PER_GROUP(sb) - 1; + + if ((rsv->rsv_start > group_last_block) || + (rsv->rsv_end < group_first_block)) + return 0; + if ((goal >= 0) && ((goal + group_first_block < rsv->rsv_start) + || (goal + group_first_block > rsv->rsv_end))) + return 0; + return 1; +} + +static inline void rsv_window_add(struct reserve_window *rsv, + struct reserve_window *prev) +{ + /* insert the new reservation window after the head */ + list_add(&rsv->rsv_list, &prev->rsv_list); +} + +static inline void rsv_window_remove(struct reserve_window *rsv) +{ + rsv->rsv_start = 0; + rsv->rsv_end = 0; + rsv->rsv_alloc_hit = 0; + list_del(&rsv->rsv_list); + INIT_LIST_HEAD(&rsv->rsv_list); +} + +static inline int rsv_is_empty(struct reserve_window *rsv) +{ + /* a valid reservation end block could not be 0 */ + return (rsv->rsv_end == 0); +} + +void ext3_discard_reservation(struct inode *inode) +{ + struct ext3_inode_info *ei = EXT3_I(inode); + struct reserve_window *rsv = &ei->i_rsv_window; + spinlock_t *rsv_lock = &EXT3_SB(inode->i_sb)->s_rsv_window_lock; + + if (!rsv_is_empty(rsv)) { + spin_lock(rsv_lock); + rsv_window_remove(rsv); + spin_unlock(rsv_lock); + } +} /* Free given blocks, update quota and i_blocks field */ -void ext3_free_blocks (handle_t *handle, struct inode * inode, +void ext3_free_blocks(handle_t *handle, struct inode *inode, unsigned long block, unsigned long count) { struct buffer_head *bitmap_bh = NULL; @@ -296,7 +374,7 @@ error_return: * data-writes at some point, and disable it for metadata allocations or * sync-data inodes. */ -static inline int ext3_test_allocatable(int nr, struct buffer_head *bh) +static int ext3_test_allocatable(int nr, struct buffer_head *bh) { int ret; struct journal_head *jh = bh2jh(bh); @@ -313,6 +391,33 @@ static inline int ext3_test_allocatable( return ret; } +static int +bitmap_search_next_usable_block(int start, struct buffer_head *bh, + int maxblocks) +{ + int next; + struct journal_head *jh = bh2jh(bh); + + /* + * The bitmap search --- search forward alternately through the actual + * bitmap and the last-committed copy until we find a bit free in + * both + */ + while (start < maxblocks) { + next = ext3_find_next_zero_bit(bh->b_data, maxblocks, start); + if (next >= maxblocks) + return -1; + if (ext3_test_allocatable(next, bh)) + return next; + jbd_lock_bh_state(bh); + if (jh->b_committed_data) + start = ext3_find_next_zero_bit(jh->b_committed_data, + maxblocks, next); + jbd_unlock_bh_state(bh); + } + return -1; +} + /* * Find an allocatable block in a bitmap. We honour both the bitmap and * its last-committed copy (if that exists), and perform the "most @@ -325,7 +430,6 @@ find_next_usable_block(int start, struct { int here, next; char *p, *r; - struct journal_head *jh = bh2jh(bh); if (start > 0) { /* @@ -337,6 +441,8 @@ find_next_usable_block(int start, struct * next 64-bit boundary is simple.. */ int end_goal = (start + 63) & ~63; + if (end_goal > maxblocks) + end_goal = maxblocks; here = ext3_find_next_zero_bit(bh->b_data, end_goal, start); if (here < end_goal && ext3_test_allocatable(here, bh)) return here; @@ -351,7 +457,7 @@ find_next_usable_block(int start, struct r = memscan(p, 0, (maxblocks - here + 7) >> 3); next = (r - ((char *)bh->b_data)) << 3; - if (next < maxblocks && ext3_test_allocatable(next, bh)) + if (next < maxblocks && next >= start && ext3_test_allocatable(next, bh)) return next; /* @@ -359,19 +465,8 @@ find_next_usable_block(int start, struct * bitmap and the last-committed copy until we find a bit free in * both */ - while (here < maxblocks) { - next = ext3_find_next_zero_bit(bh->b_data, maxblocks, here); - if (next >= maxblocks) - return -1; - if (ext3_test_allocatable(next, bh)) - return next; - jbd_lock_bh_state(bh); - if (jh->b_committed_data) - here = ext3_find_next_zero_bit(jh->b_committed_data, - maxblocks, next); - jbd_unlock_bh_state(bh); - } - return -1; + here = bitmap_search_next_usable_block(here, bh, maxblocks); + return here; } /* @@ -407,62 +502,454 @@ claim_block(spinlock_t *lock, int block, */ static int ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group, - struct buffer_head *bitmap_bh, int goal, int *errp) + struct buffer_head *bitmap_bh, int goal, struct reserve_window *my_rsv) { - int i; - int fatal; - int credits = 0; - - *errp = 0; + int group_first_block, start, end; - /* - * Make sure we use undo access for the bitmap, because it is critical - * that we do the frozen_data COW on bitmap buffers in all cases even - * if the buffer is in BJ_Forget state in the committing transaction. - */ - BUFFER_TRACE(bitmap_bh, "get undo access for new block"); - fatal = ext3_journal_get_undo_access(handle, bitmap_bh, &credits); - if (fatal) { - *errp = fatal; - goto fail; + /* we do allocation within the reservation window if we have a window */ + if (my_rsv) { + group_first_block = + le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) + + group * EXT3_BLOCKS_PER_GROUP(sb); + if (my_rsv->rsv_start >= group_first_block) + start = my_rsv->rsv_start - group_first_block; + else + /* reservation window cross group boundary */ + start = 0; + end = my_rsv->rsv_end - group_first_block + 1; + if (end > EXT3_BLOCKS_PER_GROUP(sb)) + /* reservation window crosses group boundary */ + end = EXT3_BLOCKS_PER_GROUP(sb); + if ((start <= goal) && (goal < end)) + start = goal; + else + goal = -1; + } else { + if (goal > 0) + start = goal; + else + start = 0; + end = EXT3_BLOCKS_PER_GROUP(sb); } + BUG_ON(start > EXT3_BLOCKS_PER_GROUP(sb)); + repeat: if (goal < 0 || !ext3_test_allocatable(goal, bitmap_bh)) { - goal = find_next_usable_block(goal, bitmap_bh, - EXT3_BLOCKS_PER_GROUP(sb)); + goal = find_next_usable_block(start, bitmap_bh, end); if (goal < 0) goto fail_access; + if (!my_rsv) { + int i; - for (i = 0; i < 7 && goal > 0 && - ext3_test_allocatable(goal - 1, bitmap_bh); - i++, goal--); + for (i = 0; i < 7 && goal > start && + ext3_test_allocatable(goal - 1, + bitmap_bh); + i++, goal--) + ; + } } + start = goal; if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group), goal, bitmap_bh)) { /* * The block was allocated by another thread, or it was * allocated and then freed by another thread */ + start++; goal++; - if (goal >= EXT3_BLOCKS_PER_GROUP(sb)) + if (start >= end) goto fail_access; goto repeat; } + if (my_rsv) + my_rsv->rsv_alloc_hit++; + return goal; +fail_access: + return -1; +} + +/** + * find_next_reservable_window(): + * find a reservable space within the given range + * It does not allocate the reservation window for now + * alloc_new_reservation() will do the work later. + * + * @search_head: the head of the searching list; + * This is not necessary the list head of the whole filesystem + * + * we have both head and start_block to assist the search + * for the reservable space. The list start from head, + * but we will shift to the place where start_block is, + * then start from there, we looking for a resevable space. + * + * @fs_rsv_head: per-filesystem reervation list head. + * + * @size: the target new reservation window size + * @group_first_block: the first block we consider to start + * the real search from + * + * @last_block: + * the maxium block number that our goal reservable space + * could start from. This is normally the last block in this + * group. The search will end when we found the start of next + * possiblereservable space is out of this boundary. + * This could handle the cross bounday reservation window request. + * + * basically we search from the given range, rather than the whole + * reservation double linked list, (start_block, last_block) + * to find a free region that of of my size and has not + * been reserved. + * + * on succeed, it returns the reservation window to be append to. + * failed, return NULL. + */ +static inline +struct reserve_window *find_next_reservable_window( + struct reserve_window *search_head, + struct reserve_window *fs_rsv_head, + unsigned long size, int *start_block, + int last_block) +{ + struct reserve_window *rsv; + int cur; + + /* TODO:make the start of the reservation window byte alligned */ + /*cur = *start_block & 8;*/ + cur = *start_block; + rsv = list_entry(search_head->rsv_list.next, + struct reserve_window, rsv_list); + while (rsv != fs_rsv_head) { + if (cur + size <= rsv->rsv_start) { + /* + * Found a reserveable space big enough. We could + * have a reservation across the group boundary here + */ + break; + } + if (cur <= rsv->rsv_end) + cur = rsv->rsv_end + 1; + + /* TODO? + * in the case we could not find a reservable space + * that is what is expected, during the re-search, we could + * remember what's the largest reservable space we could have + * and return that one. + * + * For now it will fail if we could not find the reservable + * space with expected-size (or more)... + */ + rsv = list_entry(rsv->rsv_list.next, + struct reserve_window, rsv_list); + if (cur > last_block) + return NULL; /* fail */ + } + /* + * we come here either : + * when we rearch to the end of the whole list, + * and there is empty reservable space after last entry in the list. + * append it to the end of the list. + * + * or we found one reservable space in the middle of the list, + * return the reservation window that we could append to. + * succeed. + */ + *start_block = cur; + return list_entry(rsv->rsv_list.prev, struct reserve_window, rsv_list); +} + +/** + * alloc_new_reservation()--allocate a new reservation window + * if there is an existing reservation, discard it first + * then allocate the new one from there + * otherwise allocate the new reservation from the given + * start block, or the beginning of the group, if a goal + * is not given. + * + * To make a new reservation, we search part of the filesystem + * reservation list(the list that inside the group). + * + * If we have a old reservation, the search goal is the end of + * last reservation. If we do not have a old reservatio, then we + * start from a given goal, or the first block of the group, if + * the goal is not given. + * + * We first find a reservable space after the goal, then from + * there,we check the bitmap for the first free block after + * it. If there is no free block until the end of group, then the + * whole group is full, we failed. Otherwise, check if the free + * block is inside the expected reservable space, if so, we + * succeed. + * If the first free block is outside the reseravle space, then + * start from the first free block, we search for next avalibale + * space, and go on. + * + * on succeed, a new reservation will be found and inserted into the list + * It contains at least one free block, and it is not overlap with other + * reservation window. + * + * failed: we failed to found a reservation window in this group + * + * @rsv: the reservation + * + * @goal: The goal. It is where the search for a + * free reservable space should start from. + * if we have a old reservation, start_block is the end of + * old reservation. Otherwise, + * if we have a goal(goal >0 ), then start from there, + * no goal(goal = -1), we start from the first block + * of the group. + * + * @sb: the super block + * @group: the group we are trying to do allocate in + * @bitmap_bh: the block group block bitmap + */ +static int alloc_new_reservation(struct reserve_window *my_rsv, + int goal, struct super_block *sb, + unsigned int group, struct buffer_head *bitmap_bh) +{ + struct reserve_window *search_head; + int group_first_block, group_end_block, start_block; + int first_free_block; + int reservable_space_start; + struct reserve_window *prev_rsv; + struct reserve_window *fs_rsv_head = &EXT3_SB(sb)->s_rsv_window_head; + unsigned long size; + + group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) + + group * EXT3_BLOCKS_PER_GROUP(sb); + group_end_block = group_first_block + EXT3_BLOCKS_PER_GROUP(sb) - 1; + + if (goal < 0) + start_block = group_first_block; + else + start_block = goal + group_first_block; + + size = atomic_read(&my_rsv->rsv_goal_size); + /* if we have a old reservation, discard it first */ + if (!rsv_is_empty(my_rsv)) { + /* + * if the old reservation is cross group boundary + * we will come here when we just failed to allocate from + * the first part of the window. We still have another part + * that belongs to the next group. In this case, there is no + * point to discard our window and try to allocate a new one + * in this group(which will fail). we should + * keep the reservation window, just simply move on. + * + * Maybe we could shift the start block of the reservation + * window to the first block of next group. + */ + + if ((my_rsv->rsv_start <= group_end_block) && + (my_rsv->rsv_end > group_end_block)) + return -1; + + /* remember where we are before we discard the old one */ + if (my_rsv->rsv_end + 1 > start_block) + start_block = my_rsv->rsv_end + 1; + search_head = list_entry(my_rsv->rsv_list.prev, + struct reserve_window, rsv_list); + if ((my_rsv->rsv_alloc_hit > (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) { + /* + * if we previously allocation hit ration is greater than half + * we double the size of reservation window next time + * otherwise keep the same + */ + size = size * 2; + atomic_set(&my_rsv->rsv_goal_size, size); + } + rsv_window_remove(my_rsv); + } + else { + /* + * we don't have a reservation, + * we set our goal(start_block) and + * the list head for the search + */ + search_head = fs_rsv_head; + } - BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for bitmap block"); - fatal = ext3_journal_dirty_metadata(handle, bitmap_bh); + /* + * find_next_reservable_window() simply find a reservable window + * inside the given range(start_block, group_end_block). + * + * To make sure the reservation window has a free bit inside it, we + * need to check the bitmap after we found a reservable window. + */ +retry: + prev_rsv = find_next_reservable_window(search_head, fs_rsv_head, size, + &start_block, group_end_block); + if (prev_rsv == NULL) + goto failed; + reservable_space_start = start_block; + /* + * On success, find_next_reservable_window() returns the + * reservation window where there is a reservable space after it. + * Before we reserve this reservable space, we need + * to make sure there is at least a free block inside this region. + * + * searching the first free bit on the block bitmap and copy of + * last committed bitmap alternatively, until we found a allocatable + * block. Search start from the start block of the reservable space + * we just found. + */ + first_free_block = bitmap_search_next_usable_block( + reservable_space_start - group_first_block, + bitmap_bh, group_end_block - group_first_block + 1); + + if (first_free_block < 0) { + /* + * no free block left on the bitmap, no point + * to reserve the space. return failed. + */ + goto failed; + } + start_block = first_free_block + group_first_block; + /* + * check if the first free block is within the + * free space we just found + */ + if ((start_block >= reservable_space_start) && + (start_block < reservable_space_start + size)) + goto found_rsv_window; + /* + * if the first free bit we found is out of the reservable space + * this means there is no free block on the reservable space + * we should continue search for next reservable space, + * start from where the free block is, + * we also shift the list head to where we stopped last time + */ + search_head = prev_rsv; + goto retry; + +found_rsv_window: + /* + * great! the reservable space contains some free blocks. + * Insert it to the list. + */ + rsv_window_add(my_rsv, prev_rsv); + my_rsv->rsv_start = reservable_space_start; + my_rsv->rsv_end = my_rsv->rsv_start + size - 1; + return 0; /* succeed */ +failed: + return -1; /* failed */ +} + +/* + * This is the main function used to allocate a new block and its reservation + * window. + * + * Each time when a new block allocation is need, first try to allocate from + * its own reservation. If it does not have a reservation window, instead of + * looking for a free bit on bitmap first, then look up the reservation list to + * see if it is inside somebody else's reservation window, we try to allocate a + * reservation window for it start from the goal first. Then do the block + * allocation within the reservation window. + * + * This will aviod keep searching the reservation list again and again when + * someboday is looking for a free block(without reservation), and there are + * lots of free blocks, but they are all being reserved + * + * We use a sorted double linked list for the per-filesystem reservation list. + * The insert, remove and find a free space(non-reserved) operations for the + * sorted double linked list should be fast. + * + */ +static int +ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle, + unsigned int group, struct buffer_head *bitmap_bh, + int goal, struct reserve_window * my_rsv, + int *errp) +{ + spinlock_t *rsv_lock; + unsigned long group_first_block; + int ret = 0; + int fatal; + int credits = 0; + + *errp = 0; + + /* + * Make sure we use undo access for the bitmap, because it is critical + * that we do the frozen_data COW on bitmap buffers in all cases even + * if the buffer is in BJ_Forget state in the committing transaction. + */ + BUFFER_TRACE(bitmap_bh, "get undo access for new block"); + fatal = ext3_journal_get_undo_access(handle, bitmap_bh, &credits); if (fatal) { *errp = fatal; - goto fail; + return -1; + } + + /* + * we don't deal with reservation when + * filesystem is mounted without reservation + * or the file is not a regular file + * of last attemp of allocating a block with reservation turn on failed + */ + if (my_rsv == NULL ) { + ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, goal, NULL); + goto out; + } + rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock; + /* + * goal is a group relative block number (if there is a goal) + * 0 < goal < EXT3_BLOCKS_PER_GROUP(sb) + * first block is a filesystem wide block number + * first block is the block number of the first block in this group + */ + group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) + + group * EXT3_BLOCKS_PER_GROUP(sb); + + /* + * Basically we will allocate a new block from inode's reservation + * window. + * + * We need to allocate a new reservation window, if: + * a) inode does not have a reservation window; or + * b) last attemp of allocating a block from existing reservation + * failed; or + * c) we come here with a goal and with a reservation window + * + * We do not need to allocate a new reservation window if we come here + * at the beginning with a goal and the goal is inside the window, or + * or we don't have a goal but already have a reservation window. + * then we could go to allocate from the reservation window directly. + */ + while (1) { + if (rsv_is_empty(my_rsv) || (ret < 0) || + !goal_in_my_reservation(my_rsv, goal, group, sb)) { + spin_lock(rsv_lock); + ret = alloc_new_reservation(my_rsv, goal, sb, + group, bitmap_bh); + spin_unlock(rsv_lock); + if (ret < 0) + break; /* failed */ + + if (!goal_in_my_reservation(my_rsv, goal, group, sb)) + goal = -1; + } + ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, goal, + my_rsv); + if (ret >= 0) + break; /* succeed */ + } +out: + if (ret >= 0) { + BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for " + "bitmap block"); + fatal = ext3_journal_dirty_metadata(handle, bitmap_bh); + if (fatal) { + *errp = fatal; + return -1; + } + return ret; } - return goal; -fail_access: BUFFER_TRACE(bitmap_bh, "journal_release_buffer"); ext3_journal_release_buffer(handle, bitmap_bh, credits); -fail: - return -1; + return ret; } /* @@ -473,16 +960,16 @@ fail: * bitmap, and then for any free bit if that fails. * This function also updates quota and i_blocks field. */ -int -ext3_new_block(handle_t *handle, struct inode *inode, unsigned long goal, - u32 *prealloc_count, u32 *prealloc_block, int *errp) -{ - struct buffer_head *bitmap_bh = NULL; /* bh */ - struct buffer_head *gdp_bh; /* bh2 */ - int group_no; /* i */ - int ret_block; /* j */ - int bgi; /* blockgroup iteration index */ - int target_block; /* tmp */ +int ext3_new_block(handle_t *handle, struct inode *inode, + unsigned long goal, int *errp) +{ + struct buffer_head *bitmap_bh = NULL; + struct buffer_head *gdp_bh; + int group_no; + int goal_group; + int ret_block; + int bgi; /* blockgroup iteration index */ + int target_block; int fatal = 0, err; int performed_allocation = 0; int free_blocks, root_blocks; @@ -490,6 +977,7 @@ ext3_new_block(handle_t *handle, struct struct ext3_group_desc *gdp; struct ext3_super_block *es; struct ext3_sb_info *sbi; + struct reserve_window *my_rsv = NULL; #ifdef EXT3FS_DEBUG static int goal_hits, goal_attempts; #endif @@ -511,7 +999,10 @@ ext3_new_block(handle_t *handle, struct sbi = EXT3_SB(sb); es = EXT3_SB(sb)->s_es; ext3_debug("goal=%lu.\n", goal); - +#ifdef EXT3_RESERVATION + if (test_opt(sb, RESERVATION) && S_ISREG(inode->i_mode)) + my_rsv = &EXT3_I(inode)->i_rsv_window; +#endif free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); root_blocks = le32_to_cpu(es->s_r_blocks_count); if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && @@ -533,6 +1024,8 @@ ext3_new_block(handle_t *handle, struct if (!gdp) goto io_error; + goal_group = group_no; +retry: free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); if (free_blocks > 0) { ret_block = ((goal - le32_to_cpu(es->s_first_data_block)) % @@ -540,8 +1033,8 @@ ext3_new_block(handle_t *handle, struct bitmap_bh = read_block_bitmap(sb, group_no); if (!bitmap_bh) goto io_error; - ret_block = ext3_try_to_allocate(sb, handle, group_no, - bitmap_bh, ret_block, &fatal); + ret_block = ext3_try_to_allocate_with_rsv(sb, handle, group_no, + bitmap_bh, ret_block, my_rsv, &fatal); if (fatal) goto out; if (ret_block >= 0) @@ -569,14 +1062,33 @@ ext3_new_block(handle_t *handle, struct bitmap_bh = read_block_bitmap(sb, group_no); if (!bitmap_bh) goto io_error; - ret_block = ext3_try_to_allocate(sb, handle, group_no, - bitmap_bh, -1, &fatal); + ret_block = ext3_try_to_allocate_with_rsv(sb, handle, group_no, + bitmap_bh, -1, my_rsv, &fatal); if (fatal) goto out; if (ret_block >= 0) goto allocated; } - +#ifdef EXT3_RESERVATION + /* + * We may end up a bogus ealier ENOSPC error due to + * filesystem is "full" of reservations, but + * there maybe indeed free blocks avaliable on disk + * In this case, we just forget about the reservations + * just do block allocation as without reservations. + */ + if (my_rsv) { +#ifdef EXT3_RESERVATION_DEBUG + printk("filesystem is fully reserved. Actual free blocks: %d. " + "Try to do allocation without reservation, goal_group " + "is %d\n", + free_blocks, goal_group); +#endif + my_rsv = NULL; + group_no = goal_group; + goto retry; + } +#endif /* No space left on the device */ *errp = -ENOSPC; goto out; --- diff/fs/ext3/file.c 2003-10-09 09:47:34.000000000 +0100 +++ source/fs/ext3/file.c 2004-04-21 10:46:51.694723320 +0100 @@ -34,7 +34,7 @@ static int ext3_release_file (struct inode * inode, struct file * filp) { if (filp->f_mode & FMODE_WRITE) - ext3_discard_prealloc (inode); + ext3_discard_reservation(inode); if (is_dx(inode) && filp->private_data) ext3_htree_free_dir_info(filp->private_data); --- diff/fs/ext3/ialloc.c 2004-04-05 12:57:08.000000000 +0100 +++ source/fs/ext3/ialloc.c 2004-04-21 10:46:51.695723168 +0100 @@ -581,10 +581,11 @@ got: ei->i_file_acl = 0; ei->i_dir_acl = 0; ei->i_dtime = 0; -#ifdef EXT3_PREALLOCATE - ei->i_prealloc_block = 0; - ei->i_prealloc_count = 0; -#endif + ei->i_rsv_window.rsv_start = 0; + ei->i_rsv_window.rsv_end = 0; + atomic_set(&ei->i_rsv_window.rsv_goal_size, EXT3_DEFAULT_RESERVE_BLOCKS); + ei->i_rsv_window.rsv_alloc_hit = 0; + INIT_LIST_HEAD(&ei->i_rsv_window.rsv_list); ei->i_block_group = group; ext3_set_inode_flags(inode); --- diff/fs/ext3/inode.c 2004-04-21 10:45:35.189353904 +0100 +++ source/fs/ext3/inode.c 2004-04-21 10:46:51.697722864 +0100 @@ -186,7 +186,7 @@ static int ext3_journal_test_restart(han void ext3_put_inode(struct inode *inode) { if (!is_bad_inode(inode)) - ext3_discard_prealloc(inode); + ext3_discard_reservation(inode); } /* @@ -244,62 +244,12 @@ no_delete: clear_inode(inode); /* We must guarantee clearing of inode... */ } -void ext3_discard_prealloc (struct inode * inode) -{ -#ifdef EXT3_PREALLOCATE - struct ext3_inode_info *ei = EXT3_I(inode); - /* Writer: ->i_prealloc* */ - if (ei->i_prealloc_count) { - unsigned short total = ei->i_prealloc_count; - unsigned long block = ei->i_prealloc_block; - ei->i_prealloc_count = 0; - ei->i_prealloc_block = 0; - /* Writer: end */ - ext3_free_blocks (inode, block, total); - } -#endif -} - static int ext3_alloc_block (handle_t *handle, struct inode * inode, unsigned long goal, int *err) { unsigned long result; -#ifdef EXT3_PREALLOCATE -#ifdef EXT3FS_DEBUG - static unsigned long alloc_hits, alloc_attempts; -#endif - struct ext3_inode_info *ei = EXT3_I(inode); - /* Writer: ->i_prealloc* */ - if (ei->i_prealloc_count && - (goal == ei->i_prealloc_block || - goal + 1 == ei->i_prealloc_block)) - { - result = ei->i_prealloc_block++; - ei->i_prealloc_count--; - /* Writer: end */ - ext3_debug ("preallocation hit (%lu/%lu).\n", - ++alloc_hits, ++alloc_attempts); - } else { - ext3_discard_prealloc (inode); - ext3_debug ("preallocation miss (%lu/%lu).\n", - alloc_hits, ++alloc_attempts); - if (S_ISREG(inode->i_mode)) - result = ext3_new_block (inode, goal, - &ei->i_prealloc_count, - &ei->i_prealloc_block, err); - else - result = ext3_new_block (inode, goal, 0, 0, err); - /* - * AKPM: this is somewhat sticky. I'm not surprised it was - * disabled in 2.2's ext3. Need to integrate b_committed_data - * guarding with preallocation, if indeed preallocation is - * effective. - */ - } -#else - result = ext3_new_block (handle, inode, goal, 0, 0, err); -#endif + result = ext3_new_block (handle, inode, goal, err); return result; } @@ -966,38 +916,6 @@ struct buffer_head *ext3_bread(handle_t bh = ext3_getblk (handle, inode, block, create, err); if (!bh) return bh; -#ifdef EXT3_PREALLOCATE - /* - * If the inode has grown, and this is a directory, then use a few - * more of the preallocated blocks to keep directory fragmentation - * down. The preallocated blocks are guaranteed to be contiguous. - */ - if (create && - S_ISDIR(inode->i_mode) && - inode->i_blocks > prev_blocks && - EXT3_HAS_COMPAT_FEATURE(inode->i_sb, - EXT3_FEATURE_COMPAT_DIR_PREALLOC)) { - int i; - struct buffer_head *tmp_bh; - - for (i = 1; - EXT3_I(inode)->i_prealloc_count && - i < EXT3_SB(inode->i_sb)->s_es->s_prealloc_dir_blocks; - i++) { - /* - * ext3_getblk will zero out the contents of the - * directory for us - */ - tmp_bh = ext3_getblk(handle, inode, - block+i, create, err); - if (!tmp_bh) { - brelse (bh); - return 0; - } - brelse (tmp_bh); - } - } -#endif if (buffer_uptodate(bh)) return bh; ll_rw_block (READ, 1, &bh); @@ -1393,7 +1311,7 @@ static int ext3_ordered_writepage(struct return ret; out_fail: - __set_page_dirty_nobuffers(page); + redirty_page_for_writepage(wbc, page); unlock_page(page); return ret; } @@ -1422,7 +1340,7 @@ static int ext3_writeback_writepage(stru return ret; out_fail: - __set_page_dirty_nobuffers(page); + redirty_page_for_writepage(wbc, page); unlock_page(page); return ret; } @@ -1478,7 +1396,7 @@ out: return ret; no_write: - __set_page_dirty_nobuffers(page); + redirty_page_for_writepage(wbc, page); out_unlock: unlock_page(page); goto out; @@ -2136,7 +2054,7 @@ void ext3_truncate(struct inode * inode) if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return; - ext3_discard_prealloc(inode); + ext3_discard_reservation(inode); /* * We have to lock the EOF page here, because lock_page() nests @@ -2529,11 +2447,11 @@ void ext3_read_inode(struct inode * inod } ei->i_disksize = inode->i_size; inode->i_generation = le32_to_cpu(raw_inode->i_generation); -#ifdef EXT3_PREALLOCATE - ei->i_prealloc_count = 0; -#endif ei->i_block_group = iloc.block_group; - + ei->i_rsv_window.rsv_start = 0; + ei->i_rsv_window.rsv_end= 0; + atomic_set(&ei->i_rsv_window.rsv_goal_size, EXT3_DEFAULT_RESERVE_BLOCKS); + INIT_LIST_HEAD(&ei->i_rsv_window.rsv_list); /* * NOTE! The in-memory inode i_data array is in little-endian order * even on big-endian machines: we do NOT byteswap the block numbers! --- diff/fs/ext3/ioctl.c 2003-10-09 09:47:17.000000000 +0100 +++ source/fs/ext3/ioctl.c 2004-04-21 10:46:51.697722864 +0100 @@ -20,6 +20,7 @@ int ext3_ioctl (struct inode * inode, st { struct ext3_inode_info *ei = EXT3_I(inode); unsigned int flags; + unsigned short rsv_window_size; ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg); @@ -151,6 +152,25 @@ flags_err: return ret; } #endif +#ifdef EXT3_RESERVATION + case EXT3_IOC_GETRSVSZ: + rsv_window_size = atomic_read(&ei->i_rsv_window.rsv_goal_size); + return put_user(rsv_window_size, (int *)arg); + case EXT3_IOC_SETRSVSZ: + if (IS_RDONLY(inode)) + return -EROFS; + + if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + return -EACCES; + + if (get_user(rsv_window_size, (int *)arg)) + return -EFAULT; + + if (rsv_window_size > EXT3_MAX_RESERVE_BLOCKS) + rsv_window_size = EXT3_MAX_RESERVE_BLOCKS; + atomic_set(&ei->i_rsv_window.rsv_goal_size, rsv_window_size); + return 0; +#endif default: return -ENOTTY; } --- diff/fs/ext3/super.c 2004-04-21 10:45:35.192353448 +0100 +++ source/fs/ext3/super.c 2004-04-21 10:46:51.698722712 +0100 @@ -572,7 +572,8 @@ enum { Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, Opt_nouid32, Opt_check, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, - Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_noload, + Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, + Opt_reservation, Opt_noreservation, Opt_noload, Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, @@ -604,6 +605,8 @@ static match_table_t tokens = { {Opt_nouser_xattr, "nouser_xattr"}, {Opt_acl, "acl"}, {Opt_noacl, "noacl"}, + {Opt_reservation, "reservation"}, + {Opt_noreservation, "noreservation"}, {Opt_noload, "noload"}, {Opt_commit, "commit=%u"}, {Opt_journal_update, "journal=update"}, @@ -757,6 +760,19 @@ static int parse_options (char * options printk("EXT3 (no)acl options not supported\n"); break; #endif +#ifdef EXT3_RESERVATION + case Opt_reservation: + set_opt(sbi->s_mount_opt, RESERVATION); + break; + case Opt_noreservation: + clear_opt(sbi->s_mount_opt, RESERVATION); + break; +#else + case Opt_reservation: + case Opt_noreservation: + printk("EXT3 block reservation options not supported\n"); + break; +#endif case Opt_journal_update: /* @@@ FIXME */ /* Eventually we will want to be able to create @@ -1277,6 +1293,8 @@ static int ext3_fill_super (struct super sbi->s_resuid = le16_to_cpu(es->s_def_resuid); sbi->s_resgid = le16_to_cpu(es->s_def_resgid); + set_opt(sbi->s_mount_opt, RESERVATION); + if (!parse_options ((char *) data, sb, &journal_inum, 0)) goto failed_mount; @@ -1451,6 +1469,14 @@ static int ext3_fill_super (struct super sbi->s_gdb_count = db_count; get_random_bytes(&sbi->s_next_generation, sizeof(u32)); spin_lock_init(&sbi->s_next_gen_lock); + /* per fileystem reservation list head & lock */ + spin_lock_init(&sbi->s_rsv_window_lock); + INIT_LIST_HEAD(&sbi->s_rsv_window_head.rsv_list); + sbi->s_rsv_window_head.rsv_start = 0; + sbi->s_rsv_window_head.rsv_end = 0; + sbi->s_rsv_window_head.rsv_alloc_hit = 0; + atomic_set(&sbi->s_rsv_window_head.rsv_goal_size, 0); + /* * set up enough so that it can read an inode */ --- diff/fs/ext3/xattr.c 2004-02-18 08:54:12.000000000 +0000 +++ source/fs/ext3/xattr.c 2004-04-21 10:46:51.699722560 +0100 @@ -787,7 +787,7 @@ ext3_xattr_set_handle2(handle_t *handle, EXT3_I(inode)->i_block_group * EXT3_BLOCKS_PER_GROUP(sb); int block = ext3_new_block(handle, - inode, goal, 0, 0, &error); + inode, goal, &error); if (error) goto cleanup; ea_idebug(inode, "creating block %d", block); --- diff/fs/fcntl.c 2004-04-21 10:45:35.193353296 +0100 +++ source/fs/fcntl.c 2004-04-21 10:46:51.700722408 +0100 @@ -627,15 +627,12 @@ void kill_fasync(struct fasync_struct ** read_unlock(&fasync_lock); } } - EXPORT_SYMBOL(kill_fasync); static int __init fasync_init(void) { fasync_cache = kmem_cache_create("fasync_cache", - sizeof(struct fasync_struct), 0, 0, NULL, NULL); - if (!fasync_cache) - panic("cannot create fasync slab cache"); + sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL, NULL); return 0; } --- diff/fs/hugetlbfs/inode.c 2004-04-21 10:45:35.265342352 +0100 +++ source/fs/hugetlbfs/inode.c 2004-04-21 10:46:51.700722408 +0100 @@ -194,6 +194,7 @@ static void hugetlbfs_delete_inode(struc hlist_del_init(&inode->i_hash); list_del_init(&inode->i_list); + list_del_init(&inode->i_sb_list); inode->i_state |= I_FREEING; inodes_stat.nr_inodes--; spin_unlock(&inode_lock); @@ -236,6 +237,7 @@ static void hugetlbfs_forget_inode(struc hlist_del_init(&inode->i_hash); out_truncate: list_del_init(&inode->i_list); + list_del_init(&inode->i_sb_list); inode->i_state |= I_FREEING; inodes_stat.nr_inodes--; spin_unlock(&inode_lock); @@ -319,12 +321,12 @@ static int hugetlb_vmtruncate(struct ino pgoff = offset >> HPAGE_SHIFT; inode->i_size = offset; - down(&mapping->i_shared_sem); + spin_lock(&mapping->i_shared_lock); if (!list_empty(&mapping->i_mmap)) hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); if (!list_empty(&mapping->i_mmap_shared)) hugetlb_vmtruncate_list(&mapping->i_mmap_shared, pgoff); - up(&mapping->i_shared_sem); + spin_unlock(&mapping->i_shared_lock); truncate_hugepages(mapping, offset); return 0; } @@ -375,6 +377,7 @@ static struct inode *hugetlbfs_get_inode inode = new_inode(sb); if (inode) { + struct hugetlbfs_inode_info *info; inode->i_mode = mode; inode->i_uid = uid; inode->i_gid = gid; @@ -383,6 +386,8 @@ static struct inode *hugetlbfs_get_inode inode->i_mapping->a_ops = &hugetlbfs_aops; inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + info = HUGETLBFS_I(inode); + mpol_shared_policy_init(&info->policy); switch (mode & S_IFMT) { default: init_special_inode(inode, mode, dev); @@ -510,6 +515,33 @@ static void hugetlbfs_put_super(struct s } } +static kmem_cache_t *hugetlbfs_inode_cachep; + +static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) +{ + struct hugetlbfs_inode_info *p; + + p = kmem_cache_alloc(hugetlbfs_inode_cachep, SLAB_KERNEL); + if (!p) + return NULL; + return &p->vfs_inode; +} + +static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags) +{ + struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo; + + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) + inode_init_once(&ei->vfs_inode); +} + +static void hugetlbfs_destroy_inode(struct inode *inode) +{ + mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy); + kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); +} + static struct address_space_operations hugetlbfs_aops = { .readpage = hugetlbfs_readpage, .prepare_write = hugetlbfs_prepare_write, @@ -541,6 +573,8 @@ static struct inode_operations hugetlbfs }; static struct super_operations hugetlbfs_ops = { + .alloc_inode = hugetlbfs_alloc_inode, + .destroy_inode = hugetlbfs_destroy_inode, .statfs = hugetlbfs_statfs, .drop_inode = hugetlbfs_drop_inode, .put_super = hugetlbfs_put_super, @@ -755,9 +789,16 @@ static int __init init_hugetlbfs_fs(void int error; struct vfsmount *vfsmount; + hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache", + sizeof(struct hugetlbfs_inode_info), + 0, SLAB_RECLAIM_ACCOUNT, + init_once, NULL); + if (hugetlbfs_inode_cachep == NULL) + return -ENOMEM; + error = register_filesystem(&hugetlbfs_fs_type); if (error) - return error; + goto out; vfsmount = kern_mount(&hugetlbfs_fs_type); @@ -767,11 +808,16 @@ static int __init init_hugetlbfs_fs(void } error = PTR_ERR(vfsmount); + + out: + if (error) + kmem_cache_destroy(hugetlbfs_inode_cachep); return error; } static void __exit exit_hugetlbfs_fs(void) { + kmem_cache_destroy(hugetlbfs_inode_cachep); unregister_filesystem(&hugetlbfs_fs_type); } --- diff/fs/inode.c 2004-04-21 10:45:35.267342048 +0100 +++ source/fs/inode.c 2004-04-21 10:46:51.701722256 +0100 @@ -184,7 +184,7 @@ void inode_init_once(struct inode *inode init_rwsem(&inode->i_alloc_sem); INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); spin_lock_init(&inode->i_data.tree_lock); - init_MUTEX(&inode->i_data.i_shared_sem); + spin_lock_init(&inode->i_data.i_shared_lock); atomic_set(&inode->i_data.truncate_count, 0); INIT_LIST_HEAD(&inode->i_data.private_list); spin_lock_init(&inode->i_data.private_lock); @@ -282,7 +282,7 @@ static void dispose_list(struct list_hea /* * Invalidate all inodes for a device. */ -static int invalidate_list(struct list_head *head, struct super_block * sb, struct list_head * dispose) +static int invalidate_list(struct list_head *head, struct list_head *dispose) { struct list_head *next; int busy = 0, count = 0; @@ -295,12 +295,11 @@ static int invalidate_list(struct list_h next = next->next; if (tmp == head) break; - inode = list_entry(tmp, struct inode, i_list); - if (inode->i_sb != sb) - continue; + inode = list_entry(tmp, struct inode, i_sb_list); invalidate_inode_buffers(inode); if (!atomic_read(&inode->i_count)) { hlist_del_init(&inode->i_hash); + list_del(&inode->i_sb_list); list_move(&inode->i_list, dispose); inode->i_state |= I_FREEING; count++; @@ -336,10 +335,7 @@ int invalidate_inodes(struct super_block down(&iprune_sem); spin_lock(&inode_lock); - busy = invalidate_list(&inode_in_use, sb, &throw_away); - busy |= invalidate_list(&inode_unused, sb, &throw_away); - busy |= invalidate_list(&sb->s_dirty, sb, &throw_away); - busy |= invalidate_list(&sb->s_io, sb, &throw_away); + busy = invalidate_list(&sb->s_inodes, &throw_away); spin_unlock(&inode_lock); dispose_list(&throw_away); @@ -439,6 +435,7 @@ static void prune_icache(int nr_to_scan) continue; } hlist_del_init(&inode->i_hash); + list_del_init(&inode->i_sb_list); list_move(&inode->i_list, &freeable); inode->i_state |= I_FREEING; nr_pruned++; @@ -549,6 +546,7 @@ struct inode *new_inode(struct super_blo spin_lock(&inode_lock); inodes_stat.nr_inodes++; list_add(&inode->i_list, &inode_in_use); + list_add(&inode->i_sb_list, &sb->s_inodes); inode->i_ino = ++last_ino; inode->i_state = 0; spin_unlock(&inode_lock); @@ -597,6 +595,7 @@ static struct inode * get_new_inode(stru inodes_stat.nr_inodes++; list_add(&inode->i_list, &inode_in_use); + list_add(&inode->i_sb_list, &sb->s_inodes); hlist_add_head(&inode->i_hash, head); inode->i_state = I_LOCK|I_NEW; spin_unlock(&inode_lock); @@ -645,6 +644,7 @@ static struct inode * get_new_inode_fast inode->i_ino = ino; inodes_stat.nr_inodes++; list_add(&inode->i_list, &inode_in_use); + list_add(&inode->i_sb_list, &sb->s_inodes); hlist_add_head(&inode->i_hash, head); inode->i_state = I_LOCK|I_NEW; spin_unlock(&inode_lock); @@ -980,6 +980,7 @@ void generic_delete_inode(struct inode * struct super_operations *op = inode->i_sb->s_op; list_del_init(&inode->i_list); + list_del_init(&inode->i_sb_list); inode->i_state|=I_FREEING; inodes_stat.nr_inodes--; spin_unlock(&inode_lock); @@ -1025,6 +1026,7 @@ static void generic_forget_inode(struct hlist_del_init(&inode->i_hash); } list_del_init(&inode->i_list); + list_del_init(&inode->i_sb_list); inode->i_state|=I_FREEING; inodes_stat.nr_inodes--; spin_unlock(&inode_lock); @@ -1213,36 +1215,23 @@ EXPORT_SYMBOL(inode_needs_sync); /* Function back in dquot.c */ int remove_inode_dquot_ref(struct inode *, int, struct list_head *); -void remove_dquot_ref(struct super_block *sb, int type, struct list_head *tofree_head) +void remove_dquot_ref(struct super_block *sb, int type, + struct list_head *tofree_head) { struct inode *inode; - struct list_head *act_head; if (!sb->dq_op) return; /* nothing to do */ spin_lock(&inode_lock); /* This lock is for inodes code */ - /* We don't have to lock against quota code - test IS_QUOTAINIT is just for speedup... */ - - list_for_each(act_head, &inode_in_use) { - inode = list_entry(act_head, struct inode, i_list); - if (inode->i_sb == sb && IS_QUOTAINIT(inode)) - remove_inode_dquot_ref(inode, type, tofree_head); - } - list_for_each(act_head, &inode_unused) { - inode = list_entry(act_head, struct inode, i_list); - if (inode->i_sb == sb && IS_QUOTAINIT(inode)) - remove_inode_dquot_ref(inode, type, tofree_head); - } - list_for_each(act_head, &sb->s_dirty) { - inode = list_entry(act_head, struct inode, i_list); - if (IS_QUOTAINIT(inode)) - remove_inode_dquot_ref(inode, type, tofree_head); - } - list_for_each(act_head, &sb->s_io) { - inode = list_entry(act_head, struct inode, i_list); + /* + * We don't have to lock against quota code - test IS_QUOTAINIT is + * just for speedup... + */ + + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) if (IS_QUOTAINIT(inode)) remove_inode_dquot_ref(inode, type, tofree_head); - } + spin_unlock(&inode_lock); } @@ -1383,11 +1372,8 @@ void __init inode_init(unsigned long mem /* inode slab cache */ inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode), - 0, SLAB_HWCACHE_ALIGN, init_once, - NULL); - if (!inode_cachep) - panic("cannot create inode slab cache"); - + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, init_once, + NULL); set_shrinker(DEFAULT_SEEKS, shrink_icache_memory); } @@ -1408,5 +1394,4 @@ void init_special_inode(struct inode *in printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o)\n", mode); } - EXPORT_SYMBOL(init_special_inode); --- diff/fs/jbd/journal.c 2004-04-21 10:45:35.272341288 +0100 +++ source/fs/jbd/journal.c 2004-04-21 10:46:51.702722104 +0100 @@ -599,6 +599,7 @@ struct journal_head * journal_get_descri return NULL; bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + memset(bh->b_data, 0, journal->j_blocksize); bh->b_state |= (1 << BH_Dirty); BUFFER_TRACE(bh, "return this buffer"); return journal_add_journal_head(bh); @@ -1673,9 +1674,17 @@ repeat: if (buffer_jbd(bh)) { jh = bh2jh(bh); } else { - J_ASSERT_BH(bh, - (atomic_read(&bh->b_count) > 0) || - (bh->b_page && bh->b_page->mapping)); + if (!(atomic_read(&bh->b_count) > 0 || + (bh->b_page && bh->b_page->mapping))) { + printk(KERN_EMERG "%s: bh->b_count=%d\n", + __FUNCTION__, atomic_read(&bh->b_count)); + printk(KERN_EMERG "%s: bh->b_page=%p\n", + __FUNCTION__, bh->b_page); + if (bh->b_page) + printk(KERN_EMERG "%s: " + "bh->b_page->mapping=%p\n", + __FUNCTION__, bh->b_page->mapping); + } if (!new_jh) { jbd_unlock_bh_journal_head(bh); --- diff/fs/jbd/transaction.c 2004-04-21 10:45:35.734271064 +0100 +++ source/fs/jbd/transaction.c 2004-04-21 10:46:51.703721952 +0100 @@ -931,7 +931,6 @@ out: int journal_dirty_data(handle_t *handle, struct buffer_head *bh) { journal_t *journal = handle->h_transaction->t_journal; - int need_brelse = 0; struct journal_head *jh; if (is_handle_aborted(handle)) @@ -1016,24 +1015,6 @@ int journal_dirty_data(handle_t *handle, goto no_journal; } - /* - * This buffer may be undergoing writeout in commit. We - * can't return from here and let the caller dirty it - * again because that can cause the write-out loop in - * commit to never terminate. - */ - if (buffer_dirty(bh)) { - get_bh(bh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - need_brelse = 1; - sync_dirty_buffer(bh); - jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); - /* The buffer may become locked again at any - time if it is redirtied */ - } - /* journal_clean_data_list() may have got there first */ if (jh->b_transaction != NULL) { JBUFFER_TRACE(jh, "unfile from commit"); @@ -1063,10 +1044,6 @@ int journal_dirty_data(handle_t *handle, no_journal: spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); - if (need_brelse) { - BUFFER_TRACE(bh, "brelse"); - __brelse(bh); - } JBUFFER_TRACE(jh, "exit"); journal_put_journal_head(jh); return 0; --- diff/fs/locks.c 2004-04-21 10:45:35.741270000 +0100 +++ source/fs/locks.c 2004-04-21 10:46:51.705721648 +0100 @@ -1994,15 +1994,13 @@ void steal_locks(fl_owner_t from) } unlock_kernel(); } - EXPORT_SYMBOL(steal_locks); static int __init filelock_init(void) { filelock_cache = kmem_cache_create("file_lock_cache", - sizeof(struct file_lock), 0, 0, init_once, NULL); - if (!filelock_cache) - panic("cannot create file lock slab cache"); + sizeof(struct file_lock), 0, SLAB_PANIC, + init_once, NULL); return 0; } --- diff/fs/namespace.c 2004-04-21 10:45:35.744269544 +0100 +++ source/fs/namespace.c 2004-04-21 10:46:51.706721496 +0100 @@ -36,6 +36,9 @@ static inline int sysfs_init(void) /* spinlock for vfsmount related operations, inplace of dcache_lock */ spinlock_t vfsmount_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; + +EXPORT_SYMBOL(vfsmount_lock); + static struct list_head *mount_hashtable; static int hash_mask, hash_bits; static kmem_cache_t *mnt_cache; @@ -1142,9 +1145,7 @@ void __init mnt_init(unsigned long mempa int i; mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount), - 0, SLAB_HWCACHE_ALIGN, NULL, NULL); - if (!mnt_cache) - panic("Cannot create vfsmount cache"); + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); order = 0; mount_hashtable = (struct list_head *) --- diff/fs/nfs/direct.c 2004-04-21 10:45:35.745269392 +0100 +++ source/fs/nfs/direct.c 2004-04-21 10:46:51.707721344 +0100 @@ -32,6 +32,7 @@ * 18 Dec 2001 Initial implementation for 2.4 --cel * 08 Jul 2002 Version for 2.4.19, with bug fixes --trondmy * 08 Jun 2003 Port to 2.5 APIs --cel + * 31 Mar 2004 Handle direct I/O without VFS support --cel * */ @@ -409,12 +410,6 @@ nfs_direct_write(struct inode *inode, st * file_offset: offset in file to begin the operation * nr_segs: size of iovec array * - * Usually a file system implements direct I/O by calling out to - * blockdev_direct_IO. The NFS client doesn't have a backing block - * device, so we do everything by hand instead. - * - * The inode's i_sem is no longer held by the VFS layer before it calls - * this function to do a write. */ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, @@ -431,10 +426,6 @@ nfs_direct_IO(int rw, struct kiocb *iocb if (!is_sync_kiocb(iocb)) goto out; - result = nfs_revalidate_inode(NFS_SERVER(inode), inode); - if (result < 0) - goto out; - switch (rw) { case READ: dprintk("NFS: direct_IO(read) (%s) off/no(%Lu/%lu)\n", @@ -455,6 +446,163 @@ nfs_direct_IO(int rw, struct kiocb *iocb } out: - dprintk("NFS: direct_IO result=%d\n", result); + dprintk("NFS: direct_IO result=%zd\n", result); return result; } + +/** + * nfs_file_direct_read - file direct read operation for NFS files + * @iocb: target I/O control block + * @buf: user's buffer into which to read data + * count: number of bytes to read + * pos: byte offset in file where reading starts + * + * We use this function for direct reads instead of calling + * generic_file_aio_read() in order to avoid gfar's check to see if + * the request starts before the end of the file. For that check + * to work, we must generate a GETATTR before each direct read, and + * even then there is a window between the GETATTR and the subsequent + * READ where the file size could change. So our preference is simply + * to do all reads the application wants, and the server will take + * care of managing the end of file boundary. + * + * This function also eliminates unnecessarily updating the file's + * atime locally, as the NFS server sets the file's atime, and this + * client must read the updated atime from the server back into its + * cache. + */ +ssize_t +nfs_file_direct_read(struct kiocb *iocb, char *buf, size_t count, loff_t pos) +{ + ssize_t retval = -EINVAL; + loff_t *ppos = &iocb->ki_pos; + struct file *file = iocb->ki_filp; + struct dentry *dentry = file->f_dentry; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct iovec iov = { .iov_base = buf, .iov_len = count }; + + dprintk("nfs: direct read(%s/%s, %lu@%lu)\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + (unsigned long) count, (unsigned long) pos); + + if (!is_sync_kiocb(iocb)) + goto out; + if (count < 0) + goto out; + retval = -EFAULT; + if (!access_ok(VERIFY_WRITE, iov.iov_base, iov.iov_len)) + goto out; + retval = 0; + if (!count) + goto out; + + /* XXX: why do this for reads? */ + if (mapping->nrpages) { + retval = filemap_fdatawrite(mapping); + if (retval == 0) + retval = filemap_fdatawait(mapping); + if (retval) + goto out; + } + + retval = nfs_direct_read(inode, file, &iov, pos, 1); + if (retval > 0) + *ppos = pos + retval; + +out: + return retval; +} + +/** + * nfs_file_direct_write - file direct write operation for NFS files + * @iocb: target I/O control block + * @buf: user's buffer from which to write data + * count: number of bytes to write + * pos: byte offset in file where writing starts + * + * We use this function for direct writes instead of calling + * generic_file_aio_write() in order to avoid taking the inode + * semaphor and updating the i_size. The NFS server will set + * the new i_size and this client must read the updated size + * back into its cache. We let the server do generic write + * parameter checking and report problems. + * + * We also avoid an unnecessary invocation of generic_osync_inode(), + * as it is fairly meaningless to sync the metadata of an NFS file. + * + * And we eliminate local atime updates, see direct read above. + * + * Note that O_APPEND is not supported for NFS direct writes, as there + * is no atomic O_APPEND write facility in the NFS protocol. + */ +ssize_t +nfs_file_direct_write(struct kiocb *iocb, const char *buf, size_t count, loff_t pos) +{ + ssize_t retval = -EINVAL; + loff_t *ppos = &iocb->ki_pos; + struct file *file = iocb->ki_filp; + struct dentry *dentry = file->f_dentry; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count }; + unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur; + + dfprintk(VFS, "nfs: direct write(%s/%s(%ld), %lu@%lu)\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + inode->i_ino, (unsigned long) count, (unsigned long) pos); + + if (!is_sync_kiocb(iocb)) + goto out; + if (count < 0) + goto out; + if (pos < 0) + goto out; + retval = -EFAULT; + if (!access_ok(VERIFY_READ, iov.iov_base, iov.iov_len)) + goto out; + if (file->f_error) { + retval = file->f_error; + file->f_error = 0; + goto out; + } + retval = -EFBIG; + if (limit != RLIM_INFINITY) { + if (pos >= limit) { + send_sig(SIGXFSZ, current, 0); + goto out; + } + if (count > limit - (unsigned long) pos) + count = limit - (unsigned long) pos; + } + retval = 0; + if (!count) + goto out; + + /* + * The server's file system may not be POSIX-compliant, so we want + * to ensure that the suid bit is gone. + */ + down(&inode->i_sem); + retval = remove_suid(dentry); + up(&inode->i_sem); + if (retval) + goto out; + + if (mapping->nrpages) { + retval = filemap_fdatawrite(mapping); + if (retval == 0) + retval = filemap_fdatawait(mapping); + if (retval) + goto out; + } + + retval = nfs_direct_write(inode, file, &iov, pos, 1); + if (mapping->nrpages) + invalidate_inode_pages2(mapping); + if (retval > 0) + *ppos = pos + retval; + +out: + return retval; +} --- diff/fs/nfs/file.c 2004-04-21 10:45:35.745269392 +0100 +++ source/fs/nfs/file.c 2004-04-21 10:46:51.707721344 +0100 @@ -154,6 +154,11 @@ nfs_file_read(struct kiocb *iocb, char * struct inode * inode = dentry->d_inode; ssize_t result; +#ifdef CONFIG_NFS_DIRECTIO + if (iocb->ki_filp->f_flags & O_DIRECT) + return nfs_file_direct_read(iocb, buf, count, pos); +#endif + dfprintk(VFS, "nfs: read(%s/%s, %lu@%lu)\n", dentry->d_parent->d_name.name, dentry->d_name.name, (unsigned long) count, (unsigned long) pos); @@ -268,6 +273,11 @@ nfs_file_write(struct kiocb *iocb, const struct inode * inode = dentry->d_inode; ssize_t result; +#ifdef CONFIG_NFS_DIRECTIO + if (iocb->ki_filp->f_flags & O_DIRECT) + return nfs_file_direct_write(iocb, buf, count, pos); +#endif + dfprintk(VFS, "nfs: write(%s/%s(%ld), %lu@%lu)\n", dentry->d_parent->d_name.name, dentry->d_name.name, inode->i_ino, (unsigned long) count, (unsigned long) pos); @@ -275,6 +285,7 @@ nfs_file_write(struct kiocb *iocb, const result = -EBUSY; if (IS_SWAPFILE(inode)) goto out_swapfile; + result = nfs_revalidate_inode(NFS_SERVER(inode), inode); if (result) goto out; --- diff/fs/nfs/nfsroot.c 2004-04-21 10:45:35.758267416 +0100 +++ source/fs/nfs/nfsroot.c 2004-04-21 10:46:51.708721192 +0100 @@ -117,11 +117,16 @@ static int mount_port __initdata = 0; / ***************************************************************************/ enum { + /* Options that take integer arguments */ Opt_port, Opt_rsize, Opt_wsize, Opt_timeo, Opt_retrans, Opt_acregmin, - Opt_acregmax, Opt_acdirmin, Opt_acdirmax, Opt_soft, Opt_hard, Opt_intr, + Opt_acregmax, Opt_acdirmin, Opt_acdirmax, + /* Options that take no arguments */ + Opt_soft, Opt_hard, Opt_intr, Opt_nointr, Opt_posix, Opt_noposix, Opt_cto, Opt_nocto, Opt_ac, Opt_noac, Opt_lock, Opt_nolock, Opt_v2, Opt_v3, Opt_udp, Opt_tcp, - Opt_broken_suid, Opt_err, + Opt_broken_suid, + /* Error token */ + Opt_err }; static match_table_t __initdata tokens = { @@ -146,9 +151,13 @@ static match_table_t __initdata tokens = {Opt_noac, "noac"}, {Opt_lock, "lock"}, {Opt_nolock, "nolock"}, + {Opt_v2, "nfsvers=2"}, {Opt_v2, "v2"}, + {Opt_v3, "nfsvers=3"}, {Opt_v3, "v3"}, + {Opt_udp, "proto=udp"}, {Opt_udp, "udp"}, + {Opt_tcp, "proto=tcp"}, {Opt_tcp, "tcp"}, {Opt_broken_suid, "broken_suid"}, {Opt_err, NULL} @@ -169,18 +178,19 @@ static int __init root_nfs_parse(char *n if (!name) return 1; - if (name[0] && strcmp(name, "default")){ - strlcpy(buf, name, NFS_MAXPATHLEN); - return 1; - } + /* Set the NFS remote path */ + p = strsep(&name, ","); + if (p[0] != '\0' && strcmp(p, "default") != 0) + strlcpy(buf, p, NFS_MAXPATHLEN); + while ((p = strsep (&name, ",")) != NULL) { int token; if (!*p) continue; token = match_token(p, tokens, args); - /* %u tokens only */ - if (match_int(&args[0], &option)) + /* %u tokens only. Beware if you add new tokens! */ + if (token < Opt_soft && match_int(&args[0], &option)) return 0; switch (token) { case Opt_port: @@ -265,6 +275,7 @@ static int __init root_nfs_parse(char *n return 0; } } + return 1; } @@ -283,9 +294,6 @@ static int __init root_nfs_name(char *na nfs_data.flags = NFS_MOUNT_NONLM; /* No lockd in nfs root yet */ nfs_data.rsize = NFS_DEF_FILE_IO_BUFFER_SIZE; nfs_data.wsize = NFS_DEF_FILE_IO_BUFFER_SIZE; - nfs_data.bsize = 0; - nfs_data.timeo = 7; - nfs_data.retrans = 3; nfs_data.acregmin = 3; nfs_data.acregmax = 60; nfs_data.acdirmin = 30; --- diff/fs/nfs/read.c 2004-04-21 10:45:35.760267112 +0100 +++ source/fs/nfs/read.c 2004-04-21 10:46:51.708721192 +0100 @@ -43,7 +43,7 @@ static mempool_t *nfs_rdata_mempool; #define MIN_POOL_READ (32) -static __inline__ struct nfs_read_data *nfs_readdata_alloc(void) +static struct nfs_read_data *nfs_readdata_alloc(void) { struct nfs_read_data *p; p = (struct nfs_read_data *)mempool_alloc(nfs_rdata_mempool, SLAB_NOFS); @@ -99,21 +99,19 @@ nfs_readpage_sync(struct file *file, str unsigned int rsize = NFS_SERVER(inode)->rsize; unsigned int count = PAGE_CACHE_SIZE; int result; - struct nfs_read_data rdata = { - .flags = (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0), - .cred = NULL, - .inode = inode, - .args = { - .fh = NFS_FH(inode), - .lockowner = current->files, - .pages = &page, - .pgbase = 0UL, - .count = rsize, - }, - .res = { - .fattr = &rdata.fattr, - } - }; + struct nfs_read_data *rdata; + + rdata = nfs_readdata_alloc(); + if (!rdata) + return -ENOMEM; + + rdata->flags = (IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0); + rdata->inode = inode; + rdata->args.fh = NFS_FH(inode); + rdata->args.pages = &page; + rdata->args.count = rsize; + rdata->args.lockowner = current->files; + rdata->res.fattr = &rdata->fattr; dprintk("NFS: nfs_readpage_sync(%p)\n", page); @@ -123,19 +121,19 @@ nfs_readpage_sync(struct file *file, str */ do { if (count < rsize) - rdata.args.count = count; - rdata.res.count = rdata.args.count; - rdata.args.offset = page_offset(page) + rdata.args.pgbase; + rdata->args.count = count; + rdata->res.count = rdata->args.count; + rdata->args.offset = page_offset(page) + rdata->args.pgbase; dprintk("NFS: nfs_proc_read(%s, (%s/%Ld), %Lu, %u)\n", NFS_SERVER(inode)->hostname, inode->i_sb->s_id, (long long)NFS_FILEID(inode), - (unsigned long long)rdata.args.pgbase, - rdata.args.count); + (unsigned long long)rdata->args.pgbase, + rdata->args.count); lock_kernel(); - result = NFS_PROTO(inode)->read(&rdata, file); + result = NFS_PROTO(inode)->read(rdata, file); unlock_kernel(); /* @@ -148,17 +146,17 @@ nfs_readpage_sync(struct file *file, str goto io_error; } count -= result; - rdata.args.pgbase += result; + rdata->args.pgbase += result; /* Note: result == 0 should only happen if we're caching * a write that extends the file and punches a hole. */ - if (rdata.res.eof != 0 || result == 0) + if (rdata->res.eof != 0 || result == 0) break; } while (count); NFS_FLAGS(inode) |= NFS_INO_INVALID_ATIME; if (count) - memclear_highpage_flush(page, rdata.args.pgbase, count); + memclear_highpage_flush(page, rdata->args.pgbase, count); SetPageUptodate(page); if (PageError(page)) ClearPageError(page); @@ -166,6 +164,7 @@ nfs_readpage_sync(struct file *file, str io_error: unlock_page(page); + nfs_readdata_free(rdata); return result; } --- diff/fs/ntfs/aops.c 2004-04-21 10:45:35.767266048 +0100 +++ source/fs/ntfs/aops.c 2004-04-21 10:46:51.709721040 +0100 @@ -458,7 +458,7 @@ err_out: * * Based on ntfs_read_block() and __block_write_full_page(). */ -static int ntfs_write_block(struct page *page) +static int ntfs_write_block(struct writeback_control *wbc, struct page *page) { VCN vcn; LCN lcn; @@ -499,10 +499,7 @@ static int ntfs_write_block(struct page * Put the page back on mapping->dirty_pages, but leave its * buffer's dirty state as-is. */ - // FIXME: Once Andrew's -EAGAIN patch goes in, remove the - // __set_page_dirty_nobuffers(page) and return -EAGAIN instead - // of zero. - __set_page_dirty_nobuffers(page); + redirty_page_for_writepage(wbc, page); unlock_page(page); return 0; } @@ -733,10 +730,7 @@ lock_retry_remap: * Put the page back on mapping->dirty_pages, but * leave its buffer's dirty state as-is. */ - // FIXME: Once Andrew's -EAGAIN patch goes in, remove - // the __set_page_dirty_nobuffers(page) and set err to - // -EAGAIN instead of zero. - __set_page_dirty_nobuffers(page); + redirty_page_for_writepage(wbc, page); err = 0; } else SetPageError(page); @@ -869,7 +863,7 @@ static int ntfs_writepage(struct page *p } /* Normal data stream. */ - return ntfs_write_block(page); + return ntfs_write_block(wbc, page); } /* @@ -986,10 +980,7 @@ err_out: * Put the page back on mapping->dirty_pages, but leave its * buffer's dirty state as-is. */ - // FIXME: Once Andrew's -EAGAIN patch goes in, remove the - // __set_page_dirty_nobuffers(page) and set err to -EAGAIN - // instead of zero. - __set_page_dirty_nobuffers(page); + redirty_page_for_writepage(wbc, page); err = 0; } else { ntfs_error(vi->i_sb, "Resident attribute write failed with " --- diff/fs/proc/kcore.c 2003-09-30 15:46:19.000000000 +0100 +++ source/fs/proc/kcore.c 2004-04-21 10:46:51.710720888 +0100 @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -84,8 +85,6 @@ kclist_del(void *addr) return 0; } -extern char saved_command_line[]; - static size_t get_kcore_size(int *nphdr, size_t *elf_buflen) { size_t try, size; --- diff/fs/proc/proc_misc.c 2004-04-05 12:57:08.000000000 +0100 +++ source/fs/proc/proc_misc.c 2004-04-21 10:46:51.710720888 +0100 @@ -47,7 +47,6 @@ #include #include #include -#include #include #include @@ -286,6 +285,10 @@ static struct file_operations proc_vmsta .release = seq_release, }; +#ifdef CONFIG_SCHEDSTATS +extern struct file_operations proc_schedstat_operations; +#endif + #ifdef CONFIG_PROC_HARDWARE static int hardware_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) @@ -522,7 +525,6 @@ static int filesystems_read_proc(char *p static int cmdline_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { - extern char saved_command_line[]; int len; len = sprintf(page, "%s\n", saved_command_line); @@ -651,6 +653,36 @@ static void create_seq_entry(char *name, entry->proc_fops = f; } +#ifdef CONFIG_LOCKMETER +extern ssize_t get_lockmeter_info(char *, size_t, loff_t *); +extern ssize_t put_lockmeter_info(const char *, size_t); +extern int get_lockmeter_info_size(void); + +/* + * This function accesses lock metering information. + */ +static ssize_t read_lockmeter(struct file *file, char *buf, + size_t count, loff_t *ppos) +{ + return get_lockmeter_info(buf, count, ppos); +} + +/* + * Writing to /proc/lockmeter resets the counters + */ +static ssize_t write_lockmeter(struct file * file, const char * buf, + size_t count, loff_t *ppos) +{ + return put_lockmeter_info(buf, count); +} + +static struct file_operations proc_lockmeter_operations = { + NULL, /* lseek */ + read: read_lockmeter, + write: write_lockmeter, +}; +#endif /* CONFIG_LOCKMETER */ + void __init proc_misc_init(void) { struct proc_dir_entry *entry; @@ -698,6 +730,9 @@ void __init proc_misc_init(void) #ifdef CONFIG_MODULES create_seq_entry("modules", 0, &proc_modules_operations); #endif +#ifdef CONFIG_SCHEDSTATS + create_seq_entry("schedstat", 0, &proc_schedstat_operations); +#endif #ifdef CONFIG_PROC_KCORE proc_root_kcore = create_proc_entry("kcore", S_IRUSR, NULL); if (proc_root_kcore) { @@ -718,6 +753,13 @@ void __init proc_misc_init(void) if (entry) entry->proc_fops = &proc_sysrq_trigger_operations; #endif +#ifdef CONFIG_LOCKMETER + entry = create_proc_entry("lockmeter", S_IWUSR | S_IRUGO, NULL); + if (entry) { + entry->proc_fops = &proc_lockmeter_operations; + entry->size = get_lockmeter_info_size(); + } +#endif #ifdef CONFIG_PPC32 { extern struct file_operations ppc_htab_operations; --- diff/fs/reiserfs/inode.c 2004-04-21 10:45:35.782263768 +0100 +++ source/fs/reiserfs/inode.c 2004-04-21 10:46:51.712720584 +0100 @@ -2112,7 +2112,7 @@ static int reiserfs_write_full_page(stru lock_buffer(bh); } else { if (test_set_buffer_locked(bh)) { - __set_page_dirty_nobuffers(page); + redirty_page_for_writepage(wbc, page); continue; } } --- diff/fs/reiserfs/super.c 2004-04-21 10:45:35.792262248 +0100 +++ source/fs/reiserfs/super.c 2004-04-21 10:46:51.712720584 +0100 @@ -86,7 +86,7 @@ static void reiserfs_write_super_lockfs reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s)); reiserfs_block_writes(&th) ; - journal_end(&th, s, 1) ; + journal_end_sync(&th, s, 1) ; } s->s_dirt = 0; reiserfs_write_unlock(s); --- diff/fs/select.c 2003-10-09 09:47:34.000000000 +0100 +++ source/fs/select.c 2004-04-21 10:46:51.713720432 +0100 @@ -291,8 +291,6 @@ static void select_bits_free(void *bits, * Update: ERESTARTSYS breaks at least the xview clock binary, so * I'm trying ERESTARTNOHAND which restart only when you want to. */ -#define MAX_SELECT_SECONDS \ - ((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1) asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timeval __user *tvp) @@ -315,9 +313,11 @@ sys_select(int n, fd_set __user *inp, fd if (sec < 0 || usec < 0) goto out_nofds; - if ((unsigned long) sec < MAX_SELECT_SECONDS) { + if ((unsigned long) sec < (MAX_SCHEDULE_TIMEOUT-1) / HZ - 1) { timeout = ROUND_UP(usec, 1000000/HZ); timeout += sec * (unsigned long) HZ; + } else { + timeout = MAX_SCHEDULE_TIMEOUT-1; } } @@ -469,11 +469,17 @@ asmlinkage long sys_poll(struct pollfd _ return -EINVAL; if (timeout) { - /* Careful about overflow in the intermediate values */ - if ((unsigned long) timeout < MAX_SCHEDULE_TIMEOUT / HZ) - timeout = (unsigned long)(timeout*HZ+999)/1000+1; - else /* Negative or overflow */ + if (timeout < 0) { timeout = MAX_SCHEDULE_TIMEOUT; + } else { + /* Careful about overflow in the intermediate values */ + long seconds = timeout/1000; + timeout = ((timeout - 1000*seconds)*HZ + 999)/1000 + 1; + if (seconds <= (MAX_SCHEDULE_TIMEOUT-2) / HZ - 1) + timeout += seconds*HZ; + else + timeout = MAX_SCHEDULE_TIMEOUT-1; + } } poll_initwait(&table); --- diff/fs/super.c 2004-04-21 10:45:35.795261792 +0100 +++ source/fs/super.c 2004-04-21 10:46:51.714720280 +0100 @@ -68,6 +68,7 @@ static struct super_block *alloc_super(v INIT_LIST_HEAD(&s->s_files); INIT_LIST_HEAD(&s->s_instances); INIT_HLIST_HEAD(&s->s_anon); + INIT_LIST_HEAD(&s->s_inodes); init_rwsem(&s->s_umount); sema_init(&s->s_lock, 1); down_write(&s->s_umount); @@ -77,6 +78,7 @@ static struct super_block *alloc_super(v sema_init(&s->s_dquot.dqio_sem, 1); sema_init(&s->s_dquot.dqonoff_sem, 1); init_rwsem(&s->s_dquot.dqptr_sem); + init_waitqueue_head(&s->s_wait_unfrozen); s->s_maxbytes = MAX_NON_LFS; s->dq_op = sb_dquot_ops; s->s_qcop = sb_quotactl_ops; @@ -623,7 +625,14 @@ struct super_block *get_sb_bdev(struct f if (IS_ERR(bdev)) return (struct super_block *)bdev; + /* + * once the super is inserted into the list by sget, s_umount + * will protect the lockfs code from trying to start a snapshot + * while we are mounting + */ + down(&bdev->bd_mount_sem); s = sget(fs_type, test_bdev_super, set_bdev_super, bdev); + up(&bdev->bd_mount_sem); if (IS_ERR(s)) goto out; --- diff/fs/sysfs/bin.c 2003-09-17 12:28:12.000000000 +0100 +++ source/fs/sysfs/bin.c 2004-04-21 10:46:51.714720280 +0100 @@ -94,7 +94,7 @@ static ssize_t write(struct file * file, static int open(struct inode * inode, struct file * file) { - struct kobject * kobj = kobject_get(file->f_dentry->d_parent->d_fsdata); + struct kobject *kobj = sysfs_get_kobject(file->f_dentry->d_parent); struct bin_attribute * attr = file->f_dentry->d_fsdata; int error = -EINVAL; --- diff/fs/sysfs/file.c 2003-09-30 15:46:19.000000000 +0100 +++ source/fs/sysfs/file.c 2004-04-21 10:46:51.714720280 +0100 @@ -238,7 +238,7 @@ sysfs_write_file(struct file *file, cons static int check_perm(struct inode * inode, struct file * file) { - struct kobject * kobj = kobject_get(file->f_dentry->d_parent->d_fsdata); + struct kobject *kobj = sysfs_get_kobject(file->f_dentry->d_parent); struct attribute * attr = file->f_dentry->d_fsdata; struct sysfs_buffer * buffer; struct sysfs_ops * ops = NULL; --- diff/fs/sysfs/sysfs.h 2003-09-30 15:46:19.000000000 +0100 +++ source/fs/sysfs/sysfs.h 2004-04-21 10:46:51.715720128 +0100 @@ -11,3 +11,16 @@ extern void sysfs_hash_and_remove(struct extern int sysfs_create_subdir(struct kobject *, const char *, struct dentry **); extern void sysfs_remove_subdir(struct dentry *); + + +static inline struct kobject *sysfs_get_kobject(struct dentry *dentry) +{ + struct kobject * kobj = NULL; + + spin_lock(&dentry->d_lock); + if (!d_unhashed(dentry)) + kobj = kobject_get(dentry->d_fsdata); + spin_unlock(&dentry->d_lock); + + return kobj; +} --- diff/fs/xfs/linux/xfs_buf.c 2004-04-21 10:45:35.802260728 +0100 +++ source/fs/xfs/linux/xfs_buf.c 2004-04-21 10:46:51.716719976 +0100 @@ -58,6 +58,8 @@ #include #include #include +#include +#include #include "xfs_linux.h" @@ -1574,7 +1576,7 @@ pagebuf_delwri_queue( } list_add_tail(&pb->pb_list, &pbd_delwrite_queue); - pb->pb_flushtime = jiffies + xfs_age_buffer; + pb->pb_creation_time = jiffies; spin_unlock(&pbd_delwrite_lock); if (unlock) @@ -1647,7 +1649,7 @@ pagebuf_daemon( if ((pb->pb_flags & PBF_DELWRI) && !pagebuf_ispin(pb) && !pagebuf_cond_lock(pb)) { if (!force_flush && - time_before(jiffies, pb->pb_flushtime)) { + time_before(jiffies, pb->pb_creation_time + xfs_age_buffer)) { pagebuf_unlock(pb); break; } --- diff/fs/xfs/linux/xfs_buf.h 2004-03-11 10:20:28.000000000 +0000 +++ source/fs/xfs/linux/xfs_buf.h 2004-04-21 10:46:51.716719976 +0100 @@ -139,7 +139,7 @@ typedef int (*page_buf_bdstrat_t)(struct typedef struct page_buf_s { struct semaphore pb_sema; /* semaphore for lockables */ - unsigned long pb_flushtime; /* time to flush pagebuf */ + unsigned long pb_creation_time; /* time pagebuf was created */ atomic_t pb_pin_count; /* pin count */ wait_queue_head_t pb_waiters; /* unpin waiters */ struct list_head pb_list; --- diff/fs/xfs/linux/xfs_globals.c 2004-03-11 10:20:28.000000000 +0000 +++ source/fs/xfs/linux/xfs_globals.c 2004-04-21 10:46:51.716719976 +0100 @@ -57,12 +57,15 @@ xfs_param_t xfs_params = { .panic_mask = { 0, 0, 127 }, .error_level = { 0, 3, 11 }, .sync_interval = { HZ, 30*HZ, 60*HZ }, + .lm_sync_interval + = { HZ, 600*HZ, INT_MAX }, .stats_clear = { 0, 0, 1 }, .inherit_sync = { 0, 1, 1 }, .inherit_nodump = { 0, 1, 1 }, .inherit_noatim = { 0, 1, 1 }, .flush_interval = { HZ/2, HZ, 30*HZ }, .age_buffer = { 1*HZ, 15*HZ, 300*HZ }, + .lm_age_buffer = { 1*HZ, 600*HZ, INT_MAX }, }; /* --- diff/fs/xfs/linux/xfs_ioctl.c 2004-04-05 12:57:08.000000000 +0100 +++ source/fs/xfs/linux/xfs_ioctl.c 2004-04-21 10:46:51.717719824 +0100 @@ -825,13 +825,14 @@ xfs_ioctl( case XFS_IOC_FREEZE: if (!capable(CAP_SYS_ADMIN)) return -EPERM; - xfs_fs_freeze(mp); + + freeze_bdev(inode->i_sb->s_bdev); return 0; case XFS_IOC_THAW: if (!capable(CAP_SYS_ADMIN)) return -EPERM; - xfs_fs_thaw(mp); + thaw_bdev(inode->i_sb->s_bdev, inode->i_sb); return 0; case XFS_IOC_GOINGDOWN: { --- diff/fs/xfs/linux/xfs_linux.h 2004-03-11 10:20:28.000000000 +0000 +++ source/fs/xfs/linux/xfs_linux.h 2004-04-21 10:46:51.717719824 +0100 @@ -134,13 +134,13 @@ static inline void set_buffer_unwritten_ #define irix_symlink_mode xfs_params.symlink_mode.val #define xfs_panic_mask xfs_params.panic_mask.val #define xfs_error_level xfs_params.error_level.val -#define xfs_syncd_interval xfs_params.sync_interval.val +#define xfs_syncd_interval (unlikely(laptop_mode) ? xfs_params.lm_sync_interval.val : xfs_params.sync_interval.val) #define xfs_stats_clear xfs_params.stats_clear.val #define xfs_inherit_sync xfs_params.inherit_sync.val #define xfs_inherit_nodump xfs_params.inherit_nodump.val #define xfs_inherit_noatime xfs_params.inherit_noatim.val #define xfs_flush_interval xfs_params.flush_interval.val -#define xfs_age_buffer xfs_params.age_buffer.val +#define xfs_age_buffer (unlikely(laptop_mode) ? xfs_params.lm_age_buffer.val : xfs_params.age_buffer.val) #define current_cpu() smp_processor_id() #define current_pid() (current->pid) --- diff/fs/xfs/linux/xfs_lrw.c 2004-03-11 10:20:28.000000000 +0000 +++ source/fs/xfs/linux/xfs_lrw.c 2004-04-21 10:46:51.718719672 +0100 @@ -682,8 +682,6 @@ xfs_write( io = &xip->i_iocore; mp = io->io_mount; - xfs_check_frozen(mp, bdp, XFS_FREEZE_WRITE); - if (XFS_FORCED_SHUTDOWN(mp)) { return -EIO; } --- diff/fs/xfs/linux/xfs_super.c 2004-04-21 10:45:35.803260576 +0100 +++ source/fs/xfs/linux/xfs_super.c 2004-04-21 10:46:51.719719520 +0100 @@ -72,6 +72,7 @@ #include #include #include +#include STATIC struct quotactl_ops linvfs_qops; STATIC struct super_operations linvfs_sops; @@ -470,6 +471,10 @@ syncd(void *arg) if (vfsp->vfs_flag & VFS_RDONLY) continue; VFS_SYNC(vfsp, SYNCD_FLAGS, NULL, error); + + vfsp->vfs_sync_seq++; + wmb(); + wake_up(&vfsp->vfs_wait_single_sync_task); } vfsp->vfs_sync_task = NULL; @@ -553,6 +558,24 @@ linvfs_sync_super( VFS_SYNC(vfsp, flags, NULL, error); sb->s_dirt = 0; + if (unlikely(laptop_mode)) + { + int prev_sync_seq = vfsp->vfs_sync_seq; + /* + * The disk must be active because we're syncing. + * We schedule syncd now (now that the disk is + * active) instead of later (when it might not be). + */ + wake_up_process(vfsp->vfs_sync_task); + /* + * We have to wait for the sync iteration to complete. + * If we don't, the disk activity caused by the sync + * will come after the sync is completed, and that + * triggers another sync from laptop mode. + */ + wait_event(vfsp->vfs_wait_single_sync_task, vfsp->vfs_sync_seq != prev_sync_seq); + } + return -error; } @@ -589,28 +612,7 @@ STATIC void linvfs_freeze_fs( struct super_block *sb) { - vfs_t *vfsp = LINVFS_GET_VFS(sb); - vnode_t *vp; - int error; - - if (sb->s_flags & MS_RDONLY) - return; - VFS_ROOT(vfsp, &vp, error); - VOP_IOCTL(vp, LINVFS_GET_IP(vp), NULL, 0, XFS_IOC_FREEZE, 0, error); - VN_RELE(vp); -} - -STATIC void -linvfs_unfreeze_fs( - struct super_block *sb) -{ - vfs_t *vfsp = LINVFS_GET_VFS(sb); - vnode_t *vp; - int error; - - VFS_ROOT(vfsp, &vp, error); - VOP_IOCTL(vp, LINVFS_GET_IP(vp), NULL, 0, XFS_IOC_THAW, 0, error); - VN_RELE(vp); + VFS_FREEZE(LINVFS_GET_VFS(sb)); } STATIC struct dentry * @@ -850,7 +852,6 @@ STATIC struct super_operations linvfs_so .write_super = linvfs_write_super, .sync_fs = linvfs_sync_super, .write_super_lockfs = linvfs_freeze_fs, - .unlockfs = linvfs_unfreeze_fs, .statfs = linvfs_statfs, .remount_fs = linvfs_remount, .show_options = linvfs_show_options, --- diff/fs/xfs/linux/xfs_sysctl.c 2004-03-11 10:20:28.000000000 +0000 +++ source/fs/xfs/linux/xfs_sysctl.c 2004-04-21 10:46:51.719719520 +0100 @@ -103,6 +103,11 @@ STATIC ctl_table xfs_table[] = { &sysctl_intvec, NULL, &xfs_params.sync_interval.min, &xfs_params.sync_interval.max}, + {XFS_LM_SYNC_INTERVAL, "lm_sync_interval", &xfs_params.lm_sync_interval.val, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, + &xfs_params.lm_sync_interval.min, &xfs_params.lm_sync_interval.max}, + {XFS_INHERIT_SYNC, "inherit_sync", &xfs_params.inherit_sync.val, sizeof(int), 0644, NULL, &proc_dointvec_minmax, &sysctl_intvec, NULL, @@ -128,6 +133,11 @@ STATIC ctl_table xfs_table[] = { &sysctl_intvec, NULL, &xfs_params.age_buffer.min, &xfs_params.age_buffer.max}, + {XFS_LM_AGE_BUFFER, "lm_age_buffer", &xfs_params.lm_age_buffer.val, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, + &xfs_params.lm_age_buffer.min, &xfs_params.lm_age_buffer.max}, + /* please keep this the last entry */ #ifdef CONFIG_PROC_FS {XFS_STATS_CLEAR, "stats_clear", &xfs_params.stats_clear.val, --- diff/fs/xfs/linux/xfs_sysctl.h 2004-03-11 10:20:28.000000000 +0000 +++ source/fs/xfs/linux/xfs_sysctl.h 2004-04-21 10:46:51.719719520 +0100 @@ -54,6 +54,7 @@ typedef struct xfs_param { xfs_sysctl_val_t panic_mask; /* bitmask to cause panic on errors. */ xfs_sysctl_val_t error_level; /* Degree of reporting for problems */ xfs_sysctl_val_t sync_interval; /* time between sync calls */ + xfs_sysctl_val_t lm_sync_interval; /* same, in laptop mode */ xfs_sysctl_val_t stats_clear; /* Reset all XFS statistics to zero. */ xfs_sysctl_val_t inherit_sync; /* Inherit the "sync" inode flag. */ xfs_sysctl_val_t inherit_nodump;/* Inherit the "nodump" inode flag. */ @@ -62,6 +63,9 @@ typedef struct xfs_param { * delwri flush daemon. */ xfs_sysctl_val_t age_buffer; /* time for buffer to age before * we flush it. */ + xfs_sysctl_val_t lm_age_buffer; /* time for buffer to age before + * we flush it when laptop mode is + * active. */ } xfs_param_t; /* @@ -86,12 +90,14 @@ enum { XFS_PANIC_MASK = 6, XFS_ERRLEVEL = 7, XFS_SYNC_INTERVAL = 8, + XFS_LM_SYNC_INTERVAL = 9, XFS_STATS_CLEAR = 12, XFS_INHERIT_SYNC = 13, XFS_INHERIT_NODUMP = 14, XFS_INHERIT_NOATIME = 15, XFS_FLUSH_INTERVAL = 16, XFS_AGE_BUFFER = 17, + XFS_LM_AGE_BUFFER = 3, }; extern xfs_param_t xfs_params; --- diff/fs/xfs/linux/xfs_vfs.c 2004-03-11 10:20:28.000000000 +0000 +++ source/fs/xfs/linux/xfs_vfs.c 2004-04-21 10:46:51.720719368 +0100 @@ -230,6 +230,18 @@ vfs_force_shutdown( ((*bhvtovfsops(next)->vfs_force_shutdown)(next, fl, file, line)); } +void +vfs_freeze( + struct bhv_desc *bdp) +{ + struct bhv_desc *next = bdp; + + ASSERT(next); + while (! (bhvtovfsops(next))->vfs_freeze) + next = BHV_NEXT(next); + ((*bhvtovfsops(next)->vfs_freeze)(next)); +} + vfs_t * vfs_allocate( void ) { @@ -238,6 +250,7 @@ vfs_allocate( void ) vfsp = kmem_zalloc(sizeof(vfs_t), KM_SLEEP); bhv_head_init(VFS_BHVHEAD(vfsp), "vfs"); init_waitqueue_head(&vfsp->vfs_wait_sync_task); + init_waitqueue_head(&vfsp->vfs_wait_single_sync_task); return vfsp; } --- diff/fs/xfs/linux/xfs_vfs.h 2004-02-09 10:36:12.000000000 +0000 +++ source/fs/xfs/linux/xfs_vfs.h 2004-04-21 10:46:51.720719368 +0100 @@ -52,6 +52,8 @@ typedef struct vfs { bhv_head_t vfs_bh; /* head of vfs behavior chain */ struct super_block *vfs_super; /* Linux superblock structure */ struct task_struct *vfs_sync_task; + int vfs_sync_seq; /* syncd iteration sequence number */ + wait_queue_head_t vfs_wait_single_sync_task; wait_queue_head_t vfs_wait_sync_task; } vfs_t; @@ -112,6 +114,7 @@ typedef int (*vfs_quotactl_t)(bhv_desc_t typedef void (*vfs_init_vnode_t)(bhv_desc_t *, struct vnode *, bhv_desc_t *, int); typedef void (*vfs_force_shutdown_t)(bhv_desc_t *, int, char *, int); +typedef void (*vfs_freeze_t)(bhv_desc_t *); typedef struct vfsops { bhv_position_t vf_position; /* behavior chain position */ @@ -128,6 +131,7 @@ typedef struct vfsops { vfs_quotactl_t vfs_quotactl; /* disk quota */ vfs_init_vnode_t vfs_init_vnode; /* initialize a new vnode */ vfs_force_shutdown_t vfs_force_shutdown; /* crash and burn */ + vfs_freeze_t vfs_freeze; /* freeze fs for snapshot */ } vfsops_t; /* @@ -147,6 +151,7 @@ typedef struct vfsops { #define VFS_QUOTACTL(v, c,id,p, rv) ((rv) = vfs_quotactl(VHEAD(v), c,id,p)) #define VFS_INIT_VNODE(v, vp,b,ul) ( vfs_init_vnode(VHEAD(v), vp,b,ul) ) #define VFS_FORCE_SHUTDOWN(v, fl,f,l) ( vfs_force_shutdown(VHEAD(v), fl,f,l) ) +#define VFS_FREEZE(v) ( vfs_freeze(VHEAD(v)) ) /* * PVFS's. Operates on behavior descriptor pointers. @@ -164,6 +169,7 @@ typedef struct vfsops { #define PVFS_QUOTACTL(b, c,id,p, rv) ((rv) = vfs_quotactl(b, c,id,p)) #define PVFS_INIT_VNODE(b, vp,b2,ul) ( vfs_init_vnode(b, vp,b2,ul) ) #define PVFS_FORCE_SHUTDOWN(b, fl,f,l) ( vfs_force_shutdown(b, fl,f,l) ) +#define PVFS_FREEZE(b) ( vfs_freeze(b) ) extern int vfs_mount(bhv_desc_t *, struct xfs_mount_args *, struct cred *); extern int vfs_parseargs(bhv_desc_t *, char *, struct xfs_mount_args *, int); @@ -178,6 +184,7 @@ extern int vfs_dmapiops(bhv_desc_t *, ca extern int vfs_quotactl(bhv_desc_t *, int, int, caddr_t); extern void vfs_init_vnode(bhv_desc_t *, struct vnode *, bhv_desc_t *, int); extern void vfs_force_shutdown(bhv_desc_t *, int, char *, int); +extern void vfs_freeze(bhv_desc_t *); typedef struct bhv_vfsops { struct vfsops bhv_common; --- diff/fs/xfs/xfs_fsops.c 2004-03-11 10:20:28.000000000 +0000 +++ source/fs/xfs/xfs_fsops.c 2004-04-21 10:46:51.721719216 +0100 @@ -582,63 +582,25 @@ xfs_fs_log_dummy(xfs_mount_t *mp) } int -xfs_fs_freeze( - xfs_mount_t *mp) -{ - vfs_t *vfsp; - /*REFERENCED*/ - int error; - - vfsp = XFS_MTOVFS(mp); - - /* Stop new writers */ - xfs_start_freeze(mp, XFS_FREEZE_WRITE); - - /* Flush the refcache */ - xfs_refcache_purge_mp(mp); - - /* Flush delalloc and delwri data */ - VFS_SYNC(vfsp, SYNC_DELWRI|SYNC_WAIT, NULL, error); - - /* Pause transaction subsystem */ - xfs_start_freeze(mp, XFS_FREEZE_TRANS); - - /* Flush any remaining inodes into buffers */ - VFS_SYNC(vfsp, SYNC_ATTR|SYNC_WAIT, NULL, error); - - /* Push all buffers out to disk */ - xfs_binval(mp->m_ddev_targp); - if (mp->m_rtdev_targp) { - xfs_binval(mp->m_rtdev_targp); - } - - /* Push the superblock and write an unmount record */ - xfs_log_unmount_write(mp); - xfs_unmountfs_writesb(mp); - - return 0; -} - -int -xfs_fs_thaw( - xfs_mount_t *mp) -{ - xfs_finish_freeze(mp); - return 0; -} - -int xfs_fs_goingdown( xfs_mount_t *mp, __uint32_t inflags) { - switch (inflags) - { - case XFS_FSOP_GOING_FLAGS_DEFAULT: - xfs_fs_freeze(mp); - xfs_force_shutdown(mp, XFS_FORCE_UMOUNT); - xfs_fs_thaw(mp); + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (inflags) { + case XFS_FSOP_GOING_FLAGS_DEFAULT: { + struct vfs *vfsp = XFS_MTOVFS(mp); + struct super_block *sb = freeze_bdev(vfsp->vfs_super->s_bdev); + + if (sb) { + xfs_force_shutdown(mp, XFS_FORCE_UMOUNT); + thaw_bdev(sb->s_bdev, sb); + } + break; + } case XFS_FSOP_GOING_FLAGS_LOGFLUSH: xfs_force_shutdown(mp, XFS_FORCE_UMOUNT); break; --- diff/fs/xfs/xfs_fsops.h 2004-03-11 10:20:28.000000000 +0000 +++ source/fs/xfs/xfs_fsops.h 2004-04-21 10:46:51.721719216 +0100 @@ -60,14 +60,6 @@ xfs_reserve_blocks( xfs_fsop_resblks_t *outval); int -xfs_fs_freeze( - xfs_mount_t *mp); - -int -xfs_fs_thaw( - xfs_mount_t *mp); - -int xfs_fs_goingdown( xfs_mount_t *mp, __uint32_t inflags); --- diff/fs/xfs/xfs_log.c 2004-04-05 12:57:08.000000000 +0100 +++ source/fs/xfs/xfs_log.c 2004-04-21 10:46:51.723718912 +0100 @@ -820,7 +820,7 @@ xfs_log_need_covered(xfs_mount_t *mp) xlog_t *log = mp->m_log; vfs_t *vfsp = XFS_MTOVFS(mp); - if (mp->m_frozen || XFS_FORCED_SHUTDOWN(mp) || + if (vfsp->vfs_super->s_frozen || XFS_FORCED_SHUTDOWN(mp) || (vfsp->vfs_flag & VFS_RDONLY)) return 0; --- diff/fs/xfs/xfs_mount.c 2004-04-05 12:57:08.000000000 +0100 +++ source/fs/xfs/xfs_mount.c 2004-04-21 10:46:51.724718760 +0100 @@ -140,9 +140,6 @@ xfs_mount_init(void) */ xfs_trans_ail_init(mp); - /* Init freeze sync structures */ - spinlock_init(&mp->m_freeze_lock, "xfs_freeze"); - init_sv(&mp->m_wait_unfreeze, SV_DEFAULT, "xfs_freeze", 0); atomic_set(&mp->m_active_trans, 0); return mp; @@ -192,8 +189,6 @@ xfs_mount_free( VFS_REMOVEBHV(vfsp, &mp->m_bhv); } - spinlock_destroy(&mp->m_freeze_lock); - sv_destroy(&mp->m_wait_unfreeze); kmem_free(mp, sizeof(xfs_mount_t)); } @@ -1586,59 +1581,3 @@ xfs_mount_log_sbunit( xfs_mod_sb(tp, fields); xfs_trans_commit(tp, 0, NULL); } - -/* Functions to lock access out of the filesystem for forced - * shutdown or snapshot. - */ - -void -xfs_start_freeze( - xfs_mount_t *mp, - int level) -{ - unsigned long s = mutex_spinlock(&mp->m_freeze_lock); - - mp->m_frozen = level; - mutex_spinunlock(&mp->m_freeze_lock, s); - - if (level == XFS_FREEZE_TRANS) { - while (atomic_read(&mp->m_active_trans) > 0) - delay(100); - } -} - -void -xfs_finish_freeze( - xfs_mount_t *mp) -{ - unsigned long s = mutex_spinlock(&mp->m_freeze_lock); - - if (mp->m_frozen) { - mp->m_frozen = 0; - sv_broadcast(&mp->m_wait_unfreeze); - } - - mutex_spinunlock(&mp->m_freeze_lock, s); -} - -void -xfs_check_frozen( - xfs_mount_t *mp, - bhv_desc_t *bdp, - int level) -{ - unsigned long s; - - if (mp->m_frozen) { - s = mutex_spinlock(&mp->m_freeze_lock); - - if (mp->m_frozen < level) { - mutex_spinunlock(&mp->m_freeze_lock, s); - } else { - sv_wait(&mp->m_wait_unfreeze, 0, &mp->m_freeze_lock, s); - } - } - - if (level == XFS_FREEZE_TRANS) - atomic_inc(&mp->m_active_trans); -} --- diff/fs/xfs/xfs_mount.h 2004-03-11 10:20:28.000000000 +0000 +++ source/fs/xfs/xfs_mount.h 2004-04-21 10:46:51.724718760 +0100 @@ -379,10 +379,6 @@ typedef struct xfs_mount { struct xfs_dmops m_dm_ops; /* vector of DMI ops */ struct xfs_qmops m_qm_ops; /* vector of XQM ops */ struct xfs_ioops m_io_ops; /* vector of I/O ops */ - lock_t m_freeze_lock; /* Lock for m_frozen */ - uint m_frozen; /* FS frozen for shutdown or - * snapshot */ - sv_t m_wait_unfreeze;/* waiting to unfreeze */ atomic_t m_active_trans; /* number trans frozen */ } xfs_mount_t; @@ -558,16 +554,6 @@ extern void xfs_initialize_perag(xfs_mou extern void xfs_xlatesb(void *, struct xfs_sb *, int, xfs_arch_t, __int64_t); -/* - * Flags for freeze operations. - */ -#define XFS_FREEZE_WRITE 1 -#define XFS_FREEZE_TRANS 2 - -extern void xfs_start_freeze(xfs_mount_t *, int); -extern void xfs_finish_freeze(xfs_mount_t *); -extern void xfs_check_frozen(xfs_mount_t *, bhv_desc_t *, int); - extern struct vfsops xfs_vfsops; extern struct vnodeops xfs_vnodeops; --- diff/fs/xfs/xfs_trans.c 2004-04-05 12:57:08.000000000 +0100 +++ source/fs/xfs/xfs_trans.c 2004-04-21 10:46:51.725718608 +0100 @@ -131,7 +131,9 @@ xfs_trans_alloc( xfs_mount_t *mp, uint type) { - xfs_check_frozen(mp, NULL, XFS_FREEZE_TRANS); + vfs_check_frozen(XFS_MTOVFS(mp)->vfs_super, SB_FREEZE_TRANS); + atomic_inc(&mp->m_active_trans); + return (_xfs_trans_alloc(mp, type)); } --- diff/fs/xfs/xfs_vfsops.c 2004-04-05 12:57:08.000000000 +0100 +++ source/fs/xfs/xfs_vfsops.c 2004-04-21 10:46:51.726718456 +0100 @@ -1858,6 +1858,20 @@ xfs_showargs( return 0; } +STATIC void +xfs_freeze( + bhv_desc_t *bdp) +{ + xfs_mount_t *mp = XFS_BHVTOM(bdp); + + while (atomic_read(&mp->m_active_trans) > 0) + delay(100); + + /* Push the superblock and write an unmount record */ + xfs_log_unmount_write(mp); + xfs_unmountfs_writesb(mp); +} + vfsops_t xfs_vfsops = { BHV_IDENTITY_INIT(VFS_BHV_XFS,VFS_POSITION_XFS), @@ -1874,4 +1888,5 @@ vfsops_t xfs_vfsops = { .vfs_quotactl = (vfs_quotactl_t)fs_nosys, .vfs_init_vnode = xfs_initialize_vnode, .vfs_force_shutdown = xfs_do_force_shutdown, + .vfs_freeze = xfs_freeze, }; --- diff/include/asm-alpha/pgtable.h 2003-10-09 09:47:34.000000000 +0100 +++ source/include/asm-alpha/pgtable.h 2004-04-21 10:46:51.727718304 +0100 @@ -349,6 +349,4 @@ extern void paging_init(void); /* We have our own get_unmapped_area to cope with ADDR_LIMIT_32BIT. */ #define HAVE_ARCH_UNMAPPED_AREA -typedef pte_t *pte_addr_t; - #endif /* _ALPHA_PGTABLE_H */ --- diff/include/asm-alpha/spinlock.h 2003-10-09 09:47:17.000000000 +0100 +++ source/include/asm-alpha/spinlock.h 2004-04-21 10:46:51.732717544 +0100 @@ -6,6 +6,10 @@ #include #include +#ifdef CONFIG_LOCKMETER +#undef DEBUG_SPINLOCK +#undef DEBUG_RWLOCK +#endif /* * Simple spin lock operations. There are two variants, one clears IRQ's @@ -95,9 +99,18 @@ static inline int _raw_spin_trylock(spin typedef struct { volatile int write_lock:1, read_counter:31; +#ifdef CONFIG_LOCKMETER + /* required for LOCKMETER since all bits in lock are used */ + /* need this storage for CPU and lock INDEX ............. */ + unsigned magic; +#endif } /*__attribute__((aligned(32)))*/ rwlock_t; +#ifdef CONFIG_LOCKMETER +#define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0, 0 } +#else #define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0 } +#endif #define rwlock_init(x) do { *(x) = RW_LOCK_UNLOCKED; } while(0) #define rwlock_is_locked(x) (*(volatile int *)(x) != 0) @@ -169,4 +182,41 @@ static inline void _raw_read_unlock(rwlo : "m" (*lock) : "memory"); } +#ifdef CONFIG_LOCKMETER +static inline int _raw_write_trylock(rwlock_t *lock) +{ + long temp,result; + + __asm__ __volatile__( + " ldl_l %1,%0\n" + " mov $31,%2\n" + " bne %1,1f\n" + " or $31,1,%2\n" + " stl_c %2,%0\n" + "1: mb\n" + : "=m" (*(volatile int *)lock), "=&r" (temp), "=&r" (result) + : "m" (*(volatile int *)lock) + ); + + return (result); +} + +static inline int _raw_read_trylock(rwlock_t *lock) +{ + unsigned long temp,result; + + __asm__ __volatile__( + " ldl_l %1,%0\n" + " mov $31,%2\n" + " blbs %1,1f\n" + " subl %1,2,%2\n" + " stl_c %2,%0\n" + "1: mb\n" + : "=m" (*(volatile int *)lock), "=&r" (temp), "=&r" (result) + : "m" (*(volatile int *)lock) + ); + return (result); +} +#endif /* CONFIG_LOCKMETER */ + #endif /* _ALPHA_SPINLOCK_H */ --- diff/include/asm-alpha/system.h 2003-02-26 16:01:09.000000000 +0000 +++ source/include/asm-alpha/system.h 2004-04-21 10:46:51.732717544 +0100 @@ -43,7 +43,6 @@ */ #define PARAM ZERO_PGE #define COMMAND_LINE ((char*)(PARAM + 0x0000)) -#define COMMAND_LINE_SIZE 256 #define INITRD_START (*(unsigned long *) (PARAM+0x100)) #define INITRD_SIZE (*(unsigned long *) (PARAM+0x108)) --- diff/include/asm-arm/bitops.h 2004-02-09 10:36:12.000000000 +0000 +++ source/include/asm-arm/bitops.h 2004-04-21 10:46:51.734717240 +0100 @@ -212,6 +212,8 @@ extern int _test_and_clear_bit_le(int nr extern int _test_and_change_bit_le(int nr, volatile unsigned long * p); extern int _find_first_zero_bit_le(void * p, unsigned size); extern int _find_next_zero_bit_le(void * p, int size, int offset); +extern int _find_first_bit_le(const unsigned long *p, unsigned size); +extern int _find_next_bit_le(const unsigned long *p, int size, int offset); /* * Big endian assembly bitops. nr = 0 -> byte 3 bit 0. @@ -224,7 +226,8 @@ extern int _test_and_clear_bit_be(int nr extern int _test_and_change_bit_be(int nr, volatile unsigned long * p); extern int _find_first_zero_bit_be(void * p, unsigned size); extern int _find_next_zero_bit_be(void * p, int size, int offset); - +extern int _find_first_bit_be(const unsigned long *p, unsigned size); +extern int _find_next_bit_be(unsigned long *p, int size, int offset); /* * The __* form of bitops are non-atomic and may be reordered. @@ -255,6 +258,8 @@ extern int _find_next_zero_bit_be(void * #define test_bit(nr,p) __test_bit(nr,p) #define find_first_zero_bit(p,sz) _find_first_zero_bit_le(p,sz) #define find_next_zero_bit(p,sz,off) _find_next_zero_bit_le(p,sz,off) +#define find_first_bit(p,sz) _find_first_bit_le(p,sz) +#define find_next_bit(p,sz,off) _find_next_bit_le(p,sz,off) #define WORD_BITOFF_TO_LE(x) ((x)) @@ -272,6 +277,8 @@ extern int _find_next_zero_bit_be(void * #define test_bit(nr,p) __test_bit(nr,p) #define find_first_zero_bit(p,sz) _find_first_zero_bit_be(p,sz) #define find_next_zero_bit(p,sz,off) _find_next_zero_bit_be(p,sz,off) +#define find_first_bit(p,sz) _find_first_bit_be(p,sz) +#define find_next_bit(p,sz,off) _find_next_bit_be(p,sz,off) #define WORD_BITOFF_TO_LE(x) ((x) ^ 0x18) --- diff/include/asm-arm/kmap_types.h 2004-01-19 10:22:59.000000000 +0000 +++ source/include/asm-arm/kmap_types.h 2004-04-21 10:46:51.734717240 +0100 @@ -14,7 +14,6 @@ enum km_type { KM_BIO_DST_IRQ, KM_PTE0, KM_PTE1, - KM_PTE2, KM_IRQ0, KM_IRQ1, KM_SOFTIRQ0, --- diff/include/asm-arm/pgtable.h 2004-04-21 10:45:35.844254344 +0100 +++ source/include/asm-arm/pgtable.h 2004-04-21 10:46:51.734717240 +0100 @@ -402,8 +402,6 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD #define io_remap_page_range(vma,from,phys,size,prot) \ remap_page_range(vma,from,phys,size,prot) -typedef pte_t *pte_addr_t; - #define pgtable_cache_init() do { } while (0) #endif /* !__ASSEMBLY__ */ --- diff/include/asm-arm26/pgtable.h 2003-10-09 09:47:34.000000000 +0100 +++ source/include/asm-arm26/pgtable.h 2004-04-21 10:46:51.733717392 +0100 @@ -290,8 +290,6 @@ static inline pte_t mk_pte_phys(unsigned #define io_remap_page_range(vma,from,phys,size,prot) \ remap_page_range(vma,from,phys,size,prot) -typedef pte_t *pte_addr_t; - #endif /* !__ASSEMBLY__ */ #endif /* _ASMARM_PGTABLE_H */ --- diff/include/asm-arm26/tlb.h 2003-06-30 10:07:24.000000000 +0100 +++ source/include/asm-arm26/tlb.h 2004-04-21 10:46:51.733717392 +0100 @@ -1,6 +1,7 @@ #ifndef __ASMARM_TLB_H #define __ASMARM_TLB_H +#include #include /* --- diff/include/asm-cris/pgtable.h 2003-07-11 09:39:50.000000000 +0100 +++ source/include/asm-cris/pgtable.h 2004-04-21 10:46:51.735717088 +0100 @@ -337,6 +337,4 @@ extern inline void update_mmu_cache(stru #define pte_to_pgoff(x) (pte_val(x) >> 6) #define pgoff_to_pte(x) __pte(((x) << 6) | _PAGE_FILE) -typedef pte_t *pte_addr_t; - #endif /* _CRIS_PGTABLE_H */ --- diff/include/asm-cris/setup.h 2003-07-11 09:39:50.000000000 +0100 +++ source/include/asm-cris/setup.h 2004-04-21 10:46:51.735717088 +0100 @@ -1,3 +1,6 @@ #ifndef _CRIS_SETUP_H #define _CRIS_SETUP_H + +#define COMMAND_LINE_SIZE 256 + #endif --- diff/include/asm-generic/tlb.h 2004-03-11 10:20:28.000000000 +0000 +++ source/include/asm-generic/tlb.h 2004-04-21 10:46:51.736716936 +0100 @@ -15,6 +15,7 @@ #include #include +#include #include /* @@ -146,4 +147,6 @@ static inline void tlb_remove_page(struc __pmd_free_tlb(tlb, pmdp); \ } while (0) +#define tlb_migrate_prepare(mm) do { } while(0) + #endif /* _ASM_GENERIC__TLB_H */ --- diff/include/asm-generic/vmlinux.lds.h 2004-04-21 10:45:35.850253432 +0100 +++ source/include/asm-generic/vmlinux.lds.h 2004-04-21 10:46:51.736716936 +0100 @@ -53,6 +53,6 @@ } #define SCHED_TEXT \ - __scheduling_functions_start_here = .; \ + __sched_text_start = .; \ *(.sched.text) \ - __scheduling_functions_end_here = .; + __sched_text_end = .; --- diff/include/asm-h8300/pgtable.h 2003-08-20 14:16:33.000000000 +0100 +++ source/include/asm-h8300/pgtable.h 2004-04-21 10:46:51.737716784 +0100 @@ -7,8 +7,6 @@ #include #include -typedef pte_t *pte_addr_t; - #define pgd_present(pgd) (1) /* pages are always present on NO_MM */ #define pgd_none(pgd) (0) #define pgd_bad(pgd) (0) --- diff/include/asm-h8300/setup.h 2003-05-21 11:50:10.000000000 +0100 +++ source/include/asm-h8300/setup.h 2004-04-21 10:46:51.737716784 +0100 @@ -1 +1,6 @@ -/* Nothing do */ +#ifndef __H8300_SETUP_H +#define __H8300_SETUP_H + +#define COMMAND_LINE_SIZE 512 + +#endif --- diff/include/asm-i386/bugs.h 2003-10-09 09:47:17.000000000 +0100 +++ source/include/asm-i386/bugs.h 2004-04-21 10:46:51.737716784 +0100 @@ -1,11 +1,11 @@ /* * include/asm-i386/bugs.h * - * Copyright (C) 1994 Linus Torvalds + * Copyright (C) 1994 Linus Torvalds * * Cyrix stuff, June 1998 by: * - Rafael R. Reilova (moved everything from head.S), - * + * * - Channing Corn (tests & fixes), * - Andrew D. Balsa (code cleanup). * @@ -25,7 +25,20 @@ #include #include #include - +#ifdef CONFIG_KGDB +/* + * Provied the command line "gdb" initial break + */ +int __init kgdb_initial_break(char * str) +{ + if (*str == '\0'){ + breakpoint(); + return 1; + } + return 0; +} +__setup("gdb",kgdb_initial_break); +#endif static int __init no_halt(char *s) { boot_cpu_data.hlt_works_ok = 0; @@ -140,7 +153,7 @@ static void __init check_popad(void) : "ecx", "edi" ); /* If this fails, it means that any user program may lock the CPU hard. Too bad. */ if (res != 12345678) printk( "Buggy.\n" ); - else printk( "OK.\n" ); + else printk( "OK.\n" ); #endif } --- diff/include/asm-i386/kmap_types.h 2003-10-09 09:47:17.000000000 +0100 +++ source/include/asm-i386/kmap_types.h 2004-04-21 10:46:51.738716632 +0100 @@ -19,12 +19,11 @@ D(5) KM_BIO_SRC_IRQ, D(6) KM_BIO_DST_IRQ, D(7) KM_PTE0, D(8) KM_PTE1, -D(9) KM_PTE2, -D(10) KM_IRQ0, -D(11) KM_IRQ1, -D(12) KM_SOFTIRQ0, -D(13) KM_SOFTIRQ1, -D(14) KM_TYPE_NR +D(9) KM_IRQ0, +D(10) KM_IRQ1, +D(11) KM_SOFTIRQ0, +D(12) KM_SOFTIRQ1, +D(13) KM_TYPE_NR }; #undef D --- diff/include/asm-i386/mach-es7000/mach_apic.h 2004-02-09 10:36:12.000000000 +0000 +++ source/include/asm-i386/mach-es7000/mach_apic.h 2004-04-21 10:46:51.739716480 +0100 @@ -39,7 +39,7 @@ static inline cpumask_t target_cpus(void #endif #define APIC_BROADCAST_ID (0xff) -#define NO_IOAPIC_CHECK (0) +#define NO_IOAPIC_CHECK (1) static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid) { @@ -169,7 +169,11 @@ static inline unsigned int cpu_mask_to_a num_bits_set = cpus_weight_const(cpumask); /* Return id to all */ if (num_bits_set == NR_CPUS) +#if defined CONFIG_ES7000_CLUSTERED_APIC return 0xFF; +#else + return cpu_to_logical_apicid(0); +#endif /* * The cpus in the mask must all be on the apic cluster. If are not * on the same apicid cluster return default value of TARGET_CPUS. @@ -182,7 +186,11 @@ static inline unsigned int cpu_mask_to_a if (apicid_cluster(apicid) != apicid_cluster(new_apicid)){ printk ("%s: Not a valid mask!\n",__FUNCTION__); +#if defined CONFIG_ES7000_CLUSTERED_APIC return 0xFF; +#else + return cpu_to_logical_apicid(0); +#endif } apicid = new_apicid; cpus_found++; --- diff/include/asm-i386/param.h 2004-03-11 10:20:28.000000000 +0000 +++ source/include/asm-i386/param.h 2004-04-21 10:46:51.739716480 +0100 @@ -4,7 +4,9 @@ #ifdef __KERNEL__ # define HZ 1000 /* Internal kernel timer frequency */ # define USER_HZ 100 /* .. some user interfaces are in "ticks" */ -# define CLOCKS_PER_SEC (USER_HZ) /* like times() */ +# define CLOCKS_PER_SEC (USER_HZ) /* like times() */ +# define JIFFIES_TO_MSEC(x) (x) +# define MSEC_TO_JIFFIES(x) (x) #endif #ifndef HZ @@ -18,5 +20,6 @@ #endif #define MAXHOSTNAMELEN 64 /* max length of hostname */ +#define COMMAND_LINE_SIZE 256 #endif --- diff/include/asm-i386/pgtable.h 2004-04-21 10:45:35.859252064 +0100 +++ source/include/asm-i386/pgtable.h 2004-04-21 10:46:51.740716328 +0100 @@ -314,18 +314,6 @@ static inline pte_t pte_modify(pte_t pte #define pte_unmap_nested(pte) do { } while (0) #endif -#if defined(CONFIG_HIGHPTE) && defined(CONFIG_HIGHMEM4G) -typedef u32 pte_addr_t; -#endif - -#if defined(CONFIG_HIGHPTE) && defined(CONFIG_HIGHMEM64G) -typedef u64 pte_addr_t; -#endif - -#if !defined(CONFIG_HIGHPTE) -typedef pte_t *pte_addr_t; -#endif - /* * The i386 doesn't have any external MMU info: the kernel page * tables contain all the necessary information. --- diff/include/asm-i386/processor.h 2004-04-21 10:45:35.859252064 +0100 +++ source/include/asm-i386/processor.h 2004-04-21 10:46:51.740716328 +0100 @@ -648,4 +648,9 @@ extern inline void prefetchw(const void extern void select_idle_routine(const struct cpuinfo_x86 *c); +#ifdef CONFIG_SCHED_SMT +#define ARCH_HAS_SCHED_DOMAIN +#define ARCH_HAS_SCHED_WAKE_IDLE +#endif + #endif /* __ASM_I386_PROCESSOR_H */ --- diff/include/asm-i386/rwlock.h 2003-10-09 09:47:17.000000000 +0100 +++ source/include/asm-i386/rwlock.h 2004-04-21 10:46:51.741716176 +0100 @@ -20,28 +20,52 @@ #define RW_LOCK_BIAS 0x01000000 #define RW_LOCK_BIAS_STR "0x01000000" -#define __build_read_lock_ptr(rw, helper) \ - asm volatile(LOCK "subl $1,(%0)\n\t" \ - "js 2f\n" \ - "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tcall " helper "\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - ::"a" (rw) : "memory") - -#define __build_read_lock_const(rw, helper) \ - asm volatile(LOCK "subl $1,%0\n\t" \ - "js 2f\n" \ - "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tpushl %%eax\n\t" \ - "leal %0,%%eax\n\t" \ - "call " helper "\n\t" \ - "popl %%eax\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - :"=m" (*(volatile int *)rw) : : "memory") +#ifdef CONFIG_SPINLINE + + #define __build_read_lock_ptr(rw, helper) \ + asm volatile(LOCK "subl $1,(%0)\n\t" \ + "jns 1f\n\t" \ + "call " helper "\n\t" \ + "1:\t" \ + ::"a" (rw) : "memory") + + #define __build_read_lock_const(rw, helper) \ + asm volatile(LOCK "subl $1,%0\n\t" \ + "jns 1f\n\t" \ + "pushl %%eax\n\t" \ + "leal %0,%%eax\n\t" \ + "call " helper "\n\t" \ + "popl %%eax\n\t" \ + "1:\t" \ + :"=m" (*(volatile int *)rw) : : "memory") + +#else /* !CONFIG_SPINLINE */ + + #define __build_read_lock_ptr(rw, helper) \ + asm volatile(LOCK "subl $1,(%0)\n\t" \ + "js 2f\n" \ + "1:\n" \ + LOCK_SECTION_START("") \ + "2:\tcall " helper "\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END \ + ::"a" (rw) : "memory") + + #define __build_read_lock_const(rw, helper) \ + asm volatile(LOCK "subl $1,%0\n\t" \ + "js 2f\n" \ + "1:\n" \ + LOCK_SECTION_START("") \ + "2:\tpushl %%eax\n\t" \ + "leal %0,%%eax\n\t" \ + "call " helper "\n\t" \ + "popl %%eax\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END \ + :"=m" (*(volatile int *)rw) : : "memory") + +#endif /* CONFIG_SPINLINE */ + #define __build_read_lock(rw, helper) do { \ if (__builtin_constant_p(rw)) \ @@ -50,28 +74,51 @@ __build_read_lock_ptr(rw, helper); \ } while (0) -#define __build_write_lock_ptr(rw, helper) \ - asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",(%0)\n\t" \ - "jnz 2f\n" \ - "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tcall " helper "\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - ::"a" (rw) : "memory") - -#define __build_write_lock_const(rw, helper) \ - asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",%0\n\t" \ - "jnz 2f\n" \ - "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tpushl %%eax\n\t" \ - "leal %0,%%eax\n\t" \ - "call " helper "\n\t" \ - "popl %%eax\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - :"=m" (*(volatile int *)rw) : : "memory") +#ifdef CONFIG_SPINLINE + + #define __build_write_lock_ptr(rw, helper) \ + asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",(%0)\n\t" \ + "jz 1f\n\t" \ + "call " helper "\n\t" \ + "1:\n" \ + ::"a" (rw) : "memory") + + #define __build_write_lock_const(rw, helper) \ + asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",%0\n\t" \ + "jz 1f\n\t" \ + "pushl %%eax\n\t" \ + "leal %0,%%eax\n\t" \ + "call " helper "\n\t" \ + "popl %%eax\n\t" \ + "1:\n" \ + :"=m" (*(volatile int *)rw) : : "memory") + +#else /* !CONFIG_SPINLINE */ + + #define __build_write_lock_ptr(rw, helper) \ + asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",(%0)\n\t" \ + "jnz 2f\n" \ + "1:\n" \ + LOCK_SECTION_START("") \ + "2:\tcall " helper "\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END \ + ::"a" (rw) : "memory") + + #define __build_write_lock_const(rw, helper) \ + asm volatile(LOCK "subl $" RW_LOCK_BIAS_STR ",%0\n\t" \ + "jnz 2f\n" \ + "1:\n" \ + LOCK_SECTION_START("") \ + "2:\tpushl %%eax\n\t" \ + "leal %0,%%eax\n\t" \ + "call " helper "\n\t" \ + "popl %%eax\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END \ + :"=m" (*(volatile int *)rw) : : "memory") + +#endif /* CONFIG_SPINLINE */ #define __build_write_lock(rw, helper) do { \ if (__builtin_constant_p(rw)) \ --- diff/include/asm-i386/smp.h 2004-03-11 10:20:28.000000000 +0000 +++ source/include/asm-i386/smp.h 2004-04-21 10:46:51.741716176 +0100 @@ -34,7 +34,7 @@ extern void smp_alloc_memory(void); extern int pic_mode; extern int smp_num_siblings; -extern int cpu_sibling_map[]; +extern cpumask_t cpu_sibling_map[]; extern void smp_flush_tlb(void); extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs); --- diff/include/asm-i386/spinlock.h 2004-03-11 10:20:28.000000000 +0000 +++ source/include/asm-i386/spinlock.h 2004-04-21 10:46:51.743715872 +0100 @@ -43,18 +43,35 @@ typedef struct { #define spin_is_locked(x) (*(volatile signed char *)(&(x)->lock) <= 0) #define spin_unlock_wait(x) do { barrier(); } while(spin_is_locked(x)) -#define spin_lock_string \ - "\n1:\t" \ - "lock ; decb %0\n\t" \ - "js 2f\n" \ - LOCK_SECTION_START("") \ - "2:\t" \ - "rep;nop\n\t" \ - "cmpb $0,%0\n\t" \ - "jle 2b\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END +#ifdef CONFIG_SPINLINE + #define spin_lock_string \ + "\n1:\t" \ + "lock ; decb %0\n\t" \ + "js 2f\n" \ + "jmp 3f\n" \ + "2:\t" \ + "rep;nop\n\t" \ + "cmpb $0,%0\n\t" \ + "jle 2b\n\t" \ + "jmp 1b\n" \ + "3:\t" + +#else /* !CONFIG_SPINLINE */ + + #define spin_lock_string \ + "\n1:\t" \ + "lock ; decb %0\n\t" \ + "js 2f\n" \ + LOCK_SECTION_START("") \ + "2:\t" \ + "rep;nop\n\t" \ + "cmpb $0,%0\n\t" \ + "jle 2b\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END + +#endif /* CONFIG_SPINLINE */ /* * This works. Despite all the confusion. * (except on PPro SMP or if we are using OOSTORE) @@ -138,6 +155,11 @@ here: */ typedef struct { volatile unsigned int lock; +#ifdef CONFIG_LOCKMETER + /* required for LOCKMETER since all bits in lock are used */ + /* and we need this storage for CPU and lock INDEX */ + unsigned lockmeter_magic; +#endif #ifdef CONFIG_DEBUG_SPINLOCK unsigned magic; #endif @@ -145,11 +167,19 @@ typedef struct { #define RWLOCK_MAGIC 0xdeaf1eed +#ifdef CONFIG_LOCKMETER +#ifdef CONFIG_DEBUG_SPINLOCK +#define RWLOCK_MAGIC_INIT , 0, RWLOCK_MAGIC +#else +#define RWLOCK_MAGIC_INIT , 0 +#endif +#else /* !CONFIG_LOCKMETER */ #ifdef CONFIG_DEBUG_SPINLOCK #define RWLOCK_MAGIC_INIT , RWLOCK_MAGIC #else #define RWLOCK_MAGIC_INIT /* */ #endif +#endif /* !CONFIG_LOCKMETER */ #define RW_LOCK_UNLOCKED (rwlock_t) { RW_LOCK_BIAS RWLOCK_MAGIC_INIT } @@ -196,4 +226,60 @@ static inline int _raw_write_trylock(rwl return 0; } +#ifdef CONFIG_LOCKMETER +static inline int _raw_read_trylock(rwlock_t *lock) +{ +/* FIXME -- replace with assembler */ + atomic_t *count = (atomic_t *)lock; + atomic_dec(count); + if (count->counter > 0) + return 1; + atomic_inc(count); + return 0; +} +#endif + +#if defined(CONFIG_LOCKMETER) && defined(CONFIG_HAVE_DEC_LOCK) +extern void _metered_spin_lock (spinlock_t *lock); +extern void _metered_spin_unlock(spinlock_t *lock); + +/* + * Matches what is in arch/i386/lib/dec_and_lock.c, except this one is + * "static inline" so that the spin_lock(), if actually invoked, is charged + * against the real caller, not against the catch-all atomic_dec_and_lock + */ +static inline int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) +{ + int counter; + int newcount; + +repeat: + counter = atomic_read(atomic); + newcount = counter-1; + + if (!newcount) + goto slow_path; + + asm volatile("lock; cmpxchgl %1,%2" + :"=a" (newcount) + :"r" (newcount), "m" (atomic->counter), "0" (counter)); + + /* If the above failed, "eax" will have changed */ + if (newcount != counter) + goto repeat; + return 0; + +slow_path: + preempt_disable(); + _metered_spin_lock(lock); + if (atomic_dec_and_test(atomic)) + return 1; + _metered_spin_unlock(lock); + preempt_enable(); + return 0; +} + +#define ATOMIC_DEC_AND_LOCK +#endif + #endif /* __ASM_SPINLOCK_H */ --- diff/include/asm-ia64/pgtable.h 2004-04-21 10:45:35.864251304 +0100 +++ source/include/asm-ia64/pgtable.h 2004-04-21 10:46:51.744715720 +0100 @@ -102,7 +102,7 @@ * can map. */ #define PMD_SHIFT (PAGE_SHIFT + (PAGE_SHIFT-3)) -#define PMD_SIZE (__IA64_UL(1) << PMD_SHIFT) +#define PMD_SIZE (1UL << PMD_SHIFT) #define PMD_MASK (~(PMD_SIZE-1)) #define PTRS_PER_PMD (__IA64_UL(1) << (PAGE_SHIFT-3)) @@ -469,8 +469,6 @@ extern void hugetlb_free_pgtables(struct struct vm_area_struct * prev, unsigned long start, unsigned long end); #endif -typedef pte_t *pte_addr_t; - /* * IA-64 doesn't have any external MMU info: the page tables contain all the necessary * information. However, we use this routine to take care of any (delayed) i-cache --- diff/include/asm-ia64/spinlock.h 2004-04-05 12:57:08.000000000 +0100 +++ source/include/asm-ia64/spinlock.h 2004-04-21 10:46:51.745715568 +0100 @@ -110,8 +110,18 @@ do { \ typedef struct { volatile int read_counter : 31; volatile int write_lock : 1; +#ifdef CONFIG_LOCKMETER + /* required for LOCKMETER since all bits in lock are used */ + /* and we need this storage for CPU and lock INDEX */ + unsigned lockmeter_magic; +#endif } rwlock_t; + +#ifdef CONFIG_LOCKMETER +#define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0, 0 } +#else #define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0 } +#endif #define rwlock_init(x) do { *(x) = RW_LOCK_UNLOCKED; } while(0) #define rwlock_is_locked(x) (*(volatile int *) (x) != 0) @@ -127,6 +137,48 @@ do { \ } \ } while (0) +#ifdef CONFIG_LOCKMETER +/* + * HACK: This works, but still have a timing window that affects performance: + * we see that no one owns the Write lock, then someone * else grabs for Write + * lock before we do a read_lock(). + * This means that on rare occasions our read_lock() will stall and spin-wait + * until we acquire for Read, instead of simply returning a trylock failure. + */ +static inline int _raw_read_trylock(rwlock_t *rw) +{ + if (rw->write_lock) { + return 0; + } else { + _raw_read_lock(rw); + return 1; + } +} + +static inline int _raw_write_trylock(rwlock_t *rw) +{ + if (!(rw->write_lock)) { + /* isn't currently write-locked... that looks promising... */ + if (test_and_set_bit(31, rw) == 0) { + /* now it is write-locked by me... */ + if (rw->read_counter) { + /* really read-locked, so release write-lock and fail */ + clear_bit(31, rw); + } else { + /* we've the the write-lock, no read-lockers... success! */ + barrier(); + return 1; + } + + } + } + + /* falls through ... fails to write-lock */ + barrier(); + return 0; +} +#endif + #define _raw_read_unlock(rw) \ do { \ rwlock_t *__read_lock_ptr = (rw); \ @@ -190,4 +242,25 @@ do { \ clear_bit(31, (x)); \ }) +#ifdef CONFIG_LOCKMETER +extern void _metered_spin_lock (spinlock_t *lock); +extern void _metered_spin_unlock(spinlock_t *lock); + +/* + * Use a less efficient, and inline, atomic_dec_and_lock() if lockmetering + * so we can see the callerPC of who is actually doing the spin_lock(). + * Otherwise, all we see is the generic rollup of all locks done by + * atomic_dec_and_lock(). + */ +static inline int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) +{ + _metered_spin_lock(lock); + if (atomic_dec_and_test(atomic)) + return 1; + _metered_spin_unlock(lock); + return 0; +} +#define ATOMIC_DEC_AND_LOCK +#endif + #endif /* _ASM_IA64_SPINLOCK_H */ --- diff/include/asm-ia64/tlb.h 2004-03-11 10:20:28.000000000 +0000 +++ source/include/asm-ia64/tlb.h 2004-04-21 10:46:51.745715568 +0100 @@ -211,6 +211,8 @@ __tlb_remove_tlb_entry (struct mmu_gathe tlb->end_addr = address + PAGE_SIZE; } +#define tlb_migrate_prepare(mm) flush_tlb_mm(mm) + #define tlb_start_vma(tlb, vma) do { } while (0) #define tlb_end_vma(tlb, vma) do { } while (0) --- diff/include/asm-ia64/unistd.h 2004-04-05 12:57:08.000000000 +0100 +++ source/include/asm-ia64/unistd.h 2004-04-21 10:46:51.745715568 +0100 @@ -248,9 +248,9 @@ #define __NR_clock_nanosleep 1256 #define __NR_fstatfs64 1257 #define __NR_statfs64 1258 -#define __NR_reserved1 1259 /* reserved for NUMA interface */ -#define __NR_reserved2 1260 /* reserved for NUMA interface */ -#define __NR_reserved3 1261 /* reserved for NUMA interface */ +#define __NR_mbind 1259 +#define __NR_get_mempolicy 1260 +#define __NR_set_mempolicy 1261 #ifdef __KERNEL__ --- diff/include/asm-m68k/pgtable.h 2004-02-09 10:36:12.000000000 +0000 +++ source/include/asm-m68k/pgtable.h 2004-04-21 10:46:51.747715264 +0100 @@ -168,8 +168,6 @@ static inline void update_mmu_cache(stru ? (__pgprot((pgprot_val(prot) & _CACHEMASK040) | _PAGE_NOCACHE_S)) \ : (prot))) -typedef pte_t *pte_addr_t; - #endif /* !__ASSEMBLY__ */ /* --- diff/include/asm-m68k/setup.h 2002-10-16 04:28:26.000000000 +0100 +++ source/include/asm-m68k/setup.h 2004-04-21 10:46:51.747715264 +0100 @@ -357,6 +357,7 @@ extern int m68k_is040or060; #define NUM_MEMINFO 4 #define CL_SIZE 256 +#define COMMAND_LINE_SIZE CL_SIZE #ifndef __ASSEMBLY__ extern int m68k_num_memory; /* # of memory blocks found (and used) */ --- diff/include/asm-m68knommu/pgtable.h 2003-06-09 14:18:20.000000000 +0100 +++ source/include/asm-m68knommu/pgtable.h 2004-04-21 10:46:51.746715416 +0100 @@ -11,8 +11,6 @@ #include #include -typedef pte_t *pte_addr_t; - /* * Trivial page table functions. */ --- diff/include/asm-m68knommu/setup.h 2002-11-11 11:09:43.000000000 +0000 +++ source/include/asm-m68knommu/setup.h 2004-04-21 10:46:51.746715416 +0100 @@ -1 +1,5 @@ #include + +/* We have a bigger command line buffer. */ +#undef COMMAND_LINE_SIZE +#define COMMAND_LINE_SIZE 512 --- diff/include/asm-mips/bootinfo.h 2004-04-21 10:45:35.870250392 +0100 +++ source/include/asm-mips/bootinfo.h 2004-04-21 10:46:51.748715112 +0100 @@ -12,6 +12,7 @@ #define _ASM_BOOTINFO_H #include +#include /* * The MACH_GROUP_ IDs are the equivalent to PCI vendor IDs; the remaining @@ -209,7 +210,7 @@ #define MACH_GROUP_TITAN 22 /* PMC-Sierra Titan */ #define MACH_TITAN_YOSEMITE 1 /* PMC-Sierra Yosemite */ -#define CL_SIZE (256) +#define CL_SIZE COMMAND_LINE_SIZE const char *get_system_type(void); --- diff/include/asm-mips/kmap_types.h 2003-07-08 09:55:19.000000000 +0100 +++ source/include/asm-mips/kmap_types.h 2004-04-21 10:46:51.748715112 +0100 @@ -19,12 +19,11 @@ D(5) KM_BIO_SRC_IRQ, D(6) KM_BIO_DST_IRQ, D(7) KM_PTE0, D(8) KM_PTE1, -D(9) KM_PTE2, -D(10) KM_IRQ0, -D(11) KM_IRQ1, -D(12) KM_SOFTIRQ0, -D(13) KM_SOFTIRQ1, -D(14) KM_TYPE_NR +D(9) KM_IRQ0, +D(10) KM_IRQ1, +D(11) KM_SOFTIRQ0, +D(12) KM_SOFTIRQ1, +D(13) KM_TYPE_NR }; #undef D --- diff/include/asm-mips/pgtable-32.h 2004-03-11 10:20:28.000000000 +0000 +++ source/include/asm-mips/pgtable-32.h 2004-04-21 10:46:51.749714960 +0100 @@ -216,10 +216,4 @@ static inline pmd_t *pmd_offset(pgd_t *d #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#ifdef CONFIG_64BIT_PHYS_ADDR -typedef u64 pte_addr_t; -#else -typedef pte_t *pte_addr_t; -#endif - #endif /* _ASM_PGTABLE_32_H */ --- diff/include/asm-mips/pgtable-64.h 2004-04-21 10:45:35.887247808 +0100 +++ source/include/asm-mips/pgtable-64.h 2004-04-21 10:46:51.749714960 +0100 @@ -214,8 +214,6 @@ static inline pte_t mk_swap_pte(unsigned #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -typedef pte_t *pte_addr_t; - /* * Used for the b0rked handling of kernel pagetables on the 64-bit kernel. */ --- diff/include/asm-mips/spinlock.h 2004-04-21 10:45:35.892247048 +0100 +++ source/include/asm-mips/spinlock.h 2004-04-21 10:46:51.750714808 +0100 @@ -91,9 +91,18 @@ static inline unsigned int _raw_spin_try typedef struct { volatile unsigned int lock; +#ifdef CONFIG_LOCKMETER + /* required for LOCKMETER since all bits in lock are used */ + /* and we need this storage for CPU and lock INDEX */ + unsigned lockmeter_magic; +#endif } rwlock_t; +#ifdef CONFIG_LOCKMETER +#define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0 } +#else #define RW_LOCK_UNLOCKED (rwlock_t) { 0 } +#endif #define rwlock_init(x) do { *(x) = RW_LOCK_UNLOCKED; } while(0) --- diff/include/asm-parisc/pgtable.h 2004-02-09 10:36:12.000000000 +0000 +++ source/include/asm-parisc/pgtable.h 2004-04-21 10:46:51.750714808 +0100 @@ -450,8 +450,6 @@ static inline void ptep_mkdirty(pte_t *p #define pte_same(A,B) (pte_val(A) == pte_val(B)) -typedef pte_t *pte_addr_t; - #endif /* !__ASSEMBLY__ */ #define io_remap_page_range remap_page_range --- diff/include/asm-parisc/setup.h 2002-10-16 04:28:25.000000000 +0100 +++ source/include/asm-parisc/setup.h 2004-04-21 10:46:51.751714656 +0100 @@ -1,10 +1,6 @@ -/* - * Just a place holder. We don't want to have to test x86 before - * we include stuff - */ +#ifndef _PARISC_SETUP_H +#define _PARISC_SETUP_H -#ifndef _i386_SETUP_H -#define _i386_SETUP_H +#define COMMAND_LINE_SIZE 1024 - -#endif /* _i386_SETUP_H */ +#endif /* _PARISC_SETUP_H */ --- diff/include/asm-ppc/machdep.h 2004-04-05 12:57:08.000000000 +0100 +++ source/include/asm-ppc/machdep.h 2004-04-21 10:46:51.753714352 +0100 @@ -106,7 +106,6 @@ struct machdep_calls { }; extern struct machdep_calls ppc_md; -#define COMMAND_LINE_SIZE 512 extern char cmd_line[COMMAND_LINE_SIZE]; extern void setup_pci_ptrs(void); --- diff/include/asm-ppc/pgtable.h 2004-02-18 08:54:12.000000000 +0000 +++ source/include/asm-ppc/pgtable.h 2004-04-21 10:46:51.754714200 +0100 @@ -670,8 +670,6 @@ extern void kernel_set_cachemode (unsign */ #define pgtable_cache_init() do { } while (0) -typedef pte_t *pte_addr_t; - #endif /* !__ASSEMBLY__ */ #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG --- diff/include/asm-ppc/setup.h 2002-10-16 04:29:02.000000000 +0100 +++ source/include/asm-ppc/setup.h 2004-04-21 10:46:51.754714200 +0100 @@ -6,6 +6,9 @@ #define m68k_memory memory #include +/* We have a bigger command line buffer. */ +#undef COMMAND_LINE_SIZE +#define COMMAND_LINE_SIZE 512 #endif /* _PPC_SETUP_H */ #endif /* __KERNEL__ */ --- diff/include/asm-ppc64/machdep.h 2004-04-21 10:45:35.902245528 +0100 +++ source/include/asm-ppc64/machdep.h 2004-04-21 10:46:51.751714656 +0100 @@ -11,6 +11,7 @@ #include #include +#include #include struct pt_regs; @@ -112,9 +113,7 @@ struct machdep_calls { }; extern struct machdep_calls ppc_md; -#define COMMAND_LINE_SIZE 512 extern char cmd_line[COMMAND_LINE_SIZE]; -extern char saved_command_line[COMMAND_LINE_SIZE]; /* Functions to produce codes on the leds. * The SRC code should be unique for the message category and should --- diff/include/asm-ppc64/pgalloc.h 2004-02-09 10:36:12.000000000 +0000 +++ source/include/asm-ppc64/pgalloc.h 2004-04-21 10:46:51.751714656 +0100 @@ -48,28 +48,43 @@ pmd_free(pmd_t *pmd) pmd_populate_kernel(mm, pmd, page_address(pte_page)) static inline pte_t * -pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) +pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - return kmem_cache_alloc(zero_cache, GFP_KERNEL|__GFP_REPEAT); + pte_t *pte; + pte = kmem_cache_alloc(zero_cache, GFP_KERNEL|__GFP_REPEAT); + if (pte) { + struct page *ptepage = virt_to_page(pte); + ptepage->mapping = (void *) mm; + ptepage->index = address & PMD_MASK; + } + return pte; } static inline struct page * pte_alloc_one(struct mm_struct *mm, unsigned long address) { - pte_t *pte = pte_alloc_one_kernel(mm, address); - - if (pte) - return virt_to_page(pte); - + pte_t *pte; + pte = kmem_cache_alloc(zero_cache, GFP_KERNEL|__GFP_REPEAT); + if (pte) { + struct page *ptepage = virt_to_page(pte); + ptepage->mapping = (void *) mm; + ptepage->index = address & PMD_MASK; + return ptepage; + } return NULL; } static inline void pte_free_kernel(pte_t *pte) { + virt_to_page(pte)->mapping = NULL; kmem_cache_free(zero_cache, pte); } -#define pte_free(pte_page) pte_free_kernel(page_address(pte_page)) +static inline void pte_free(struct page *ptepage) +{ + ptepage->mapping = NULL; + kmem_cache_free(zero_cache, page_address(ptepage)); +} struct pte_freelist_batch { @@ -86,33 +101,7 @@ extern void pte_free_submit(struct pte_f DECLARE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur); -static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage) -{ - /* This is safe as we are holding page_table_lock */ - cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id()); - struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur); - - if (atomic_read(&tlb->mm->mm_users) < 2 || - cpus_equal(tlb->mm->cpu_vm_mask, local_cpumask)) { - pte_free(ptepage); - return; - } - - if (*batchp == NULL) { - *batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC); - if (*batchp == NULL) { - pte_free_now(ptepage); - return; - } - (*batchp)->index = 0; - } - (*batchp)->pages[(*batchp)->index++] = ptepage; - if ((*batchp)->index == PTE_FREELIST_SIZE) { - pte_free_submit(*batchp); - *batchp = NULL; - } -} - +void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage); #define __pmd_free_tlb(tlb, pmd) __pte_free_tlb(tlb, virt_to_page(pmd)) #define check_pgt_cache() do { } while (0) --- diff/include/asm-ppc64/pgtable.h 2004-04-21 10:45:35.905245072 +0100 +++ source/include/asm-ppc64/pgtable.h 2004-04-21 10:46:51.752714504 +0100 @@ -471,8 +471,6 @@ extern struct vm_struct * im_get_area(un int region_type); unsigned long im_free(void *addr); -typedef pte_t *pte_addr_t; - long pSeries_lpar_hpte_insert(unsigned long hpte_group, unsigned long va, unsigned long prpn, int secondary, unsigned long hpteflags, --- diff/include/asm-ppc64/processor.h 2004-04-21 10:45:35.907244768 +0100 +++ source/include/asm-ppc64/processor.h 2004-04-21 10:46:51.752714504 +0100 @@ -623,6 +623,11 @@ static inline void prefetchw(const void #define spin_lock_prefetch(x) prefetchw(x) +#ifdef CONFIG_SCHED_SMT +#define ARCH_HAS_SCHED_DOMAIN +#define ARCH_HAS_SCHED_WAKE_BALANCE +#endif + #endif /* ASSEMBLY */ #endif /* __ASM_PPC64_PROCESSOR_H */ --- diff/include/asm-ppc64/setup.h 2002-10-16 04:27:14.000000000 +0100 +++ source/include/asm-ppc64/setup.h 2004-04-21 10:46:51.753714352 +0100 @@ -1,6 +1,6 @@ #ifndef _PPC_SETUP_H #define _PPC_SETUP_H -/* This is a place holder include */ +#define COMMAND_LINE_SIZE 512 #endif /* _PPC_SETUP_H */ --- diff/include/asm-s390/pgtable.h 2004-04-05 12:57:08.000000000 +0100 +++ source/include/asm-s390/pgtable.h 2004-04-21 10:46:51.755714048 +0100 @@ -760,8 +760,6 @@ extern inline pte_t mk_swap_pte(unsigned #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -typedef pte_t *pte_addr_t; - #ifndef __s390x__ # define PTE_FILE_MAX_BITS 26 #else /* __s390x__ */ --- diff/include/asm-sh/pgtable.h 2004-04-05 12:57:08.000000000 +0100 +++ source/include/asm-sh/pgtable.h 2004-04-21 10:46:51.755714048 +0100 @@ -274,8 +274,6 @@ extern void update_mmu_cache(struct vm_a #define pte_same(A,B) (pte_val(A) == pte_val(B)) -typedef pte_t *pte_addr_t; - #endif /* !__ASSEMBLY__ */ #define kern_addr_valid(addr) (1) --- diff/include/asm-sparc/kmap_types.h 2004-01-19 10:22:59.000000000 +0000 +++ source/include/asm-sparc/kmap_types.h 2004-04-21 10:46:51.758713592 +0100 @@ -11,7 +11,6 @@ enum km_type { KM_BIO_DST_IRQ, KM_PTE0, KM_PTE1, - KM_PTE2, KM_IRQ0, KM_IRQ1, KM_SOFTIRQ0, --- diff/include/asm-sparc/pgtable.h 2004-04-05 12:57:08.000000000 +0100 +++ source/include/asm-sparc/pgtable.h 2004-04-21 10:46:51.758713592 +0100 @@ -491,8 +491,6 @@ extern int io_remap_page_range(struct vm #include -typedef pte_t *pte_addr_t; - #endif /* !(__ASSEMBLY__) */ /* We provide our own get_unmapped_area to cope with VA holes for userland */ --- diff/include/asm-sparc/setup.h 2004-01-19 10:22:59.000000000 +0000 +++ source/include/asm-sparc/setup.h 2004-04-21 10:46:51.759713440 +0100 @@ -5,5 +5,6 @@ #ifndef _SPARC_SETUP_H #define _SPARC_SETUP_H +#define COMMAND_LINE_SIZE 256 #endif /* _SPARC_SETUP_H */ --- diff/include/asm-sparc64/pgtable.h 2004-01-19 10:22:59.000000000 +0000 +++ source/include/asm-sparc64/pgtable.h 2004-04-21 10:46:51.757713744 +0100 @@ -384,8 +384,6 @@ extern unsigned long get_fb_unmapped_are extern void check_pgt_cache(void); -typedef pte_t *pte_addr_t; - #endif /* !(__ASSEMBLY__) */ #endif /* !(_SPARC64_PGTABLE_H) */ --- diff/include/asm-sparc64/setup.h 2004-01-19 10:22:59.000000000 +0000 +++ source/include/asm-sparc64/setup.h 2004-04-21 10:46:51.757713744 +0100 @@ -5,5 +5,6 @@ #ifndef _SPARC64_SETUP_H #define _SPARC64_SETUP_H +#define COMMAND_LINE_SIZE 256 #endif /* _SPARC64_SETUP_H */ --- diff/include/asm-sparc64/spinlock.h 2004-03-11 10:20:28.000000000 +0000 +++ source/include/asm-sparc64/spinlock.h 2004-04-21 10:46:51.758713592 +0100 @@ -31,15 +31,23 @@ #ifndef CONFIG_DEBUG_SPINLOCK -typedef unsigned char spinlock_t; -#define SPIN_LOCK_UNLOCKED 0 +typedef struct { + unsigned char lock; + unsigned int index; +} spinlock_t; -#define spin_lock_init(lock) (*((unsigned char *)(lock)) = 0) -#define spin_is_locked(lock) (*((volatile unsigned char *)(lock)) != 0) +#ifdef CONFIG_LOCKMETER +#define SPIN_LOCK_UNLOCKED (spinlock_t) {0, 0} +#else +#define SPIN_LOCK_UNLOCKED (spinlock_t) { 0 } +#endif -#define spin_unlock_wait(lock) \ +#define spin_lock_init(__lock) do { *(__lock) = SPIN_LOCK_UNLOCKED; } while(0) +#define spin_is_locked(__lock) (*((volatile unsigned char *)(&((__lock)->lock))) != 0) + +#define spin_unlock_wait(__lock) \ do { membar("#LoadLoad"); \ -} while(*((volatile unsigned char *)lock)) +} while(*((volatile unsigned char *)(&(((spinlock_t *)__lock)->lock)))) static __inline__ void _raw_spin_lock(spinlock_t *lock) { @@ -110,17 +118,31 @@ extern int _spin_trylock (spinlock_t *lo #ifndef CONFIG_DEBUG_SPINLOCK -typedef unsigned int rwlock_t; -#define RW_LOCK_UNLOCKED 0 -#define rwlock_init(lp) do { *(lp) = RW_LOCK_UNLOCKED; } while(0) -#define rwlock_is_locked(x) (*(x) != RW_LOCK_UNLOCKED) +#ifdef CONFIG_LOCKMETER +typedef struct { + unsigned int lock; + unsigned int index; + unsigned int cpu; +} rwlock_t; +#define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0, 0xff } +#else +typedef struct { + unsigned int lock; +} rwlock_t; +#define RW_LOCK_UNLOCKED (rwlock_t) { 0 } +#endif + +#define rwlock_init(lp) do { *(lp) = RW_LOCK_UNLOCKED; } while(0) +#define rwlock_is_locked(x) ((x)->lock != 0) +extern int __read_trylock(rwlock_t *); extern void __read_lock(rwlock_t *); extern void __read_unlock(rwlock_t *); extern void __write_lock(rwlock_t *); extern void __write_unlock(rwlock_t *); extern int __write_trylock(rwlock_t *); +#define _raw_read_trylock(p) __read_trylock(p) #define _raw_read_lock(p) __read_lock(p) #define _raw_read_unlock(p) __read_unlock(p) #define _raw_write_lock(p) __write_lock(p) --- diff/include/asm-um/pgtable.h 2003-10-09 09:47:34.000000000 +0100 +++ source/include/asm-um/pgtable.h 2004-04-21 10:46:51.759713440 +0100 @@ -384,18 +384,6 @@ static inline pmd_t * pmd_offset(pgd_t * #define pte_unmap(pte) kunmap_atomic((pte), KM_PTE0) #define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1) -#if defined(CONFIG_HIGHPTE) && defined(CONFIG_HIGHMEM4G) -typedef u32 pte_addr_t; -#endif - -#if defined(CONFIG_HIGHPTE) && defined(CONFIG_HIGHMEM64G) -typedef u64 pte_addr_t; -#endif - -#if !defined(CONFIG_HIGHPTE) -typedef pte_t *pte_addr_t; -#endif - #define update_mmu_cache(vma,address,pte) do ; while (0) /* Encode and de-code a swap entry */ --- diff/include/asm-v850/pgtable.h 2002-11-11 11:09:43.000000000 +0000 +++ source/include/asm-v850/pgtable.h 2004-04-21 10:46:51.760713288 +0100 @@ -5,8 +5,6 @@ #include -typedef pte_t *pte_addr_t; - #define pgd_present(pgd) (1) /* pages are always present on NO_MM */ #define pgd_none(pgd) (0) #define pgd_bad(pgd) (0) --- diff/include/asm-x86_64/bootsetup.h 2004-04-21 10:45:35.919242944 +0100 +++ source/include/asm-x86_64/bootsetup.h 2004-04-21 10:46:51.761713136 +0100 @@ -30,7 +30,6 @@ extern char x86_boot_params[2048]; #define EDD_NR (*(unsigned char *) (PARAM+EDDNR)) #define EDD_BUF ((struct edd_info *) (PARAM+EDDBUF)) #define COMMAND_LINE saved_command_line -#define COMMAND_LINE_SIZE 256 #define RAMDISK_IMAGE_START_MASK 0x07FF #define RAMDISK_PROMPT_FLAG 0x8000 --- diff/include/asm-x86_64/pgtable.h 2004-03-11 10:20:28.000000000 +0000 +++ source/include/asm-x86_64/pgtable.h 2004-04-21 10:46:51.762712984 +0100 @@ -390,8 +390,6 @@ extern inline pte_t pte_modify(pte_t pte #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -typedef pte_t *pte_addr_t; - #endif /* !__ASSEMBLY__ */ extern int kern_addr_valid(unsigned long addr); --- diff/include/asm-x86_64/setup.h 2002-10-16 04:29:06.000000000 +0100 +++ source/include/asm-x86_64/setup.h 2004-04-21 10:46:51.762712984 +0100 @@ -1,10 +1,6 @@ -/* - * Just a place holder. We don't want to have to test x86 before - * we include stuff - */ - #ifndef _x8664_SETUP_H #define _x8664_SETUP_H +#define COMMAND_LINE_SIZE 256 #endif --- diff/include/asm-x86_64/unistd.h 2004-04-21 10:45:35.924242184 +0100 +++ source/include/asm-x86_64/unistd.h 2004-04-21 10:46:51.763712832 +0100 @@ -534,7 +534,7 @@ __SYSCALL(__NR_utimes, sys_utimes) __SYSCALL(__NR_vserver, sys_ni_syscall) #define __NR_vserver 236 __SYSCALL(__NR_vserver, sys_ni_syscall) -#define __NR_mbind 237 +#define __NR_mbind 237 __SYSCALL(__NR_mbind, sys_ni_syscall) #define __NR_set_mempolicy 238 __SYSCALL(__NR_set_mempolicy, sys_ni_syscall) @@ -546,7 +546,7 @@ __SYSCALL(__NR_mq_open, sys_mq_open) __SYSCALL(__NR_mq_unlink, sys_mq_unlink) #define __NR_mq_timedsend 242 __SYSCALL(__NR_mq_timedsend, sys_mq_timedsend) -#define __NR_mq_timedreceive 243 +#define __NR_mq_timedreceive 243 __SYSCALL(__NR_mq_timedreceive, sys_mq_timedreceive) #define __NR_mq_notify 244 __SYSCALL(__NR_mq_notify, sys_mq_notify) --- diff/include/linux/auto_fs4.h 2002-10-16 04:27:11.000000000 +0100 +++ source/include/linux/auto_fs4.h 2004-04-21 10:46:51.763712832 +0100 @@ -23,6 +23,12 @@ #define AUTOFS_MIN_PROTO_VERSION 3 #define AUTOFS_MAX_PROTO_VERSION 4 +#define AUTOFS_PROTO_SUBVERSION 5 + +/* Mask for expire behaviour */ +#define AUTOFS_EXP_IMMEDIATE 1 +#define AUTOFS_EXP_LEAVES 2 + /* New message type */ #define autofs_ptype_expire_multi 2 /* Expire entry (umount request) */ @@ -41,7 +47,11 @@ union autofs_packet_union { struct autofs_packet_expire_multi expire_multi; }; -#define AUTOFS_IOC_EXPIRE_MULTI _IOW(0x93,0x66,int) +#define AUTOFS_IOC_EXPIRE_MULTI _IOW(0x93,0x66,int) +#define AUTOFS_IOC_PROTOSUBVER _IOR(0x93,0x67,int) +#define AUTOFS_IOC_ASKREGHOST _IOR(0x93,0x68,int) +#define AUTOFS_IOC_TOGGLEREGHOST _IOR(0x93,0x69,int) +#define AUTOFS_IOC_ASKUMOUNT _IOR(0x93,0x70,int) #endif /* _LINUX_AUTO_FS4_H */ --- diff/include/linux/binfmts.h 2004-04-21 10:45:35.926241880 +0100 +++ source/include/linux/binfmts.h 2004-04-21 10:46:51.764712680 +0100 @@ -35,9 +35,13 @@ struct linux_binprm{ char * interp; /* Name of the binary really executed. Most of the time same as filename, but could be different for binfmt_{misc,script} */ + unsigned long interp_flags; unsigned long loader, exec; }; +#define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0 +#define BINPRM_FLAGS_ENFORCE_NONDUMP (1 << BINPRM_FLAGS_ENFORCE_NONDUMP_BIT) + /* * This structure defines the functions that are used to load the binary formats that * linux accepts. --- diff/include/linux/bitmap.h 2004-04-21 10:45:35.927241728 +0100 +++ source/include/linux/bitmap.h 2004-04-21 10:46:51.764712680 +0100 @@ -29,7 +29,8 @@ static inline void bitmap_fill(unsigned static inline void bitmap_copy(unsigned long *dst, const unsigned long *src, int bits) { - memcpy(dst, src, BITS_TO_LONGS(bits)*sizeof(unsigned long)); + int len = BITS_TO_LONGS(bits)*sizeof(unsigned long); + memcpy(dst, src, len); } void bitmap_shift_right(unsigned long *dst, --- diff/include/linux/buffer_head.h 2004-04-21 10:45:35.928241576 +0100 +++ source/include/linux/buffer_head.h 2004-04-21 10:46:51.764712680 +0100 @@ -157,6 +157,8 @@ void __wait_on_buffer(struct buffer_head wait_queue_head_t *bh_waitq_head(struct buffer_head *bh); void wake_up_buffer(struct buffer_head *bh); int fsync_bdev(struct block_device *); +struct super_block *freeze_bdev(struct block_device *); +void thaw_bdev(struct block_device *, struct super_block *); int fsync_super(struct super_block *); int fsync_no_super(struct block_device *); struct buffer_head *__find_get_block(struct block_device *, sector_t, int); --- diff/include/linux/compat.h 2004-04-21 10:45:35.928241576 +0100 +++ source/include/linux/compat.h 2004-04-21 10:46:51.765712528 +0100 @@ -117,5 +117,18 @@ long compat_sys_shmat(int first, int sec long compat_sys_shmctl(int first, int second, void __user *uptr); long compat_sys_semtimedop(int semid, struct sembuf __user *tsems, unsigned nsems, const struct compat_timespec __user *timeout); + +asmlinkage ssize_t compat_sys_readv(unsigned long fd, + const struct compat_iovec __user *vec, unsigned long vlen); +asmlinkage ssize_t compat_sys_writev(unsigned long fd, + const struct compat_iovec __user *vec, unsigned long vlen); + +int compat_do_execve(char * filename, compat_uptr_t __user *argv, + compat_uptr_t __user *envp, struct pt_regs * regs); + +asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp, + compat_ulong_t __user *outp, compat_ulong_t __user *exp, + struct compat_timeval __user *tvp); + #endif /* CONFIG_COMPAT */ #endif /* _LINUX_COMPAT_H */ --- diff/include/linux/compat_ioctl.h 2004-04-21 10:45:35.929241424 +0100 +++ source/include/linux/compat_ioctl.h 2004-04-21 10:46:51.765712528 +0100 @@ -597,7 +597,7 @@ COMPATIBLE_IOCTL(RNDGETPOOL) COMPATIBLE_IOCTL(RNDADDENTROPY) COMPATIBLE_IOCTL(RNDZAPENTCNT) COMPATIBLE_IOCTL(RNDCLEARPOOL) -/* Bluetooth ioctls */ +/* Bluetooth */ COMPATIBLE_IOCTL(HCIDEVUP) COMPATIBLE_IOCTL(HCIDEVDOWN) COMPATIBLE_IOCTL(HCIDEVRESET) @@ -631,6 +631,20 @@ COMPATIBLE_IOCTL(CMTPCONNADD) COMPATIBLE_IOCTL(CMTPCONNDEL) COMPATIBLE_IOCTL(CMTPGETCONNLIST) COMPATIBLE_IOCTL(CMTPGETCONNINFO) +/* CAPI */ +COMPATIBLE_IOCTL(CAPI_REGISTER) +COMPATIBLE_IOCTL(CAPI_GET_MANUFACTURER) +COMPATIBLE_IOCTL(CAPI_GET_VERSION) +COMPATIBLE_IOCTL(CAPI_GET_SERIAL) +COMPATIBLE_IOCTL(CAPI_GET_PROFILE) +COMPATIBLE_IOCTL(CAPI_MANUFACTURER_CMD) +COMPATIBLE_IOCTL(CAPI_GET_ERRCODE) +COMPATIBLE_IOCTL(CAPI_INSTALLED) +COMPATIBLE_IOCTL(CAPI_GET_FLAGS) +COMPATIBLE_IOCTL(CAPI_SET_FLAGS) +COMPATIBLE_IOCTL(CAPI_CLR_FLAGS) +COMPATIBLE_IOCTL(CAPI_NCCI_OPENCOUNT) +COMPATIBLE_IOCTL(CAPI_NCCI_GETUNIT) /* Misc. */ COMPATIBLE_IOCTL(0x41545900) /* ATYIO_CLKR */ COMPATIBLE_IOCTL(0x41545901) /* ATYIO_CLKW */ --- diff/include/linux/compiler-gcc.h 2003-10-09 09:47:17.000000000 +0100 +++ source/include/linux/compiler-gcc.h 2004-04-21 10:46:51.766712376 +0100 @@ -13,5 +13,5 @@ shouldn't recognize the original var, and make assumptions about it */ #define RELOC_HIDE(ptr, off) \ ({ unsigned long __ptr; \ - __asm__ ("" : "=g"(__ptr) : "0"(ptr)); \ + __asm__ ("" : "=r"(__ptr) : "0"(ptr)); \ (typeof(ptr)) (__ptr + (off)); }) --- diff/include/linux/config.h 2003-10-09 09:47:17.000000000 +0100 +++ source/include/linux/config.h 2004-04-21 10:46:51.766712376 +0100 @@ -2,5 +2,8 @@ #define _LINUX_CONFIG_H #include +#ifdef CONFIG_X86 +#include +#endif #endif --- diff/include/linux/delay.h 2003-08-20 14:16:34.000000000 +0100 +++ source/include/linux/delay.h 2004-04-21 10:46:51.766712376 +0100 @@ -10,7 +10,7 @@ extern unsigned long loops_per_jiffy; #include - +#include /* * Using udelay() for intervals greater than a few milliseconds can * risk overflow for high loops_per_jiffy (high bogomips) machines. The @@ -25,14 +25,13 @@ extern unsigned long loops_per_jiffy; #define MAX_UDELAY_MS 5 #endif -#ifdef notdef -#define mdelay(n) (\ - {unsigned long __ms=(n); while (__ms--) udelay(1000);}) -#else -#define mdelay(n) (\ - (__builtin_constant_p(n) && (n)<=MAX_UDELAY_MS) ? udelay((n)*1000) : \ - ({unsigned long __ms=(n); while (__ms--) udelay(1000);})) -#endif +#define mdelay(n) ( \ + { \ + static int warned=0; \ + unsigned long __ms=(n); \ + WARN_ON(in_irq() && !(warned++)); \ + while (__ms--) udelay(1000); \ + }) #ifndef ndelay #define ndelay(x) udelay(((x)+999)/1000) --- diff/include/linux/ext3_fs.h 2003-08-20 14:16:34.000000000 +0100 +++ source/include/linux/ext3_fs.h 2004-04-21 10:46:51.769711920 +0100 @@ -33,11 +33,11 @@ struct statfs; #undef EXT3FS_DEBUG /* - * Define EXT3_PREALLOCATE to preallocate data blocks for expanding files + * Define EXT3_RESERVATION to reserve data blocks for expanding files */ -#undef EXT3_PREALLOCATE /* @@@ Fix this! */ -#define EXT3_DEFAULT_PREALLOC_BLOCKS 8 - +#define EXT3_RESERVATION +#define EXT3_DEFAULT_RESERVE_BLOCKS 8 +#define EXT3_MAX_RESERVE_BLOCKS 1024 /* * Always enable hashed directories */ @@ -208,6 +208,10 @@ struct ext3_group_desc #ifdef CONFIG_JBD_DEBUG #define EXT3_IOC_WAIT_FOR_READONLY _IOR('f', 99, long) #endif +#ifdef EXT3_RESERVATION +#define EXT3_IOC_GETRSVSZ _IOR('r', 1, long) +#define EXT3_IOC_SETRSVSZ _IOW('r', 2, long) +#endif /* * Structure of an inode on the disk @@ -306,24 +310,25 @@ struct ext3_inode { /* * Mount flags */ -#define EXT3_MOUNT_CHECK 0x0001 /* Do mount-time checks */ -#define EXT3_MOUNT_OLDALLOC 0x0002 /* Don't use the new Orlov allocator */ -#define EXT3_MOUNT_GRPID 0x0004 /* Create files with directory's group */ -#define EXT3_MOUNT_DEBUG 0x0008 /* Some debugging messages */ -#define EXT3_MOUNT_ERRORS_CONT 0x0010 /* Continue on errors */ -#define EXT3_MOUNT_ERRORS_RO 0x0020 /* Remount fs ro on errors */ -#define EXT3_MOUNT_ERRORS_PANIC 0x0040 /* Panic on errors */ -#define EXT3_MOUNT_MINIX_DF 0x0080 /* Mimics the Minix statfs */ -#define EXT3_MOUNT_NOLOAD 0x0100 /* Don't use existing journal*/ -#define EXT3_MOUNT_ABORT 0x0200 /* Fatal error detected */ -#define EXT3_MOUNT_DATA_FLAGS 0x0C00 /* Mode for data writes: */ - #define EXT3_MOUNT_JOURNAL_DATA 0x0400 /* Write data to journal */ - #define EXT3_MOUNT_ORDERED_DATA 0x0800 /* Flush data before commit */ - #define EXT3_MOUNT_WRITEBACK_DATA 0x0C00 /* No data ordering */ -#define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */ -#define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ -#define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ -#define EXT3_MOUNT_POSIX_ACL 0x8000 /* POSIX Access Control Lists */ +#define EXT3_MOUNT_CHECK 0x00001 /* Do mount-time checks */ +#define EXT3_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */ +#define EXT3_MOUNT_GRPID 0x00004 /* Create files with directory's group */ +#define EXT3_MOUNT_DEBUG 0x00008 /* Some debugging messages */ +#define EXT3_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ +#define EXT3_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */ +#define EXT3_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */ +#define EXT3_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ +#define EXT3_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ +#define EXT3_MOUNT_ABORT 0x00200 /* Fatal error detected */ +#define EXT3_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ +#define EXT3_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ +#define EXT3_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ +#define EXT3_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */ +#define EXT3_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */ +#define EXT3_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */ +#define EXT3_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */ +#define EXT3_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ +#define EXT3_MOUNT_RESERVATION 0x10000 /* Preallocation */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H @@ -680,8 +685,7 @@ struct dir_private_info { /* balloc.c */ extern int ext3_bg_has_super(struct super_block *sb, int group); extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); -extern int ext3_new_block (handle_t *, struct inode *, unsigned long, - __u32 *, __u32 *, int *); +extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *); extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, unsigned long); extern unsigned long ext3_count_free_blocks (struct super_block *); @@ -728,6 +732,7 @@ extern void ext3_put_inode (struct inode extern void ext3_delete_inode (struct inode *); extern int ext3_sync_inode (handle_t *, struct inode *); extern void ext3_discard_prealloc (struct inode *); +extern void ext3_discard_reservation (struct inode *); extern void ext3_dirty_inode(struct inode *); extern int ext3_change_inode_journal_flag(struct inode *, int); extern void ext3_truncate (struct inode *); --- diff/include/linux/ext3_fs_i.h 2003-10-09 09:47:34.000000000 +0100 +++ source/include/linux/ext3_fs_i.h 2004-04-21 10:46:51.769711920 +0100 @@ -18,8 +18,16 @@ #include +struct reserve_window { + struct list_head rsv_list; + __u32 rsv_start; + __u32 rsv_end; + atomic_t rsv_goal_size; + __u32 rsv_alloc_hit; +}; + /* - * second extended file system inode data in memory + * third extended file system inode data in memory */ struct ext3_inode_info { __u32 i_data[15]; @@ -57,10 +65,9 @@ struct ext3_inode_info { * allocation when we detect linearly ascending requests. */ __u32 i_next_alloc_goal; -#ifdef EXT3_PREALLOCATE - __u32 i_prealloc_block; - __u32 i_prealloc_count; -#endif + /* block reservation window */ + struct reserve_window i_rsv_window; + __u32 i_dir_start_lookup; #ifdef CONFIG_EXT3_FS_XATTR /* --- diff/include/linux/ext3_fs_sb.h 2004-04-21 10:45:35.934240664 +0100 +++ source/include/linux/ext3_fs_sb.h 2004-04-21 10:46:51.769711920 +0100 @@ -59,6 +59,10 @@ struct ext3_sb_info { struct percpu_counter s_dirs_counter; struct blockgroup_lock s_blockgroup_lock; + /* head of the per fs reservation window tree */ + spinlock_t s_rsv_window_lock; + struct reserve_window s_rsv_window_head; + /* Journaling */ struct inode * s_journal_inode; struct journal_s * s_journal; --- diff/include/linux/fs.h 2004-04-21 10:45:35.936240360 +0100 +++ source/include/linux/fs.h 2004-04-21 10:46:51.770711768 +0100 @@ -331,7 +331,7 @@ struct address_space { struct address_space_operations *a_ops; /* methods */ struct list_head i_mmap; /* list of private mappings */ struct list_head i_mmap_shared; /* list of shared mappings */ - struct semaphore i_shared_sem; /* protect both above lists */ + spinlock_t i_shared_lock; /* protect both above lists */ atomic_t truncate_count; /* Cover race condition with truncate */ unsigned long flags; /* error bits/gfp mask */ struct backing_dev_info *backing_dev_info; /* device readahead, etc */ @@ -345,6 +345,7 @@ struct block_device { struct inode * bd_inode; /* will die */ int bd_openers; struct semaphore bd_sem; /* open/close mutex */ + struct semaphore bd_mount_sem; /* mount mutex */ struct list_head bd_inodes; void * bd_holder; int bd_holders; @@ -407,6 +408,7 @@ static inline int mapping_writably_mappe struct inode { struct hlist_node i_hash; struct list_head i_list; + struct list_head i_sb_list; struct list_head i_dentry; unsigned long i_ino; atomic_t i_count; @@ -740,6 +742,7 @@ struct super_block { atomic_t s_active; void *s_security; + struct list_head s_inodes; /* all inodes */ struct list_head s_dirty; /* dirty inodes */ struct list_head s_io; /* parked for writeback */ struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */ @@ -749,6 +752,9 @@ struct super_block { struct list_head s_instances; struct quota_info s_dquot; /* Diskquota specific options */ + int s_frozen; + wait_queue_head_t s_wait_unfrozen; + char s_id[32]; /* Informational name */ void *s_fs_info; /* Filesystem private info */ @@ -761,6 +767,18 @@ struct super_block { }; /* + * Snapshotting support. + */ +enum { + SB_UNFROZEN = 0, + SB_FREEZE_WRITE = 1, + SB_FREEZE_TRANS = 2, +}; + +#define vfs_check_frozen(sb, level) \ + wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level))) + +/* * Superblock locking. */ static inline void lock_super(struct super_block * sb) --- diff/include/linux/gfp.h 2004-04-21 10:45:35.936240360 +0100 +++ source/include/linux/gfp.h 2004-04-21 10:46:51.771711616 +0100 @@ -4,6 +4,8 @@ #include #include #include +#include + /* * GFP bitmasks.. */ @@ -69,19 +71,43 @@ * For the normal case of non-DISCONTIGMEM systems the NODE_DATA() gets * optimized to &contig_page_data at compile-time. */ -extern struct page * FASTCALL(__alloc_pages(unsigned int, unsigned int, struct zonelist *)); -static inline struct page * alloc_pages_node(int nid, unsigned int gfp_mask, unsigned int order) +extern struct page * +FASTCALL(__alloc_pages(unsigned int, unsigned int, struct zonelist *)); + +static inline struct page *alloc_pages_node(int nid, unsigned int gfp_mask, + unsigned int order) +{ + if (unlikely(order >= MAX_ORDER)) + return NULL; + + return __alloc_pages(gfp_mask, order, + NODE_DATA(nid)->node_zonelists + (gfp_mask & GFP_ZONEMASK)); +} + +extern struct page *alloc_pages_current(unsigned gfp_mask, unsigned order); +struct vm_area_struct; + +#ifdef CONFIG_NUMA +static inline struct page * +alloc_pages(unsigned int gfp_mask, unsigned int order) { if (unlikely(order >= MAX_ORDER)) return NULL; - return __alloc_pages(gfp_mask, order, NODE_DATA(nid)->node_zonelists + (gfp_mask & GFP_ZONEMASK)); + return alloc_pages_current(gfp_mask, order); } +extern struct page *__alloc_page_vma(unsigned gfp_mask, + struct vm_area_struct *vma, unsigned long off); +extern struct page *alloc_page_vma(unsigned gfp_mask, + struct vm_area_struct *vma, unsigned long addr); +#else #define alloc_pages(gfp_mask, order) \ alloc_pages_node(numa_node_id(), gfp_mask, order) -#define alloc_page(gfp_mask) \ - alloc_pages_node(numa_node_id(), gfp_mask, 0) +#define alloc_page_vma(gfp_mask, vma, addr) alloc_pages(gfp_mask, 0) +#define __alloc_page_vma(gfp_mask, vma, addr) alloc_pages(gfp_mask, 0) +#endif +#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) extern unsigned long FASTCALL(__get_free_pages(unsigned int gfp_mask, unsigned int order)); extern unsigned long FASTCALL(get_zeroed_page(unsigned int gfp_mask)); --- diff/include/linux/hugetlb.h 2004-04-21 10:45:35.937240208 +0100 +++ source/include/linux/hugetlb.h 2004-04-21 10:46:51.771711616 +0100 @@ -3,6 +3,8 @@ #ifdef CONFIG_HUGETLB_PAGE +#include + struct ctl_table; static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) @@ -105,6 +107,17 @@ struct hugetlbfs_sb_info { spinlock_t stat_lock; }; + +struct hugetlbfs_inode_info { + struct shared_policy policy; + struct inode vfs_inode; +}; + +static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) +{ + return container_of(inode, struct hugetlbfs_inode_info, vfs_inode); +} + static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) { return sb->s_fs_info; --- diff/include/linux/ide.h 2004-04-21 10:45:35.938240056 +0100 +++ source/include/linux/ide.h 2004-04-21 10:46:51.772711464 +0100 @@ -1005,6 +1005,7 @@ typedef struct hwif_s { unsigned dma; void (*led_act)(void *data, int rw); + unsigned int (*max_rqsize)(ide_drive_t *); } ide_hwif_t; /* --- diff/include/linux/init.h 2004-04-21 10:45:35.939239904 +0100 +++ source/include/linux/init.h 2004-04-21 10:46:51.772711464 +0100 @@ -3,6 +3,7 @@ #include #include +#include /* These macros are used to mark some functions or * initialized data (doesn't apply to uninitialized data) @@ -46,8 +47,6 @@ #define __exitdata __attribute__ ((__section__(".exit.data"))) #define __exit_call __attribute_used__ __attribute__ ((__section__ (".exitcall.exit"))) -#define __sched __attribute__((__section__(".sched.text"))) - #ifdef MODULE #define __exit __attribute__ ((__section__(".exit.text"))) #else @@ -68,6 +67,9 @@ typedef void (*exitcall_t)(void); extern initcall_t __con_initcall_start, __con_initcall_end; extern initcall_t __security_initcall_start, __security_initcall_end; + +/* Defined in init/main.c */ +extern char saved_command_line[COMMAND_LINE_SIZE]; #endif #ifndef MODULE @@ -109,25 +111,33 @@ extern initcall_t __security_initcall_st struct obs_kernel_param { const char *str; int (*setup_func)(char *); + int early; }; -/* OBSOLETE: see moduleparam.h for the right way. */ -#define __setup_param(str, unique_id, fn) \ +/* Only for really core code. See moduleparam.h for the normal way. */ +#define __setup_param(str, unique_id, fn, early) \ static char __setup_str_##unique_id[] __initdata = str; \ static struct obs_kernel_param __setup_##unique_id \ __attribute_used__ \ __attribute__((__section__(".init.setup"))) \ - = { __setup_str_##unique_id, fn } + = { __setup_str_##unique_id, fn, early } #define __setup_null_param(str, unique_id) \ - __setup_param(str, unique_id, NULL) + __setup_param(str, unique_id, NULL, 0) #define __setup(str, fn) \ - __setup_param(str, fn, fn) + __setup_param(str, fn, fn, 0) #define __obsolete_setup(str) \ __setup_null_param(str, __LINE__) +/* NOTE: fn is as per module_param, not __setup! Emits warning if fn + * returns non-zero. */ +#define early_param(str, fn) \ + __setup_param(str, fn, fn, 1) + +/* Relies on saved_command_line being set */ +void __init parse_early_param(void); #endif /* __ASSEMBLY__ */ /** --- diff/include/linux/kmalloc_sizes.h 2003-05-21 11:50:00.000000000 +0100 +++ source/include/linux/kmalloc_sizes.h 2004-04-21 10:46:51.773711312 +0100 @@ -12,6 +12,9 @@ CACHE(256) CACHE(512) CACHE(1024) +#if (PAGE_SIZE != 4096) /* special cache for eth skbs - 5 fit into one 8 kB page */ + CACHE(1620) +#endif CACHE(2048) CACHE(4096) CACHE(8192) --- diff/include/linux/list.h 2004-04-21 10:45:35.944239144 +0100 +++ source/include/linux/list.h 2004-04-21 10:46:51.773711312 +0100 @@ -158,8 +158,11 @@ static inline void __list_del(struct lis * Note: list_empty on entry does not return true after this, the entry is * in an undefined state. */ +#include /* BUG_ON */ static inline void list_del(struct list_head *entry) { + BUG_ON(entry->prev->next != entry); + BUG_ON(entry->next->prev != entry); __list_del(entry->prev, entry->next); entry->next = LIST_POISON1; entry->prev = LIST_POISON2; --- diff/include/linux/mm.h 2004-04-21 10:45:35.945238992 +0100 +++ source/include/linux/mm.h 2004-04-21 10:46:51.775711008 +0100 @@ -12,6 +12,7 @@ #include #include #include +#include #ifndef CONFIG_DISCONTIGMEM /* Don't use mapnrs, do it properly */ extern unsigned long max_mapnr; @@ -47,6 +48,10 @@ extern int page_cluster; * * This structure is exactly 64 bytes on ia32. Please think very, very hard * before adding anything to it. + * [Now 4 bytes more on 32bit NUMA machines. Sorry. -AK. + * But if you want to recover the 4 bytes justr remove vm_next. It is redundant + * with vm_rb. Will be a lot of editing work though. vm_rb.color is redundant + * too.] */ struct vm_area_struct { struct mm_struct * vm_mm; /* The address space we belong to. */ @@ -77,6 +82,10 @@ struct vm_area_struct { units, *not* PAGE_CACHE_SIZE */ struct file * vm_file; /* File we map to (can be NULL). */ void * vm_private_data; /* was vm_pte (shared mem) */ + +#ifdef CONFIG_NUMA + struct mempolicy *vm_policy; /* NUMA policy for the VMA */ +#endif }; /* @@ -148,10 +157,13 @@ struct vm_operations_struct { void (*close)(struct vm_area_struct * area); struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int *type); int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock); +#ifdef CONFIG_NUMA + int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new); + struct mempolicy *(*get_policy)(struct vm_area_struct *vma, + unsigned long addr); +#endif }; -/* forward declaration; pte_chain is meant to be internal to rmap.c */ -struct pte_chain; struct mmu_gather; struct inode; @@ -173,27 +185,26 @@ typedef unsigned long page_flags_t; * * The first line is data used in page cache lookup, the second line * is used for linear searches (eg. clock algorithm scans). - * - * TODO: make this structure smaller, it could be as small as 32 bytes. */ struct page { - page_flags_t flags; /* atomic flags, some possibly - updated asynchronously */ + page_flags_t flags; /* Atomic flags, some possibly + * updated asynchronously */ atomic_t count; /* Usage count, see below. */ - struct address_space *mapping; /* The inode (or ...) we belong to. */ - pgoff_t index; /* Our offset within mapping. */ - struct list_head lru; /* Pageout list, eg. active_list; - protected by zone->lru_lock !! */ - union { - struct pte_chain *chain;/* Reverse pte mapping pointer. - * protected by PG_chainlock */ - pte_addr_t direct; - } pte; + unsigned int mapcount; /* Count of ptes mapped in mms, + * to show when page is mapped + * & limit reverse map searches, + * protected by PG_maplock. + */ unsigned long private; /* Mapping-private opaque data: * usually used for buffer_heads * if PagePrivate set; used for * swp_entry_t if PageSwapCache */ + struct address_space *mapping; /* The inode (or ...) we belong to. */ + pgoff_t index; /* Our offset within mapping. */ + struct list_head lru; /* Pageout list, eg. active_list + * protected by zone->lru_lock ! + */ /* * On machines where all RAM is mapped into kernel address space, * we can simply calculate the virtual address. On machines with @@ -404,13 +415,11 @@ static inline struct address_space *page } /* - * Return true if this page is mapped into pagetables. Subtle: test pte.direct - * rather than pte.chain. Because sometimes pte.direct is 64-bit, and .chain - * is only 32-bit. + * Return true if this page is mapped into pagetables. */ static inline int page_mapped(struct page *page) { - return page->pte.direct != 0; + return page->mapcount != 0; } /* @@ -435,6 +444,9 @@ extern void show_free_areas(void); struct page *shmem_nopage(struct vm_area_struct * vma, unsigned long address, int *type); +int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new); +struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, + unsigned long addr); struct file *shmem_file_setup(char * name, loff_t size, unsigned long flags); void shmem_lock(struct file * file, int lock); int shmem_zero_setup(struct vm_area_struct *); @@ -447,6 +459,7 @@ struct zap_details { struct address_space *check_mapping; /* Check page->mapping if set */ pgoff_t first_index; /* Lowest page->index to unmap */ pgoff_t last_index; /* Highest page->index to unmap */ + int atomic; /* May not schedule() */ }; void zap_page_range(struct vm_area_struct *vma, unsigned long address, @@ -486,6 +499,8 @@ int get_user_pages(struct task_struct *t int __set_page_dirty_buffers(struct page *page); int __set_page_dirty_nobuffers(struct page *page); +int redirty_page_for_writepage(struct writeback_control *wbc, + struct page *page); int FASTCALL(set_page_dirty(struct page *page)); int set_page_dirty_lock(struct page *page); int clear_page_dirty_for_io(struct page *page); @@ -634,6 +649,11 @@ static inline struct vm_area_struct * fi return vma; } +static inline unsigned long vma_pages(struct vm_area_struct *vma) +{ + return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; +} + extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr); extern unsigned int nr_used_zone_pages(void); --- diff/include/linux/mmzone.h 2004-04-21 10:45:35.946238840 +0100 +++ source/include/linux/mmzone.h 2004-04-21 10:46:51.776710856 +0100 @@ -52,6 +52,14 @@ struct per_cpu_pages { struct per_cpu_pageset { struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */ +#ifdef CONFIG_NUMA + unsigned long numa_hit; /* allocated in intended node */ + unsigned long numa_miss; /* allocated in non intended node */ + unsigned long numa_foreign; /* was intended here, hit elsewhere */ + unsigned long interleave_hit; /* interleaver prefered this zone */ + unsigned long local_node; /* allocation from local node */ + unsigned long other_node; /* allocation from other node */ +#endif } ____cacheline_aligned_in_smp; #define ZONE_DMA 0 --- diff/include/linux/nfs_fs.h 2004-04-21 10:45:35.953237776 +0100 +++ source/include/linux/nfs_fs.h 2004-04-21 10:46:51.776710856 +0100 @@ -306,6 +306,10 @@ nfs_file_cred(struct file *file) */ extern ssize_t nfs_direct_IO(int, struct kiocb *, const struct iovec *, loff_t, unsigned long); +extern ssize_t nfs_file_direct_read(struct kiocb *iocb, char *buf, + size_t count, loff_t pos); +extern ssize_t nfs_file_direct_write(struct kiocb *iocb, const char *buf, + size_t count, loff_t pos); /* * linux/fs/nfs/dir.c --- diff/include/linux/page-flags.h 2004-04-21 10:45:35.956237320 +0100 +++ source/include/linux/page-flags.h 2004-04-21 10:46:51.777710704 +0100 @@ -71,12 +71,12 @@ #define PG_nosave 14 /* Used for system suspend/resume */ #define PG_maplock 15 /* Lock bit for rmap to ptes */ -#define PG_direct 16 /* ->pte_chain points directly at pte */ +#define PG_swapcache 16 /* Swap page: swp_entry_t in private */ #define PG_mappedtodisk 17 /* Has blocks allocated on-disk */ #define PG_reclaim 18 /* To be reclaimed asap */ #define PG_compound 19 /* Part of a compound page */ -#define PG_anon 20 /* Anonymous page: anon_vma in mapping*/ -#define PG_swapcache 21 /* Swap page: swp_entry_t in private */ + +#define PG_anon 20 /* Anonymous page: anonmm in mapping */ /* @@ -281,12 +281,6 @@ extern void get_full_page_state(struct p #define ClearPageNosave(page) clear_bit(PG_nosave, &(page)->flags) #define TestClearPageNosave(page) test_and_clear_bit(PG_nosave, &(page)->flags) -#define PageDirect(page) test_bit(PG_direct, &(page)->flags) -#define SetPageDirect(page) set_bit(PG_direct, &(page)->flags) -#define TestSetPageDirect(page) test_and_set_bit(PG_direct, &(page)->flags) -#define ClearPageDirect(page) clear_bit(PG_direct, &(page)->flags) -#define TestClearPageDirect(page) test_and_clear_bit(PG_direct, &(page)->flags) - #define PageMappedToDisk(page) test_bit(PG_mappedtodisk, &(page)->flags) #define SetPageMappedToDisk(page) set_bit(PG_mappedtodisk, &(page)->flags) #define ClearPageMappedToDisk(page) clear_bit(PG_mappedtodisk, &(page)->flags) --- diff/include/linux/pci_ids.h 2004-04-21 10:45:35.959236864 +0100 +++ source/include/linux/pci_ids.h 2004-04-21 10:46:51.778710552 +0100 @@ -1638,8 +1638,8 @@ #define PCI_SUBDEVICE_ID_CHASE_PCIRAS8 0xF010 #define PCI_VENDOR_ID_AUREAL 0x12eb -#define PCI_DEVICE_ID_AUREAL_VORTEX 0x0001 -#define PCI_DEVICE_ID_AUREAL_VORTEX2 0x0002 +#define PCI_DEVICE_ID_AUREAL_VORTEX_1 0x0001 +#define PCI_DEVICE_ID_AUREAL_VORTEX_2 0x0002 #define PCI_DEVICE_ID_AUREAL_ADVANTAGE 0x0003 #define PCI_VENDOR_ID_ELECTRONICDESIGNGMBH 0x12f8 --- diff/include/linux/rmap.h 2004-04-21 10:45:35.966235800 +0100 +++ source/include/linux/rmap.h 2004-04-21 10:46:51.778710552 +0100 @@ -15,19 +15,78 @@ #ifdef CONFIG_MMU -struct pte_chain; -struct pte_chain *pte_chain_alloc(int gfp_flags); -void __pte_chain_free(struct pte_chain *pte_chain); +void fastcall page_add_anon_rmap(struct page *, + struct mm_struct *, unsigned long addr); +void fastcall page_add_file_rmap(struct page *); +void fastcall page_remove_rmap(struct page *); + +/** + * page_dup_rmap - duplicate pte mapping to a page + * @page: the page to add the mapping to + * + * For copy_page_range only: minimal extract from page_add_rmap, + * avoiding unnecessary tests (already checked) so it's quicker. + */ +static inline void page_dup_rmap(struct page *page) +{ + rmap_lock(page); + page->mapcount++; + rmap_unlock(page); +} + +int fastcall mremap_move_anon_rmap(struct page *page, unsigned long addr); + +/** + * mremap_moved_anon_rmap - does new address clash with that noted? + * @page: the page just brought back in from swap + * @addr: the user virtual address at which it is mapped + * + * Returns boolean, true if addr clashes with address already in page. + * + * For do_swap_page and unuse_pte: anonmm rmap cannot find the page if + * it's at different addresses in different mms, so caller must take a + * copy of the page to avoid that: not very clever, but too rare a case + * to merit cleverness. + */ +static inline int mremap_moved_anon_rmap(struct page *page, unsigned long addr) +{ + return page->index != (addr & PAGE_MASK); +} -static inline void pte_chain_free(struct pte_chain *pte_chain) +/** + * make_page_exclusive - try to make page exclusive to one mm + * @vma the vm_area_struct covering this address + * @addr the user virtual address of the page in question + * + * Assumes that the page at this address is anonymous (COWable), + * and that the caller holds mmap_sem for reading or for writing. + * + * For mremap's move_page_tables and for swapoff's unuse_process: + * not a general purpose routine, and in general may not succeed. + * But move_page_tables loops until it succeeds, and unuse_process + * holds the original page locked, which protects against races. + */ +static inline int make_page_exclusive(struct vm_area_struct *vma, + unsigned long addr) { - if (pte_chain) - __pte_chain_free(pte_chain); + switch (handle_mm_fault(vma->vm_mm, vma, addr, 1)) { + case VM_FAULT_MINOR: + case VM_FAULT_MAJOR: + return 0; + case VM_FAULT_OOM: + return -ENOMEM; + default: + return -EFAULT; + } } -struct pte_chain * fastcall - page_add_rmap(struct page *, pte_t *, struct pte_chain *); -void fastcall page_remove_rmap(struct page *, pte_t *); +/* + * Called from kernel/fork.c to manage anonymous memory + */ +void init_rmap(void); +int exec_rmap(struct mm_struct *); +int dup_rmap(struct mm_struct *, struct mm_struct *oldmm); +void exit_rmap(struct mm_struct *); /* * Called from mm/vmscan.c to handle paging out @@ -37,6 +96,11 @@ int fastcall try_to_unmap(struct page *) #else /* !CONFIG_MMU */ +#define init_rmap() do {} while (0) +#define exec_rmap(mm) (0) +#define dup_rmap(mm, oldmm) (0) +#define exit_rmap(mm) do {} while (0) + #define page_referenced(page) TestClearPageReferenced(page) #define try_to_unmap(page) SWAP_FAIL --- diff/include/linux/rwsem-spinlock.h 2003-05-21 11:49:46.000000000 +0100 +++ source/include/linux/rwsem-spinlock.h 2004-04-21 10:46:51.779710400 +0100 @@ -26,14 +26,14 @@ struct rwsem_waiter; * - if activity is 0 then there are no active readers or writers * - if activity is +ve then that is the number of active readers * - if activity is -1 then there is one active writer - * - if wait_list is not empty, then there are processes waiting for the semaphore + * - if wait_list is not empty, there are processes waiting for the semaphore */ struct rw_semaphore { - __s32 activity; - spinlock_t wait_lock; - struct list_head wait_list; + __s32 activity; + spinlock_t wait_lock; + struct list_head wait_list; #if RWSEM_DEBUG - int debug; + int debug; #endif }; --- diff/include/linux/rwsem.h 2002-10-16 04:27:49.000000000 +0100 +++ source/include/linux/rwsem.h 2004-04-21 10:46:51.779710400 +0100 @@ -22,9 +22,9 @@ struct rw_semaphore; #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK -#include /* use a generic implementation */ +#include /* use a generic implementation */ #else -#include /* use an arch-specific implementation */ +#include /* use an arch-specific implementation */ #endif #ifndef rwsemtrace @@ -41,9 +41,9 @@ extern void FASTCALL(rwsemtrace(struct r static inline void down_read(struct rw_semaphore *sem) { might_sleep(); - rwsemtrace(sem,"Entering down_read"); + rwsemtrace(sem, "Entering down_read"); __down_read(sem); - rwsemtrace(sem,"Leaving down_read"); + rwsemtrace(sem, "Leaving down_read"); } /* @@ -52,9 +52,9 @@ static inline void down_read(struct rw_s static inline int down_read_trylock(struct rw_semaphore *sem) { int ret; - rwsemtrace(sem,"Entering down_read_trylock"); + rwsemtrace(sem, "Entering down_read_trylock"); ret = __down_read_trylock(sem); - rwsemtrace(sem,"Leaving down_read_trylock"); + rwsemtrace(sem, "Leaving down_read_trylock"); return ret; } @@ -64,9 +64,9 @@ static inline int down_read_trylock(stru static inline void down_write(struct rw_semaphore *sem) { might_sleep(); - rwsemtrace(sem,"Entering down_write"); + rwsemtrace(sem, "Entering down_write"); __down_write(sem); - rwsemtrace(sem,"Leaving down_write"); + rwsemtrace(sem, "Leaving down_write"); } /* @@ -75,9 +75,9 @@ static inline void down_write(struct rw_ static inline int down_write_trylock(struct rw_semaphore *sem) { int ret; - rwsemtrace(sem,"Entering down_write_trylock"); + rwsemtrace(sem, "Entering down_write_trylock"); ret = __down_write_trylock(sem); - rwsemtrace(sem,"Leaving down_write_trylock"); + rwsemtrace(sem, "Leaving down_write_trylock"); return ret; } @@ -86,9 +86,9 @@ static inline int down_write_trylock(str */ static inline void up_read(struct rw_semaphore *sem) { - rwsemtrace(sem,"Entering up_read"); + rwsemtrace(sem, "Entering up_read"); __up_read(sem); - rwsemtrace(sem,"Leaving up_read"); + rwsemtrace(sem, "Leaving up_read"); } /* @@ -96,9 +96,9 @@ static inline void up_read(struct rw_sem */ static inline void up_write(struct rw_semaphore *sem) { - rwsemtrace(sem,"Entering up_write"); + rwsemtrace(sem, "Entering up_write"); __up_write(sem); - rwsemtrace(sem,"Leaving up_write"); + rwsemtrace(sem, "Leaving up_write"); } /* @@ -106,9 +106,9 @@ static inline void up_write(struct rw_se */ static inline void downgrade_write(struct rw_semaphore *sem) { - rwsemtrace(sem,"Entering downgrade_write"); + rwsemtrace(sem, "Entering downgrade_write"); __downgrade_write(sem); - rwsemtrace(sem,"Leaving downgrade_write"); + rwsemtrace(sem, "Leaving downgrade_write"); } #endif /* __KERNEL__ */ --- diff/include/linux/sched.h 2004-04-21 10:45:35.967235648 +0100 +++ source/include/linux/sched.h 2004-04-21 10:46:51.780710248 +0100 @@ -29,6 +29,7 @@ #include #include #include +#include struct exec_domain; @@ -96,6 +97,14 @@ extern unsigned long nr_running(void); extern unsigned long nr_uninterruptible(void); extern unsigned long nr_iowait(void); +#ifdef CONFIG_SCHEDSTATS +#define schedstat_inc(s, field) ((s)->field++) +#define schedstat_add(s, field, amt) ((s)->field += amt) +#else +#define schedstat_inc(s, field) do { } while (0) +#define schedstat_add(d, field, amt) do { } while (0) +#endif + #include #include #include @@ -147,6 +156,7 @@ extern spinlock_t mmlist_lock; typedef struct task_struct task_t; extern void sched_init(void); +extern void sched_init_smp(void); extern void init_idle(task_t *idle, int cpu); extern void show_state(void); @@ -170,9 +180,11 @@ extern void update_one_process(struct ta unsigned long system, int cpu); extern void scheduler_tick(int user_tick, int system); extern unsigned long cache_decay_ticks; -extern const unsigned long scheduling_functions_start_here; -extern const unsigned long scheduling_functions_end_here; +/* Attach to any functions which should be ignored in wchan output. */ +#define __sched __attribute__((__section__(".sched.text"))) +/* Is this address in the __sched functions? */ +extern int in_sched_functions(unsigned long addr); #define MAX_SCHEDULE_TIMEOUT LONG_MAX extern signed long FASTCALL(schedule_timeout(signed long timeout)); @@ -203,6 +215,7 @@ struct mm_struct { * together off init_mm.mmlist, and are protected * by mmlist_lock */ + struct anonmm *anonmm; /* For rmap to track anon mem */ unsigned long start_code, end_code, start_data, end_data; unsigned long start_brk, brk, start_stack; @@ -343,7 +356,6 @@ struct k_itimer { struct sigqueue *sigq; /* signal queue entry. */ }; - struct io_context; /* See blkdev.h */ void exit_io_context(void); @@ -503,6 +515,9 @@ struct task_struct { unsigned long ptrace_message; siginfo_t *last_siginfo; /* For ptrace use. */ + + struct mempolicy *mempolicy; + short il_next; /* could be shared with used_math */ }; static inline pid_t process_group(struct task_struct *tsk) @@ -541,6 +556,147 @@ do { if (atomic_dec_and_test(&(tsk)->usa #define PF_SYNCWRITE 0x00200000 /* I am doing a sync write */ #ifdef CONFIG_SMP +#define SCHED_LOAD_SCALE 128UL /* increase resolution of load */ + +#define SD_BALANCE_NEWIDLE 1 /* Balance when about to become idle */ +#define SD_BALANCE_EXEC 2 /* Balance on exec */ +#define SD_BALANCE_CLONE 4 /* Balance on clone */ +#define SD_WAKE_IDLE 8 /* Wake to idle CPU on task wakeup */ +#define SD_WAKE_AFFINE 16 /* Wake task to waking CPU */ +#define SD_WAKE_BALANCE 32 /* Perform balancing at task wakeup */ +#define SD_SHARE_CPUPOWER 64 /* Domain members share cpu power */ + +struct sched_group { + struct sched_group *next; /* Must be a circular list */ + cpumask_t cpumask; + + /* + * CPU power of this group, SCHED_LOAD_SCALE being max power for a + * single CPU. This should be read only (except for setup). Although + * it will need to be written to at cpu hot(un)plug time, perhaps the + * cpucontrol semaphore will provide enough exclusion? + */ + unsigned long cpu_power; +}; + +struct sched_domain { + /* These fields must be setup */ + struct sched_domain *parent; /* top domain must be null terminated */ + struct sched_group *groups; /* the balancing groups of the domain */ + cpumask_t span; /* span of all CPUs in this domain */ + unsigned long min_interval; /* Minimum balance interval ms */ + unsigned long max_interval; /* Maximum balance interval ms */ + unsigned int busy_factor; /* less balancing by factor if busy */ + unsigned int imbalance_pct; /* No balance until over watermark */ + unsigned long long cache_hot_time; /* Task considered cache hot (ns) */ + unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ + unsigned int per_cpu_gain; /* CPU % gained by adding domain cpus */ + int flags; /* See SD_* */ + + /* Runtime fields. */ + unsigned long last_balance; /* init to jiffies. units in jiffies */ + unsigned int balance_interval; /* initialise to 1. units in ms. */ + unsigned int nr_balance_failed; /* initialise to 0 */ + +#ifdef CONFIG_SCHEDSTATS + unsigned long lb_cnt[3]; + unsigned long lb_balanced[3]; + unsigned long lb_failed[3]; + unsigned long lb_pulled[3]; + unsigned long lb_hot_pulled[3]; + unsigned long lb_imbalance[3]; + + /* Active load balancing */ + unsigned long alb_cnt; + unsigned long alb_failed; + unsigned long alb_pushed; + + /* Wakeups */ + unsigned long sched_wake_remote; + + /* Passive load balancing */ + unsigned long plb_pulled; + + /* Affine wakeups */ + unsigned long afw_pulled; + + /* SD_BALANCE_EXEC balances */ + unsigned long sbe_pushed; + + /* SD_BALANCE_CLONE balances */ + unsigned long sbc_pushed; +#endif +}; + +/* Common values for SMT siblings */ +#define SD_SIBLING_INIT (struct sched_domain) { \ + .span = CPU_MASK_NONE, \ + .parent = NULL, \ + .groups = NULL, \ + .min_interval = 1, \ + .max_interval = 2, \ + .busy_factor = 8, \ + .imbalance_pct = 110, \ + .cache_hot_time = 0, \ + .cache_nice_tries = 0, \ + .per_cpu_gain = 15, \ + .flags = SD_BALANCE_NEWIDLE \ + | SD_BALANCE_EXEC \ + | SD_BALANCE_CLONE \ + | SD_WAKE_AFFINE \ + | SD_WAKE_IDLE \ + | SD_SHARE_CPUPOWER, \ + .last_balance = jiffies, \ + .balance_interval = 1, \ + .nr_balance_failed = 0, \ +} + +/* Common values for CPUs */ +#define SD_CPU_INIT (struct sched_domain) { \ + .span = CPU_MASK_NONE, \ + .parent = NULL, \ + .groups = NULL, \ + .min_interval = 1, \ + .max_interval = 4, \ + .busy_factor = 64, \ + .imbalance_pct = 125, \ + .cache_hot_time = (5*1000000/2), \ + .cache_nice_tries = 1, \ + .per_cpu_gain = 100, \ + .flags = SD_BALANCE_NEWIDLE \ + | SD_BALANCE_EXEC \ + | SD_BALANCE_CLONE \ + | SD_WAKE_AFFINE \ + | SD_WAKE_BALANCE, \ + .last_balance = jiffies, \ + .balance_interval = 1, \ + .nr_balance_failed = 0, \ +} + +#ifdef CONFIG_NUMA +/* Common values for NUMA nodes */ +#define SD_NODE_INIT (struct sched_domain) { \ + .span = CPU_MASK_NONE, \ + .parent = NULL, \ + .groups = NULL, \ + .min_interval = 8, \ + .max_interval = 256*fls(num_online_cpus()),\ + .busy_factor = 32, \ + .imbalance_pct = 125, \ + .cache_hot_time = (10*1000000), \ + .cache_nice_tries = 1, \ + .per_cpu_gain = 100, \ + .flags = SD_BALANCE_EXEC \ + | SD_BALANCE_CLONE \ + | SD_WAKE_BALANCE, \ + .last_balance = jiffies, \ + .balance_interval = 1, \ + .nr_balance_failed = 0, \ +} +#endif + +extern void cpu_attach_domain(struct sched_domain *sd, int cpu); + extern int set_cpus_allowed(task_t *p, cpumask_t new_mask); #else static inline int set_cpus_allowed(task_t *p, cpumask_t new_mask) @@ -551,12 +707,10 @@ static inline int set_cpus_allowed(task_ extern unsigned long long sched_clock(void); -#ifdef CONFIG_NUMA +#ifdef CONFIG_SMP extern void sched_balance_exec(void); -extern void node_nr_running_init(void); #else #define sched_balance_exec() {} -#define node_nr_running_init() {} #endif /* Move tasks off this (offline) CPU onto another. */ @@ -611,12 +765,17 @@ extern void do_timer(struct pt_regs *); extern int FASTCALL(wake_up_state(struct task_struct * tsk, unsigned int state)); extern int FASTCALL(wake_up_process(struct task_struct * tsk)); +extern void FASTCALL(wake_up_forked_process(struct task_struct * tsk)); #ifdef CONFIG_SMP extern void kick_process(struct task_struct *tsk); + extern void FASTCALL(wake_up_forked_thread(struct task_struct * tsk)); #else static inline void kick_process(struct task_struct *tsk) { } + static inline void wake_up_forked_thread(struct task_struct * tsk) + { + return wake_up_forked_process(tsk); + } #endif -extern void FASTCALL(wake_up_forked_process(struct task_struct * tsk)); extern void FASTCALL(sched_fork(task_t * p)); extern void FASTCALL(sched_exit(task_t * p)); --- diff/include/linux/security.h 2004-04-05 12:57:08.000000000 +0100 +++ source/include/linux/security.h 2004-04-21 10:46:51.782709944 +0100 @@ -44,7 +44,7 @@ extern int cap_capget (struct task_struc extern int cap_capset_check (struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); extern void cap_capset_set (struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); extern int cap_bprm_set_security (struct linux_binprm *bprm); -extern void cap_bprm_compute_creds (struct linux_binprm *bprm); +extern void cap_bprm_apply_creds (struct linux_binprm *bprm); extern int cap_bprm_secureexec(struct linux_binprm *bprm); extern int cap_inode_setxattr(struct dentry *dentry, char *name, void *value, size_t size, int flags); extern int cap_inode_removexattr(struct dentry *dentry, char *name); @@ -102,7 +102,7 @@ struct swap_info_struct; * @bprm_free_security: * @bprm contains the linux_binprm structure to be modified. * Deallocate and clear the @bprm->security field. - * @bprm_compute_creds: + * @bprm_apply_creds: * Compute and set the security attributes of a process being transformed * by an execve operation based on the old attributes (current->security) * and the information saved in @bprm->security by the set_security hook. @@ -115,7 +115,7 @@ struct swap_info_struct; * @bprm contains the linux_binprm structure. * @bprm_set_security: * Save security information in the bprm->security field, typically based - * on information about the bprm->file, for later use by the compute_creds + * on information about the bprm->file, for later use by the apply_creds * hook. This hook may also optionally check permissions (e.g. for * transitions between security domains). * This hook may be called multiple times during a single execve, e.g. for @@ -924,7 +924,7 @@ struct swap_info_struct; * Check permission before allowing the @parent process to trace the * @child process. * Security modules may also want to perform a process tracing check - * during an execve in the set_security or compute_creds hooks of + * during an execve in the set_security or apply_creds hooks of * binprm_security_ops if the process is being traced and its security * attributes would be changed by the execve. * @parent contains the task_struct structure for parent process. @@ -1026,7 +1026,7 @@ struct security_operations { int (*bprm_alloc_security) (struct linux_binprm * bprm); void (*bprm_free_security) (struct linux_binprm * bprm); - void (*bprm_compute_creds) (struct linux_binprm * bprm); + void (*bprm_apply_creds) (struct linux_binprm * bprm); int (*bprm_set_security) (struct linux_binprm * bprm); int (*bprm_check_security) (struct linux_binprm * bprm); int (*bprm_secureexec) (struct linux_binprm * bprm); @@ -1290,9 +1290,9 @@ static inline void security_bprm_free (s { security_ops->bprm_free_security (bprm); } -static inline void security_bprm_compute_creds (struct linux_binprm *bprm) +static inline void security_bprm_apply_creds (struct linux_binprm *bprm) { - security_ops->bprm_compute_creds (bprm); + security_ops->bprm_apply_creds (bprm); } static inline int security_bprm_set (struct linux_binprm *bprm) { @@ -1962,9 +1962,9 @@ static inline int security_bprm_alloc (s static inline void security_bprm_free (struct linux_binprm *bprm) { } -static inline void security_bprm_compute_creds (struct linux_binprm *bprm) +static inline void security_bprm_apply_creds (struct linux_binprm *bprm) { - cap_bprm_compute_creds (bprm); + cap_bprm_apply_creds (bprm); } static inline int security_bprm_set (struct linux_binprm *bprm) --- diff/include/linux/serial_core.h 2004-04-21 10:45:35.968235496 +0100 +++ source/include/linux/serial_core.h 2004-04-21 10:46:51.782709944 +0100 @@ -168,7 +168,9 @@ struct uart_port { unsigned char x_char; /* xon/xoff char */ unsigned char regshift; /* reg offset shift */ unsigned char iotype; /* io access style */ - +#ifdef CONFIG_KGDB + int kgdb; /* in use by kgdb */ +#endif #define UPIO_PORT (0) #define UPIO_HUB6 (1) #define UPIO_MEM (2) --- diff/include/linux/shmem_fs.h 2003-06-30 10:07:24.000000000 +0100 +++ source/include/linux/shmem_fs.h 2004-04-21 10:46:51.782709944 +0100 @@ -2,6 +2,7 @@ #define __SHMEM_FS_H #include +#include /* inode in-kernel data */ @@ -15,6 +16,7 @@ struct shmem_inode_info { unsigned long alloced; /* data pages allocated to file */ unsigned long swapped; /* subtotal assigned to swap */ unsigned long flags; + struct shared_policy policy; struct list_head list; struct inode vfs_inode; }; --- diff/include/linux/slab.h 2004-04-21 10:45:35.969235344 +0100 +++ source/include/linux/slab.h 2004-04-21 10:46:51.783709792 +0100 @@ -44,6 +44,7 @@ typedef struct kmem_cache_s kmem_cache_t #define SLAB_STORE_USER 0x00010000UL /* store the last owner for bug hunting */ #define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* track pages allocated to indicate what is reclaimable later*/ +#define SLAB_PANIC 0x00040000UL /* panic if kmem_cache_create() fails */ /* flags passed to a constructor func */ #define SLAB_CTOR_CONSTRUCTOR 0x001UL /* if not set, then deconstructor */ --- diff/include/linux/spinlock.h 2003-10-09 09:47:17.000000000 +0100 +++ source/include/linux/spinlock.h 2004-04-21 10:46:51.783709792 +0100 @@ -15,6 +15,12 @@ #include /* for cpu relax */ #include +#ifdef CONFIG_KGDB +#include +#define SET_WHO(x, him) (x)->who = him; +#else +#define SET_WHO(x, him) +#endif /* * Must define these before including other files, inline functions need them @@ -55,6 +61,9 @@ typedef struct { const char *module; char *owner; int oline; +#ifdef CONFIG_KGDB + struct task_struct *who; +#endif } spinlock_t; #define SPIN_LOCK_UNLOCKED (spinlock_t) { SPINLOCK_MAGIC, 0, 10, __FILE__ , NULL, 0} @@ -66,6 +75,7 @@ typedef struct { (x)->module = __FILE__; \ (x)->owner = NULL; \ (x)->oline = 0; \ + SET_WHO(x, NULL) \ } while (0) #define CHECK_LOCK(x) \ @@ -88,6 +98,7 @@ typedef struct { (x)->lock = 1; \ (x)->owner = __FILE__; \ (x)->oline = __LINE__; \ + SET_WHO(x, current) \ } while (0) /* without debugging, spin_is_locked on UP always says @@ -118,6 +129,7 @@ typedef struct { (x)->lock = 1; \ (x)->owner = __FILE__; \ (x)->oline = __LINE__; \ + SET_WHO(x, current) \ 1; \ }) @@ -184,6 +196,17 @@ typedef struct { #endif /* !SMP */ +#ifdef CONFIG_LOCKMETER +extern void _metered_spin_lock (spinlock_t *lock); +extern void _metered_spin_unlock (spinlock_t *lock); +extern int _metered_spin_trylock(spinlock_t *lock); +extern void _metered_read_lock (rwlock_t *lock); +extern void _metered_read_unlock (rwlock_t *lock); +extern void _metered_write_lock (rwlock_t *lock); +extern void _metered_write_unlock (rwlock_t *lock); +extern int _metered_write_trylock(rwlock_t *lock); +#endif + /* * Define the various spin_lock and rw_lock methods. Note we define these * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The various @@ -389,6 +412,141 @@ do { \ _raw_spin_trylock(lock) ? 1 : \ ({preempt_enable(); local_bh_enable(); 0;});}) +#ifdef CONFIG_LOCKMETER +#undef spin_lock +#undef spin_trylock +#undef spin_unlock +#undef spin_lock_irqsave +#undef spin_lock_irq +#undef spin_lock_bh +#undef read_lock +#undef read_unlock +#undef write_lock +#undef write_unlock +#undef write_trylock +#undef spin_unlock_bh +#undef read_lock_irqsave +#undef read_lock_irq +#undef read_lock_bh +#undef read_unlock_bh +#undef write_lock_irqsave +#undef write_lock_irq +#undef write_lock_bh +#undef write_unlock_bh + +#define spin_lock(lock) \ +do { \ + preempt_disable(); \ + _metered_spin_lock(lock); \ +} while(0) + +#define spin_trylock(lock) ({preempt_disable(); _metered_spin_trylock(lock) ? \ + 1 : ({preempt_enable(); 0;});}) +#define spin_unlock(lock) \ +do { \ + _metered_spin_unlock(lock); \ + preempt_enable(); \ +} while (0) + +#define spin_lock_irqsave(lock, flags) \ +do { \ + local_irq_save(flags); \ + preempt_disable(); \ + _metered_spin_lock(lock); \ +} while (0) + +#define spin_lock_irq(lock) \ +do { \ + local_irq_disable(); \ + preempt_disable(); \ + _metered_spin_lock(lock); \ +} while (0) + +#define spin_lock_bh(lock) \ +do { \ + local_bh_disable(); \ + preempt_disable(); \ + _metered_spin_lock(lock); \ +} while (0) + +#define spin_unlock_bh(lock) \ +do { \ + _metered_spin_unlock(lock); \ + preempt_enable(); \ + local_bh_enable(); \ +} while (0) + + +#define read_lock(lock) ({preempt_disable(); _metered_read_lock(lock);}) +#define read_unlock(lock) ({_metered_read_unlock(lock); preempt_enable();}) +#define write_lock(lock) ({preempt_disable(); _metered_write_lock(lock);}) +#define write_unlock(lock) ({_metered_write_unlock(lock); preempt_enable();}) +#define write_trylock(lock) ({preempt_disable();_metered_write_trylock(lock) ? \ + 1 : ({preempt_enable(); 0;});}) +#define spin_unlock_no_resched(lock) \ +do { \ + _metered_spin_unlock(lock); \ + preempt_enable_no_resched(); \ +} while (0) + +#define read_lock_irqsave(lock, flags) \ +do { \ + local_irq_save(flags); \ + preempt_disable(); \ + _metered_read_lock(lock); \ +} while (0) + +#define read_lock_irq(lock) \ +do { \ + local_irq_disable(); \ + preempt_disable(); \ + _metered_read_lock(lock); \ +} while (0) + +#define read_lock_bh(lock) \ +do { \ + local_bh_disable(); \ + preempt_disable(); \ + _metered_read_lock(lock); \ +} while (0) + +#define read_unlock_bh(lock) \ +do { \ + _metered_read_unlock(lock); \ + preempt_enable(); \ + local_bh_enable(); \ +} while (0) + +#define write_lock_irqsave(lock, flags) \ +do { \ + local_irq_save(flags); \ + preempt_disable(); \ + _metered_write_lock(lock); \ +} while (0) + +#define write_lock_irq(lock) \ +do { \ + local_irq_disable(); \ + preempt_disable(); \ + _metered_write_lock(lock); \ +} while (0) + +#define write_lock_bh(lock) \ +do { \ + local_bh_disable(); \ + preempt_disable(); \ + _metered_write_lock(lock); \ +} while (0) + +#define write_unlock_bh(lock) \ +do { \ + _metered_write_unlock(lock); \ + preempt_enable(); \ + local_bh_enable(); \ +} while (0) + +#endif /* !CONFIG_LOCKMETER */ + /* "lock on reference count zero" */ #ifndef ATOMIC_DEC_AND_LOCK #include --- diff/include/linux/string.h 2004-02-18 08:54:13.000000000 +0000 +++ source/include/linux/string.h 2004-04-21 10:46:51.784709640 +0100 @@ -1,10 +1,11 @@ #ifndef _LINUX_STRING_H_ #define _LINUX_STRING_H_ -/* We don't want strings.h stuff being user by user stuff by accident */ +/* We don't want strings.h stuff being used by user stuff by accident */ #ifdef __KERNEL__ +#include /* for inline */ #include /* for size_t */ #include /* for NULL */ --- diff/include/linux/swap.h 2004-04-21 10:45:35.972234888 +0100 +++ source/include/linux/swap.h 2004-04-21 10:46:51.784709640 +0100 @@ -151,7 +151,7 @@ struct swap_list_t { extern void out_of_memory(void); /* linux/mm/memory.c */ -extern void swapin_readahead(swp_entry_t); +extern void swapin_readahead(swp_entry_t, unsigned long, struct vm_area_struct *); /* linux/mm/page_alloc.c */ extern unsigned long totalram_pages; @@ -200,7 +200,8 @@ extern int move_from_swap_cache(struct p extern void free_page_and_swap_cache(struct page *); extern void free_pages_and_swap_cache(struct page **, int); extern struct page * lookup_swap_cache(swp_entry_t); -extern struct page * read_swap_cache_async(swp_entry_t); +extern struct page * read_swap_cache_async(swp_entry_t, struct vm_area_struct *vma, + unsigned long addr); /* linux/mm/swapfile.c */ extern int total_swap_pages; @@ -242,7 +243,7 @@ extern spinlock_t swaplock; #define free_swap_and_cache(swp) /*NOTHING*/ #define swap_duplicate(swp) /*NOTHING*/ #define swap_free(swp) /*NOTHING*/ -#define read_swap_cache_async(swp) NULL +#define read_swap_cache_async(swp,vma,addr) NULL #define lookup_swap_cache(swp) NULL #define valid_swaphandles(swp, off) 0 #define can_share_swap_page(p) 0 --- diff/include/linux/workqueue.h 2004-03-11 10:20:29.000000000 +0000 +++ source/include/linux/workqueue.h 2004-04-21 10:46:51.785709488 +0100 @@ -49,7 +49,11 @@ struct work_struct { init_timer(&(_work)->timer); \ } while (0) -extern struct workqueue_struct *create_workqueue(const char *name); +extern struct workqueue_struct *__create_workqueue(const char *name, + int singlethread); +#define create_workqueue(name) __create_workqueue((name), 0) +#define create_singlethread_workqueue(name) __create_workqueue((name), 1) + extern void destroy_workqueue(struct workqueue_struct *wq); extern int FASTCALL(queue_work(struct workqueue_struct *wq, struct work_struct *work)); --- diff/init/Kconfig 2004-04-21 10:45:35.992231848 +0100 +++ source/init/Kconfig 2004-04-21 10:46:51.785709488 +0100 @@ -43,7 +43,7 @@ config CLEAN_COMPILE config STANDALONE bool "Select only drivers that don't need compile-time external firmware" if EXPERIMENTAL - default y + default n help Select this option if you don't have magic firmware for drivers that need it. --- diff/init/main.c 2004-04-21 10:45:35.993231696 +0100 +++ source/init/main.c 2004-04-21 10:46:51.786709336 +0100 @@ -84,11 +84,11 @@ extern void signals_init(void); extern void buffer_init(void); extern void pidhash_init(void); extern void pidmap_init(void); -extern void pte_chain_init(void); extern void radix_tree_init(void); extern void free_initmem(void); extern void populate_rootfs(void); extern void driver_init(void); +extern void prepare_namespace(void); #ifdef CONFIG_TC extern void tc_init(void); @@ -107,6 +107,9 @@ extern void time_init(void); void (*late_time_init)(void); extern void softirq_init(void); +/* Untouched command line (eg. for /proc) saved by arch-specific code. */ +char saved_command_line[COMMAND_LINE_SIZE]; + static char *execute_command; /* Setup configured maximum number of CPUs to activate */ @@ -153,6 +156,9 @@ static int __init obsolete_checksetup(ch do { int n = strlen(p->str); if (!strncmp(line, p->str, n)) { + /* Already done in parse_early_param? */ + if (p->early) + return 1; if (!p->setup_func) { printk(KERN_WARNING "Parameter %s is obsolete, ignored\n", p->str); return 1; @@ -391,6 +397,38 @@ static void noinline rest_init(void) cpu_idle(); } +/* Check for early params. */ +static int __init do_early_param(char *param, char *val) +{ + struct obs_kernel_param *p; + extern struct obs_kernel_param __setup_start, __setup_end; + + for (p = &__setup_start; p < &__setup_end; p++) { + if (p->early && strcmp(param, p->str) == 0) { + if (p->setup_func(val) != 0) + printk(KERN_WARNING + "Malformed early option '%s'\n", param); + } + } + /* We accept everything at this stage. */ + return 0; +} + +/* Arch code calls this early on, or if not, just before other parsing. */ +void __init parse_early_param(void) +{ + static __initdata int done = 0; + static __initdata char tmp_cmdline[COMMAND_LINE_SIZE]; + + if (done) + return; + + /* All fall through to do_early_param. */ + strlcpy(tmp_cmdline, saved_command_line, COMMAND_LINE_SIZE); + parse_args("early options", tmp_cmdline, NULL, 0, do_early_param); + done = 1; +} + /* * Activate the first processor. */ @@ -398,7 +436,6 @@ static void noinline rest_init(void) asmlinkage void __init start_kernel(void) { char * command_line; - extern char saved_command_line[]; extern struct kernel_param __start___param[], __stop___param[]; /* * Interrupts are still disabled. Do necessary setups, then @@ -416,18 +453,26 @@ asmlinkage void __init start_kernel(void */ smp_prepare_boot_cpu(); + /* + * Set up the scheduler prior starting any interrupts (such as the + * timer interrupt). Full topology setup happens at smp_init() + * time - but meanwhile we still have a functioning scheduler. + */ + sched_init(); + build_all_zonelists(); page_alloc_init(); + trap_init(); printk("Kernel command line: %s\n", saved_command_line); + parse_early_param(); parse_args("Booting kernel", command_line, __start___param, __stop___param - __start___param, &unknown_bootoption); sort_main_extable(); - trap_init(); rcu_init(); init_IRQ(); pidhash_init(); - sched_init(); + init_timers(); softirq_init(); time_init(); @@ -456,7 +501,6 @@ asmlinkage void __init start_kernel(void calibrate_delay(); pidmap_init(); pgtable_cache_init(); - pte_chain_init(); #ifdef CONFIG_X86 if (efi_enabled) efi_enter_virtual_mode(); @@ -471,7 +515,6 @@ asmlinkage void __init start_kernel(void signals_init(); /* rootfs populating might need page-writeback */ page_writeback_init(); - populate_rootfs(); #ifdef CONFIG_PROC_FS proc_root_init(); #endif @@ -567,7 +610,6 @@ static void do_pre_smp_initcalls(void) migration_init(); #endif - node_nr_running_init(); spawn_ksoftirqd(); } @@ -577,8 +619,6 @@ static void run_init_process(char *init_ execve(init_filename, argv_init, envp_init); } -extern void prepare_namespace(void); - static int init(void * unused) { lock_kernel(); @@ -598,16 +638,18 @@ static int init(void * unused) do_pre_smp_initcalls(); smp_init(); + sched_init_smp(); do_basic_setup(); - /* - * check if there is an early userspace init, if yes - * let it do all the work - */ - if (sys_access("/init", 0) == 0) - execute_command = "/init"; - else - prepare_namespace(); + populate_rootfs(); + /* + * check if there is an early userspace init. If yes, let it do all + * the work + */ + if (sys_access("/init", 0) == 0) + execute_command = "/init"; + else + prepare_namespace(); /* * Ok, we have completed the initial bootup, and @@ -641,3 +683,10 @@ static int init(void * unused) panic("No init found. Try passing init= option to kernel."); } + +static int early_param_test(char *rest) +{ + printk("early_parm_test: %s\n", rest ?: "(null)"); + return rest ? 0 : -EINVAL; +} +early_param("testsetup", early_param_test); --- diff/ipc/shm.c 2004-04-21 10:45:35.996231240 +0100 +++ source/ipc/shm.c 2004-04-21 10:46:51.786709336 +0100 @@ -163,6 +163,10 @@ static struct vm_operations_struct shm_v .open = shm_open, /* callback for a new vm-area open */ .close = shm_close, /* callback for when the vm-area is released */ .nopage = shmem_nopage, +#ifdef CONFIG_NUMA + .set_policy = shmem_set_policy, + .get_policy = shmem_get_policy, +#endif }; static int newseg (key_t key, int shmflg, size_t size) @@ -792,10 +796,12 @@ asmlinkage long sys_shmdt(char __user *s */ if ((vma->vm_ops == &shm_vm_ops || is_vm_hugetlb_page(vma)) && (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) { - + int ret; size = vma->vm_file->f_dentry->d_inode->i_size; - do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); + ret = do_munmap(mm, vma->vm_start, + vma->vm_end - vma->vm_start); + WARN_ON(ret); /* * We discovered the size of the shm segment, so * break out of here and fall through to the next @@ -819,9 +825,13 @@ asmlinkage long sys_shmdt(char __user *s /* finding a matching vma now does not alter retval */ if ((vma->vm_ops == &shm_vm_ops || is_vm_hugetlb_page(vma)) && - (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) + (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) { + int ret; - do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); + ret = do_munmap(mm, vma->vm_start, + vma->vm_end - vma->vm_start); + WARN_ON(ret); + } vma = next; } --- diff/kernel/Makefile 2004-04-21 10:45:35.997231088 +0100 +++ source/kernel/Makefile 2004-04-21 10:46:51.790708728 +0100 @@ -12,6 +12,7 @@ obj-y = sched.o fork.o exec_domain.o obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o +obj-$(CONFIG_LOCKMETER) += lockmeter.o obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_KALLSYMS) += kallsyms.o --- diff/kernel/exit.c 2004-04-21 10:45:36.001230480 +0100 +++ source/kernel/exit.c 2004-04-21 10:46:51.787709184 +0100 @@ -790,6 +790,7 @@ asmlinkage NORET_TYPE void do_exit(long __exit_fs(tsk); exit_namespace(tsk); exit_thread(); + mpol_free(tsk->mempolicy); if (tsk->signal->leader) disassociate_ctty(1); --- diff/kernel/fork.c 2004-04-21 10:45:36.002230328 +0100 +++ source/kernel/fork.c 2004-04-21 10:46:51.788709032 +0100 @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -215,11 +216,8 @@ void __init fork_init(unsigned long memp #endif /* create a slab on which task_structs can be allocated */ task_struct_cachep = - kmem_cache_create("task_struct", - sizeof(struct task_struct),ARCH_MIN_TASKALIGN, - 0, NULL, NULL); - if (!task_struct_cachep) - panic("fork_init(): cannot create task_struct SLAB cache"); + kmem_cache_create("task_struct", sizeof(struct task_struct), + ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL, NULL); #endif /* @@ -272,6 +270,7 @@ static inline int dup_mmap(struct mm_str struct rb_node **rb_link, *rb_parent; int retval; unsigned long charge = 0; + struct mempolicy *pol; down_write(&oldmm->mmap_sem); flush_cache_mm(current->mm); @@ -313,6 +312,11 @@ static inline int dup_mmap(struct mm_str if (!tmp) goto fail_nomem; *tmp = *mpnt; + pol = mpol_copy(vma_policy(mpnt)); + retval = PTR_ERR(pol); + if (IS_ERR(pol)) + goto fail_nomem_policy; + vma_set_policy(tmp, pol); tmp->vm_flags &= ~VM_LOCKED; tmp->vm_mm = mm; tmp->vm_next = NULL; @@ -325,9 +329,9 @@ static inline int dup_mmap(struct mm_str atomic_dec(&inode->i_writecount); /* insert tmp into the share list, just after mpnt */ - down(&file->f_mapping->i_shared_sem); + spin_lock(&file->f_mapping->i_shared_lock); list_add(&tmp->shared, &mpnt->shared); - up(&file->f_mapping->i_shared_sem); + spin_unlock(&file->f_mapping->i_shared_lock); } /* @@ -359,6 +363,8 @@ out: flush_tlb_mm(current->mm); up_write(&oldmm->mmap_sem); return retval; +fail_nomem_policy: + kmem_cache_free(vm_area_cachep, tmp); fail_nomem: retval = -ENOMEM; fail: @@ -421,9 +427,14 @@ struct mm_struct * mm_alloc(void) mm = allocate_mm(); if (mm) { memset(mm, 0, sizeof(*mm)); - return mm_init(mm); + mm = mm_init(mm); + if (mm && exec_rmap(mm)) { + mm_free_pgd(mm); + free_mm(mm); + mm = NULL; + } } - return NULL; + return mm; } /* @@ -450,6 +461,7 @@ void mmput(struct mm_struct *mm) spin_unlock(&mmlist_lock); exit_aio(mm); exit_mmap(mm); + exit_rmap(mm); mmdrop(mm); } } @@ -553,6 +565,12 @@ static int copy_mm(unsigned long clone_f if (!mm_init(mm)) goto fail_nomem; + if (dup_rmap(mm, oldmm)) { + mm_free_pgd(mm); + free_mm(mm); + goto fail_nomem; + } + if (init_new_context(tsk,mm)) goto fail_nocontext; @@ -953,10 +971,16 @@ struct task_struct *copy_process(unsigne p->security = NULL; p->io_context = NULL; p->audit_context = NULL; + p->mempolicy = mpol_copy(p->mempolicy); + if (IS_ERR(p->mempolicy)) { + retval = PTR_ERR(p->mempolicy); + p->mempolicy = NULL; + goto bad_fork_cleanup; + } retval = -ENOMEM; if ((retval = security_task_alloc(p))) - goto bad_fork_cleanup; + goto bad_fork_cleanup_policy; if ((retval = audit_alloc(p))) goto bad_fork_cleanup_security; /* copy all the process information */ @@ -1102,6 +1126,8 @@ bad_fork_cleanup_audit: audit_free(p); bad_fork_cleanup_security: security_task_free(p); +bad_fork_cleanup_policy: + mpol_free(p->mempolicy); bad_fork_cleanup: if (p->pid > 0) free_pidmap(p->pid); @@ -1180,9 +1206,23 @@ long do_fork(unsigned long clone_flags, set_tsk_thread_flag(p, TIF_SIGPENDING); } - if (!(clone_flags & CLONE_STOPPED)) - wake_up_forked_process(p); /* do this last */ - else + if (!(clone_flags & CLONE_STOPPED)) { + /* + * Do the wakeup last. On SMP we treat fork() and + * CLONE_VM separately, because fork() has already + * created cache footprint on this CPU (due to + * copying the pagetables), hence migration would + * probably be costy. Threads on the other hand + * have less traction to the current CPU, and if + * there's an imbalance then the scheduler can + * migrate this fresh thread now, before it + * accumulates a larger cache footprint: + */ + if (clone_flags & CLONE_VM) + wake_up_forked_thread(p); + else + wake_up_forked_process(p); + } else p->state = TASK_STOPPED; ++total_forks; @@ -1227,37 +1267,21 @@ void __init proc_caches_init(void) { sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, - SLAB_HWCACHE_ALIGN, NULL, NULL); - if (!sighand_cachep) - panic("Cannot create sighand SLAB cache"); - + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); signal_cachep = kmem_cache_create("signal_cache", sizeof(struct signal_struct), 0, - SLAB_HWCACHE_ALIGN, NULL, NULL); - if (!signal_cachep) - panic("Cannot create signal SLAB cache"); - + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); files_cachep = kmem_cache_create("files_cache", - sizeof(struct files_struct), 0, - SLAB_HWCACHE_ALIGN, NULL, NULL); - if (!files_cachep) - panic("Cannot create files SLAB cache"); - + sizeof(struct files_struct), 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); fs_cachep = kmem_cache_create("fs_cache", - sizeof(struct fs_struct), 0, - SLAB_HWCACHE_ALIGN, NULL, NULL); - if (!fs_cachep) - panic("Cannot create fs_struct SLAB cache"); - + sizeof(struct fs_struct), 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); vm_area_cachep = kmem_cache_create("vm_area_struct", sizeof(struct vm_area_struct), 0, - 0, NULL, NULL); - if(!vm_area_cachep) - panic("vma_init: Cannot alloc vm_area_struct SLAB cache"); - + SLAB_PANIC, NULL, NULL); mm_cachep = kmem_cache_create("mm_struct", sizeof(struct mm_struct), 0, - SLAB_HWCACHE_ALIGN, NULL, NULL); - if(!mm_cachep) - panic("vma_init: Cannot alloc mm_struct SLAB cache"); + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); + init_rmap(); } --- diff/kernel/kmod.c 2004-04-21 10:45:36.002230328 +0100 +++ source/kernel/kmod.c 2004-04-21 10:46:51.788709032 +0100 @@ -35,10 +35,13 @@ #include #include #include +#include #include extern int max_threads; +static struct workqueue_struct *khelper_wq; + #ifdef CONFIG_KMOD /* @@ -109,6 +112,7 @@ int request_module(const char *fmt, ...) atomic_dec(&kmod_concurrent); return ret; } +EXPORT_SYMBOL(request_module); #endif /* CONFIG_KMOD */ #ifdef CONFIG_HOTPLUG @@ -197,9 +201,7 @@ static int wait_for_helper(void *data) return 0; } -/* - * This is run by keventd. - */ +/* This is run by khelper thread */ static void __call_usermodehelper(void *data) { struct subprocess_info *sub_info = data; @@ -249,26 +251,22 @@ int call_usermodehelper(char *path, char }; DECLARE_WORK(work, __call_usermodehelper, &sub_info); - if (system_state != SYSTEM_RUNNING) + if (!khelper_wq) return -EBUSY; if (path[0] == '\0') - goto out; + return 0; - if (current_is_keventd()) { - /* We can't wait on keventd! */ - __call_usermodehelper(&sub_info); - } else { - schedule_work(&work); - wait_for_completion(&done); - } -out: + queue_work(khelper_wq, &work); + wait_for_completion(&done); return sub_info.retval; } - EXPORT_SYMBOL(call_usermodehelper); -#ifdef CONFIG_KMOD -EXPORT_SYMBOL(request_module); -#endif - +static __init int usermodehelper_init(void) +{ + khelper_wq = create_singlethread_workqueue("khelper"); + BUG_ON(!khelper_wq); + return 0; +} +__initcall(usermodehelper_init); --- diff/kernel/module.c 2004-04-21 10:45:36.003230176 +0100 +++ source/kernel/module.c 2004-04-21 10:46:51.791708576 +0100 @@ -36,7 +36,6 @@ #include #include #include -#include #include #if 0 --- diff/kernel/pid.c 2004-04-21 10:45:36.004230024 +0100 +++ source/kernel/pid.c 2004-04-21 10:46:51.792708424 +0100 @@ -122,6 +122,8 @@ return_pid: } if (!offset || !atomic_read(&map->nr_free)) { + if (!offset) + map--; next_map: map = next_free_map(map, &max_steps); if (!map) @@ -268,6 +270,9 @@ void switch_exec_pids(task_t *leader, ta * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or * more. */ +#ifdef CONFIG_KGDB +int kgdb_pid_init_done; /* so we don't call prior to... */ +#endif void __init pidhash_init(void) { int i, j, pidhash_size; @@ -289,6 +294,9 @@ void __init pidhash_init(void) for (j = 0; j < pidhash_size; j++) INIT_LIST_HEAD(&pid_hash[i][j]); } +#ifdef CONFIG_KGDB + kgdb_pid_init_done++; +#endif } void __init pidmap_init(void) --- diff/kernel/sched.c 2004-04-21 10:45:36.011228960 +0100 +++ source/kernel/sched.c 2004-04-21 10:46:51.796707816 +0100 @@ -15,6 +15,7 @@ * and per-CPU runqueues. Cleanups and useful suggestions * by Davide Libenzi, preemptible kernel bits by Robert Love. * 2003-09-03 Interactivity tuning by Con Kolivas. + * 2004-04-02 Scheduler domains code by Nick Piggin */ #include @@ -25,6 +26,8 @@ #include #include #include +#include +#include #include #include #include @@ -39,6 +42,8 @@ #include #include #include +#include +#include #ifdef CONFIG_NUMA #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu)) @@ -72,6 +77,13 @@ #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) +#ifndef JIFFIES_TO_MSEC +# define JIFFIES_TO_MSEC(x) ((x) * 1000 / HZ) +#endif +#ifndef MSEC_TO_JIFFIES +# define MSEC_TO_JIFFIES(x) ((x) * HZ / 1000) +#endif + /* * These are the 'tuning knobs' of the scheduler: * @@ -91,7 +103,6 @@ #define MAX_SLEEP_AVG (AVG_TIMESLICE * MAX_BONUS) #define STARVATION_LIMIT (MAX_SLEEP_AVG) #define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) -#define NODE_THRESHOLD 125 #define CREDIT_LIMIT 100 /* @@ -173,11 +184,13 @@ ((MAX_TIMESLICE - MIN_TIMESLICE) * \ (MAX_PRIO-1 - (p)->static_prio) / (MAX_USER_PRIO-1))) -static inline unsigned int task_timeslice(task_t *p) +static unsigned int task_timeslice(task_t *p) { return BASE_TIMESLICE(p); } +#define task_hot(p, now, sd) ((now) - (p)->timestamp < (sd)->cache_hot_time) + /* * These are the runqueue data structures: */ @@ -187,7 +200,7 @@ static inline unsigned int task_timeslic typedef struct runqueue runqueue_t; struct prio_array { - int nr_active; + unsigned int nr_active; unsigned long bitmap[BITMAP_SIZE]; struct list_head queue[MAX_PRIO]; }; @@ -201,37 +214,63 @@ struct prio_array { */ struct runqueue { spinlock_t lock; + + /* + * nr_running and cpu_load should be in the same cacheline because + * remote CPUs use both these fields when doing load calculation. + */ + unsigned long nr_running; +#ifdef CONFIG_SMP + unsigned long cpu_load; +#endif unsigned long long nr_switches; - unsigned long nr_running, expired_timestamp, nr_uninterruptible, - timestamp_last_tick; + unsigned long expired_timestamp, nr_uninterruptible; + unsigned long long timestamp_last_tick; task_t *curr, *idle; + struct mm_struct *prev_mm; prio_array_t *active, *expired, arrays[2]; - int best_expired_prio, prev_cpu_load[NR_CPUS]; -#ifdef CONFIG_NUMA - atomic_t *node_nr_running; - int prev_node_load[MAX_NUMNODES]; -#endif + int best_expired_prio; + atomic_t nr_iowait; + +#ifdef CONFIG_SMP + struct sched_domain *sd; + + /* For active balancing */ + int active_balance; + int push_cpu; + task_t *migration_thread; struct list_head migration_queue; - - atomic_t nr_iowait; +#endif +#ifdef CONFIG_SCHEDSTATS + /* sys_sched_yield stats */ + unsigned long yld_exp_empty; + unsigned long yld_act_empty; + unsigned long yld_both_empty; + unsigned long yld_cnt; + + /* schedule stats */ + unsigned long sched_cnt; + unsigned long sched_switch; + unsigned long sched_idle; + + /* wake stats */ + unsigned long sched_wake; + unsigned long sched_wake_local; +#endif }; static DEFINE_PER_CPU(struct runqueue, runqueues); +#define for_each_domain(cpu, domain) \ + for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent) + #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) #define this_rq() (&__get_cpu_var(runqueues)) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) -extern unsigned long __scheduling_functions_start_here; -extern unsigned long __scheduling_functions_end_here; -const unsigned long scheduling_functions_start_here = - (unsigned long)&__scheduling_functions_start_here; -const unsigned long scheduling_functions_end_here = - (unsigned long)&__scheduling_functions_end_here; - /* * Default context-switch locking: */ @@ -241,57 +280,12 @@ const unsigned long scheduling_functions # define task_running(rq, p) ((rq)->curr == (p)) #endif -#ifdef CONFIG_NUMA - -/* - * Keep track of running tasks. - */ - -static atomic_t node_nr_running[MAX_NUMNODES] ____cacheline_maxaligned_in_smp = - {[0 ...MAX_NUMNODES-1] = ATOMIC_INIT(0)}; - -static inline void nr_running_init(struct runqueue *rq) -{ - rq->node_nr_running = &node_nr_running[0]; -} - -static inline void nr_running_inc(runqueue_t *rq) -{ - atomic_inc(rq->node_nr_running); - rq->nr_running++; -} - -static inline void nr_running_dec(runqueue_t *rq) -{ - atomic_dec(rq->node_nr_running); - rq->nr_running--; -} - -__init void node_nr_running_init(void) -{ - int i; - - for (i = 0; i < NR_CPUS; i++) { - if (cpu_possible(i)) - cpu_rq(i)->node_nr_running = - &node_nr_running[cpu_to_node(i)]; - } -} - -#else /* !CONFIG_NUMA */ - -# define nr_running_init(rq) do { } while (0) -# define nr_running_inc(rq) do { (rq)->nr_running++; } while (0) -# define nr_running_dec(rq) do { (rq)->nr_running--; } while (0) - -#endif /* CONFIG_NUMA */ - /* * task_rq_lock - lock the runqueue a given task resides on and disable * interrupts. Note the ordering: we can safely lookup the task_rq without * explicitly disabling preemption. */ -static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) +static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) { struct runqueue *rq; @@ -311,10 +305,95 @@ static inline void task_rq_unlock(runque spin_unlock_irqrestore(&rq->lock, *flags); } + +#ifdef CONFIG_SCHEDSTATS + +/* + * bump this up when changing the output format or the meaning of an existing + * format, so that tools can adapt (or abort) + */ +#define SCHEDSTAT_VERSION 7 + +static int show_schedstat(struct seq_file *seq, void *v) +{ + int i; + + seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); + seq_printf(seq, "timestamp %lu\n", jiffies); + for_each_cpu(i) { + /* Include offline CPUs */ + runqueue_t *rq = cpu_rq(i); +#ifdef CONFIG_SMP + struct sched_domain *sd; + int j = 0; +#endif + + seq_printf(seq, + "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu", + i, + rq->yld_both_empty, rq->yld_act_empty, + rq->yld_exp_empty, rq->yld_cnt, + rq->sched_switch, rq->sched_cnt, + rq->sched_idle, rq->sched_wake, rq->sched_wake_local); +#ifdef CONFIG_SMP + for_each_domain(i, sd) { + char str[NR_CPUS]; + int k; + cpumask_scnprintf(str, NR_CPUS, sd->span); + seq_printf(seq, " domain%d %s", j++, str); + + for (k = 0; k < 3; k++) { + seq_printf(seq, " %lu %lu %lu %lu %lu %lu", + sd->lb_cnt[k], sd->lb_balanced[k], + sd->lb_failed[k], sd->lb_pulled[k], + sd->lb_hot_pulled[k], sd->lb_imbalance[k]); + } + + seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu", + sd->alb_cnt, sd->alb_failed, + sd->alb_pushed, sd->sched_wake_remote, + sd->plb_pulled, sd->afw_pulled, + sd->sbe_pushed, sd->sbc_pushed); + } +#endif + + seq_printf(seq, "\n"); + } + + return 0; +} + +static int schedstat_open(struct inode *inode, struct file *file) +{ + unsigned size = 4096 * (1 + num_online_cpus() / 32); + char *buf = kmalloc(size, GFP_KERNEL); + struct seq_file *m; + int res; + + if (!buf) + return -ENOMEM; + res = single_open(file, show_schedstat, NULL); + if (!res) { + m = file->private_data; + m->buf = buf; + m->size = size; + } else + kfree(buf); + return res; +} + +struct file_operations proc_schedstat_operations = { + .open = schedstat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + /* * rq_lock - lock a given runqueue and disable interrupts. */ -static inline runqueue_t *this_rq_lock(void) +static runqueue_t *this_rq_lock(void) { runqueue_t *rq; @@ -333,7 +412,7 @@ static inline void rq_unlock(runqueue_t /* * Adding/removing a task to/from a priority array: */ -static inline void dequeue_task(struct task_struct *p, prio_array_t *array) +static void dequeue_task(struct task_struct *p, prio_array_t *array) { array->nr_active--; list_del(&p->run_list); @@ -341,7 +420,7 @@ static inline void dequeue_task(struct t __clear_bit(p->prio, array->bitmap); } -static inline void enqueue_task(struct task_struct *p, prio_array_t *array) +static void enqueue_task(struct task_struct *p, prio_array_t *array) { list_add_tail(&p->run_list, array->queue + p->prio); __set_bit(p->prio, array->bitmap); @@ -349,6 +428,21 @@ static inline void enqueue_task(struct t p->array = array; } +#ifdef CONFIG_SMP +/* + * Used by the migration code - we pull tasks from the head of the + * remote queue so we want these tasks to show up at the head of the + * local queue: + */ +static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) +{ + list_add(&p->run_list, array->queue + p->prio); + __set_bit(p->prio, array->bitmap); + array->nr_active++; + p->array = array; +} +#endif + /* * effective_prio - return the priority that is based on the static * priority but is modified by bonuses/penalties. @@ -386,7 +480,7 @@ static int effective_prio(task_t *p) static inline void __activate_task(task_t *p, runqueue_t *rq) { enqueue_task(p, rq->active); - nr_running_inc(rq); + rq->nr_running++; } static void recalc_task_prio(task_t *p, unsigned long long now) @@ -469,7 +563,7 @@ static void recalc_task_prio(task_t *p, * Update all the scheduling statistics stuff. (sleep average * calculation, priority modifiers, etc.) */ -static inline void activate_task(task_t *p, runqueue_t *rq) +static void activate_task(task_t *p, runqueue_t *rq) { unsigned long long now = sched_clock(); @@ -505,9 +599,9 @@ static inline void activate_task(task_t /* * deactivate_task - remove a task from the runqueue. */ -static inline void deactivate_task(struct task_struct *p, runqueue_t *rq) +static void deactivate_task(struct task_struct *p, runqueue_t *rq) { - nr_running_dec(rq); + rq->nr_running--; if (p->state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible++; dequeue_task(p, p->array); @@ -521,9 +615,9 @@ static inline void deactivate_task(struc * might also involve a cross-CPU call to trigger the scheduler on * the target CPU. */ -static inline void resched_task(task_t *p) -{ #ifdef CONFIG_SMP +static void resched_task(task_t *p) +{ int need_resched, nrpolling; preempt_disable(); @@ -535,10 +629,13 @@ static inline void resched_task(task_t * if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id())) smp_send_reschedule(task_cpu(p)); preempt_enable(); +} #else +static inline void resched_task(task_t *p) +{ set_tsk_need_resched(p); -#endif } +#endif /** * task_curr - is this task currently executing on a CPU? @@ -550,40 +647,46 @@ inline int task_curr(task_t *p) } #ifdef CONFIG_SMP +enum request_type { + REQ_MOVE_TASK, + REQ_SET_DOMAIN, +}; + typedef struct { struct list_head list; + enum request_type type; + + /* For REQ_MOVE_TASK */ task_t *task; + int dest_cpu; + + /* For REQ_SET_DOMAIN */ + struct sched_domain *sd; + struct completion done; } migration_req_t; /* - * The task's runqueue lock must be held, and the new mask must be valid. + * The task's runqueue lock must be held. * Returns true if you have to wait for migration thread. */ -static int __set_cpus_allowed(task_t *p, cpumask_t new_mask, - migration_req_t *req) +static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) { runqueue_t *rq = task_rq(p); - p->cpus_allowed = new_mask; - /* - * Can the task run on the task's current CPU? If not then - * migrate the thread off to a proper CPU. - */ - if (cpu_isset(task_cpu(p), new_mask)) - return 0; - /* * If the task is not on a runqueue (and not running), then * it is sufficient to simply update the task's cpu field. */ if (!p->array && !task_running(rq, p)) { - set_task_cpu(p, any_online_cpu(p->cpus_allowed)); + set_task_cpu(p, dest_cpu); return 0; } init_completion(&req->done); + req->type = REQ_MOVE_TASK; req->task = p; + req->dest_cpu = dest_cpu; list_add(&req->list, &rq->migration_queue); return 1; } @@ -638,6 +741,71 @@ void kick_process(task_t *p) EXPORT_SYMBOL_GPL(kick_process); +/* + * Return a low guess at the load of a migration-source cpu. + * + * We want to under-estimate the load of migration sources, to + * balance conservatively. + */ +static inline unsigned long source_load(int cpu) +{ + runqueue_t *rq = cpu_rq(cpu); + unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; + + return min(rq->cpu_load, load_now); +} + +/* + * Return a high guess at the load of a migration-target cpu + */ +static inline unsigned long target_load(int cpu) +{ + runqueue_t *rq = cpu_rq(cpu); + unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; + + return max(rq->cpu_load, load_now); +} + +#endif + +/* + * wake_idle() is useful especially on SMT architectures to wake a + * task onto an idle sibling if we would otherwise wake it onto a + * busy sibling. + * + * Returns the CPU we should wake onto. + */ +#if defined(ARCH_HAS_SCHED_WAKE_IDLE) +static int wake_idle(int cpu, task_t *p) +{ + cpumask_t tmp; + runqueue_t *rq = cpu_rq(cpu); + struct sched_domain *sd; + int i; + + if (idle_cpu(cpu)) + return cpu; + + sd = rq->sd; + if (!(sd->flags & SD_WAKE_IDLE)) + return cpu; + + cpus_and(tmp, sd->span, cpu_online_map); + for_each_cpu_mask(i, tmp) { + if (!cpu_isset(i, p->cpus_allowed)) + continue; + + if (idle_cpu(i)) + return i; + } + + return cpu; +} +#else +static inline int wake_idle(int cpu, task_t *p) +{ + return cpu; +} #endif /*** @@ -656,52 +824,151 @@ EXPORT_SYMBOL_GPL(kick_process); */ static int try_to_wake_up(task_t * p, unsigned int state, int sync) { + int cpu, this_cpu, success = 0; unsigned long flags; - int success = 0; long old_state; runqueue_t *rq; +#ifdef CONFIG_SMP + unsigned long load, this_load; + struct sched_domain *sd; + int new_cpu; +#endif -repeat_lock_task: rq = task_rq_lock(p, &flags); old_state = p->state; - if (old_state & state) { - if (!p->array) { + if (!(old_state & state)) + goto out; + + if (p->array) + goto out_running; + + cpu = task_cpu(p); + this_cpu = smp_processor_id(); + + schedstat_inc(rq, sched_wake); +#ifndef CONFIG_SMP + schedstat_inc(rq, sched_wake_local); +#endif + +#ifdef CONFIG_SMP +#ifdef CONFIG_SCHEDSTATS + if (cpu == this_cpu) + schedstat_inc(rq, sched_wake_local); + else { + for_each_domain(this_cpu, sd) + if (cpu_isset(cpu, sd->span)) + break; + if (sd) + schedstat_inc(sd, sched_wake_remote); + } +#endif + + if (unlikely(task_running(rq, p) || cpu_is_offline(this_cpu))) + goto out_activate; + + new_cpu = cpu; + + if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) + goto out_set_cpu; + + load = source_load(cpu); + this_load = target_load(this_cpu); + + /* Don't pull the task off an idle CPU to a busy one */ + if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2) + goto out_set_cpu; + + new_cpu = this_cpu; /* Wake to this CPU if we can */ + + /* + * Scan domains for affine wakeup and passive balancing + * possibilities. + */ + for_each_domain(this_cpu, sd) { + unsigned int imbalance; + /* + * Start passive balancing when half the imbalance_pct + * limit is reached. + */ + imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2; + + if ( ((sd->flags & SD_WAKE_AFFINE) && + !task_hot(p, rq->timestamp_last_tick, sd)) + || ((sd->flags & SD_WAKE_BALANCE) && + imbalance*this_load <= 100*load) ) { + /* - * Fast-migrate the task if it's not running or runnable - * currently. Do not violate hard affinity. + * Now sd has SD_WAKE_AFFINE and p is cache cold in sd + * or sd has SD_WAKE_BALANCE and there is an imbalance */ - if (unlikely(sync && !task_running(rq, p) && - (task_cpu(p) != smp_processor_id()) && - cpu_isset(smp_processor_id(), - p->cpus_allowed) && - !cpu_is_offline(smp_processor_id()))) { - set_task_cpu(p, smp_processor_id()); - task_rq_unlock(rq, &flags); - goto repeat_lock_task; - } - if (old_state == TASK_UNINTERRUPTIBLE) { - rq->nr_uninterruptible--; - /* - * Tasks on involuntary sleep don't earn - * sleep_avg beyond just interactive state. - */ - p->activated = -1; - } - if (sync && (task_cpu(p) == smp_processor_id())) - __activate_task(p, rq); - else { - activate_task(p, rq); - if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); + if (cpu_isset(cpu, sd->span)) { +#ifdef CONFIG_SCHEDSTATS + if ((sd->flags & SD_WAKE_AFFINE) && + !task_hot(p, rq->timestamp_last_tick, sd)) + schedstat_inc(sd, afw_pulled); + else if ((sd->flags & SD_WAKE_BALANCE) && + imbalance*this_load <= 100*load) + schedstat_inc(sd, plb_pulled); +#endif + goto out_set_cpu; } - success = 1; } - p->state = TASK_RUNNING; } + + new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ +out_set_cpu: + new_cpu = wake_idle(new_cpu, p); + if (new_cpu != cpu && cpu_isset(new_cpu, p->cpus_allowed)) { + set_task_cpu(p, new_cpu); + task_rq_unlock(rq, &flags); + /* might preempt at this point */ + rq = task_rq_lock(p, &flags); + old_state = p->state; + if (!(old_state & state)) + goto out; + if (p->array) + goto out_running; + + this_cpu = smp_processor_id(); + cpu = task_cpu(p); + } + +out_activate: +#endif /* CONFIG_SMP */ + if (old_state == TASK_UNINTERRUPTIBLE) { + rq->nr_uninterruptible--; + /* + * Tasks on involuntary sleep don't earn + * sleep_avg beyond just interactive state. + */ + p->activated = -1; + } + + /* + * Sync wakeups (i.e. those types of wakeups where the waker + * has indicated that it will leave the CPU in short order) + * don't trigger a preemption, if the woken up task will run on + * this cpu. (in this case the 'I will reschedule' promise of + * the waker guarantees that the freshly woken up task is going + * to be considered on this CPU.) + */ + if (sync && cpu == this_cpu) { + __activate_task(p, rq); + } else { + activate_task(p, rq); + if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + } + success = 1; + +out_running: + p->state = TASK_RUNNING; +out: task_rq_unlock(rq, &flags); return success; } + int fastcall wake_up_process(task_t * p) { return try_to_wake_up(p, TASK_STOPPED | @@ -756,8 +1023,8 @@ void fastcall sched_fork(task_t *p) p->timestamp = sched_clock(); if (!current->time_slice) { /* - * This case is rare, it happens when the parent has only - * a single jiffy left from its timeslice. Taking the + * This case is rare, it happens when the parent has only + * a single jiffy left from its timeslice. Taking the * runqueue lock is not a problem. */ current->time_slice = 1; @@ -805,7 +1072,7 @@ void fastcall wake_up_forked_process(tas list_add_tail(&p->run_list, ¤t->run_list); p->array = current->array; p->array->nr_active++; - nr_running_inc(rq); + rq->nr_running++; } task_rq_unlock(rq, &flags); } @@ -856,7 +1123,7 @@ void fastcall sched_exit(task_t * p) * with the lock held can cause deadlocks; see schedule() for * details.) */ -static inline void finish_task_switch(task_t *prev) +static void finish_task_switch(task_t *prev) { runqueue_t *rq = this_rq(); struct mm_struct *mm = rq->prev_mm; @@ -873,7 +1140,7 @@ static inline void finish_task_switch(ta * still held, otherwise prev could be scheduled on another cpu, die * there before we look at prev->state, and then the reference would * be dropped twice. - * Manfred Spraul + * Manfred Spraul */ prev_task_flags = prev->flags; finish_arch_switch(rq, prev); @@ -935,7 +1202,7 @@ unsigned long nr_running(void) { unsigned long i, sum = 0; - for (i = 0; i < NR_CPUS; i++) + for_each_cpu(i) sum += cpu_rq(i)->nr_running; return sum; @@ -977,7 +1244,7 @@ unsigned long nr_iowait(void) * Note this does not disable interrupts like task_rq_lock, * you need to do so manually before calling. */ -static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) +static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) { if (rq1 == rq2) spin_lock(&rq1->lock); @@ -998,14 +1265,149 @@ static inline void double_rq_lock(runque * Note this does not restore interrupts like task_rq_unlock, * you need to do so manually after calling. */ -static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) +static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) { spin_unlock(&rq1->lock); if (rq1 != rq2) spin_unlock(&rq2->lock); } -#ifdef CONFIG_NUMA +enum idle_type +{ + IDLE, + NOT_IDLE, + NEWLY_IDLE, +}; + +#ifdef CONFIG_SMP + +/* + * find_idlest_cpu - find the least busy runqueue. + */ +static int find_idlest_cpu(struct task_struct *p, int this_cpu, + struct sched_domain *sd) +{ + unsigned long load, min_load, this_load; + int i, min_cpu; + cpumask_t mask; + + min_cpu = UINT_MAX; + min_load = ULONG_MAX; + + cpus_and(mask, sd->span, cpu_online_map); + cpus_and(mask, mask, p->cpus_allowed); + + for_each_cpu_mask(i, mask) { + load = target_load(i); + + if (load < min_load) { + min_cpu = i; + min_load = load; + + /* break out early on an idle CPU: */ + if (!min_load) + break; + } + } + + /* add +1 to account for the new task */ + this_load = source_load(this_cpu) + SCHED_LOAD_SCALE; + + /* + * Would with the addition of the new task to the + * current CPU there be an imbalance between this + * CPU and the idlest CPU? + * + * Use half of the balancing threshold - new-context is + * a good opportunity to balance. + */ + if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100) + return min_cpu; + + return this_cpu; +} + +/* + * wake_up_forked_thread - wake up a freshly forked thread. + * + * This function will do some initial scheduler statistics housekeeping + * that must be done for every newly created context, and it also does + * runqueue balancing. + */ +void fastcall wake_up_forked_thread(task_t * p) +{ + unsigned long flags; + int this_cpu = get_cpu(), cpu; + struct sched_domain *tmp, *sd = NULL; + runqueue_t *this_rq = cpu_rq(this_cpu), *rq; + + /* + * Find the largest domain that this CPU is part of that + * is willing to balance on clone: + */ + for_each_domain(this_cpu, tmp) + if (tmp->flags & SD_BALANCE_CLONE) + sd = tmp; + if (sd) + cpu = find_idlest_cpu(p, this_cpu, sd); + else + cpu = this_cpu; + + local_irq_save(flags); +lock_again: + rq = cpu_rq(cpu); + double_rq_lock(this_rq, rq); + + BUG_ON(p->state != TASK_RUNNING); + + /* + * We did find_idlest_cpu() unlocked, so in theory + * the mask could have changed - just dont migrate + * in this case: + */ + if (unlikely(!cpu_isset(cpu, p->cpus_allowed))) { + cpu = this_cpu; + double_rq_unlock(this_rq, rq); + goto lock_again; + } + /* + * We decrease the sleep average of forking parents + * and children as well, to keep max-interactive tasks + * from forking tasks that are max-interactive. + */ + current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * + PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + + p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * + CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + + p->interactive_credit = 0; + + p->prio = effective_prio(p); + set_task_cpu(p, cpu); + + if (cpu == this_cpu) { + if (unlikely(!current->array)) + __activate_task(p, rq); + else { + p->prio = current->prio; + list_add_tail(&p->run_list, ¤t->run_list); + p->array = current->array; + p->array->nr_active++; + rq->nr_running++; + } + } else { + schedstat_inc(sd, sbc_pushed); + __activate_task(p, rq); + if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + } + + double_rq_unlock(this_rq, rq); + local_irq_restore(flags); + put_cpu(); +} + /* * If dest_cpu is allowed for this process, migrate the task to it. * This is accomplished by forcing the cpu_allowed mask to only @@ -1014,32 +1416,31 @@ static inline void double_rq_unlock(runq */ static void sched_migrate_task(task_t *p, int dest_cpu) { - runqueue_t *rq; migration_req_t req; + runqueue_t *rq; unsigned long flags; - cpumask_t old_mask, new_mask = cpumask_of_cpu(dest_cpu); lock_cpu_hotplug(); rq = task_rq_lock(p, &flags); - old_mask = p->cpus_allowed; - if (!cpu_isset(dest_cpu, old_mask) || !cpu_online(dest_cpu)) + if (!cpu_isset(dest_cpu, p->cpus_allowed)) goto out; /* force the process onto the specified CPU */ - if (__set_cpus_allowed(p, new_mask, &req)) { + if (migrate_task(p, dest_cpu, &req)) { /* Need to wait for migration thread. */ task_rq_unlock(rq, &flags); wake_up_process(rq->migration_thread); wait_for_completion(&req.done); - /* If we raced with sys_sched_setaffinity, don't - * restore mask. */ - rq = task_rq_lock(p, &flags); - if (likely(cpus_equal(p->cpus_allowed, new_mask))) { - /* Restore old mask: won't need migration - * thread, since current cpu is allowed. */ - BUG_ON(__set_cpus_allowed(p, old_mask, NULL)); - } + /* + * we want a new context here. This eliminates TLB + * flushes on the cpus where the process executed prior to + * the migration. + */ + tlb_migrate_prepare(current->mm); + unlock_cpu_hotplug(); + + return; } out: task_rq_unlock(rq, &flags); @@ -1047,203 +1448,51 @@ out: } /* - * Find the least loaded CPU. Slightly favor the current CPU by - * setting its runqueue length as the minimum to start. + * sched_balance_exec(): find the highest-level, exec-balance-capable + * domain and try to migrate the task to the least loaded CPU. + * + * execve() is a valuable balancing opportunity, because at this point + * the task has the smallest effective memory and cache footprint. */ -static int sched_best_cpu(struct task_struct *p) -{ - int i, minload, load, best_cpu, node = 0; - cpumask_t cpumask; - - best_cpu = task_cpu(p); - if (cpu_rq(best_cpu)->nr_running <= 2) - return best_cpu; - - minload = 10000000; - for_each_node_with_cpus(i) { - /* - * Node load is always divided by nr_cpus_node to normalise - * load values in case cpu count differs from node to node. - * We first multiply node_nr_running by 10 to get a little - * better resolution. - */ - load = 10 * atomic_read(&node_nr_running[i]) / nr_cpus_node(i); - if (load < minload) { - minload = load; - node = i; - } - } - - minload = 10000000; - cpumask = node_to_cpumask(node); - for (i = 0; i < NR_CPUS; ++i) { - if (!cpu_isset(i, cpumask)) - continue; - if (cpu_rq(i)->nr_running < minload) { - best_cpu = i; - minload = cpu_rq(i)->nr_running; - } - } - return best_cpu; -} - void sched_balance_exec(void) { - int new_cpu; + struct sched_domain *tmp, *sd = NULL; + int new_cpu, this_cpu = get_cpu(); - if (numnodes > 1) { - new_cpu = sched_best_cpu(current); - if (new_cpu != smp_processor_id()) - sched_migrate_task(current, new_cpu); - } -} + /* Prefer the current CPU if there's only this task running */ + if (this_rq()->nr_running <= 1) + goto out; -/* - * Find the busiest node. All previous node loads contribute with a - * geometrically deccaying weight to the load measure: - * load_{t} = load_{t-1}/2 + nr_node_running_{t} - * This way sudden load peaks are flattened out a bit. - * Node load is divided by nr_cpus_node() in order to compare nodes - * of different cpu count but also [first] multiplied by 10 to - * provide better resolution. - */ -static int find_busiest_node(int this_node) -{ - int i, node = -1, load, this_load, maxload; - - if (!nr_cpus_node(this_node)) - return node; - this_load = maxload = (this_rq()->prev_node_load[this_node] >> 1) - + (10 * atomic_read(&node_nr_running[this_node]) - / nr_cpus_node(this_node)); - this_rq()->prev_node_load[this_node] = this_load; - for_each_node_with_cpus(i) { - if (i == this_node) - continue; - load = (this_rq()->prev_node_load[i] >> 1) - + (10 * atomic_read(&node_nr_running[i]) - / nr_cpus_node(i)); - this_rq()->prev_node_load[i] = load; - if (load > maxload && (100*load > NODE_THRESHOLD*this_load)) { - maxload = load; - node = i; + for_each_domain(this_cpu, tmp) + if (tmp->flags & SD_BALANCE_EXEC) + sd = tmp; + + if (sd) { + new_cpu = find_idlest_cpu(current, this_cpu, sd); + if (new_cpu != this_cpu) { + schedstat_inc(sd, sbe_pushed); + put_cpu(); + sched_migrate_task(current, new_cpu); + return; } } - return node; +out: + put_cpu(); } -#endif /* CONFIG_NUMA */ - -#ifdef CONFIG_SMP - /* - * double_lock_balance - lock the busiest runqueue - * - * this_rq is locked already. Recalculate nr_running if we have to - * drop the runqueue lock. + * double_lock_balance - lock the busiest runqueue, this_rq is locked already. */ -static inline -unsigned int double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest, - int this_cpu, int idle, - unsigned int nr_running) +static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) { if (unlikely(!spin_trylock(&busiest->lock))) { if (busiest < this_rq) { spin_unlock(&this_rq->lock); spin_lock(&busiest->lock); spin_lock(&this_rq->lock); - /* Need to recalculate nr_running */ - if (idle || (this_rq->nr_running > - this_rq->prev_cpu_load[this_cpu])) - nr_running = this_rq->nr_running; - else - nr_running = this_rq->prev_cpu_load[this_cpu]; } else spin_lock(&busiest->lock); } - return nr_running; -} - -/* - * find_busiest_queue - find the busiest runqueue among the cpus in cpumask. - */ -static inline -runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu, int idle, - int *imbalance, cpumask_t cpumask) -{ - int nr_running, load, max_load, i; - runqueue_t *busiest, *rq_src; - - /* - * We search all runqueues to find the most busy one. - * We do this lockless to reduce cache-bouncing overhead, - * we re-check the 'best' source CPU later on again, with - * the lock held. - * - * We fend off statistical fluctuations in runqueue lengths by - * saving the runqueue length (as seen by the balancing CPU) during - * the previous load-balancing operation and using the smaller one - * of the current and saved lengths. If a runqueue is long enough - * for a longer amount of time then we recognize it and pull tasks - * from it. - * - * The 'current runqueue length' is a statistical maximum variable, - * for that one we take the longer one - to avoid fluctuations in - * the other direction. So for a load-balance to happen it needs - * stable long runqueue on the target CPU and stable short runqueue - * on the local runqueue. - * - * We make an exception if this CPU is about to become idle - in - * that case we are less picky about moving a task across CPUs and - * take what can be taken. - */ - if (idle || (this_rq->nr_running > this_rq->prev_cpu_load[this_cpu])) - nr_running = this_rq->nr_running; - else - nr_running = this_rq->prev_cpu_load[this_cpu]; - - busiest = NULL; - max_load = 1; - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_isset(i, cpumask)) - continue; - - rq_src = cpu_rq(i); - if (idle || (rq_src->nr_running < this_rq->prev_cpu_load[i])) - load = rq_src->nr_running; - else - load = this_rq->prev_cpu_load[i]; - this_rq->prev_cpu_load[i] = rq_src->nr_running; - - if ((load > max_load) && (rq_src != this_rq)) { - busiest = rq_src; - max_load = load; - } - } - - if (likely(!busiest)) - goto out; - - *imbalance = max_load - nr_running; - - /* It needs an at least ~25% imbalance to trigger balancing. */ - if (!idle && ((*imbalance)*4 < max_load)) { - busiest = NULL; - goto out; - } - - nr_running = double_lock_balance(this_rq, busiest, this_cpu, - idle, nr_running); - /* - * Make sure nothing changed since we checked the - * runqueue length. - */ - if (busiest->nr_running <= nr_running) { - spin_unlock(&busiest->lock); - busiest = NULL; - } -out: - return busiest; } /* @@ -1252,13 +1501,13 @@ out: */ static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, - runqueue_t *this_rq, int this_cpu) + runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) { dequeue_task(p, src_array); - nr_running_dec(src_rq); + src_rq->nr_running--; set_task_cpu(p, this_cpu); - nr_running_inc(this_rq); - enqueue_task(p, this_rq->active); + this_rq->nr_running++; + enqueue_task_head(p, this_array); p->timestamp = sched_clock() - (src_rq->timestamp_last_tick - p->timestamp); /* @@ -1266,193 +1515,570 @@ void pull_task(runqueue_t *src_rq, prio_ * to be always true for them. */ if (TASK_PREEMPTS_CURR(p, this_rq)) - set_need_resched(); + resched_task(this_rq->curr); } /* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ static inline -int can_migrate_task(task_t *tsk, runqueue_t *rq, int this_cpu, int idle) +int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, + struct sched_domain *sd, enum idle_type idle) { - unsigned long delta = rq->timestamp_last_tick - tsk->timestamp; - /* * We do not migrate tasks that are: * 1) running (obviously), or * 2) cannot be migrated to this CPU due to cpus_allowed, or * 3) are cache-hot on their current CPU. */ - if (task_running(rq, tsk)) - return 0; - if (!cpu_isset(this_cpu, tsk->cpus_allowed)) + if (task_running(rq, p)) return 0; - if (!idle && (delta <= JIFFIES_TO_NS(cache_decay_ticks))) + if (!cpu_isset(this_cpu, p->cpus_allowed)) return 0; + + /* Aggressive migration if we've failed balancing */ + if (idle == NEWLY_IDLE || + sd->nr_balance_failed < sd->cache_nice_tries) { + if (task_hot(p, rq->timestamp_last_tick, sd)) + return -1; + } + +#ifdef CONFIG_SCHEDSTATS + if (!task_hot(p, rq->timestamp_last_tick, sd)) + schedstat_inc(sd, lb_pulled[idle]); + else + schedstat_inc(sd, lb_hot_pulled[idle]); +#endif + return 1; } /* - * Current runqueue is empty, or rebalance tick: if there is an - * inbalance (current runqueue is too short) then pull from - * busiest runqueue(s). + * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, + * as part of a balancing operation within "domain". Returns the number of + * tasks moved. + * + * Called with both runqueues locked. + */ +static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, + unsigned long max_nr_move, struct sched_domain *sd, + enum idle_type idle) +{ + prio_array_t *array, *dst_array; + struct list_head *head, *curr; + int ret, idx, pulled = 0; + task_t *tmp; + + if (max_nr_move <= 0 || busiest->nr_running <= 1) + goto out; + + /* We first consider active tasks. */ + if (busiest->active->nr_active) { + array = busiest->active; + dst_array = this_rq->active; + } else { + array = busiest->expired; + dst_array = this_rq->expired; + } + +new_array: + /* Start searching at priority 0: */ + idx = 0; +skip_bitmap: + if (!idx) + idx = sched_find_first_bit(array->bitmap); + else + idx = find_next_bit(array->bitmap, MAX_PRIO, idx); + if (idx >= MAX_PRIO) { + if (array == busiest->active && busiest->expired->nr_active) { + array = busiest->expired; + dst_array = this_rq->expired; + goto new_array; + } + goto out; + } + + head = array->queue + idx; + curr = head->next; +skip_queue: + tmp = list_entry(curr, task_t, run_list); + + curr = curr->next; + + ret = can_migrate_task(tmp, busiest, this_cpu, sd, idle); + if (ret == -1) { + idx++; + goto skip_bitmap; + } + if (!ret) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } + pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); + pulled++; + + /* We only want to steal up to the prescribed number of tasks. */ + if (pulled < max_nr_move) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } +out: + return pulled; +} + +/* + * find_busiest_group finds and returns the busiest CPU group within the + * domain. It calculates and returns the number of tasks which should be + * moved to restore balance via the imbalance parameter. + */ +static struct sched_group * +find_busiest_group(struct sched_domain *sd, int this_cpu, + unsigned long *imbalance, enum idle_type idle) +{ + struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; + unsigned long max_load, avg_load, total_load, this_load, total_pwr; + + max_load = this_load = total_load = total_pwr = 0; + + do { + cpumask_t tmp; + unsigned long load; + int local_group; + int i, nr_cpus = 0; + + local_group = cpu_isset(this_cpu, group->cpumask); + + /* Tally up the load of all CPUs in the group */ + avg_load = 0; + cpus_and(tmp, group->cpumask, cpu_online_map); + if (unlikely(cpus_empty(tmp))) + goto nextgroup; + + for_each_cpu_mask(i, tmp) { + /* Bias balancing toward cpus of our domain */ + if (local_group) + load = target_load(i); + else + load = source_load(i); + + nr_cpus++; + avg_load += load; + } + + if (!nr_cpus) + goto nextgroup; + + total_load += avg_load; + total_pwr += group->cpu_power; + + /* Adjust by relative CPU power of the group */ + avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + + if (local_group) { + this_load = avg_load; + this = group; + goto nextgroup; + } else if (avg_load > max_load) { + max_load = avg_load; + busiest = group; + } +nextgroup: + group = group->next; + } while (group != sd->groups); + + if (!busiest || this_load >= max_load) + goto out_balanced; + + avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; + + if (this_load >= avg_load || + 100*max_load <= sd->imbalance_pct*this_load) + goto out_balanced; + + /* + * We're trying to get all the cpus to the average_load, so we don't + * want to push ourselves above the average load, nor do we wish to + * reduce the max loaded cpu below the average load, as either of these + * actions would just result in more rebalancing later, and ping-pong + * tasks around. Thus we look for the minimum possible imbalance. + * Negative imbalances (*we* are more loaded than anyone else) will + * be counted as no imbalance for these purposes -- we can't fix that + * by pulling tasks to us. Be careful of negative numbers as they'll + * appear as very large values with unsigned longs. + */ + *imbalance = min(max_load - avg_load, avg_load - this_load); + + /* How much load to actually move to equalise the imbalance */ + *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power)) + / SCHED_LOAD_SCALE; + + if (*imbalance < SCHED_LOAD_SCALE - 1) { + unsigned long pwr_now = 0, pwr_move = 0; + unsigned long tmp; + + if (max_load - this_load >= SCHED_LOAD_SCALE*2) { + *imbalance = 1; + return busiest; + } + + /* + * OK, we don't have enough imbalance to justify moving tasks, + * however we may be able to increase total CPU power used by + * moving them. + */ + + pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); + pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); + pwr_now /= SCHED_LOAD_SCALE; + + /* Amount of load we'd subtract */ + tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; + if (max_load > tmp) + pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, + max_load - tmp); + + /* Amount of load we'd add */ + tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; + if (max_load < tmp) + tmp = max_load; + pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); + pwr_move /= SCHED_LOAD_SCALE; + + /* Move if we gain another 8th of a CPU worth of throughput */ + if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8) + goto out_balanced; + + *imbalance = 1; + return busiest; + } + + /* Get rid of the scaling factor, rounding down as we divide */ + *imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE; + + return busiest; + +out_balanced: + if (busiest && (idle == NEWLY_IDLE || + (idle == IDLE && max_load > SCHED_LOAD_SCALE)) ) { + *imbalance = 1; + return busiest; + } + + *imbalance = 0; + return NULL; +} + +/* + * find_busiest_queue - find the busiest runqueue among the cpus in group. + */ +static runqueue_t *find_busiest_queue(struct sched_group *group) +{ + cpumask_t tmp; + unsigned long load, max_load = 0; + runqueue_t *busiest = NULL; + int i; + + cpus_and(tmp, group->cpumask, cpu_online_map); + for_each_cpu_mask(i, tmp) { + load = source_load(i); + + if (load > max_load) { + max_load = load; + busiest = cpu_rq(i); + } + } + + return busiest; +} + +/* + * Check this_cpu to ensure it is balanced within domain. Attempt to move + * tasks if there is an imbalance. + * + * Called with this_rq unlocked. + */ +static int load_balance(int this_cpu, runqueue_t *this_rq, + struct sched_domain *sd, enum idle_type idle) +{ + struct sched_group *group; + runqueue_t *busiest; + unsigned long imbalance; + int nr_moved; + + spin_lock(&this_rq->lock); + schedstat_inc(sd, lb_cnt[idle]); + + group = find_busiest_group(sd, this_cpu, &imbalance, idle); + if (!group) { + schedstat_inc(sd, lb_balanced[idle]); + goto out_balanced; + } + + busiest = find_busiest_queue(group); + if (!busiest) { + schedstat_inc(sd, lb_balanced[idle]); + goto out_balanced; + } + + if (unlikely(busiest == this_rq)) { + WARN_ON(1); + goto out_balanced; + } + schedstat_add(sd, lb_imbalance[idle], imbalance); + + /* Attempt to move tasks */ + double_lock_balance(this_rq, busiest); + + nr_moved = move_tasks(this_rq, this_cpu, busiest, imbalance, sd, idle); + spin_unlock(&this_rq->lock); + spin_unlock(&busiest->lock); + + if (!nr_moved) { + schedstat_inc(sd, lb_failed[idle]); + sd->nr_balance_failed++; + + if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { + int wake = 0; + + spin_lock(&busiest->lock); + if (!busiest->active_balance) { + busiest->active_balance = 1; + busiest->push_cpu = this_cpu; + wake = 1; + } + spin_unlock(&busiest->lock); + if (wake) + wake_up_process(busiest->migration_thread); + + /* + * We've kicked active balancing, reset the failure + * counter. + */ + sd->nr_balance_failed = sd->cache_nice_tries; + } + } else + sd->nr_balance_failed = 0; + + /* We were unbalanced, so reset the balancing interval */ + sd->balance_interval = sd->min_interval; + + return nr_moved; + +out_balanced: + spin_unlock(&this_rq->lock); + + /* tune up the balancing interval */ + if (sd->balance_interval < sd->max_interval) + sd->balance_interval *= 2; + + return 0; +} + +/* + * Check this_cpu to ensure it is balanced within domain. Attempt to move + * tasks if there is an imbalance. * - * We call this with the current runqueue locked, - * irqs disabled. + * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). + * this_rq is locked. */ -static void load_balance(runqueue_t *this_rq, int idle, cpumask_t cpumask) +static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, + struct sched_domain *sd) { - int imbalance, idx, this_cpu = smp_processor_id(); - runqueue_t *busiest; - prio_array_t *array; - struct list_head *head, *curr; - task_t *tmp; - - if (cpu_is_offline(this_cpu)) + struct sched_group *group; + runqueue_t *busiest = NULL; + unsigned long imbalance; + int nr_moved = 0; + + schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); + group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); + if (!group) { + schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); goto out; + } - busiest = find_busiest_queue(this_rq, this_cpu, idle, - &imbalance, cpumask); - if (!busiest) + busiest = find_busiest_queue(group); + if (!busiest || busiest == this_rq) { + schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); goto out; + } - /* - * We only want to steal a number of tasks equal to 1/2 the imbalance, - * otherwise we'll just shift the imbalance to the new queue: - */ - imbalance /= 2; + schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); - /* - * We first consider expired tasks. Those will likely not be - * executed in the near future, and they are most likely to - * be cache-cold, thus switching CPUs has the least effect - * on them. - */ - if (busiest->expired->nr_active) - array = busiest->expired; - else - array = busiest->active; + /* Attempt to move tasks */ + double_lock_balance(this_rq, busiest); -new_array: - /* Start searching at priority 0: */ - idx = 0; -skip_bitmap: - if (!idx) - idx = sched_find_first_bit(array->bitmap); - else - idx = find_next_bit(array->bitmap, MAX_PRIO, idx); - if (idx >= MAX_PRIO) { - if (array == busiest->expired) { - array = busiest->active; - goto new_array; - } - goto out_unlock; - } + nr_moved = move_tasks(this_rq, this_cpu, busiest, + imbalance, sd, NEWLY_IDLE); - head = array->queue + idx; - curr = head->prev; -skip_queue: - tmp = list_entry(curr, task_t, run_list); + spin_unlock(&busiest->lock); - curr = curr->prev; +out: + return nr_moved; +} - if (!can_migrate_task(tmp, busiest, this_cpu, idle)) { - if (curr != head) - goto skip_queue; - idx++; - goto skip_bitmap; - } - pull_task(busiest, array, tmp, this_rq, this_cpu); +/* + * idle_balance is called by schedule() if this_cpu is about to become + * idle. Attempts to pull tasks from other CPUs. + */ +static inline void idle_balance(int this_cpu, runqueue_t *this_rq) +{ + struct sched_domain *sd; - /* Only migrate one task if we are idle */ - if (!idle && --imbalance) { - if (curr != head) - goto skip_queue; - idx++; - goto skip_bitmap; + if (unlikely(cpu_is_offline(this_cpu))) + return; + + for_each_domain(this_cpu, sd) { + if (sd->flags & SD_BALANCE_NEWIDLE) { + if (load_balance_newidle(this_cpu, this_rq, sd)) { + /* We've pulled tasks over so stop searching */ + break; + } + } } -out_unlock: - spin_unlock(&busiest->lock); -out: - ; } /* - * One of the idle_cpu_tick() and busy_cpu_tick() functions will - * get called every timer tick, on every CPU. Our balancing action - * frequency and balancing agressivity depends on whether the CPU is - * idle or not. + * active_load_balance is run by migration threads. It pushes a running + * task off the cpu. It can be required to correctly have at least 1 task + * running on each physical CPU where possible, and not have a physical / + * logical imbalance. * - * busy-rebalance every 200 msecs. idle-rebalance every 1 msec. (or on - * systems with HZ=100, every 10 msecs.) - * - * On NUMA, do a node-rebalance every 400 msecs. + * Called with busiest locked. */ -#define IDLE_REBALANCE_TICK (HZ/1000 ?: 1) -#define BUSY_REBALANCE_TICK (HZ/5 ?: 1) -#define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * 5) -#define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * 2) - -#ifdef CONFIG_NUMA -static void balance_node(runqueue_t *this_rq, int idle, int this_cpu) +static void active_load_balance(runqueue_t *busiest, int busiest_cpu) { - int node = find_busiest_node(cpu_to_node(this_cpu)); + struct sched_domain *sd; + struct sched_group *group, *busy_group; + int i; + int moved = 0; + + if (busiest->nr_running <= 1) + return; - if (node >= 0) { - cpumask_t cpumask = node_to_cpumask(node); - cpu_set(this_cpu, cpumask); - spin_lock(&this_rq->lock); - load_balance(this_rq, idle, cpumask); - spin_unlock(&this_rq->lock); + for_each_domain(busiest_cpu, sd) + if (cpu_isset(busiest->push_cpu, sd->span)) + break; + if (!sd) { + WARN_ON(1); + return; } + schedstat_inc(sd, alb_cnt); + + group = sd->groups; + while (!cpu_isset(busiest_cpu, group->cpumask)) + group = group->next; + busy_group = group; + + group = sd->groups; + do { + cpumask_t tmp; + runqueue_t *rq; + int push_cpu = 0; + + if (group == busy_group) + goto next_group; + + cpus_and(tmp, group->cpumask, cpu_online_map); + if (!cpus_weight(tmp)) + goto next_group; + + for_each_cpu_mask(i, tmp) { + if (!idle_cpu(i)) + goto next_group; + push_cpu = i; + } + + rq = cpu_rq(push_cpu); + double_lock_balance(busiest, rq); + moved += move_tasks(rq, push_cpu, busiest, 1, sd, IDLE); + spin_unlock(&rq->lock); +next_group: + group = group->next; + } while (group != sd->groups); + + if (moved) + schedstat_add(sd, alb_pushed, moved); + else + schedstat_inc(sd, alb_failed); } -#endif -static void rebalance_tick(runqueue_t *this_rq, int idle) +/* + * rebalance_tick will get called every timer tick, on every CPU. + * + * It checks each scheduling domain to see if it is due to be balanced, + * and initiates a balancing operation if so. + * + * Balancing parameters are set up in arch_init_sched_domains. + */ + +/* Don't have all balancing operations going off at once */ +#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS) + +static void rebalance_tick(int this_cpu, runqueue_t *this_rq, + enum idle_type idle) { -#ifdef CONFIG_NUMA - int this_cpu = smp_processor_id(); -#endif - unsigned long j = jiffies; + unsigned long old_load, this_load; + unsigned long j = jiffies + CPU_OFFSET(this_cpu); + struct sched_domain *sd; - /* - * First do inter-node rebalancing, then intra-node rebalancing, - * if both events happen in the same tick. The inter-node - * rebalancing does not necessarily have to create a perfect - * balance within the node, since we load-balance the most loaded - * node with the current CPU. (ie. other CPUs in the local node - * are not balanced.) - */ - if (idle) { -#ifdef CONFIG_NUMA - if (!(j % IDLE_NODE_REBALANCE_TICK)) - balance_node(this_rq, idle, this_cpu); -#endif - if (!(j % IDLE_REBALANCE_TICK)) { - spin_lock(&this_rq->lock); - load_balance(this_rq, idle, cpu_to_node_mask(this_cpu)); - spin_unlock(&this_rq->lock); - } + if (unlikely(cpu_is_offline(this_cpu))) return; - } -#ifdef CONFIG_NUMA - if (!(j % BUSY_NODE_REBALANCE_TICK)) - balance_node(this_rq, idle, this_cpu); -#endif - if (!(j % BUSY_REBALANCE_TICK)) { - spin_lock(&this_rq->lock); - load_balance(this_rq, idle, cpu_to_node_mask(this_cpu)); - spin_unlock(&this_rq->lock); + + /* Update our load */ + old_load = this_rq->cpu_load; + this_load = this_rq->nr_running * SCHED_LOAD_SCALE; + this_rq->cpu_load = (old_load + this_load) / 2; + + for_each_domain(this_cpu, sd) { + unsigned long interval = sd->balance_interval; + + if (idle != IDLE) + interval *= sd->busy_factor; + + /* scale ms to jiffies */ + interval = MSEC_TO_JIFFIES(interval); + if (unlikely(!interval)) + interval = 1; + + if (j - sd->last_balance >= interval) { + if (load_balance(this_cpu, this_rq, sd, idle)) { + /* We've pulled tasks over so no longer idle */ + idle = NOT_IDLE; + } + sd->last_balance += interval; + } } } #else /* * on UP we do not need to balance between CPUs: */ -static inline void rebalance_tick(runqueue_t *this_rq, int idle) +static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle) { } +static inline void idle_balance(int cpu, runqueue_t *rq) +{ +} +#endif + +static inline int wake_priority_sleeper(runqueue_t *rq) +{ +#ifdef CONFIG_SCHED_SMT + /* + * If an SMT sibling task has been put to sleep for priority + * reasons reschedule the idle task to see if it can now run. + */ + if (rq->nr_running) { + resched_task(rq->idle); + return 1; + } #endif + return 0; +} DEFINE_PER_CPU(struct kernel_stat, kstat); @@ -1507,7 +2133,9 @@ void scheduler_tick(int user_ticks, int cpustat->iowait += sys_ticks; else cpustat->idle += sys_ticks; - rebalance_tick(rq, 1); + if (wake_priority_sleeper(rq)) + goto out; + rebalance_tick(cpu, rq, IDLE); return; } if (TASK_NICE(p) > 0) @@ -1591,9 +2219,94 @@ void scheduler_tick(int user_ticks, int out_unlock: spin_unlock(&rq->lock); out: - rebalance_tick(rq, 0); + rebalance_tick(cpu, rq, NOT_IDLE); +} + +#ifdef CONFIG_SCHED_SMT +static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq) +{ + int i; + struct sched_domain *sd = rq->sd; + cpumask_t sibling_map; + + if (!(sd->flags & SD_SHARE_CPUPOWER)) + return; + + cpus_and(sibling_map, sd->span, cpu_online_map); + for_each_cpu_mask(i, sibling_map) { + runqueue_t *smt_rq; + + if (i == cpu) + continue; + + smt_rq = cpu_rq(i); + + /* + * If an SMT sibling task is sleeping due to priority + * reasons wake it up now. + */ + if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running) + resched_task(smt_rq->idle); + } +} + +static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p) +{ + struct sched_domain *sd = rq->sd; + cpumask_t sibling_map; + int ret = 0, i; + + if (!(sd->flags & SD_SHARE_CPUPOWER)) + return 0; + + cpus_and(sibling_map, sd->span, cpu_online_map); + for_each_cpu_mask(i, sibling_map) { + runqueue_t *smt_rq; + task_t *smt_curr; + + if (i == cpu) + continue; + + smt_rq = cpu_rq(i); + smt_curr = smt_rq->curr; + + /* + * If a user task with lower static priority than the + * running task on the SMT sibling is trying to schedule, + * delay it till there is proportionately less timeslice + * left of the sibling task to prevent a lower priority + * task from using an unfair proportion of the + * physical cpu's resources. -ck + */ + if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) > + task_timeslice(p) || rt_task(smt_curr)) && + p->mm && smt_curr->mm && !rt_task(p)) + ret = 1; + + /* + * Reschedule a lower priority task on the SMT sibling, + * or wake it up if it has been put to sleep for priority + * reasons. + */ + if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) > + task_timeslice(smt_curr) || rt_task(p)) && + smt_curr->mm && p->mm && !rt_task(smt_curr)) || + (smt_curr == smt_rq->idle && smt_rq->nr_running)) + resched_task(smt_curr); + } + return ret; +} +#else +static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq) +{ } +static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p) +{ + return 0; +} +#endif + /* * schedule() is the main scheduler function. */ @@ -1606,7 +2319,7 @@ asmlinkage void __sched schedule(void) struct list_head *queue; unsigned long long now; unsigned long run_time; - int idx; + int cpu, idx; /* * Test if we are atomic. Since do_exit() needs to call into @@ -1626,6 +2339,7 @@ need_resched: rq = this_rq(); release_kernel_lock(prev); + schedstat_inc(rq, sched_cnt); now = sched_clock(); if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG)) run_time = now - prev->timestamp; @@ -1656,13 +2370,14 @@ need_resched: deactivate_task(prev, rq); } + cpu = smp_processor_id(); if (unlikely(!rq->nr_running)) { -#ifdef CONFIG_SMP - load_balance(rq, 1, cpu_to_node_mask(smp_processor_id())); -#endif + idle_balance(cpu, rq); if (!rq->nr_running) { next = rq->idle; rq->expired_timestamp = 0; + wake_sleeping_dependent(cpu, rq); + schedstat_inc(rq, sched_idle); goto switch_tasks; } } @@ -1672,6 +2387,7 @@ need_resched: /* * Switch the active and expired arrays. */ + schedstat_inc(rq, sched_switch); rq->active = rq->expired; rq->expired = array; array = rq->active; @@ -1683,6 +2399,11 @@ need_resched: queue = array->queue + idx; next = list_entry(queue->next, task_t, run_list); + if (dependent_sleeper(cpu, rq, next)) { + next = rq->idle; + goto switch_tasks; + } + if (!rt_task(next) && next->activated > 0) { unsigned long long delta = now - next->timestamp; @@ -1836,15 +2557,16 @@ void fastcall __wake_up_locked(wait_queu void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) { unsigned long flags; + int sync = 1; if (unlikely(!q)) return; + if (unlikely(!nr_exclusive)) + sync = 0; + spin_lock_irqsave(&q->lock, flags); - if (likely(nr_exclusive)) - __wake_up_common(q, mode, nr_exclusive, 1); - else - __wake_up_common(q, mode, nr_exclusive, 0); + __wake_up_common(q, mode, nr_exclusive, sync); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ @@ -2015,6 +2737,13 @@ out_unlock: EXPORT_SYMBOL(set_user_nice); +#if defined( CONFIG_KGDB) +struct task_struct * kgdb_get_idle(int this_cpu) +{ + return cpu_rq(this_cpu)->idle; +} +#endif + #ifndef __alpha__ /* @@ -2199,7 +2928,7 @@ static int setscheduler(pid_t pid, int p if (task_running(rq, p)) { if (p->prio > oldprio) resched_task(rq->curr); - } else if (p->prio < rq->curr->prio) + } else if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); } @@ -2398,7 +3127,9 @@ asmlinkage long sys_sched_yield(void) { runqueue_t *rq = this_rq_lock(); prio_array_t *array = current->array; + prio_array_t *target = rq->expired; + schedstat_inc(rq, yld_cnt); /* * We implement yielding by moving the task into the expired * queue. @@ -2406,13 +3137,12 @@ asmlinkage long sys_sched_yield(void) * (special rule: RT tasks will just roundrobin in the active * array.) */ - if (likely(!rt_task(current))) { - dequeue_task(current, array); - enqueue_task(current, rq->expired); - } else { - list_del(¤t->run_list); - list_add_tail(¤t->run_list, array->queue + current->prio); - } + if (unlikely(rt_task(current))) + target = rq->active; + + dequeue_task(current, array); + enqueue_task(current, target); + /* * Since we are going to call schedule() anyway, there's * no need to preempt: @@ -2723,7 +3453,12 @@ int set_cpus_allowed(task_t *p, cpumask_ goto out; } - if (__set_cpus_allowed(p, new_mask, &req)) { + p->cpus_allowed = new_mask; + /* Can the task run on the task's current CPU? If so, we're done */ + if (cpu_isset(task_cpu(p), new_mask)) + goto out; + + if (migrate_task(p, any_online_cpu(new_mask), &req)) { /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, &flags); wake_up_process(rq->migration_thread); @@ -2737,22 +3472,34 @@ out: EXPORT_SYMBOL_GPL(set_cpus_allowed); -/* Move (not current) task off this cpu, onto dest cpu. */ -static void move_task_away(struct task_struct *p, int dest_cpu) +/* + * Move (not current) task off this cpu, onto dest cpu. We're doing + * this because either it can't run here any more (set_cpus_allowed() + * away from this CPU, or CPU going down), or because we're + * attempting to rebalance this task on exec (sched_balance_exec). + * + * So we race with normal scheduler movements, but that's OK, as long + * as the task is no longer on this CPU. + */ +static void __migrate_task(struct task_struct *p, int dest_cpu) { runqueue_t *rq_dest; rq_dest = cpu_rq(dest_cpu); double_rq_lock(this_rq(), rq_dest); + /* Already moved. */ if (task_cpu(p) != smp_processor_id()) - goto out; /* Already moved */ + goto out; + /* Affinity changed (again). */ + if (!cpu_isset(dest_cpu, p->cpus_allowed)) + goto out; set_task_cpu(p, dest_cpu); if (p->array) { deactivate_task(p, this_rq()); activate_task(p, rq_dest); - if (p->prio < rq_dest->curr->prio) + if (TASK_PREEMPTS_CURR(p, rq_dest)) resched_task(rq_dest->curr); } p->timestamp = rq_dest->timestamp_last_tick; @@ -2782,7 +3529,13 @@ static int migration_thread(void * data) refrigerator(PF_FREEZE); spin_lock_irq(&rq->lock); + if (rq->active_balance) { + active_load_balance(rq, cpu); + rq->active_balance = 0; + } + head = &rq->migration_queue; + current->state = TASK_INTERRUPTIBLE; if (list_empty(head)) { spin_unlock_irq(&rq->lock); @@ -2791,11 +3544,19 @@ static int migration_thread(void * data) } req = list_entry(head->next, migration_req_t, list); list_del_init(head->next); + spin_unlock(&rq->lock); - move_task_away(req->task, - any_online_cpu(req->task->cpus_allowed)); + if (req->type == REQ_MOVE_TASK) { + __migrate_task(req->task, req->dest_cpu); + } else if (req->type == REQ_SET_DOMAIN) { + rq->sd = req->sd; + } else { + WARN_ON(1); + } + local_irq_enable(); + complete(&req->done); } return 0; @@ -2851,7 +3612,7 @@ void migrate_all_tasks(void) tsk->pid, tsk->comm, src_cpu); } - move_task_away(tsk, dest_cpu); + __migrate_task(tsk, dest_cpu); } while_each_thread(t, tsk); write_unlock(&tasklist_lock); @@ -2930,23 +3691,299 @@ int __init migration_init(void) spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; EXPORT_SYMBOL(kernel_flag); +#ifdef CONFIG_SMP +/* Attach the domain 'sd' to 'cpu' as its base domain */ +void cpu_attach_domain(struct sched_domain *sd, int cpu) +{ + migration_req_t req; + unsigned long flags; + runqueue_t *rq = cpu_rq(cpu); + int local = 1; + + lock_cpu_hotplug(); + + spin_lock_irqsave(&rq->lock, flags); + + if (cpu == smp_processor_id() || cpu_is_offline(cpu)) { + rq->sd = sd; + } else { + init_completion(&req.done); + req.type = REQ_SET_DOMAIN; + req.sd = sd; + list_add(&req.list, &rq->migration_queue); + local = 0; + } + + spin_unlock_irqrestore(&rq->lock, flags); + + if (!local) { + wake_up_process(rq->migration_thread); + wait_for_completion(&req.done); + } + + unlock_cpu_hotplug(); +} + +#ifdef ARCH_HAS_SCHED_DOMAIN +extern void __init arch_init_sched_domains(void); +#else +static struct sched_group sched_group_cpus[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_domain, cpu_domains); +#ifdef CONFIG_NUMA +static struct sched_group sched_group_nodes[MAX_NUMNODES]; +static DEFINE_PER_CPU(struct sched_domain, node_domains); +static void __init arch_init_sched_domains(void) +{ + int i; + struct sched_group *first_node = NULL, *last_node = NULL; + + /* Set up domains */ + for_each_cpu(i) { + int node = cpu_to_node(i); + cpumask_t nodemask = node_to_cpumask(node); + struct sched_domain *node_sd = &per_cpu(node_domains, i); + struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); + + *node_sd = SD_NODE_INIT; + node_sd->span = cpu_possible_map; + node_sd->groups = &sched_group_nodes[cpu_to_node(i)]; + + *cpu_sd = SD_CPU_INIT; + cpus_and(cpu_sd->span, nodemask, cpu_possible_map); + cpu_sd->groups = &sched_group_cpus[i]; + cpu_sd->parent = node_sd; + } + + /* Set up groups */ + for (i = 0; i < MAX_NUMNODES; i++) { + cpumask_t tmp = node_to_cpumask(i); + cpumask_t nodemask; + struct sched_group *first_cpu = NULL, *last_cpu = NULL; + struct sched_group *node = &sched_group_nodes[i]; + int j; + + cpus_and(nodemask, tmp, cpu_possible_map); + + if (cpus_empty(nodemask)) + continue; + + node->cpumask = nodemask; + node->cpu_power = SCHED_LOAD_SCALE * cpus_weight(node->cpumask); + + for_each_cpu_mask(j, node->cpumask) { + struct sched_group *cpu = &sched_group_cpus[j]; + + cpus_clear(cpu->cpumask); + cpu_set(j, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + + if (!first_node) + first_node = node; + if (last_node) + last_node->next = node; + last_node = node; + } + last_node->next = first_node; + + mb(); + for_each_cpu(i) { + struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); + cpu_attach_domain(cpu_sd, i); + } +} + +#else /* !CONFIG_NUMA */ +static void __init arch_init_sched_domains(void) +{ + int i; + struct sched_group *first_cpu = NULL, *last_cpu = NULL; + + /* Set up domains */ + for_each_cpu(i) { + struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); + + *cpu_sd = SD_CPU_INIT; + cpu_sd->cache_nice_tries = 2; + cpu_sd->span = cpu_possible_map; + cpu_sd->groups = &sched_group_cpus[i]; + } + + /* Set up CPU groups */ + for_each_cpu_mask(i, cpu_possible_map) { + struct sched_group *cpu = &sched_group_cpus[i]; + + cpus_clear(cpu->cpumask); + cpu_set(i, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + + mb(); /* domains were modified outside the lock */ + for_each_cpu(i) { + struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); + cpu_attach_domain(cpu_sd, i); + } +} + +#endif /* CONFIG_NUMA */ +#endif /* ARCH_HAS_SCHED_DOMAIN */ + +#define SCHED_DOMAIN_DEBUG +#ifdef SCHED_DOMAIN_DEBUG +void sched_domain_debug(void) +{ + int i; + + for_each_cpu(i) { + runqueue_t *rq = cpu_rq(i); + struct sched_domain *sd; + int level = 0; + + sd = rq->sd; + + printk(KERN_DEBUG "CPU%d: %s\n", + i, (cpu_online(i) ? " online" : "offline")); + + do { + int j; + char str[NR_CPUS]; + struct sched_group *group = sd->groups; + cpumask_t groupmask, tmp; + + cpumask_scnprintf(str, NR_CPUS, sd->span); + cpus_clear(groupmask); + + printk(KERN_DEBUG); + for (j = 0; j < level + 1; j++) + printk(" "); + printk("domain %d: span %s\n", level, str); + + if (!cpu_isset(i, sd->span)) + printk(KERN_DEBUG "ERROR domain->span does not contain CPU%d\n", i); + if (!cpu_isset(i, group->cpumask)) + printk(KERN_DEBUG "ERROR domain->groups does not contain CPU%d\n", i); + if (!group->cpu_power) + printk(KERN_DEBUG "ERROR domain->cpu_power not set\n"); + + printk(KERN_DEBUG); + for (j = 0; j < level + 2; j++) + printk(" "); + printk("groups:"); + do { + if (!group) { + printk(" ERROR: NULL"); + break; + } + + if (!cpus_weight(group->cpumask)) + printk(" ERROR empty group:"); + + cpus_and(tmp, groupmask, group->cpumask); + if (cpus_weight(tmp) > 0) + printk(" ERROR repeated CPUs:"); + + cpus_or(groupmask, groupmask, group->cpumask); + + cpumask_scnprintf(str, NR_CPUS, group->cpumask); + printk(" %s", str); + + group = group->next; + } while (group != sd->groups); + printk("\n"); + + if (!cpus_equal(sd->span, groupmask)) + printk(KERN_DEBUG "ERROR groups don't span domain->span\n"); + + level++; + sd = sd->parent; + + if (sd) { + cpus_and(tmp, groupmask, sd->span); + if (!cpus_equal(tmp, groupmask)) + printk(KERN_DEBUG "ERROR parent span is not a superset of domain->span\n"); + } + + } while (sd); + } +} +#else +#define sched_domain_debug() {} +#endif + +void __init sched_init_smp(void) +{ + arch_init_sched_domains(); + sched_domain_debug(); +} +#else +void __init sched_init_smp(void) +{ +} +#endif /* CONFIG_SMP */ + +int in_sched_functions(unsigned long addr) +{ + /* Linker adds these: start and end of __sched functions */ + extern char __sched_text_start[], __sched_text_end[]; + return addr >= (unsigned long)__sched_text_start + && addr < (unsigned long)__sched_text_end; +} + void __init sched_init(void) { runqueue_t *rq; int i, j, k; +#ifdef CONFIG_SMP + /* Set up an initial dummy domain for early boot */ + static struct sched_domain sched_domain_init; + static struct sched_group sched_group_init; + cpumask_t cpu_mask_all = CPU_MASK_ALL; + + memset(&sched_domain_init, 0, sizeof(struct sched_domain)); + sched_domain_init.span = cpu_mask_all; + sched_domain_init.groups = &sched_group_init; + sched_domain_init.last_balance = jiffies; + sched_domain_init.balance_interval = INT_MAX; /* Don't balance */ + + memset(&sched_group_init, 0, sizeof(struct sched_group)); + sched_group_init.cpumask = cpu_mask_all; + sched_group_init.next = &sched_group_init; + sched_group_init.cpu_power = SCHED_LOAD_SCALE; +#endif + for (i = 0; i < NR_CPUS; i++) { prio_array_t *array; rq = cpu_rq(i); + spin_lock_init(&rq->lock); rq->active = rq->arrays; rq->expired = rq->arrays + 1; rq->best_expired_prio = MAX_PRIO; - spin_lock_init(&rq->lock); +#ifdef CONFIG_SMP + rq->sd = &sched_domain_init; + rq->cpu_load = 0; + rq->active_balance = 0; + rq->push_cpu = 0; + rq->migration_thread = NULL; INIT_LIST_HEAD(&rq->migration_queue); +#endif atomic_set(&rq->nr_iowait, 0); - nr_running_init(rq); for (j = 0; j < 2; j++) { array = rq->arrays + j; @@ -2968,8 +4005,6 @@ void __init sched_init(void) set_task_cpu(current, smp_processor_id()); wake_up_forked_process(current); - init_timers(); - /* * The boot idle thread does lazy MMU switching as well: */ --- diff/kernel/signal.c 2004-04-21 10:45:36.012228808 +0100 +++ source/kernel/signal.c 2004-04-21 10:46:51.797707664 +0100 @@ -1403,12 +1403,12 @@ static void __wake_up_parent(struct task * Fortunately this is not necessary for thread groups: */ if (p->tgid == tsk->tgid) { - wake_up_interruptible(&tsk->wait_chldexit); + wake_up_interruptible_sync(&tsk->wait_chldexit); return; } do { - wake_up_interruptible(&tsk->wait_chldexit); + wake_up_interruptible_sync(&tsk->wait_chldexit); tsk = next_thread(tsk); if (tsk->signal != parent->signal) BUG(); @@ -2568,7 +2568,5 @@ void __init signals_init(void) kmem_cache_create("sigqueue", sizeof(struct sigqueue), __alignof__(struct sigqueue), - 0, NULL, NULL); - if (!sigqueue_cachep) - panic("signals_init(): cannot create sigqueue SLAB cache"); + SLAB_PANIC, NULL, NULL); } --- diff/kernel/softirq.c 2004-04-21 10:45:36.013228656 +0100 +++ source/kernel/softirq.c 2004-04-21 10:46:51.798707512 +0100 @@ -132,11 +132,22 @@ EXPORT_SYMBOL(do_softirq); void local_bh_enable(void) { + if (in_irq()) { + printk("local_bh_enable() was called in hard irq context. " + "This is probably a bug\n"); + dump_stack(); + } + __local_bh_enable(); - WARN_ON(irqs_disabled()); - if (unlikely(!in_interrupt() && - local_softirq_pending())) + if (unlikely(!in_interrupt() && local_softirq_pending())) { + if (irqs_disabled()) { + printk("local_bh_enable() was called with local " + "interrupts disabled. This is probably a" + " bug\n"); + dump_stack(); + } invoke_softirq(); + } preempt_check_resched(); } EXPORT_SYMBOL(local_bh_enable); --- diff/kernel/sys.c 2004-04-21 10:45:36.014228504 +0100 +++ source/kernel/sys.c 2004-04-21 10:46:51.798707512 +0100 @@ -271,6 +271,9 @@ cond_syscall(compat_sys_mq_timedsend) cond_syscall(compat_sys_mq_timedreceive) cond_syscall(compat_sys_mq_notify) cond_syscall(compat_sys_mq_getsetattr) +cond_syscall(sys_mbind) +cond_syscall(sys_get_mempolicy) +cond_syscall(sys_set_mempolicy) /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read) --- diff/kernel/user.c 2003-08-20 14:16:34.000000000 +0100 +++ source/kernel/user.c 2004-04-21 10:46:51.799707360 +0100 @@ -138,10 +138,7 @@ static int __init uid_cache_init(void) int n; uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct), - 0, - SLAB_HWCACHE_ALIGN, NULL, NULL); - if(!uid_cachep) - panic("Cannot create uid taskcount SLAB cache\n"); + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); for(n = 0; n < UIDHASH_SZ; ++n) INIT_LIST_HEAD(uidhash_table + n); --- diff/kernel/workqueue.c 2004-04-21 10:45:36.016228200 +0100 +++ source/kernel/workqueue.c 2004-04-21 10:46:51.799707360 +0100 @@ -27,7 +27,7 @@ #include /* - * The per-CPU workqueue. + * The per-CPU workqueue (if single thread, we always use cpu 0's). * * The sequence counters are for flush_scheduled_work(). It wants to wait * until until all currently-scheduled works are completed, but it doesn't @@ -59,20 +59,19 @@ struct cpu_workqueue_struct { struct workqueue_struct { struct cpu_workqueue_struct cpu_wq[NR_CPUS]; const char *name; - struct list_head list; + struct list_head list; /* Empty if single thread */ }; -#ifdef CONFIG_HOTPLUG_CPU -/* All the workqueues on the system, for hotplug cpu to add/remove - threads to each one as cpus come/go. Protected by cpucontrol - sem. */ +/* All the per-cpu workqueues on the system, for hotplug cpu to add/remove + threads to each one as cpus come/go. */ +static spinlock_t workqueue_lock = SPIN_LOCK_UNLOCKED; static LIST_HEAD(workqueues); -#define add_workqueue(wq) list_add(&(wq)->list, &workqueues) -#define del_workqueue(wq) list_del(&(wq)->list) -#else -#define add_workqueue(wq) -#define del_workqueue(wq) -#endif /* CONFIG_HOTPLUG_CPU */ + +/* If it's single threaded, it isn't in the list of workqueues. */ +static inline int is_single_threaded(struct workqueue_struct *wq) +{ + return list_empty(&wq->list); +} /* Preempt must be disabled. */ static void __queue_work(struct cpu_workqueue_struct *cwq, @@ -100,6 +99,8 @@ int fastcall queue_work(struct workqueue int ret = 0, cpu = get_cpu(); if (!test_and_set_bit(0, &work->pending)) { + if (unlikely(is_single_threaded(wq))) + cpu = 0; BUG_ON(!list_empty(&work->entry)); __queue_work(wq->cpu_wq + cpu, work); ret = 1; @@ -112,8 +113,12 @@ static void delayed_work_timer_fn(unsign { struct work_struct *work = (struct work_struct *)__data; struct workqueue_struct *wq = work->wq_data; + int cpu = smp_processor_id(); + + if (unlikely(is_single_threaded(wq))) + cpu = 0; - __queue_work(wq->cpu_wq + smp_processor_id(), work); + __queue_work(wq->cpu_wq + cpu, work); } int fastcall queue_delayed_work(struct workqueue_struct *wq, @@ -234,13 +239,14 @@ void fastcall flush_workqueue(struct wor might_sleep(); lock_cpu_hotplug(); - for (cpu = 0; cpu < NR_CPUS; cpu++) { + for_each_online_cpu(cpu) { DEFINE_WAIT(wait); long sequence_needed; - if (!cpu_online(cpu)) - continue; - cwq = wq->cpu_wq + cpu; + if (is_single_threaded(wq)) + cwq = wq->cpu_wq + 0; /* Always use cpu 0's area. */ + else + cwq = wq->cpu_wq + cpu; if (cwq->thread == current) { /* @@ -266,7 +272,8 @@ void fastcall flush_workqueue(struct wor unlock_cpu_hotplug(); } -static int create_workqueue_thread(struct workqueue_struct *wq, int cpu) +static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, + int cpu) { struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu; struct task_struct *p; @@ -280,18 +287,22 @@ static int create_workqueue_thread(struc init_waitqueue_head(&cwq->more_work); init_waitqueue_head(&cwq->work_done); - p = kthread_create(worker_thread, cwq, "%s/%d", wq->name, cpu); + if (is_single_threaded(wq)) + p = kthread_create(worker_thread, cwq, "%s", wq->name); + else + p = kthread_create(worker_thread, cwq, "%s/%d", wq->name, cpu); if (IS_ERR(p)) - return PTR_ERR(p); + return NULL; cwq->thread = p; - kthread_bind(p, cpu); - return 0; + return p; } -struct workqueue_struct *create_workqueue(const char *name) +struct workqueue_struct *__create_workqueue(const char *name, + int singlethread) { int cpu, destroy = 0; struct workqueue_struct *wq; + struct task_struct *p; BUG_ON(strlen(name) > 10); @@ -303,15 +314,26 @@ struct workqueue_struct *create_workqueu wq->name = name; /* We don't need the distraction of CPUs appearing and vanishing. */ lock_cpu_hotplug(); - for (cpu = 0; cpu < NR_CPUS; cpu++) { - if (!cpu_online(cpu)) - continue; - if (create_workqueue_thread(wq, cpu) < 0) + if (singlethread) { + INIT_LIST_HEAD(&wq->list); + p = create_workqueue_thread(wq, 0); + if (!p) destroy = 1; else - wake_up_process(wq->cpu_wq[cpu].thread); + wake_up_process(p); + } else { + spin_lock(&workqueue_lock); + list_add(&wq->list, &workqueues); + spin_unlock_irq(&workqueue_lock); + for_each_online_cpu(cpu) { + p = create_workqueue_thread(wq, cpu); + if (p) { + kthread_bind(p, cpu); + wake_up_process(p); + } else + destroy = 1; + } } - add_workqueue(wq); /* * Was there any error during startup? If yes then clean up: @@ -347,11 +369,15 @@ void destroy_workqueue(struct workqueue_ /* We don't need the distraction of CPUs appearing and vanishing. */ lock_cpu_hotplug(); - for (cpu = 0; cpu < NR_CPUS; cpu++) { - if (cpu_online(cpu)) + if (is_single_threaded(wq)) + cleanup_workqueue_thread(wq, 0); + else { + for_each_online_cpu(cpu) cleanup_workqueue_thread(wq, cpu); + spin_lock(&workqueue_lock); + list_del(&wq->list); + spin_unlock_irq(&workqueue_lock); } - del_workqueue(wq); unlock_cpu_hotplug(); kfree(wq); } @@ -467,7 +493,7 @@ void init_workqueues(void) BUG_ON(!keventd_wq); } -EXPORT_SYMBOL_GPL(create_workqueue); +EXPORT_SYMBOL_GPL(__create_workqueue); EXPORT_SYMBOL_GPL(queue_work); EXPORT_SYMBOL_GPL(queue_delayed_work); EXPORT_SYMBOL_GPL(flush_workqueue); --- diff/lib/radix-tree.c 2004-04-21 10:45:36.019227744 +0100 +++ source/lib/radix-tree.c 2004-04-21 10:46:51.800707208 +0100 @@ -799,9 +799,7 @@ void __init radix_tree_init(void) { radix_tree_node_cachep = kmem_cache_create("radix_tree_node", sizeof(struct radix_tree_node), 0, - 0, radix_tree_node_ctor, NULL); - if (!radix_tree_node_cachep) - panic ("Failed to create radix_tree_node cache\n"); + SLAB_PANIC, radix_tree_node_ctor, NULL); radix_tree_init_maxindex(); hotcpu_notifier(radix_tree_callback, 0); } --- diff/lib/rwsem-spinlock.c 2004-03-11 10:20:29.000000000 +0000 +++ source/lib/rwsem-spinlock.c 2004-04-21 10:46:51.801707056 +0100 @@ -1,5 +1,5 @@ -/* rwsem-spinlock.c: R/W semaphores: contention handling functions for generic spinlock - * implementation +/* rwsem-spinlock.c: R/W semaphores: contention handling functions for generic + * spinlock implementation * * Copyright (c) 2001 David Howells (dhowells@redhat.com). * - Derived partially from idea by Andrea Arcangeli @@ -8,11 +8,13 @@ #include #include #include +#include struct rwsem_waiter { - struct list_head list; - struct task_struct *task; - unsigned int flags; + struct list_head list; + struct task_struct *task; + struct completion granted; + unsigned int flags; #define RWSEM_WAITING_FOR_READ 0x00000001 #define RWSEM_WAITING_FOR_WRITE 0x00000002 }; @@ -22,7 +24,8 @@ void rwsemtrace(struct rw_semaphore *sem { if (sem->debug) printk("[%d] %s({%d,%d})\n", - current->pid,str,sem->activity,list_empty(&sem->wait_list)?0:1); + current->pid, str, sem->activity, + list_empty(&sem->wait_list) ? 0 : 1); } #endif @@ -40,22 +43,24 @@ void fastcall init_rwsem(struct rw_semap } /* - * handle the lock being released whilst there are processes blocked on it that can now run + * handle the lock being released whilst there are processes blocked on it + * that can now run * - if we come here, then: * - the 'active count' _reached_ zero * - the 'waiting count' is non-zero * - the spinlock must be held by the caller - * - woken process blocks are discarded from the list after having flags zeroised + * - woken process blocks are discarded from the list after having flags zeroed * - writers are only woken if wakewrite is non-zero */ -static inline struct rw_semaphore *__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) +static inline struct rw_semaphore * +__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) { struct rwsem_waiter *waiter; int woken; - rwsemtrace(sem,"Entering __rwsem_do_wake"); + rwsemtrace(sem, "Entering __rwsem_do_wake"); - waiter = list_entry(sem->wait_list.next,struct rwsem_waiter,list); + waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); if (!wakewrite) { if (waiter->flags & RWSEM_WAITING_FOR_WRITE) @@ -63,54 +68,53 @@ static inline struct rw_semaphore *__rws goto dont_wake_writers; } - /* if we are allowed to wake writers try to grant a single write lock if there's a - * writer at the front of the queue - * - we leave the 'waiting count' incremented to signify potential contention + /* if we are allowed to wake writers try to grant a single write lock + * if there's a writer at the front of the queue - we leave the + * 'waiting count' incremented to signify potential contention */ if (waiter->flags & RWSEM_WAITING_FOR_WRITE) { sem->activity = -1; list_del(&waiter->list); - waiter->flags = 0; - wake_up_process(waiter->task); + complete(&waiter->granted); goto out; } - /* grant an infinite number of read locks to the readers at the front of the queue */ - dont_wake_writers: + /* grant an infinite number of read locks to the readers at the front + * of the queue */ +dont_wake_writers: woken = 0; - while (waiter->flags&RWSEM_WAITING_FOR_READ) { + while (waiter->flags & RWSEM_WAITING_FOR_READ) { struct list_head *next = waiter->list.next; list_del(&waiter->list); - waiter->flags = 0; - wake_up_process(waiter->task); + complete(&waiter->granted); woken++; if (list_empty(&sem->wait_list)) break; - waiter = list_entry(next,struct rwsem_waiter,list); + waiter = list_entry(next, struct rwsem_waiter, list); } sem->activity += woken; - out: - rwsemtrace(sem,"Leaving __rwsem_do_wake"); +out: + rwsemtrace(sem, "Leaving __rwsem_do_wake"); return sem; } /* * wake a single writer */ -static inline struct rw_semaphore *__rwsem_wake_one_writer(struct rw_semaphore *sem) +static inline struct rw_semaphore * +__rwsem_wake_one_writer(struct rw_semaphore *sem) { struct rwsem_waiter *waiter; sem->activity = -1; - waiter = list_entry(sem->wait_list.next,struct rwsem_waiter,list); + waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); list_del(&waiter->list); - waiter->flags = 0; - wake_up_process(waiter->task); + complete(&waiter->granted); return sem; } @@ -120,43 +124,32 @@ static inline struct rw_semaphore *__rws void fastcall __down_read(struct rw_semaphore *sem) { struct rwsem_waiter waiter; - struct task_struct *tsk; - rwsemtrace(sem,"Entering __down_read"); + rwsemtrace(sem, "Entering __down_read"); spin_lock(&sem->wait_lock); - if (sem->activity>=0 && list_empty(&sem->wait_list)) { + if (sem->activity >= 0 && list_empty(&sem->wait_list)) { /* granted */ sem->activity++; spin_unlock(&sem->wait_lock); goto out; } - tsk = current; - set_task_state(tsk,TASK_UNINTERRUPTIBLE); - /* set up my own style of waitqueue */ - waiter.task = tsk; + init_completion(&waiter.granted); + waiter.task = current; waiter.flags = RWSEM_WAITING_FOR_READ; - list_add_tail(&waiter.list,&sem->wait_list); + list_add_tail(&waiter.list, &sem->wait_list); /* we don't need to touch the semaphore struct anymore */ spin_unlock(&sem->wait_lock); - /* wait to be given the lock */ - for (;;) { - if (!waiter.flags) - break; - schedule(); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - } + wait_for_completion(&waiter.granted); - tsk->state = TASK_RUNNING; - - out: - rwsemtrace(sem,"Leaving __down_read"); +out: + rwsemtrace(sem, "Leaving __down_read"); } /* @@ -165,11 +158,11 @@ void fastcall __down_read(struct rw_sema int fastcall __down_read_trylock(struct rw_semaphore *sem) { int ret = 0; - rwsemtrace(sem,"Entering __down_read_trylock"); + rwsemtrace(sem, "Entering __down_read_trylock"); spin_lock(&sem->wait_lock); - if (sem->activity>=0 && list_empty(&sem->wait_list)) { + if (sem->activity >= 0 && list_empty(&sem->wait_list)) { /* granted */ sem->activity++; ret = 1; @@ -177,54 +170,43 @@ int fastcall __down_read_trylock(struct spin_unlock(&sem->wait_lock); - rwsemtrace(sem,"Leaving __down_read_trylock"); + rwsemtrace(sem, "Leaving __down_read_trylock"); return ret; } /* * get a write lock on the semaphore - * - note that we increment the waiting count anyway to indicate an exclusive lock + * - note that we increment the waiting count anyway to indicate an exclusive + * lock */ void fastcall __down_write(struct rw_semaphore *sem) { struct rwsem_waiter waiter; - struct task_struct *tsk; - rwsemtrace(sem,"Entering __down_write"); + rwsemtrace(sem, "Entering __down_write"); spin_lock(&sem->wait_lock); - if (sem->activity==0 && list_empty(&sem->wait_list)) { + if (sem->activity == 0 && list_empty(&sem->wait_list)) { /* granted */ sem->activity = -1; spin_unlock(&sem->wait_lock); goto out; } - tsk = current; - set_task_state(tsk,TASK_UNINTERRUPTIBLE); - /* set up my own style of waitqueue */ - waiter.task = tsk; + init_completion(&waiter.granted); + waiter.task = current; waiter.flags = RWSEM_WAITING_FOR_WRITE; - list_add_tail(&waiter.list,&sem->wait_list); + list_add_tail(&waiter.list, &sem->wait_list); /* we don't need to touch the semaphore struct anymore */ spin_unlock(&sem->wait_lock); - /* wait to be given the lock */ - for (;;) { - if (!waiter.flags) - break; - schedule(); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - } - - tsk->state = TASK_RUNNING; - - out: - rwsemtrace(sem,"Leaving __down_write"); + wait_for_completion(&waiter.granted); +out: + rwsemtrace(sem, "Leaving __down_write"); } /* @@ -233,11 +215,11 @@ void fastcall __down_write(struct rw_sem int fastcall __down_write_trylock(struct rw_semaphore *sem) { int ret = 0; - rwsemtrace(sem,"Entering __down_write_trylock"); + rwsemtrace(sem, "Entering __down_write_trylock"); spin_lock(&sem->wait_lock); - if (sem->activity==0 && list_empty(&sem->wait_list)) { + if (sem->activity == 0 && list_empty(&sem->wait_list)) { /* granted */ sem->activity = -1; ret = 1; @@ -245,7 +227,7 @@ int fastcall __down_write_trylock(struct spin_unlock(&sem->wait_lock); - rwsemtrace(sem,"Leaving __down_write_trylock"); + rwsemtrace(sem, "Leaving __down_write_trylock"); return ret; } @@ -254,16 +236,16 @@ int fastcall __down_write_trylock(struct */ void fastcall __up_read(struct rw_semaphore *sem) { - rwsemtrace(sem,"Entering __up_read"); + rwsemtrace(sem, "Entering __up_read"); spin_lock(&sem->wait_lock); - if (--sem->activity==0 && !list_empty(&sem->wait_list)) + if (--sem->activity == 0 && !list_empty(&sem->wait_list)) sem = __rwsem_wake_one_writer(sem); spin_unlock(&sem->wait_lock); - rwsemtrace(sem,"Leaving __up_read"); + rwsemtrace(sem, "Leaving __up_read"); } /* @@ -271,7 +253,7 @@ void fastcall __up_read(struct rw_semaph */ void fastcall __up_write(struct rw_semaphore *sem) { - rwsemtrace(sem,"Entering __up_write"); + rwsemtrace(sem, "Entering __up_write"); spin_lock(&sem->wait_lock); @@ -281,7 +263,7 @@ void fastcall __up_write(struct rw_semap spin_unlock(&sem->wait_lock); - rwsemtrace(sem,"Leaving __up_write"); + rwsemtrace(sem, "Leaving __up_write"); } /* @@ -290,17 +272,17 @@ void fastcall __up_write(struct rw_semap */ void fastcall __downgrade_write(struct rw_semaphore *sem) { - rwsemtrace(sem,"Entering __downgrade_write"); + rwsemtrace(sem, "Entering __downgrade_write"); spin_lock(&sem->wait_lock); sem->activity = 1; if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem,0); + sem = __rwsem_do_wake(sem, 0); spin_unlock(&sem->wait_lock); - rwsemtrace(sem,"Leaving __downgrade_write"); + rwsemtrace(sem, "Leaving __downgrade_write"); } EXPORT_SYMBOL(init_rwsem); --- diff/lib/rwsem.c 2004-04-21 10:45:36.020227592 +0100 +++ source/lib/rwsem.c 2004-04-21 10:46:51.801707056 +0100 @@ -7,11 +7,13 @@ #include #include #include +#include struct rwsem_waiter { - struct list_head list; - struct task_struct *task; - unsigned int flags; + struct list_head list; + struct task_struct *task; + struct completion granted; + unsigned int flags; #define RWSEM_WAITING_FOR_READ 0x00000001 #define RWSEM_WAITING_FOR_WRITE 0x00000002 }; @@ -20,100 +22,107 @@ struct rwsem_waiter { #undef rwsemtrace void rwsemtrace(struct rw_semaphore *sem, const char *str) { - printk("sem=%p\n",sem); - printk("(sem)=%08lx\n",sem->count); + printk("sem=%p\n", sem); + printk("(sem)=%08lx\n", sem->count); if (sem->debug) - printk("[%d] %s({%08lx})\n",current->pid,str,sem->count); + printk("[%d] %s({%08lx})\n", current->pid, str, sem->count); } #endif /* - * handle the lock being released whilst there are processes blocked on it that can now run + * handle the lock being released whilst there are processes blocked on it + * that can now run * - if we come here, then: - * - the 'active part' of the count (&0x0000ffff) reached zero but has been re-incremented - * - the 'waiting part' of the count (&0xffff0000) is negative (and will still be so) + * - the 'active part' of the count (&0x0000ffff) reached zero but has been + * re-incremented + * - the 'waiting part' of the count (&0xffff0000) is negative (and will + * still be so) * - there must be someone on the queue * - the spinlock must be held by the caller - * - woken process blocks are discarded from the list after having flags zeroised + * - woken process blocks are discarded from the list after having flags + * zeroised * - writers are only woken if wakewrite is non-zero */ -static inline struct rw_semaphore *__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) +static inline struct rw_semaphore * +__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) { struct rwsem_waiter *waiter; struct list_head *next; signed long oldcount; int woken, loop; - rwsemtrace(sem,"Entering __rwsem_do_wake"); + rwsemtrace(sem, "Entering __rwsem_do_wake"); if (!wakewrite) goto dont_wake_writers; - /* only wake someone up if we can transition the active part of the count from 0 -> 1 */ - try_again: - oldcount = rwsem_atomic_update(RWSEM_ACTIVE_BIAS,sem) - RWSEM_ACTIVE_BIAS; + /* only wake someone if we can transition the active part of the count + * from 0 -> 1 */ +try_again: + oldcount = rwsem_atomic_update(RWSEM_ACTIVE_BIAS, sem) + - RWSEM_ACTIVE_BIAS; if (oldcount & RWSEM_ACTIVE_MASK) goto undo; - waiter = list_entry(sem->wait_list.next,struct rwsem_waiter,list); + waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - /* try to grant a single write lock if there's a writer at the front of the queue - * - note we leave the 'active part' of the count incremented by 1 and the waiting part - * incremented by 0x00010000 + /* try to grant a single write lock if there's a writer at the front + * of the queue - note we leave the 'active part' of the count + * incremented by 1 and the waiting part incremented by 0x00010000 */ if (!(waiter->flags & RWSEM_WAITING_FOR_WRITE)) goto readers_only; list_del(&waiter->list); - waiter->flags = 0; - wake_up_process(waiter->task); + complete(&waiter->granted); goto out; /* don't want to wake any writers */ - dont_wake_writers: - waiter = list_entry(sem->wait_list.next,struct rwsem_waiter,list); +dont_wake_writers: + waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); if (waiter->flags & RWSEM_WAITING_FOR_WRITE) goto out; - /* grant an infinite number of read locks to the readers at the front of the queue - * - note we increment the 'active part' of the count by the number of readers (less one - * for the activity decrement we've already done) before waking any processes up + /* grant an infinite number of read locks to the readers at the front + * of the queue - note we increment the 'active part' of the count by + * the number of readers (less one for the activity decrement we've + * already done) before waking any processes up */ - readers_only: +readers_only: woken = 0; do { woken++; - if (waiter->list.next==&sem->wait_list) + if (waiter->list.next == &sem->wait_list) break; - waiter = list_entry(waiter->list.next,struct rwsem_waiter,list); + waiter = list_entry(waiter->list.next, + struct rwsem_waiter, list); } while (waiter->flags & RWSEM_WAITING_FOR_READ); loop = woken; - woken *= RWSEM_ACTIVE_BIAS-RWSEM_WAITING_BIAS; + woken *= RWSEM_ACTIVE_BIAS - RWSEM_WAITING_BIAS; woken -= RWSEM_ACTIVE_BIAS; - rwsem_atomic_add(woken,sem); + rwsem_atomic_add(woken, sem); next = sem->wait_list.next; - for (; loop>0; loop--) { - waiter = list_entry(next,struct rwsem_waiter,list); + for (; loop > 0; loop--) { + waiter = list_entry(next, struct rwsem_waiter, list); next = waiter->list.next; - waiter->flags = 0; - wake_up_process(waiter->task); + complete(&waiter->granted); } sem->wait_list.next = next; next->prev = &sem->wait_list; - out: - rwsemtrace(sem,"Leaving __rwsem_do_wake"); +out: + rwsemtrace(sem, "Leaving __rwsem_do_wake"); return sem; /* undo the change to count, but check for a transition 1->0 */ - undo: - if (rwsem_atomic_update(-RWSEM_ACTIVE_BIAS,sem)!=0) +undo: + if (rwsem_atomic_update(-RWSEM_ACTIVE_BIAS, sem) != 0) goto out; goto try_again; } @@ -121,41 +130,35 @@ static inline struct rw_semaphore *__rws /* * wait for a lock to be granted */ -static inline struct rw_semaphore *rwsem_down_failed_common(struct rw_semaphore *sem, - struct rwsem_waiter *waiter, - signed long adjustment) +static struct rw_semaphore * +rwsem_down_failed_common(struct rw_semaphore *sem, + struct rwsem_waiter *waiter, signed long adjustment) { struct task_struct *tsk = current; signed long count; - set_task_state(tsk,TASK_UNINTERRUPTIBLE); - /* set up my own style of waitqueue */ spin_lock(&sem->wait_lock); waiter->task = tsk; + init_completion(&waiter->granted); - list_add_tail(&waiter->list,&sem->wait_list); + list_add_tail(&waiter->list, &sem->wait_list); - /* note that we're now waiting on the lock, but no longer actively read-locking */ - count = rwsem_atomic_update(adjustment,sem); + /* note that we're now waiting on the lock, but no longer actively + * read-locking */ + count = rwsem_atomic_update(adjustment, sem); - /* if there are no longer active locks, wake the front queued process(es) up - * - it might even be this process, since the waker takes a more active part + /* if there are no longer active locks, wake the front queued + * process(es) up - it might even be this process, since the waker + * takes a more active part */ if (!(count & RWSEM_ACTIVE_MASK)) - sem = __rwsem_do_wake(sem,1); + sem = __rwsem_do_wake(sem, 1); spin_unlock(&sem->wait_lock); /* wait to be given the lock */ - for (;;) { - if (!waiter->flags) - break; - schedule(); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - } - - tsk->state = TASK_RUNNING; + wait_for_completion(&waiter->granted); return sem; } @@ -163,32 +166,35 @@ static inline struct rw_semaphore *rwsem /* * wait for the read lock to be granted */ -struct rw_semaphore fastcall __sched *rwsem_down_read_failed(struct rw_semaphore *sem) +struct rw_semaphore fastcall __sched * +rwsem_down_read_failed(struct rw_semaphore *sem) { struct rwsem_waiter waiter; - rwsemtrace(sem,"Entering rwsem_down_read_failed"); + rwsemtrace(sem, "Entering rwsem_down_read_failed"); waiter.flags = RWSEM_WAITING_FOR_READ; - rwsem_down_failed_common(sem,&waiter,RWSEM_WAITING_BIAS-RWSEM_ACTIVE_BIAS); + rwsem_down_failed_common(sem, &waiter, + RWSEM_WAITING_BIAS - RWSEM_ACTIVE_BIAS); - rwsemtrace(sem,"Leaving rwsem_down_read_failed"); + rwsemtrace(sem, "Leaving rwsem_down_read_failed"); return sem; } /* * wait for the write lock to be granted */ -struct rw_semaphore fastcall __sched *rwsem_down_write_failed(struct rw_semaphore *sem) +struct rw_semaphore fastcall __sched * +rwsem_down_write_failed(struct rw_semaphore *sem) { struct rwsem_waiter waiter; - rwsemtrace(sem,"Entering rwsem_down_write_failed"); + rwsemtrace(sem, "Entering rwsem_down_write_failed"); waiter.flags = RWSEM_WAITING_FOR_WRITE; - rwsem_down_failed_common(sem,&waiter,-RWSEM_ACTIVE_BIAS); + rwsem_down_failed_common(sem, &waiter, -RWSEM_ACTIVE_BIAS); - rwsemtrace(sem,"Leaving rwsem_down_write_failed"); + rwsemtrace(sem, "Leaving rwsem_down_write_failed"); return sem; } @@ -198,39 +204,39 @@ struct rw_semaphore fastcall __sched *rw */ struct rw_semaphore fastcall *rwsem_wake(struct rw_semaphore *sem) { - rwsemtrace(sem,"Entering rwsem_wake"); + rwsemtrace(sem, "Entering rwsem_wake"); spin_lock(&sem->wait_lock); /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem,1); + sem = __rwsem_do_wake(sem, 1); spin_unlock(&sem->wait_lock); - rwsemtrace(sem,"Leaving rwsem_wake"); + rwsemtrace(sem, "Leaving rwsem_wake"); return sem; } /* * downgrade a write lock into a read lock - * - caller incremented waiting part of count, and discovered it to be still negative + * - caller incremented waiting part of count, and discovered it to be still -ve * - just wake up any readers at the front of the queue */ struct rw_semaphore fastcall *rwsem_downgrade_wake(struct rw_semaphore *sem) { - rwsemtrace(sem,"Entering rwsem_downgrade_wake"); + rwsemtrace(sem, "Entering rwsem_downgrade_wake"); spin_lock(&sem->wait_lock); /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem,0); + sem = __rwsem_do_wake(sem, 0); spin_unlock(&sem->wait_lock); - rwsemtrace(sem,"Leaving rwsem_downgrade_wake"); + rwsemtrace(sem, "Leaving rwsem_downgrade_wake"); return sem; } --- diff/mm/Makefile 2004-04-21 10:45:36.020227592 +0100 +++ source/mm/Makefile 2004-04-21 10:46:51.806706296 +0100 @@ -13,3 +13,4 @@ obj-y := bootmem.o filemap.o mempool.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o +obj-$(CONFIG_NUMA) += mempolicy.o --- diff/mm/filemap.c 2004-04-21 10:45:36.022227288 +0100 +++ source/mm/filemap.c 2004-04-21 10:46:51.804706600 +0100 @@ -55,17 +55,17 @@ /* * Lock ordering: * - * ->i_shared_sem (vmtruncate) + * ->i_shared_lock (vmtruncate) * ->private_lock (__free_pte->__set_page_dirty_buffers) * ->swap_list_lock * ->swap_device_lock (exclusive_swap_page, others) * ->mapping->tree_lock * * ->i_sem - * ->i_shared_sem (truncate->unmap_mapping_range) + * ->i_shared_lock (truncate->unmap_mapping_range) * * ->mmap_sem - * ->i_shared_sem (various places) + * ->i_shared_lock (various places) * * ->mmap_sem * ->lock_page (access_process_vm) --- diff/mm/fremap.c 2004-04-21 10:45:36.022227288 +0100 +++ source/mm/fremap.c 2004-04-21 10:46:51.805706448 +0100 @@ -36,7 +36,7 @@ static inline void zap_pte(struct mm_str if (!PageReserved(page)) { if (pte_dirty(pte)) set_page_dirty(page); - page_remove_rmap(page, ptep); + page_remove_rmap(page); page_cache_release(page); mm->rss--; } @@ -49,7 +49,7 @@ static inline void zap_pte(struct mm_str } /* - * Install a page to a given virtual memory address, release any + * Install a file page to a given virtual memory address, release any * previously existing mapping. */ int install_page(struct mm_struct *mm, struct vm_area_struct *vma, @@ -60,11 +60,13 @@ int install_page(struct mm_struct *mm, s pgd_t *pgd; pmd_t *pmd; pte_t pte_val; - struct pte_chain *pte_chain; - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto err; + /* + * We use page_add_file_rmap below: if install_page is + * ever extended to anonymous pages, this will warn us. + */ + BUG_ON(!page_mapping(page)); + pgd = pgd_offset(mm, addr); spin_lock(&mm->page_table_lock); @@ -81,18 +83,14 @@ int install_page(struct mm_struct *mm, s mm->rss++; flush_icache_page(vma, page); set_pte(pte, mk_pte(page, prot)); - pte_chain = page_add_rmap(page, pte, pte_chain); + page_add_file_rmap(page); pte_val = *pte; pte_unmap(pte); update_mmu_cache(vma, addr, pte_val); - spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); - return 0; + err = 0; err_unlock: spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); -err: return err; } EXPORT_SYMBOL(install_page); @@ -188,15 +186,18 @@ asmlinkage long sys_remap_file_pages(uns /* * Make sure the vma is shared, that it supports prefaulting, * and that the remapped range is valid and fully within - * the single existing vma: + * the single existing vma. vm_private_data is used as a + * swapout cursor in a VM_NONLINEAR vma (unless VM_RESERVED + * or VM_LOCKED, but VM_LOCKED could be revoked later on). */ if (vma && (vma->vm_flags & VM_SHARED) && + (!vma->vm_private_data || (vma->vm_flags & VM_RESERVED)) && vma->vm_ops && vma->vm_ops->populate && end > start && start >= vma->vm_start && end <= vma->vm_end) { /* Must set VM_NONLINEAR before any pages are populated. */ - if (pgoff != ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff) + if (pgoff != linear_page_index(vma, start)) vma->vm_flags |= VM_NONLINEAR; /* ->populate can take a long time, so downgrade the lock. */ --- diff/mm/highmem.c 2004-03-11 10:20:29.000000000 +0000 +++ source/mm/highmem.c 2004-04-21 10:46:51.805706448 +0100 @@ -26,7 +26,6 @@ #include #include #include -#include #include static mempool_t *page_pool, *isa_page_pool; --- diff/mm/madvise.c 2004-04-21 10:45:36.023227136 +0100 +++ source/mm/madvise.c 2004-04-21 10:46:51.805706448 +0100 @@ -92,16 +92,14 @@ static long madvise_willneed(struct vm_a static long madvise_dontneed(struct vm_area_struct * vma, unsigned long start, unsigned long end) { - struct zap_details details; - if (vma->vm_flags & VM_LOCKED) return -EINVAL; if (unlikely(vma->vm_flags & VM_NONLINEAR)) { - details.check_mapping = NULL; - details.nonlinear_vma = vma; - details.first_index = 0; - details.last_index = ULONG_MAX; + struct zap_details details = { + .nonlinear_vma = vma, + .last_index = ULONG_MAX, + }; zap_page_range(vma, start, end - start, &details); } else zap_page_range(vma, start, end - start, NULL); --- diff/mm/memory.c 2004-04-21 10:45:36.024226984 +0100 +++ source/mm/memory.c 2004-04-21 10:46:51.807706144 +0100 @@ -48,7 +48,6 @@ #include #include -#include #include #include #include @@ -105,7 +104,7 @@ static inline void free_one_pmd(struct m } page = pmd_page(*dir); pmd_clear(dir); - pgtable_remove_rmap(page); + dec_page_state(nr_page_table_pages); pte_free_tlb(tlb, page); } @@ -164,7 +163,7 @@ pte_t fastcall * pte_alloc_map(struct mm pte_free(new); goto out; } - pgtable_add_rmap(new, mm, address); + inc_page_state(nr_page_table_pages); pmd_populate(mm, pmd, new); } out: @@ -190,7 +189,6 @@ pte_t fastcall * pte_alloc_kernel(struct pte_free_kernel(new); goto out; } - pgtable_add_rmap(virt_to_page(new), mm, address); pmd_populate_kernel(mm, pmd, new); } out: @@ -217,20 +215,10 @@ int copy_page_range(struct mm_struct *ds unsigned long address = vma->vm_start; unsigned long end = vma->vm_end; unsigned long cow; - struct pte_chain *pte_chain = NULL; if (is_vm_hugetlb_page(vma)) return copy_hugetlb_page_range(dst, src, vma); - pte_chain = pte_chain_alloc(GFP_ATOMIC | __GFP_NOWARN); - if (!pte_chain) { - spin_unlock(&dst->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - spin_lock(&dst->page_table_lock); - if (!pte_chain) - goto nomem; - } - cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; src_pgd = pgd_offset(src, address)-1; dst_pgd = pgd_offset(dst, address)-1; @@ -329,32 +317,8 @@ skip_copy_pte_range: pte = pte_mkold(pte); get_page(page); dst->rss++; - set_pte(dst_pte, pte); - pte_chain = page_add_rmap(page, dst_pte, - pte_chain); - if (pte_chain) - goto cont_copy_pte_range_noset; - pte_chain = pte_chain_alloc(GFP_ATOMIC | __GFP_NOWARN); - if (pte_chain) - goto cont_copy_pte_range_noset; - - /* - * pte_chain allocation failed, and we need to - * run page reclaim. - */ - pte_unmap_nested(src_pte); - pte_unmap(dst_pte); - spin_unlock(&src->page_table_lock); - spin_unlock(&dst->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - spin_lock(&dst->page_table_lock); - if (!pte_chain) - goto nomem; - spin_lock(&src->page_table_lock); - dst_pte = pte_offset_map(dst_pmd, address); - src_pte = pte_offset_map_nested(src_pmd, - address); + page_dup_rmap(page); cont_copy_pte_range_noset: address += PAGE_SIZE; if (address >= end) { @@ -368,7 +332,7 @@ cont_copy_pte_range_noset: pte_unmap_nested(src_pte-1); pte_unmap(dst_pte-1); spin_unlock(&src->page_table_lock); - + cond_resched_lock(&dst->page_table_lock); cont_copy_pmd_range: src_pmd++; dst_pmd++; @@ -377,10 +341,8 @@ cont_copy_pmd_range: out_unlock: spin_unlock(&src->page_table_lock); out: - pte_chain_free(pte_chain); return 0; nomem: - pte_chain_free(pte_chain); return -ENOMEM; } @@ -446,7 +408,7 @@ static void zap_pte_range(struct mmu_gat if (pte_young(pte) && page_mapping(page)) mark_page_accessed(page); tlb->freed++; - page_remove_rmap(page, ptep); + page_remove_rmap(page); tlb_remove_page(tlb, page); continue; } @@ -556,6 +518,7 @@ int unmap_vmas(struct mmu_gather **tlbp, unsigned long tlb_start = 0; /* For tlb_finish_mmu */ int tlb_start_valid = 0; int ret = 0; + int atomic = details && details->atomic; for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { unsigned long start; @@ -593,7 +556,7 @@ int unmap_vmas(struct mmu_gather **tlbp, zap_bytes -= block; if ((long)zap_bytes > 0) continue; - if (need_resched()) { + if (!atomic && need_resched()) { int fullmm = tlb_is_full_mm(*tlbp); tlb_finish_mmu(*tlbp, tlb_start, start); cond_resched_lock(&mm->page_table_lock); @@ -621,8 +584,6 @@ void zap_page_range(struct vm_area_struc unsigned long end = address + size; unsigned long nr_accounted = 0; - might_sleep(); - if (is_vm_hugetlb_page(vma)) { zap_hugepage_range(vma, address, size); return; @@ -737,6 +698,7 @@ int get_user_pages(struct task_struct *t struct page **pages, struct vm_area_struct **vmas) { int i; + int vm_io; unsigned int flags; /* @@ -780,8 +742,10 @@ int get_user_pages(struct task_struct *t continue; } - if (!vma || (pages && (vma->vm_flags & VM_IO)) - || !(flags & vma->vm_flags)) + if (!vma) + return i ? : -EFAULT; + vm_io = vma->vm_flags & VM_IO; + if ((pages && vm_io) || !(flags & vma->vm_flags)) return i ? : -EFAULT; if (is_vm_hugetlb_page(vma)) { @@ -791,8 +755,15 @@ int get_user_pages(struct task_struct *t } spin_lock(&mm->page_table_lock); do { - struct page *map; + struct page *map = NULL; int lookup_write = write; + + /* + * We don't follow pagetables for VM_IO regions - they + * may have no pageframes. + */ + if (vm_io) + goto no_follow; while (!(map = follow_page(mm, start, lookup_write))) { /* * Shortcut for anonymous pages. We don't want @@ -844,6 +815,7 @@ int get_user_pages(struct task_struct *t if (!PageReserved(pages[i])) page_cache_get(pages[i]); } +no_follow: if (vmas) vmas[i] = vma; i++; @@ -1070,7 +1042,6 @@ static int do_wp_page(struct mm_struct * { struct page *old_page, *new_page; unsigned long pfn = pte_pfn(pte); - struct pte_chain *pte_chain; pte_t entry; if (unlikely(!pfn_valid(pfn))) { @@ -1109,10 +1080,7 @@ static int do_wp_page(struct mm_struct * page_cache_get(old_page); spin_unlock(&mm->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto no_pte_chain; - new_page = alloc_page(GFP_HIGHUSER); + new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); if (!new_page) goto no_new_page; copy_cow_page(old_page,new_page,address); @@ -1125,10 +1093,11 @@ static int do_wp_page(struct mm_struct * if (pte_same(*page_table, pte)) { if (PageReserved(old_page)) ++mm->rss; - page_remove_rmap(old_page, page_table); + else + page_remove_rmap(old_page); break_cow(vma, new_page, address, page_table); - pte_chain = page_add_rmap(new_page, page_table, pte_chain); lru_cache_add_active(new_page); + page_add_anon_rmap(new_page, mm, address); /* Free the old page.. */ new_page = old_page; @@ -1137,12 +1106,9 @@ static int do_wp_page(struct mm_struct * page_cache_release(new_page); page_cache_release(old_page); spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); return VM_FAULT_MINOR; no_new_page: - pte_chain_free(pte_chain); -no_pte_chain: page_cache_release(old_page); return VM_FAULT_OOM; } @@ -1199,7 +1165,7 @@ static void unmap_mapping_range_list(str * but 0 when invalidating pagecache, don't throw away private data. */ void unmap_mapping_range(struct address_space *mapping, - loff_t const holebegin, loff_t const holelen, int even_cows) + loff_t const holebegin, loff_t const holelen, int even_cows) { struct zap_details details; pgoff_t hba = holebegin >> PAGE_SHIFT; @@ -1217,10 +1183,11 @@ void unmap_mapping_range(struct address_ details.nonlinear_vma = NULL; details.first_index = hba; details.last_index = hba + hlen - 1; + details.atomic = 1; /* A spinlock is held */ if (details.last_index < details.first_index) details.last_index = ULONG_MAX; - down(&mapping->i_shared_sem); + spin_lock(&mapping->i_shared_lock); /* Protect against page fault */ atomic_inc(&mapping->truncate_count); if (unlikely(!list_empty(&mapping->i_mmap))) @@ -1231,7 +1198,7 @@ void unmap_mapping_range(struct address_ if (unlikely(!list_empty(&mapping->i_mmap_shared))) unmap_mapping_range_list(&mapping->i_mmap_shared, &details); - up(&mapping->i_shared_sem); + spin_unlock(&mapping->i_shared_lock); } EXPORT_SYMBOL(unmap_mapping_range); @@ -1280,9 +1247,17 @@ EXPORT_SYMBOL(vmtruncate); * (1 << page_cluster) entries in the swap area. This method is chosen * because it doesn't cost us any seek time. We also make sure to queue * the 'original' request together with the readahead ones... + * + * This has been extended to use the NUMA policies from the mm triggering + * the readahead. + * + * Caller must hold down_read on the vma->vm_mm if vma is not NULL. */ -void swapin_readahead(swp_entry_t entry) +void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struct *vma) { +#ifdef CONFIG_NUMA + struct vm_area_struct *next_vma = vma ? vma->vm_next : NULL; +#endif int i, num; struct page *new_page; unsigned long offset; @@ -1294,10 +1269,31 @@ void swapin_readahead(swp_entry_t entry) for (i = 0; i < num; offset++, i++) { /* Ok, do the async read-ahead now */ new_page = read_swap_cache_async(swp_entry(swp_type(entry), - offset)); + offset), vma, addr); if (!new_page) break; page_cache_release(new_page); +#ifdef CONFIG_NUMA + /* + * Find the next applicable VMA for the NUMA policy. + */ + addr += PAGE_SIZE; + if (addr == 0) + vma = NULL; + if (vma) { + if (addr >= vma->vm_end) { + vma = next_vma; + next_vma = vma ? vma->vm_next : NULL; + } + if (vma && addr < vma->vm_start) + vma = NULL; + } else { + if (next_vma && addr >= next_vma->vm_start) { + vma = next_vma; + next_vma = vma->vm_next; + } + } +#endif } lru_add_drain(); /* Push any new pages onto the LRU now */ } @@ -1314,14 +1310,13 @@ static int do_swap_page(struct mm_struct swp_entry_t entry = pte_to_swp_entry(orig_pte); pte_t pte; int ret = VM_FAULT_MINOR; - struct pte_chain *pte_chain = NULL; pte_unmap(page_table); spin_unlock(&mm->page_table_lock); page = lookup_swap_cache(entry); if (!page) { - swapin_readahead(entry); - page = read_swap_cache_async(entry); + swapin_readahead(entry, address, vma); + page = read_swap_cache_async(entry, vma, address); if (!page) { /* * Back out if somebody else faulted in this pte while @@ -1344,11 +1339,6 @@ static int do_swap_page(struct mm_struct } mark_page_accessed(page); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) { - ret = VM_FAULT_OOM; - goto out; - } lock_page(page); /* @@ -1374,20 +1364,28 @@ static int do_swap_page(struct mm_struct mm->rss++; pte = mk_pte(page, vma->vm_page_prot); - if (write_access && can_share_swap_page(page)) + if (write_access && can_share_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); + write_access = 0; + } unlock_page(page); flush_icache_page(vma, page); set_pte(page_table, pte); - pte_chain = page_add_rmap(page, page_table, pte_chain); + page_add_anon_rmap(page, mm, address); + + if (write_access || mremap_moved_anon_rmap(page, address)) { + if (do_wp_page(mm, vma, address, + page_table, pmd, pte) == VM_FAULT_OOM) + ret = VM_FAULT_OOM; + goto out; + } /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); pte_unmap(page_table); spin_unlock(&mm->page_table_lock); out: - pte_chain_free(pte_chain); return ret; } @@ -1403,20 +1401,7 @@ do_anonymous_page(struct mm_struct *mm, { pte_t entry; struct page * page = ZERO_PAGE(addr); - struct pte_chain *pte_chain; - int ret; - pte_chain = pte_chain_alloc(GFP_ATOMIC | __GFP_NOWARN); - if (!pte_chain) { - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto no_mem; - spin_lock(&mm->page_table_lock); - page_table = pte_offset_map(pmd, addr); - } - /* Read-only mapping of ZERO_PAGE. */ entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); @@ -1426,7 +1411,7 @@ do_anonymous_page(struct mm_struct *mm, pte_unmap(page_table); spin_unlock(&mm->page_table_lock); - page = alloc_page(GFP_HIGHUSER); + page = alloc_page_vma(GFP_HIGHUSER, vma, addr); if (!page) goto no_mem; clear_user_highpage(page, addr); @@ -1438,7 +1423,6 @@ do_anonymous_page(struct mm_struct *mm, pte_unmap(page_table); page_cache_release(page); spin_unlock(&mm->page_table_lock); - ret = VM_FAULT_MINOR; goto out; } mm->rss++; @@ -1447,24 +1431,19 @@ do_anonymous_page(struct mm_struct *mm, vma); lru_cache_add_active(page); mark_page_accessed(page); + page_add_anon_rmap(page, mm, addr); } set_pte(page_table, entry); - /* ignores ZERO_PAGE */ - pte_chain = page_add_rmap(page, page_table, pte_chain); pte_unmap(page_table); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, addr, entry); spin_unlock(&mm->page_table_lock); - ret = VM_FAULT_MINOR; - goto out; - -no_mem: - ret = VM_FAULT_OOM; out: - pte_chain_free(pte_chain); - return ret; + return VM_FAULT_MINOR; +no_mem: + return VM_FAULT_OOM; } /* @@ -1486,9 +1465,9 @@ do_no_page(struct mm_struct *mm, struct struct page * new_page; struct address_space *mapping = NULL; pte_t entry; - struct pte_chain *pte_chain; int sequence = 0; int ret = VM_FAULT_MINOR; + int anon = 0; if (!vma->vm_ops || !vma->vm_ops->nopage) return do_anonymous_page(mm, vma, page_table, @@ -1510,21 +1489,17 @@ retry: if (new_page == NOPAGE_OOM) return VM_FAULT_OOM; - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto oom; - /* * Should we do an early C-O-W break? */ if (write_access && !(vma->vm_flags & VM_SHARED)) { - struct page * page = alloc_page(GFP_HIGHUSER); + struct page *page = alloc_page_vma(GFP_HIGHUSER, vma, address); if (!page) goto oom; copy_user_highpage(page, new_page, address); page_cache_release(new_page); - lru_cache_add_active(page); new_page = page; + anon = 1; } spin_lock(&mm->page_table_lock); @@ -1538,7 +1513,6 @@ retry: sequence = atomic_read(&mapping->truncate_count); spin_unlock(&mm->page_table_lock); page_cache_release(new_page); - pte_chain_free(pte_chain); goto retry; } page_table = pte_offset_map(pmd, address); @@ -1562,7 +1536,11 @@ retry: if (write_access) entry = maybe_mkwrite(pte_mkdirty(entry), vma); set_pte(page_table, entry); - pte_chain = page_add_rmap(new_page, page_table, pte_chain); + if (anon) { + lru_cache_add_active(new_page); + page_add_anon_rmap(new_page, mm, address); + } else + page_add_file_rmap(new_page); pte_unmap(page_table); } else { /* One of our sibling threads was faster, back out. */ @@ -1580,7 +1558,6 @@ oom: page_cache_release(new_page); ret = VM_FAULT_OOM; out: - pte_chain_free(pte_chain); return ret; } --- diff/mm/mincore.c 2004-02-09 10:36:12.000000000 +0000 +++ source/mm/mincore.c 2004-04-21 10:46:51.809705840 +0100 @@ -14,7 +14,6 @@ #include #include -#include /* * Later we can get more picky about what "in core" means precisely. --- diff/mm/mmap.c 2004-04-21 10:45:36.025226832 +0100 +++ source/mm/mmap.c 2004-04-21 10:46:51.810705688 +0100 @@ -23,7 +23,6 @@ #include #include -#include #include /* @@ -63,7 +62,7 @@ EXPORT_SYMBOL(sysctl_max_map_count); EXPORT_SYMBOL(vm_committed_space); /* - * Requires inode->i_mapping->i_shared_sem + * Requires inode->i_mapping->i_shared_lock */ static inline void __remove_shared_vm_struct(struct vm_area_struct *vma, struct inode *inode) @@ -84,9 +83,9 @@ static void remove_shared_vm_struct(stru if (file) { struct address_space *mapping = file->f_mapping; - down(&mapping->i_shared_sem); + spin_lock(&mapping->i_shared_lock); __remove_shared_vm_struct(vma, file->f_dentry->d_inode); - up(&mapping->i_shared_sem); + spin_unlock(&mapping->i_shared_lock); } } @@ -286,12 +285,12 @@ static void vma_link(struct mm_struct *m mapping = vma->vm_file->f_mapping; if (mapping) - down(&mapping->i_shared_sem); + spin_lock(&mapping->i_shared_lock); spin_lock(&mm->page_table_lock); __vma_link(mm, vma, prev, rb_link, rb_parent); spin_unlock(&mm->page_table_lock); if (mapping) - up(&mapping->i_shared_sem); + spin_unlock(&mapping->i_shared_lock); mark_mm_hugetlb(mm, vma); mm->map_count++; @@ -301,7 +300,7 @@ static void vma_link(struct mm_struct *m /* * Insert vm structure into process list sorted by address and into the inode's * i_mmap ring. The caller should hold mm->page_table_lock and - * ->f_mappping->i_shared_sem if vm_file is non-NULL. + * ->f_mappping->i_shared_lock if vm_file is non-NULL. */ static void __insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) @@ -333,8 +332,6 @@ static inline int is_mergeable_vma(struc return 0; if (vma->vm_flags != vm_flags) return 0; - if (vma->vm_private_data) - return 0; return 1; } @@ -389,11 +386,12 @@ static struct vm_area_struct *vma_merge( struct vm_area_struct *prev, struct rb_node *rb_parent, unsigned long addr, unsigned long end, unsigned long vm_flags, - struct file *file, unsigned long pgoff) + struct file *file, unsigned long pgoff, + struct mempolicy *policy) { spinlock_t *lock = &mm->page_table_lock; struct inode *inode = file ? file->f_dentry->d_inode : NULL; - struct semaphore *i_shared_sem; + spinlock_t *i_shared_lock; /* * We later require that vma->vm_flags == vm_flags, so this tests @@ -402,7 +400,7 @@ static struct vm_area_struct *vma_merge( if (vm_flags & VM_SPECIAL) return NULL; - i_shared_sem = file ? &file->f_mapping->i_shared_sem : NULL; + i_shared_lock = file ? &file->f_mapping->i_shared_lock : NULL; if (!prev) { prev = rb_entry(rb_parent, struct vm_area_struct, vm_rb); @@ -413,13 +411,14 @@ static struct vm_area_struct *vma_merge( * Can it merge with the predecessor? */ if (prev->vm_end == addr && + mpol_equal(vma_policy(prev), policy) && can_vma_merge_after(prev, vm_flags, file, pgoff)) { struct vm_area_struct *next; int need_up = 0; if (unlikely(file && prev->vm_next && prev->vm_next->vm_file == file)) { - down(i_shared_sem); + spin_lock(i_shared_lock); need_up = 1; } spin_lock(lock); @@ -430,6 +429,7 @@ static struct vm_area_struct *vma_merge( */ next = prev->vm_next; if (next && prev->vm_end == next->vm_start && + vma_mpol_equal(prev, next) && can_vma_merge_before(next, vm_flags, file, pgoff, (end - addr) >> PAGE_SHIFT)) { prev->vm_end = next->vm_end; @@ -437,17 +437,18 @@ static struct vm_area_struct *vma_merge( __remove_shared_vm_struct(next, inode); spin_unlock(lock); if (need_up) - up(i_shared_sem); + spin_unlock(i_shared_lock); if (file) fput(file); mm->map_count--; + mpol_free(vma_policy(next)); kmem_cache_free(vm_area_cachep, next); return prev; } spin_unlock(lock); if (need_up) - up(i_shared_sem); + spin_unlock(i_shared_lock); return prev; } @@ -457,18 +458,20 @@ static struct vm_area_struct *vma_merge( prev = prev->vm_next; if (prev) { merge_next: + if (!mpol_equal(policy, vma_policy(prev))) + return 0; if (!can_vma_merge_before(prev, vm_flags, file, pgoff, (end - addr) >> PAGE_SHIFT)) return NULL; if (end == prev->vm_start) { if (file) - down(i_shared_sem); + spin_lock(i_shared_lock); spin_lock(lock); prev->vm_start = addr; prev->vm_pgoff -= (end - addr) >> PAGE_SHIFT; spin_unlock(lock); if (file) - up(i_shared_sem); + spin_unlock(i_shared_lock); return prev; } } @@ -633,7 +636,7 @@ munmap_back: /* Can we just expand an old anonymous mapping? */ if (!file && !(vm_flags & VM_SHARED) && rb_parent) if (vma_merge(mm, prev, rb_parent, addr, addr + len, - vm_flags, NULL, 0)) + vm_flags, NULL, pgoff, NULL)) goto out; /* @@ -656,6 +659,7 @@ munmap_back: vma->vm_file = NULL; vma->vm_private_data = NULL; vma->vm_next = NULL; + mpol_set_vma_default(vma); INIT_LIST_HEAD(&vma->shared); if (file) { @@ -695,7 +699,9 @@ munmap_back: addr = vma->vm_start; if (!file || !rb_parent || !vma_merge(mm, prev, rb_parent, addr, - addr + len, vma->vm_flags, file, pgoff)) { + vma->vm_end, + vma->vm_flags, file, pgoff, + vma_policy(vma))) { vma_link(mm, vma, prev, rb_link, rb_parent); if (correct_wcount) atomic_inc(&inode->i_writecount); @@ -705,6 +711,7 @@ munmap_back: atomic_inc(&inode->i_writecount); fput(file); } + mpol_free(vma_policy(vma)); kmem_cache_free(vm_area_cachep, vma); } out: @@ -1120,6 +1127,7 @@ static void unmap_vma(struct mm_struct * remove_shared_vm_struct(area); + mpol_free(vma_policy(area)); if (area->vm_ops && area->vm_ops->close) area->vm_ops->close(area); if (area->vm_file) @@ -1202,6 +1210,7 @@ detach_vmas_to_be_unmapped(struct mm_str int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long addr, int new_below) { + struct mempolicy *pol; struct vm_area_struct *new; struct address_space *mapping = NULL; @@ -1224,6 +1233,13 @@ int split_vma(struct mm_struct * mm, str new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); } + pol = mpol_copy(vma_policy(vma)); + if (IS_ERR(pol)) { + kmem_cache_free(vm_area_cachep, new); + return PTR_ERR(pol); + } + vma_set_policy(new, pol); + if (new->vm_file) get_file(new->vm_file); @@ -1234,7 +1250,7 @@ int split_vma(struct mm_struct * mm, str mapping = vma->vm_file->f_mapping; if (mapping) - down(&mapping->i_shared_sem); + spin_lock(&mapping->i_shared_lock); spin_lock(&mm->page_table_lock); if (new_below) { @@ -1247,7 +1263,7 @@ int split_vma(struct mm_struct * mm, str spin_unlock(&mm->page_table_lock); if (mapping) - up(&mapping->i_shared_sem); + spin_unlock(&mapping->i_shared_lock); return 0; } @@ -1393,7 +1409,7 @@ unsigned long do_brk(unsigned long addr, /* Can we just expand an old anonymous mapping? */ if (rb_parent && vma_merge(mm, prev, rb_parent, addr, addr + len, - flags, NULL, 0)) + flags, NULL, 0, NULL)) goto out; /* @@ -1414,6 +1430,7 @@ unsigned long do_brk(unsigned long addr, vma->vm_pgoff = 0; vma->vm_file = NULL; vma->vm_private_data = NULL; + mpol_set_vma_default(vma); INIT_LIST_HEAD(&vma->shared); vma_link(mm, vma, prev, rb_link, rb_parent); @@ -1474,6 +1491,7 @@ void exit_mmap(struct mm_struct *mm) } if (vma->vm_file) fput(vma->vm_file); + mpol_free(vma_policy(vma)); kmem_cache_free(vm_area_cachep, vma); vma = next; } @@ -1481,7 +1499,7 @@ void exit_mmap(struct mm_struct *mm) /* Insert vm structure into process list sorted by address * and into the inode's i_mmap ring. If vm_file is non-NULL - * then i_shared_sem is taken here. + * then i_shared_lock is taken here. */ void insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) { @@ -1506,10 +1524,11 @@ struct vm_area_struct *copy_vma(struct v struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *new_vma, *prev; struct rb_node **rb_link, *rb_parent; + struct mempolicy *pol; find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); new_vma = vma_merge(mm, prev, rb_parent, addr, addr + len, - vma->vm_flags, vma->vm_file, pgoff); + vma->vm_flags, vma->vm_file, pgoff, vma_policy(vma)); if (new_vma) { /* * Source vma may have been merged into new_vma @@ -1521,6 +1540,12 @@ struct vm_area_struct *copy_vma(struct v new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); if (new_vma) { *new_vma = *vma; + pol = mpol_copy(vma_policy(vma)); + if (IS_ERR(pol)) { + kmem_cache_free(vm_area_cachep, new_vma); + return NULL; + } + vma_set_policy(new_vma, pol); INIT_LIST_HEAD(&new_vma->shared); new_vma->vm_start = addr; new_vma->vm_end = addr + len; --- diff/mm/mprotect.c 2004-04-21 10:45:36.026226680 +0100 +++ source/mm/mprotect.c 2004-04-21 10:46:51.811705536 +0100 @@ -18,7 +18,6 @@ #include #include -#include #include #include #include @@ -125,6 +124,8 @@ mprotect_attempt_merge(struct vm_area_st return 0; if (vma->vm_file || (vma->vm_flags & VM_SHARED)) return 0; + if (!vma_mpol_equal(vma, prev)) + return 0; /* * If the whole area changes to the protection of the previous one @@ -136,6 +137,7 @@ mprotect_attempt_merge(struct vm_area_st __vma_unlink(mm, vma, prev); spin_unlock(&mm->page_table_lock); + mpol_free(vma_policy(vma)); kmem_cache_free(vm_area_cachep, vma); mm->map_count--; return 1; @@ -318,12 +320,14 @@ sys_mprotect(unsigned long start, size_t if (next && prev->vm_end == next->vm_start && can_vma_merge(next, prev->vm_flags) && + vma_mpol_equal(prev, next) && !prev->vm_file && !(prev->vm_flags & VM_SHARED)) { spin_lock(&prev->vm_mm->page_table_lock); prev->vm_end = next->vm_end; __vma_unlink(prev->vm_mm, next, prev); spin_unlock(&prev->vm_mm->page_table_lock); + mpol_free(vma_policy(next)); kmem_cache_free(vm_area_cachep, next); prev->vm_mm->map_count--; } --- diff/mm/mremap.c 2004-04-21 10:45:36.027226528 +0100 +++ source/mm/mremap.c 2004-04-21 10:46:51.811705536 +0100 @@ -19,7 +19,6 @@ #include #include -#include #include #include @@ -79,21 +78,19 @@ static inline pte_t *alloc_one_pte_map(s return pte; } -static void -copy_one_pte(struct vm_area_struct *vma, unsigned long old_addr, - pte_t *src, pte_t *dst, struct pte_chain **pte_chainp) +static inline int +can_move_one_pte(pte_t *src, unsigned long new_addr) { - pte_t pte = ptep_clear_flush(vma, old_addr, src); - set_pte(dst, pte); - - if (pte_present(pte)) { - unsigned long pfn = pte_pfn(pte); + int move = 1; + if (pte_present(*src)) { + unsigned long pfn = pte_pfn(*src); if (pfn_valid(pfn)) { struct page *page = pfn_to_page(pfn); - page_remove_rmap(page, src); - *pte_chainp = page_add_rmap(page, dst, *pte_chainp); + if (PageAnon(page)) + move = mremap_move_anon_rmap(page, new_addr); } } + return move; } static int @@ -103,13 +100,7 @@ move_one_page(struct vm_area_struct *vma struct mm_struct *mm = vma->vm_mm; int error = 0; pte_t *src, *dst; - struct pte_chain *pte_chain; - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) { - error = -ENOMEM; - goto out; - } spin_lock(&mm->page_table_lock); src = get_one_pte_map_nested(mm, old_addr); if (src) { @@ -130,23 +121,26 @@ move_one_page(struct vm_area_struct *vma * page_table_lock, we should re-check the src entry... */ if (src) { - if (dst) - copy_one_pte(vma, old_addr, src, - dst, &pte_chain); - else + if (!dst) error = -ENOMEM; + else if (!can_move_one_pte(src, new_addr)) + error = -EAGAIN; + else { + pte_t pte; + pte = ptep_clear_flush(vma, old_addr, src); + set_pte(dst, pte); + } pte_unmap_nested(src); } pte_unmap(dst); } spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); -out: return error; } static int move_page_tables(struct vm_area_struct *vma, - unsigned long new_addr, unsigned long old_addr, unsigned long len) + unsigned long new_addr, unsigned long old_addr, + unsigned long len, int *cows) { unsigned long offset; @@ -158,8 +152,23 @@ static int move_page_tables(struct vm_ar * only a few pages.. This also makes error recovery easier. */ for (offset = 0; offset < len; offset += PAGE_SIZE) { - if (move_one_page(vma, old_addr+offset, new_addr+offset) < 0) + int ret = move_one_page(vma, old_addr+offset, new_addr+offset); + /* + * The anonmm objrmap can only track anon page movements + * if the page is exclusive to one mm. In the rare case + * when mremap move is applied to a shared page, break + * COW (take a copy of the page) to make it exclusive. + * If shared while on swap, page will be copied when + * brought back in (if it's still shared by then). + */ + if (ret == -EAGAIN) { + ret = make_page_exclusive(vma, old_addr+offset); + offset -= PAGE_SIZE; + (*cows)++; + } + if (ret) break; + cond_resched(); } return offset; } @@ -176,6 +185,7 @@ static unsigned long move_vma(struct vm_ unsigned long moved_len; unsigned long excess = 0; int split = 0; + int cows = 0; /* * We'd prefer to avoid failure later on in do_munmap: @@ -197,23 +207,26 @@ static unsigned long move_vma(struct vm_ * and we propagate stale pages into the dst afterward. */ mapping = vma->vm_file->f_mapping; - down(&mapping->i_shared_sem); + spin_lock(&mapping->i_shared_lock); } - moved_len = move_page_tables(vma, new_addr, old_addr, old_len); + moved_len = move_page_tables(vma, new_addr, old_addr, old_len, &cows); if (moved_len < old_len) { /* * On error, move entries back from new area to old, * which will succeed since page tables still there, * and then proceed to unmap new area instead of old. */ - move_page_tables(new_vma, old_addr, new_addr, moved_len); + move_page_tables(new_vma, old_addr, new_addr, moved_len, &cows); vma = new_vma; old_len = new_len; old_addr = new_addr; new_addr = -ENOMEM; } + if (cows) /* Downgrade or remove this message later */ + printk(KERN_WARNING "%s: mremap moved %d cows\n", + current->comm, cows); if (mapping) - up(&mapping->i_shared_sem); + spin_unlock(&mapping->i_shared_lock); /* Conceal VM_ACCOUNT so old reservation is not undone */ if (vm_flags & VM_ACCOUNT) { --- diff/mm/msync.c 2004-02-09 10:36:12.000000000 +0000 +++ source/mm/msync.c 2004-04-21 10:46:51.811705536 +0100 @@ -13,7 +13,6 @@ #include #include -#include #include /* --- diff/mm/nommu.c 2004-02-09 10:36:12.000000000 +0000 +++ source/mm/nommu.c 2004-04-21 10:46:51.812705384 +0100 @@ -19,7 +19,6 @@ #include #include -#include #include #include #include @@ -567,7 +566,3 @@ unsigned long get_unmapped_area(struct f { return -ENOMEM; } - -void pte_chain_init(void) -{ -} --- diff/mm/page-writeback.c 2004-04-21 10:45:36.027226528 +0100 +++ source/mm/page-writeback.c 2004-04-21 10:46:51.813705232 +0100 @@ -91,6 +91,7 @@ int block_dump; * Flag that puts the machine in "laptop mode". */ int laptop_mode; +EXPORT_SYMBOL(laptop_mode); /* End of sysctl-exported parameters */ @@ -580,6 +581,18 @@ int __set_page_dirty_nobuffers(struct pa EXPORT_SYMBOL(__set_page_dirty_nobuffers); /* + * When a writepage implementation decides that it doesn't want to write this + * page for some reason, it should redirty the locked page via + * redirty_page_for_writepage() and it should then unlock the page and return 0 + */ +int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) +{ + wbc->pages_skipped++; + return __set_page_dirty_nobuffers(page); +} +EXPORT_SYMBOL(redirty_page_for_writepage); + +/* * If the mapping doesn't provide a set_page_dirty a_op, then * just fall through and assume that it wants buffer_heads. */ --- diff/mm/page_alloc.c 2004-04-21 10:45:36.029226224 +0100 +++ source/mm/page_alloc.c 2004-04-21 10:46:51.813705232 +0100 @@ -460,6 +460,32 @@ void drain_local_pages(void) } #endif /* CONFIG_PM */ +static void zone_statistics(struct zonelist *zonelist, struct zone *z) +{ +#ifdef CONFIG_NUMA + unsigned long flags; + int cpu; + pg_data_t *pg = z->zone_pgdat; + pg_data_t *orig = zonelist->zones[0]->zone_pgdat; + struct per_cpu_pageset *p; + + local_irq_save(flags); + cpu = smp_processor_id(); + p = &z->pageset[cpu]; + if (pg == orig) { + z->pageset[cpu].numa_hit++; + } else { + p->numa_miss++; + zonelist->zones[0]->pageset[cpu].numa_foreign++; + } + if (pg == NODE_DATA(numa_node_id())) + p->local_node++; + else + p->other_node++; + local_irq_restore(flags); +#endif +} + /* * Free a 0-order page */ @@ -593,8 +619,10 @@ __alloc_pages(unsigned int gfp_mask, uns if (z->free_pages >= min || (!wait && z->free_pages >= z->pages_high)) { page = buffered_rmqueue(z, order, gfp_mask); - if (page) + if (page) { + zone_statistics(zonelist, z); goto got_pg; + } } } @@ -616,8 +644,10 @@ __alloc_pages(unsigned int gfp_mask, uns if (z->free_pages >= min || (!wait && z->free_pages >= z->pages_high)) { page = buffered_rmqueue(z, order, gfp_mask); - if (page) + if (page) { + zone_statistics(zonelist, z); goto got_pg; + } } } @@ -630,8 +660,10 @@ rebalance: struct zone *z = zones[i]; page = buffered_rmqueue(z, order, gfp_mask); - if (page) + if (page) { + zone_statistics(zonelist, z); goto got_pg; + } } goto nopage; } @@ -658,8 +690,10 @@ rebalance: if (z->free_pages >= min || (!wait && z->free_pages >= z->pages_high)) { page = buffered_rmqueue(z, order, gfp_mask); - if (page) + if (page) { + zone_statistics(zonelist, z); goto got_pg; + } } } --- diff/mm/pdflush.c 2004-04-21 10:45:36.030226072 +0100 +++ source/mm/pdflush.c 2004-04-21 10:46:51.814705080 +0100 @@ -88,6 +88,8 @@ struct pdflush_work { unsigned long when_i_went_to_sleep; }; +static int wakeup_count = 100; + static int __pdflush(struct pdflush_work *my_work) { current->flags |= PF_FLUSHER; @@ -114,7 +116,10 @@ static int __pdflush(struct pdflush_work spin_lock_irq(&pdflush_lock); if (!list_empty(&my_work->list)) { - printk("pdflush: bogus wakeup!\n"); + if (wakeup_count > 0) { + wakeup_count--; + printk("pdflush: bogus wakeup!\n"); + } my_work->fn = NULL; continue; } @@ -190,6 +195,7 @@ int pdflush_operation(void (*fn)(unsigne { unsigned long flags; int ret = 0; + static int poke_count = 0; if (fn == NULL) BUG(); /* Hard to diagnose if it's deferred */ @@ -198,9 +204,19 @@ int pdflush_operation(void (*fn)(unsigne if (list_empty(&pdflush_list)) { spin_unlock_irqrestore(&pdflush_lock, flags); ret = -1; + if (wakeup_count < 100 && poke_count < 10) { + printk("%s: no threads\n", __FUNCTION__); + dump_stack(); + poke_count++; + } } else { struct pdflush_work *pdf; + if (wakeup_count < 100 && poke_count < 10) { + printk("%s: found a thread\n", __FUNCTION__); + dump_stack(); + poke_count++; + } pdf = list_entry(pdflush_list.next, struct pdflush_work, list); list_del_init(&pdf->list); if (list_empty(&pdflush_list)) --- diff/mm/rmap.c 2004-04-21 10:45:36.031225920 +0100 +++ source/mm/rmap.c 2004-04-21 10:46:51.816704776 +0100 @@ -4,17 +4,14 @@ * Copyright 2001, Rik van Riel * Released under the General Public License (GPL). * - * - * Simple, low overhead pte-based reverse mapping scheme. - * This is kept modular because we may want to experiment - * with object-based reverse mapping schemes. Please try - * to keep this thing as modular as possible. + * Simple, low overhead reverse mapping scheme. + * Please try to keep this thing as modular as possible. */ /* * Locking: - * - the page->pte.chain is protected by the PG_maplock bit, - * which nests within the the mm->page_table_lock, + * - the page->mapcount field is protected by the PG_maplock bit, + * which nests within the mm->page_table_lock, * which nests within the page lock. * - because swapout locking is opposite to the locking order * in the page fault path, the swapout path uses trylocks @@ -27,106 +24,349 @@ #include #include #include -#include -#include -#include -#include -#include #include /* - * Something oopsable to put for now in the page->mapping - * of an anonymous page, to test that it is ignored. - */ -#define ANON_MAPPING_DEBUG ((struct address_space *) 0xADB) + * struct anonmm: to track a bundle of anonymous memory mappings. + * + * Could be embedded in mm_struct, but mm_struct is rather heavyweight, + * and we may need the anonmm to stay around long after the mm_struct + * and its pgd have been freed: because pages originally faulted into + * that mm have been duped into forked mms, and still need tracking. + */ +struct anonmm { + atomic_t count; /* ref count, including 1 per page */ + spinlock_t lock; /* head's locks list; others unused */ + struct mm_struct *mm; /* assoc mm_struct, NULL when gone */ + struct anonmm *head; /* exec starts new chain from head */ + struct list_head list; /* chain of associated anonmms */ +}; +static kmem_cache_t *anonmm_cachep; -static inline void clear_page_anon(struct page *page) +/** + ** Functions for creating and destroying struct anonmm. + **/ + +void __init init_rmap(void) { - BUG_ON(page->mapping != ANON_MAPPING_DEBUG); - page->mapping = NULL; - ClearPageAnon(page); + anonmm_cachep = kmem_cache_create("anonmm", + sizeof(struct anonmm), 0, SLAB_PANIC, NULL, NULL); } -/* - * Shared pages have a chain of pte_chain structures, used to locate - * all the mappings to this page. We only need a pointer to the pte - * here, the page struct for the page table page contains the process - * it belongs to and the offset within that process. - * - * We use an array of pte pointers in this structure to minimise cache misses - * while traversing reverse maps. - */ -#define NRPTE ((L1_CACHE_BYTES - sizeof(unsigned long))/sizeof(pte_addr_t)) - -/* - * next_and_idx encodes both the address of the next pte_chain and the - * offset of the lowest-index used pte in ptes[] (which is equal also - * to the offset of the highest-index unused pte in ptes[], plus one). - */ -struct pte_chain { - unsigned long next_and_idx; - pte_addr_t ptes[NRPTE]; -} ____cacheline_aligned; +int exec_rmap(struct mm_struct *mm) +{ + struct anonmm *anonmm; -kmem_cache_t *pte_chain_cache; + anonmm = kmem_cache_alloc(anonmm_cachep, SLAB_KERNEL); + if (unlikely(!anonmm)) + return -ENOMEM; -static inline struct pte_chain *pte_chain_next(struct pte_chain *pte_chain) -{ - return (struct pte_chain *)(pte_chain->next_and_idx & ~NRPTE); + atomic_set(&anonmm->count, 2); /* ref by mm and head */ + anonmm->lock = SPIN_LOCK_UNLOCKED; /* this lock is used */ + anonmm->mm = mm; + anonmm->head = anonmm; + INIT_LIST_HEAD(&anonmm->list); + mm->anonmm = anonmm; + return 0; } -static inline struct pte_chain *pte_chain_ptr(unsigned long pte_chain_addr) +int dup_rmap(struct mm_struct *mm, struct mm_struct *oldmm) { - return (struct pte_chain *)(pte_chain_addr & ~NRPTE); + struct anonmm *anonmm; + struct anonmm *anonhd = oldmm->anonmm->head; + + anonmm = kmem_cache_alloc(anonmm_cachep, SLAB_KERNEL); + if (unlikely(!anonmm)) + return -ENOMEM; + + /* + * copy_mm calls us before dup_mmap has reset the mm fields, + * so reset rss ourselves before adding to anonhd's list, + * to keep away from this mm until it's worth examining. + */ + mm->rss = 0; + + atomic_set(&anonmm->count, 1); /* ref by mm */ + anonmm->lock = SPIN_LOCK_UNLOCKED; /* this lock is not used */ + anonmm->mm = mm; + anonmm->head = anonhd; + spin_lock(&anonhd->lock); + atomic_inc(&anonhd->count); /* ref by anonmm's head */ + list_add_tail(&anonmm->list, &anonhd->list); + spin_unlock(&anonhd->lock); + mm->anonmm = anonmm; + return 0; +} + +void exit_rmap(struct mm_struct *mm) +{ + struct anonmm *anonmm = mm->anonmm; + struct anonmm *anonhd = anonmm->head; + + mm->anonmm = NULL; + spin_lock(&anonhd->lock); + anonmm->mm = NULL; + if (atomic_dec_and_test(&anonmm->count)) { + BUG_ON(anonmm == anonhd); + list_del(&anonmm->list); + kmem_cache_free(anonmm_cachep, anonmm); + if (atomic_dec_and_test(&anonhd->count)) + BUG(); + } + spin_unlock(&anonhd->lock); + if (atomic_read(&anonhd->count) == 1) { + BUG_ON(anonhd->mm); + BUG_ON(!list_empty(&anonhd->list)); + kmem_cache_free(anonmm_cachep, anonhd); + } } -static inline int pte_chain_idx(struct pte_chain *pte_chain) -{ - return pte_chain->next_and_idx & NRPTE; +static void free_anonmm(struct anonmm *anonmm) +{ + struct anonmm *anonhd = anonmm->head; + + BUG_ON(anonmm->mm); + BUG_ON(anonmm == anonhd); + spin_lock(&anonhd->lock); + list_del(&anonmm->list); + if (atomic_dec_and_test(&anonhd->count)) + BUG(); + spin_unlock(&anonhd->lock); + kmem_cache_free(anonmm_cachep, anonmm); } -static inline unsigned long -pte_chain_encode(struct pte_chain *pte_chain, int idx) +static inline void clear_page_anon(struct page *page) { - return (unsigned long)pte_chain | idx; + struct anonmm *anonmm = (struct anonmm *) page->mapping; + + page->mapping = NULL; + ClearPageAnon(page); + if (atomic_dec_and_test(&anonmm->count)) + free_anonmm(anonmm); } +/** + ** VM stuff below this comment + **/ + /* - * pte_chain list management policy: - * - * - If a page has a pte_chain list then it is shared by at least two processes, - * because a single sharing uses PageDirect. (Well, this isn't true yet, - * coz this code doesn't collapse singletons back to PageDirect on the remove - * path). - * - A pte_chain list has free space only in the head member - all succeeding - * members are 100% full. - * - If the head element has free space, it occurs in its leading slots. - * - All free space in the pte_chain is at the start of the head member. - * - Insertion into the pte_chain puts a pte pointer in the last free slot of - * the head member. - * - Removal from a pte chain moves the head pte of the head member onto the - * victim pte and frees the head member if it became empty. + * At what user virtual address is pgoff expected in file-backed vma? */ +static inline +unsigned long vma_address(struct vm_area_struct *vma, pgoff_t pgoff) +{ + unsigned long address; + + address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + return (address >= vma->vm_start && address < vma->vm_end)? + address: -EFAULT; +} /** - ** VM stuff below this comment + ** Subfunctions of page_referenced: page_referenced_one called + ** repeatedly from either page_referenced_anon or page_referenced_file. **/ +static int page_referenced_one(struct page *page, + struct mm_struct *mm, unsigned long address, + unsigned int *mapcount, int *failed) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + int referenced = 0; + + if (!spin_trylock(&mm->page_table_lock)) { + /* + * For debug we're currently warning if not all found, + * but in this case that's expected: suppress warning. + */ + (*failed)++; + return 0; + } + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out_unlock; + + pmd = pmd_offset(pgd, address); + if (!pmd_present(*pmd)) + goto out_unlock; + + pte = pte_offset_map(pmd, address); + if (!pte_present(*pte)) + goto out_unmap; + + if (page_to_pfn(page) != pte_pfn(*pte)) + goto out_unmap; + + if (ptep_test_and_clear_young(pte)) + referenced++; + + (*mapcount)--; + +out_unmap: + pte_unmap(pte); + +out_unlock: + spin_unlock(&mm->page_table_lock); + return referenced; +} + +static inline int page_referenced_anon(struct page *page) +{ + unsigned int mapcount = page->mapcount; + struct anonmm *anonmm = (struct anonmm *) page->mapping; + struct anonmm *anonhd = anonmm->head; + struct anonmm *new_anonmm = anonmm; + struct list_head *seek_head; + int referenced = 0; + int failed = 0; + + spin_lock(&anonhd->lock); + /* + * First try the indicated mm, it's the most likely. + * Make a note to migrate the page if this mm is extinct. + */ + if (!anonmm->mm) + new_anonmm = NULL; + else if (anonmm->mm->rss) { + referenced += page_referenced_one(page, + anonmm->mm, page->index, &mapcount, &failed); + if (!mapcount) + goto out; + } + + /* + * Then down the rest of the list, from that as the head. Stop + * when we reach anonhd? No: although a page cannot get dup'ed + * into an older mm, once swapped, its indicated mm may not be + * the oldest, just the first into which it was faulted back. + * If original mm now extinct, note first to contain the page. + */ + seek_head = &anonmm->list; + list_for_each_entry(anonmm, seek_head, list) { + if (!anonmm->mm || !anonmm->mm->rss) + continue; + referenced += page_referenced_one(page, + anonmm->mm, page->index, &mapcount, &failed); + if (!new_anonmm && mapcount < page->mapcount) + new_anonmm = anonmm; + if (!mapcount) { + anonmm = (struct anonmm *) page->mapping; + if (new_anonmm == anonmm) + goto out; + goto migrate; + } + } + + /* + * The warning below may appear if page_referenced catches the + * page in between page_add_rmap and its replacement demanded + * by mremap_moved_anon_page: so remove the warning once we're + * convinced that anonmm rmap really is finding its pages. + */ + WARN_ON(!failed); +out: + spin_unlock(&anonhd->lock); + return referenced; + +migrate: + /* + * Migrate pages away from an extinct mm, so that its anonmm + * can be freed in due course: we could leave this to happen + * through the natural attrition of try_to_unmap, but that + * would miss locked pages and frequently referenced pages. + */ + spin_unlock(&anonhd->lock); + page->mapping = (void *) new_anonmm; + atomic_inc(&new_anonmm->count); + if (atomic_dec_and_test(&anonmm->count)) + free_anonmm(anonmm); + return referenced; +} + +/** + * page_referenced_file - referenced check for object-based rmap + * @page: the page we're checking references on. + * + * For an object-based mapped page, find all the places it is mapped and + * check/clear the referenced flag. This is done by following the page->mapping + * pointer, then walking the chain of vmas it holds. It returns the number + * of references it found. + * + * This function is only called from page_referenced for object-based pages. + * + * The semaphore address_space->i_shared_lock is tried. If it can't be gotten, + * assume a reference count of 0, so try_to_unmap will then have a go. + */ +static inline int page_referenced_file(struct page *page) +{ + unsigned int mapcount = page->mapcount; + struct address_space *mapping = page->mapping; + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + struct vm_area_struct *vma; + unsigned long address; + int referenced = 0; + int failed = 0; + + if (!spin_trylock(&mapping->i_shared_lock)) + return 0; + + list_for_each_entry(vma, &mapping->i_mmap, shared) { + address = vma_address(vma, pgoff); + if (address == -EFAULT) + continue; + if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE)) + == (VM_LOCKED|VM_MAYSHARE)) { + referenced++; + goto out; + } + if (vma->vm_mm->rss) { + referenced += page_referenced_one(page, + vma->vm_mm, address, &mapcount, &failed); + if (!mapcount) + goto out; + } + } + + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) { + if (unlikely(vma->vm_flags & VM_NONLINEAR)) { + failed++; + continue; + } + address = vma_address(vma, pgoff); + if (address == -EFAULT) + continue; + if (vma->vm_flags & (VM_LOCKED|VM_RESERVED)) { + referenced++; + goto out; + } + if (vma->vm_mm->rss) { + referenced += page_referenced_one(page, + vma->vm_mm, address, &mapcount, &failed); + if (!mapcount) + goto out; + } + } + + WARN_ON(!failed); +out: + spin_unlock(&mapping->i_shared_lock); + return referenced; +} + /** * page_referenced - test if the page was referenced * @page: the page to test * * Quick test_and_clear_referenced for all mappings to a page, - * returns the number of processes which referenced the page. + * returns the number of ptes which referenced the page. * Caller needs to hold the rmap lock. - * - * If the page has a single-entry pte_chain, collapse that back to a PageDirect - * representation. This way, it's only done under memory pressure. */ -int fastcall page_referenced(struct page * page) +int fastcall page_referenced(struct page *page) { - struct pte_chain *pc; int referenced = 0; if (page_test_and_clear_young(page)) @@ -135,218 +375,175 @@ int fastcall page_referenced(struct page if (TestClearPageReferenced(page)) referenced++; - if (PageDirect(page)) { - pte_t *pte = rmap_ptep_map(page->pte.direct); - if (ptep_test_and_clear_young(pte)) - referenced++; - rmap_ptep_unmap(pte); - } else { - int nr_chains = 0; - - /* Check all the page tables mapping this page. */ - for (pc = page->pte.chain; pc; pc = pte_chain_next(pc)) { - int i; - - for (i = pte_chain_idx(pc); i < NRPTE; i++) { - pte_addr_t pte_paddr = pc->ptes[i]; - pte_t *p; - - p = rmap_ptep_map(pte_paddr); - if (ptep_test_and_clear_young(p)) - referenced++; - rmap_ptep_unmap(p); - nr_chains++; - } - } - if (nr_chains == 1) { - pc = page->pte.chain; - page->pte.direct = pc->ptes[NRPTE-1]; - SetPageDirect(page); - pc->ptes[NRPTE-1] = 0; - __pte_chain_free(pc); - } + if (page->mapcount && page->mapping) { + if (PageAnon(page)) + referenced += page_referenced_anon(page); + else + referenced += page_referenced_file(page); } return referenced; } /** - * page_add_rmap - add reverse mapping entry to a page - * @page: the page to add the mapping to - * @ptep: the page table entry mapping this page + * page_add_anon_rmap - add pte mapping to an anonymous page + * @page: the page to add the mapping to + * @mm: the mm in which the mapping is added + * @address: the user virtual address mapped * - * Add a new pte reverse mapping to a page. * The caller needs to hold the mm->page_table_lock. */ -struct pte_chain * fastcall -page_add_rmap(struct page *page, pte_t *ptep, struct pte_chain *pte_chain) +void fastcall page_add_anon_rmap(struct page *page, + struct mm_struct *mm, unsigned long address) { - pte_addr_t pte_paddr = ptep_to_paddr(ptep); - struct pte_chain *cur_pte_chain; + struct anonmm *anonmm = mm->anonmm; - if (PageReserved(page)) - return pte_chain; + BUG_ON(PageReserved(page)); + BUG_ON(page_mapping(page)); rmap_lock(page); - - if (page->pte.direct == 0) { - page->pte.direct = pte_paddr; - SetPageDirect(page); - if (!page->mapping) { - SetPageAnon(page); - page->mapping = ANON_MAPPING_DEBUG; - } + if (!page->mapcount) { + SetPageAnon(page); + page->index = address & PAGE_MASK; + page->mapping = (void *) anonmm; + atomic_inc(&anonmm->count); inc_page_state(nr_mapped); - goto out; } - - if (PageDirect(page)) { - /* Convert a direct pointer into a pte_chain */ - ClearPageDirect(page); - pte_chain->ptes[NRPTE-1] = page->pte.direct; - pte_chain->ptes[NRPTE-2] = pte_paddr; - pte_chain->next_and_idx = pte_chain_encode(NULL, NRPTE-2); - page->pte.direct = 0; - page->pte.chain = pte_chain; - pte_chain = NULL; /* We consumed it */ - goto out; - } - - cur_pte_chain = page->pte.chain; - if (cur_pte_chain->ptes[0]) { /* It's full */ - pte_chain->next_and_idx = pte_chain_encode(cur_pte_chain, - NRPTE - 1); - page->pte.chain = pte_chain; - pte_chain->ptes[NRPTE-1] = pte_paddr; - pte_chain = NULL; /* We consumed it */ - goto out; - } - cur_pte_chain->ptes[pte_chain_idx(cur_pte_chain) - 1] = pte_paddr; - cur_pte_chain->next_and_idx--; -out: + page->mapcount++; rmap_unlock(page); - return pte_chain; } /** - * page_remove_rmap - take down reverse mapping to a page - * @page: page to remove mapping from - * @ptep: page table entry to remove + * page_add_file_rmap - add pte mapping to a file page + * @page: the page to add the mapping to * - * Removes the reverse mapping from the pte_chain of the page, - * after that the caller can clear the page table entry and free - * the page. - * Caller needs to hold the mm->page_table_lock. + * The caller needs to hold the mm->page_table_lock. */ -void fastcall page_remove_rmap(struct page *page, pte_t *ptep) +void fastcall page_add_file_rmap(struct page *page) { - pte_addr_t pte_paddr = ptep_to_paddr(ptep); - struct pte_chain *pc; - + BUG_ON(PageAnon(page)); if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) return; rmap_lock(page); + if (!page->mapcount) + inc_page_state(nr_mapped); + page->mapcount++; + rmap_unlock(page); +} - if (!page_mapped(page)) - goto out_unlock; /* remap_page_range() from a driver? */ +/** + * page_remove_rmap - take down pte mapping from a page + * @page: page to remove mapping from + * + * Caller needs to hold the mm->page_table_lock. + */ +void fastcall page_remove_rmap(struct page *page) +{ + BUG_ON(PageReserved(page)); + BUG_ON(!page->mapcount); - if (PageDirect(page)) { - if (page->pte.direct == pte_paddr) { - page->pte.direct = 0; - ClearPageDirect(page); - goto out; - } - } else { - struct pte_chain *start = page->pte.chain; - struct pte_chain *next; - int victim_i = pte_chain_idx(start); - - for (pc = start; pc; pc = next) { - int i; - - next = pte_chain_next(pc); - if (next) - prefetch(next); - for (i = pte_chain_idx(pc); i < NRPTE; i++) { - pte_addr_t pa = pc->ptes[i]; - - if (pa != pte_paddr) - continue; - pc->ptes[i] = start->ptes[victim_i]; - start->ptes[victim_i] = 0; - if (victim_i == NRPTE-1) { - /* Emptied a pte_chain */ - page->pte.chain = pte_chain_next(start); - __pte_chain_free(start); - } else { - start->next_and_idx++; - } - goto out; - } - } - } -out: - if (!page_mapped(page)) { + rmap_lock(page); + page->mapcount--; + if (!page->mapcount) { if (page_test_and_clear_dirty(page)) set_page_dirty(page); if (PageAnon(page)) clear_page_anon(page); dec_page_state(nr_mapped); } -out_unlock: rmap_unlock(page); } /** - * try_to_unmap_one - worker function for try_to_unmap - * @page: page to unmap - * @ptep: page table entry to unmap from page - * - * Internal helper function for try_to_unmap, called for each page - * table entry mapping a page. Because locking order here is opposite - * to the locking order used by the page fault path, we use trylocks. - * Locking: - * page lock shrink_list(), trylock - * rmap lock shrink_list() - * mm->page_table_lock try_to_unmap_one(), trylock - */ -static int fastcall try_to_unmap_one(struct page * page, pte_addr_t paddr) -{ - pte_t *ptep = rmap_ptep_map(paddr); - unsigned long address = ptep_to_address(ptep); - struct mm_struct * mm = ptep_to_mm(ptep); - struct vm_area_struct * vma; - pte_t pte; - int ret; + * mremap_move_anon_rmap - try to note new address of anonymous page + * @page: page about to be moved + * @address: user virtual address at which it is going to be mapped + * + * Returns boolean, true if page is not shared, so address updated. + * + * For mremap's can_move_one_page: to update address when vma is moved, + * provided that anon page is not shared with a parent or child mm. + * If it is shared, then caller must take a copy of the page instead: + * not very clever, but too rare a case to merit cleverness. + */ +int fastcall mremap_move_anon_rmap(struct page *page, unsigned long address) +{ + int move = 0; + if (page->mapcount == 1) { + rmap_lock(page); + if (page->mapcount == 1) { + page->index = address & PAGE_MASK; + move = 1; + } + rmap_unlock(page); + } + return move; +} - if (!mm) - BUG(); +/** + ** Subfunctions of try_to_unmap: try_to_unmap_one called + ** repeatedly from either try_to_unmap_anon or try_to_unmap_file. + **/ + +static int try_to_unmap_one(struct page *page, + struct mm_struct *mm, unsigned long address, + unsigned int *mapcount, struct vm_area_struct *vma) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + pte_t pteval; + int ret = SWAP_AGAIN; /* * We need the page_table_lock to protect us from page faults, * munmap, fork, etc... */ - if (!spin_trylock(&mm->page_table_lock)) { - rmap_ptep_unmap(ptep); - return SWAP_AGAIN; - } + if (!spin_trylock(&mm->page_table_lock)) + goto out; - /* unmap_vmas drops page_table_lock with vma unlinked */ - vma = find_vma(mm, address); - if (!vma) { - ret = SWAP_FAIL; + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out_unlock; + + pmd = pmd_offset(pgd, address); + if (!pmd_present(*pmd)) goto out_unlock; + + pte = pte_offset_map(pmd, address); + if (!pte_present(*pte)) + goto out_unmap; + + if (page_to_pfn(page) != pte_pfn(*pte)) + goto out_unmap; + + (*mapcount)--; + + if (!vma) { + vma = find_vma(mm, address); + /* unmap_vmas drops page_table_lock with vma unlinked */ + if (!vma) + goto out_unmap; } - /* The page is mlock()d, we cannot swap it out. */ - if (vma->vm_flags & VM_LOCKED) { + /* + * If the page is mlock()d, we cannot swap it out. + * If it's recently referenced (perhaps page_referenced + * skipped over this mm) then we should reactivate it. + */ + if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) || + ptep_test_and_clear_young(pte)) { ret = SWAP_FAIL; - goto out_unlock; + goto out_unmap; } /* Nuke the page table entry. */ flush_cache_page(vma, address); - pte = ptep_clear_flush(vma, address, ptep); + pteval = ptep_clear_flush(vma, address, pte); + + /* Move the dirty bit to the physical page now the pte is gone. */ + if (pte_dirty(pteval)) + set_page_dirty(page); if (PageAnon(page)) { swp_entry_t entry = { .val = page->private }; @@ -356,189 +553,311 @@ static int fastcall try_to_unmap_one(str */ BUG_ON(!PageSwapCache(page)); swap_duplicate(entry); - set_pte(ptep, swp_entry_to_pte(entry)); - BUG_ON(pte_file(*ptep)); - } else { - /* - * If a nonlinear mapping then store the file page offset - * in the pte. - */ - BUG_ON(!page->mapping); - if (page->index != linear_page_index(vma, address)) { - set_pte(ptep, pgoff_to_pte(page->index)); - BUG_ON(!pte_file(*ptep)); - } + set_pte(pte, swp_entry_to_pte(entry)); + BUG_ON(pte_file(*pte)); } - /* Move the dirty bit to the physical page now the pte is gone. */ - if (pte_dirty(pte)) - set_page_dirty(page); - mm->rss--; + BUG_ON(!page->mapcount); + page->mapcount--; page_cache_release(page); - ret = SWAP_SUCCESS; + +out_unmap: + pte_unmap(pte); out_unlock: - rmap_ptep_unmap(ptep); spin_unlock(&mm->page_table_lock); + +out: return ret; } -/** - * try_to_unmap - try to remove all page table mappings to a page - * @page: the page to get unmapped - * - * Tries to remove all the page table entries which are mapping this - * page, used in the pageout path. Caller must hold the page lock - * and its rmap lock. Return values are: - * - * SWAP_SUCCESS - we succeeded in removing all mappings - * SWAP_AGAIN - we missed a trylock, try again later - * SWAP_FAIL - the page is unswappable +/* + * try_to_unmap_cluster is only used on VM_NONLINEAR shared object vmas, + * in which objrmap is unable to predict where a page will be found. */ -int fastcall try_to_unmap(struct page * page) +#define CLUSTER_SIZE (32 * PAGE_SIZE) +#if CLUSTER_SIZE > PMD_SIZE +#undef CLUSTER_SIZE +#define CLUSTER_SIZE PMD_SIZE +#endif +#define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) + +static int try_to_unmap_cluster(struct mm_struct *mm, unsigned long cursor, + unsigned int *mapcount, struct vm_area_struct *vma) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + pte_t pteval; + struct page *page; + unsigned long address; + unsigned long end; + unsigned long pfn; + + /* + * We need the page_table_lock to protect us from page faults, + * munmap, fork, etc... + */ + if (!spin_trylock(&mm->page_table_lock)) + return SWAP_FAIL; + + address = (vma->vm_start + cursor) & CLUSTER_MASK; + end = address + CLUSTER_SIZE; + if (address < vma->vm_start) + address = vma->vm_start; + if (end > vma->vm_end) + end = vma->vm_end; + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out_unlock; + + pmd = pmd_offset(pgd, address); + if (!pmd_present(*pmd)) + goto out_unlock; + + for (pte = pte_offset_map(pmd, address); + address < end; pte++, address += PAGE_SIZE) { + + if (!pte_present(*pte)) + continue; + + pfn = pte_pfn(*pte); + if (!pfn_valid(pfn)) + continue; + + page = pfn_to_page(pfn); + BUG_ON(PageAnon(page)); + if (PageReserved(page)) + continue; + + if (ptep_test_and_clear_young(pte)) + continue; + + /* Nuke the page table entry. */ + flush_cache_page(vma, address); + pteval = ptep_clear_flush(vma, address, pte); + + /* If nonlinear, store the file page offset in the pte. */ + if (page->index != linear_page_index(vma, address)) + set_pte(pte, pgoff_to_pte(page->index)); + + /* Move the dirty bit to the physical page now the pte is gone. */ + if (pte_dirty(pteval)) + set_page_dirty(page); + + page_remove_rmap(page); + page_cache_release(page); + mm->rss--; + (*mapcount)--; + } + + pte_unmap(pte); + +out_unlock: + spin_unlock(&mm->page_table_lock); + return SWAP_AGAIN; +} + +static inline int try_to_unmap_anon(struct page *page) { - struct pte_chain *pc, *next_pc, *start; - int ret = SWAP_SUCCESS; - int victim_i; + unsigned int mapcount = page->mapcount; + struct anonmm *anonmm = (struct anonmm *) page->mapping; + struct anonmm *anonhd = anonmm->head; + struct list_head *seek_head; + int ret = SWAP_AGAIN; - /* This page should not be on the pageout lists. */ - if (PageReserved(page)) - BUG(); - if (!PageLocked(page)) - BUG(); + spin_lock(&anonhd->lock); + /* + * First try the indicated mm, it's the most likely. + */ + if (anonmm->mm && anonmm->mm->rss) { + ret = try_to_unmap_one(page, + anonmm->mm, page->index, &mapcount, NULL); + if (ret == SWAP_FAIL || !mapcount) + goto out; + } - if (PageDirect(page)) { - ret = try_to_unmap_one(page, page->pte.direct); - if (ret == SWAP_SUCCESS) { - page->pte.direct = 0; - ClearPageDirect(page); - } - goto out; + /* + * Then down the rest of the list, from that as the head. Stop + * when we reach anonhd? No: although a page cannot get dup'ed + * into an older mm, once swapped, its indicated mm may not be + * the oldest, just the first into which it was faulted back. + */ + seek_head = &anonmm->list; + list_for_each_entry(anonmm, seek_head, list) { + if (!anonmm->mm || !anonmm->mm->rss) + continue; + ret = try_to_unmap_one(page, + anonmm->mm, page->index, &mapcount, NULL); + if (ret == SWAP_FAIL || !mapcount) + goto out; } +out: + spin_unlock(&anonhd->lock); + return ret; +} - start = page->pte.chain; - victim_i = pte_chain_idx(start); - for (pc = start; pc; pc = next_pc) { - int i; - - next_pc = pte_chain_next(pc); - if (next_pc) - prefetch(next_pc); - for (i = pte_chain_idx(pc); i < NRPTE; i++) { - pte_addr_t pte_paddr = pc->ptes[i]; - - switch (try_to_unmap_one(page, pte_paddr)) { - case SWAP_SUCCESS: - /* - * Release a slot. If we're releasing the - * first pte in the first pte_chain then - * pc->ptes[i] and start->ptes[victim_i] both - * refer to the same thing. It works out. - */ - pc->ptes[i] = start->ptes[victim_i]; - start->ptes[victim_i] = 0; - victim_i++; - if (victim_i == NRPTE) { - page->pte.chain = pte_chain_next(start); - __pte_chain_free(start); - start = page->pte.chain; - victim_i = 0; - } else { - start->next_and_idx++; - } - break; - case SWAP_AGAIN: - /* Skip this pte, remembering status. */ - ret = SWAP_AGAIN; +/** + * try_to_unmap_file - unmap file page using the object-based rmap method + * @page: the page to unmap + * + * Find all the mappings of a page using the mapping pointer and the vma chains + * contained in the address_space struct it points to. + * + * This function is only called from try_to_unmap for object-based pages. + * + * The semaphore address_space->i_shared_lock is tried. If it can't be gotten, + * return a temporary error. + */ +static inline int try_to_unmap_file(struct page *page) +{ + unsigned int mapcount = page->mapcount; + struct address_space *mapping = page->mapping; + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + struct vm_area_struct *vma; + unsigned long address; + int ret = SWAP_AGAIN; + unsigned long cursor; + unsigned long max_nl_cursor = 0; + unsigned long max_nl_size = 0; + + if (!spin_trylock(&mapping->i_shared_lock)) + return ret; + + list_for_each_entry(vma, &mapping->i_mmap, shared) { + if (vma->vm_mm->rss) { + address = vma_address(vma, pgoff); + if (address == -EFAULT) continue; - case SWAP_FAIL: - ret = SWAP_FAIL; + ret = try_to_unmap_one(page, + vma->vm_mm, address, &mapcount, vma); + if (ret == SWAP_FAIL || !mapcount) goto out; - } } } -out: - if (!page_mapped(page)) { - if (page_test_and_clear_dirty(page)) - set_page_dirty(page); - if (PageAnon(page)) - clear_page_anon(page); - dec_page_state(nr_mapped); - ret = SWAP_SUCCESS; + + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) { + if (unlikely(vma->vm_flags & VM_NONLINEAR)) { + /* + * Defer unmapping nonlinear to the next loop, + * but take notes while we're here e.g. don't + * want to loop again when no nonlinear vmas. + */ + if (vma->vm_flags & (VM_LOCKED|VM_RESERVED)) + continue; + cursor = (unsigned long) vma->vm_private_data; + if (cursor > max_nl_cursor) + max_nl_cursor = cursor; + cursor = vma->vm_end - vma->vm_start; + if (cursor > max_nl_size) + max_nl_size = cursor; + continue; + } + if (vma->vm_mm->rss) { + address = vma_address(vma, pgoff); + if (address == -EFAULT) + continue; + ret = try_to_unmap_one(page, + vma->vm_mm, address, &mapcount, vma); + if (ret == SWAP_FAIL || !mapcount) + goto out; + } } - return ret; -} -/** - ** No more VM stuff below this comment, only pte_chain helper - ** functions. - **/ + if (max_nl_size == 0) /* no nonlinear vmas of this file */ + goto out; -static void pte_chain_ctor(void *p, kmem_cache_t *cachep, unsigned long flags) -{ - struct pte_chain *pc = p; + /* + * We don't try to search for this page in the nonlinear vmas, + * and page_referenced wouldn't have found it anyway. Instead + * just walk the nonlinear vmas trying to age and unmap some. + * The mapcount of the page we came in with is irrelevant, + * but even so use it as a guide to how hard we should try? + */ + rmap_unlock(page); - memset(pc, 0, sizeof(*pc)); -} + max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; + if (max_nl_cursor == 0) + max_nl_cursor = CLUSTER_SIZE; + + do { + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) { + if (VM_NONLINEAR != (vma->vm_flags & + (VM_NONLINEAR|VM_LOCKED|VM_RESERVED))) + continue; + cursor = (unsigned long) vma->vm_private_data; + while (vma->vm_mm->rss && + cursor < max_nl_cursor && + cursor < vma->vm_end - vma->vm_start) { + ret = try_to_unmap_cluster(vma->vm_mm, + cursor, &mapcount, vma); + if (ret == SWAP_FAIL) + break; + cursor += CLUSTER_SIZE; + vma->vm_private_data = (void *) cursor; + if ((int)mapcount <= 0) + goto relock; + cond_resched(); + } + if (ret != SWAP_FAIL) + vma->vm_private_data = + (void *) max_nl_cursor; + ret = SWAP_AGAIN; + } + max_nl_cursor += CLUSTER_SIZE; + } while (max_nl_cursor <= max_nl_size); -DEFINE_PER_CPU(struct pte_chain *, local_pte_chain) = 0; + /* + * Don't loop forever (perhaps all the remaining pages are + * in locked vmas). Reset cursor on all unreserved nonlinear + * vmas, now forgetting on which ones it had fallen behind. + */ + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) { + if ((vma->vm_flags & (VM_NONLINEAR|VM_RESERVED)) == + VM_NONLINEAR) + vma->vm_private_data = 0; + } +relock: + rmap_lock(page); +out: + spin_unlock(&mapping->i_shared_lock); + return ret; +} /** - * __pte_chain_free - free pte_chain structure - * @pte_chain: pte_chain struct to free + * try_to_unmap - try to remove all page table mappings to a page + * @page: the page to get unmapped + * + * Tries to remove all the page table entries which are mapping this + * page, used in the pageout path. Caller must hold the page lock + * and its rmap lock. Return values are: + * + * SWAP_SUCCESS - we succeeded in removing all mappings + * SWAP_AGAIN - we missed a trylock, try again later + * SWAP_FAIL - the page is unswappable */ -void __pte_chain_free(struct pte_chain *pte_chain) +int fastcall try_to_unmap(struct page *page) { - struct pte_chain **pte_chainp; + int ret; - pte_chainp = &get_cpu_var(local_pte_chain); - if (pte_chain->next_and_idx) - pte_chain->next_and_idx = 0; - if (*pte_chainp) - kmem_cache_free(pte_chain_cache, *pte_chainp); - *pte_chainp = pte_chain; - put_cpu_var(local_pte_chain); -} + BUG_ON(PageReserved(page)); + BUG_ON(!PageLocked(page)); + BUG_ON(!page->mapcount); + + if (PageAnon(page)) + ret = try_to_unmap_anon(page); + else + ret = try_to_unmap_file(page); -/* - * pte_chain_alloc(): allocate a pte_chain structure for use by page_add_rmap(). - * - * The caller of page_add_rmap() must perform the allocation because - * page_add_rmap() is invariably called under spinlock. Often, page_add_rmap() - * will not actually use the pte_chain, because there is space available in one - * of the existing pte_chains which are attached to the page. So the case of - * allocating and then freeing a single pte_chain is specially optimised here, - * with a one-deep per-cpu cache. - */ -struct pte_chain *pte_chain_alloc(int gfp_flags) -{ - struct pte_chain *ret; - struct pte_chain **pte_chainp; - - might_sleep_if(gfp_flags & __GFP_WAIT); - - pte_chainp = &get_cpu_var(local_pte_chain); - if (*pte_chainp) { - ret = *pte_chainp; - *pte_chainp = NULL; - put_cpu_var(local_pte_chain); - } else { - put_cpu_var(local_pte_chain); - ret = kmem_cache_alloc(pte_chain_cache, gfp_flags); + if (!page->mapcount) { + if (page_test_and_clear_dirty(page)) + set_page_dirty(page); + if (PageAnon(page)) + clear_page_anon(page); + dec_page_state(nr_mapped); + ret = SWAP_SUCCESS; } return ret; } - -void __init pte_chain_init(void) -{ - pte_chain_cache = kmem_cache_create( "pte_chain", - sizeof(struct pte_chain), - sizeof(struct pte_chain), - 0, - pte_chain_ctor, - NULL); - - if (!pte_chain_cache) - panic("failed to create pte_chain cache!\n"); -} --- diff/mm/shmem.c 2004-04-21 10:45:36.032225768 +0100 +++ source/mm/shmem.c 2004-04-21 10:46:51.817704624 +0100 @@ -8,6 +8,7 @@ * 2002 Red Hat Inc. * Copyright (C) 2002-2003 Hugh Dickins. * Copyright (C) 2002-2003 VERITAS Software Corporation. + * Copyright (C) 2004 Andi Kleen, SuSE Labs * * This file is released under the GPL. */ @@ -37,8 +38,10 @@ #include #include #include +#include #include #include +#include /* This magic number is used in glibc for posix shared memory */ #define TMPFS_MAGIC 0x01021994 @@ -783,6 +786,74 @@ redirty: return WRITEPAGE_ACTIVATE; /* Return with the page locked */ } +#ifdef CONFIG_NUMA +static struct page *shmem_swapin_async(struct shared_policy *p, + swp_entry_t entry, unsigned long idx) +{ + struct page *page; + struct vm_area_struct pvma; + + /* Create a pseudo vma that just contains the policy */ + memset(&pvma, 0, sizeof(struct vm_area_struct)); + pvma.vm_end = PAGE_SIZE; + pvma.vm_pgoff = idx; + pvma.vm_policy = mpol_shared_policy_lookup(p, idx); + page = read_swap_cache_async(entry, &pvma, 0); + mpol_free(pvma.vm_policy); + return page; +} + +struct page *shmem_swapin(struct shmem_inode_info *info, swp_entry_t entry, + unsigned long idx) +{ + struct shared_policy *p = &info->policy; + int i, num; + struct page *page; + unsigned long offset; + + num = valid_swaphandles(entry, &offset); + for (i = 0; i < num; offset++, i++) { + page = shmem_swapin_async(p, + swp_entry(swp_type(entry), offset), idx); + if (!page) + break; + page_cache_release(page); + } + lru_add_drain(); /* Push any new pages onto the LRU now */ + return shmem_swapin_async(p, entry, idx); +} + +static struct page * +shmem_alloc_page(unsigned long gfp, struct shmem_inode_info *info, + unsigned long idx) +{ + struct vm_area_struct pvma; + struct page *page; + + memset(&pvma, 0, sizeof(struct vm_area_struct)); + pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); + pvma.vm_pgoff = idx; + pvma.vm_end = PAGE_SIZE; + page = alloc_page_vma(gfp, &pvma, 0); + mpol_free(pvma.vm_policy); + return page; +} +#else +static inline struct page * +shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx) +{ + swapin_readahead(entry, 0, NULL); + return read_swap_cache_async(entry, NULL, 0); +} + +static inline struct page * +shmem_alloc_page(unsigned long gfp,struct shmem_inode_info *info, + unsigned long idx) +{ + return alloc_page(gfp); +} +#endif + /* * shmem_getpage - either get the page from swap or allocate a new one * @@ -790,7 +861,8 @@ redirty: * vm. If we swap it in we mark it dirty since we also free the swap * entry since a page cannot live in both the swap and page cache */ -static int shmem_getpage(struct inode *inode, unsigned long idx, struct page **pagep, enum sgp_type sgp, int *type) +static int shmem_getpage(struct inode *inode, unsigned long idx, + struct page **pagep, enum sgp_type sgp, int *type) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); @@ -840,8 +912,7 @@ repeat: if (majmin == VM_FAULT_MINOR && type) inc_page_state(pgmajfault); majmin = VM_FAULT_MAJOR; - swapin_readahead(swap); - swappage = read_swap_cache_async(swap); + swappage = shmem_swapin(info, swap, idx); if (!swappage) { spin_lock(&info->lock); entry = shmem_swp_alloc(info, idx, sgp); @@ -946,7 +1017,9 @@ repeat: if (!filepage) { spin_unlock(&info->lock); - filepage = page_cache_alloc(mapping); + filepage = shmem_alloc_page(mapping_gfp_mask(mapping), + info, + idx); if (!filepage) { shmem_unacct_blocks(info->flags, 1); shmem_free_block(inode); @@ -1069,6 +1142,24 @@ static int shmem_populate(struct vm_area return 0; } +#ifdef CONFIG_NUMA +int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) +{ + struct inode *i = vma->vm_file->f_dentry->d_inode; + return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); +} + +struct mempolicy * +shmem_get_policy(struct vm_area_struct *vma, unsigned long addr) +{ + struct inode *i = vma->vm_file->f_dentry->d_inode; + unsigned long idx; + + idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx); +} +#endif + void shmem_lock(struct file *file, int lock) { struct inode *inode = file->f_dentry->d_inode; @@ -1117,6 +1208,7 @@ shmem_get_inode(struct super_block *sb, info = SHMEM_I(inode); memset(info, 0, (char *)inode - (char *)info); spin_lock_init(&info->lock); + mpol_shared_policy_init(&info->policy); switch (mode & S_IFMT) { default: init_special_inode(inode, mode, dev); @@ -1792,6 +1884,7 @@ static struct inode *shmem_alloc_inode(s static void shmem_destroy_inode(struct inode *inode) { + mpol_free_shared_policy(&SHMEM_I(inode)->policy); kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); } @@ -1808,9 +1901,9 @@ static void init_once(void *foo, kmem_ca static int init_inodecache(void) { shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", - sizeof(struct shmem_inode_info), - 0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, - init_once, NULL); + sizeof(struct shmem_inode_info), + 0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, + init_once, NULL); if (shmem_inode_cachep == NULL) return -ENOMEM; return 0; @@ -1876,6 +1969,10 @@ static struct super_operations shmem_ops static struct vm_operations_struct shmem_vm_ops = { .nopage = shmem_nopage, .populate = shmem_populate, +#ifdef CONFIG_NUMA + .set_policy = shmem_set_policy, + .get_policy = shmem_get_policy, +#endif }; static struct super_block *shmem_get_sb(struct file_system_type *fs_type, --- diff/mm/slab.c 2004-04-21 10:45:36.034225464 +0100 +++ source/mm/slab.c 2004-04-21 10:46:51.818704472 +0100 @@ -135,11 +135,11 @@ SLAB_POISON | SLAB_HWCACHE_ALIGN | \ SLAB_NO_REAP | SLAB_CACHE_DMA | \ SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ - SLAB_RECLAIM_ACCOUNT ) + SLAB_RECLAIM_ACCOUNT | SLAB_PANIC) #else # define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \ SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ - SLAB_RECLAIM_ACCOUNT) + SLAB_RECLAIM_ACCOUNT | SLAB_PANIC) #endif /* @@ -1367,9 +1367,11 @@ next: up(&cache_chain_sem); unlock_cpu_hotplug(); opps: + if (!cachep && (flags & SLAB_PANIC)) + panic("kmem_cache_create(): failed to create slab `%s'\n", + name); return cachep; } - EXPORT_SYMBOL(kmem_cache_create); static inline void check_irq_off(void) @@ -1988,6 +1990,15 @@ cache_alloc_debugcheck_after(kmem_cache_ *dbg_redzone1(cachep, objp) = RED_ACTIVE; *dbg_redzone2(cachep, objp) = RED_ACTIVE; } + { + int objnr; + struct slab *slabp; + + slabp = GET_PAGE_SLAB(virt_to_page(objp)); + + objnr = (objp - slabp->s_mem) / cachep->objsize; + slab_bufctl(slabp)[objnr] = (unsigned long)caller; + } objp += obj_dbghead(cachep); if (cachep->ctor && cachep->flags & SLAB_POISON) { unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR; @@ -2049,12 +2060,14 @@ static void free_block(kmem_cache_t *cac objnr = (objp - slabp->s_mem) / cachep->objsize; check_slabp(cachep, slabp); #if DEBUG +#if 0 if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { printk(KERN_ERR "slab: double free detected in cache '%s', objp %p.\n", cachep->name, objp); BUG(); } #endif +#endif slab_bufctl(slabp)[objnr] = slabp->free; slabp->free = objnr; STATS_DEC_ACTIVE(cachep); @@ -2835,6 +2848,29 @@ struct seq_operations slabinfo_op = { .show = s_show, }; +static void do_dump_slabp(kmem_cache_t *cachep) +{ +#if DEBUG + struct list_head *q; + + check_irq_on(); + spin_lock_irq(&cachep->spinlock); + list_for_each(q,&cachep->lists.slabs_full) { + struct slab *slabp; + int i; + slabp = list_entry(q, struct slab, list); + for (i = 0; i < cachep->num; i++) { + unsigned long sym = slab_bufctl(slabp)[i]; + + printk("obj %p/%d: %p", slabp, i, (void *)sym); + print_symbol(" <%s>", sym); + printk("\n"); + } + } + spin_unlock_irq(&cachep->spinlock); +#endif +} + #define MAX_SLABINFO_WRITE 128 /** * slabinfo_write - Tuning for the slab allocator @@ -2875,9 +2911,11 @@ ssize_t slabinfo_write(struct file *file batchcount < 1 || batchcount > limit || shared < 0) { - res = -EINVAL; + do_dump_slabp(cachep); + res = 0; } else { - res = do_tune_cpucache(cachep, limit, batchcount, shared); + res = do_tune_cpucache(cachep, limit, + batchcount, shared); } break; } --- diff/mm/swap_state.c 2004-04-21 10:45:36.035225312 +0100 +++ source/mm/swap_state.c 2004-04-21 10:46:51.820704168 +0100 @@ -325,7 +325,8 @@ struct page * lookup_swap_cache(swp_entr * A failure return means that either the page allocation failed or that * the swap entry is no longer in use. */ -struct page * read_swap_cache_async(swp_entry_t entry) +struct page *read_swap_cache_async(swp_entry_t entry, + struct vm_area_struct *vma, unsigned long addr) { struct page *found_page, *new_page = NULL; int err; @@ -349,7 +350,7 @@ struct page * read_swap_cache_async(swp_ * Get a new page to read into from swap. */ if (!new_page) { - new_page = alloc_page(GFP_HIGHUSER); + new_page = alloc_page_vma(GFP_HIGHUSER, vma, addr); if (!new_page) break; /* Out of memory */ } --- diff/mm/swapfile.c 2004-04-21 10:45:36.037225008 +0100 +++ source/mm/swapfile.c 2004-04-21 10:46:51.819704320 +0100 @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -455,19 +456,19 @@ void free_swap_and_cache(swp_entry_t ent /* vma->vm_mm->page_table_lock is held */ static void unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir, - swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) + swp_entry_t entry, struct page *page) { vma->vm_mm->rss++; get_page(page); set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); - *pte_chainp = page_add_rmap(page, dir, *pte_chainp); + page_add_anon_rmap(page, vma->vm_mm, address); swap_free(entry); } /* vma->vm_mm->page_table_lock is held */ -static int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir, +static unsigned long unuse_pmd(struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long size, unsigned long offset, - swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) + swp_entry_t entry, struct page *page) { pte_t * pte; unsigned long end; @@ -492,10 +493,10 @@ static int unuse_pmd(struct vm_area_stru * Test inline before going to call unuse_pte. */ if (unlikely(pte_same(*pte, swp_pte))) { - unuse_pte(vma, offset + address, pte, - entry, page, pte_chainp); + unuse_pte(vma, offset + address, pte, entry, page); pte_unmap(pte); - return 1; + /* add 1 since address may be 0 */ + return 1 + offset + address; } address += PAGE_SIZE; pte++; @@ -505,12 +506,13 @@ static int unuse_pmd(struct vm_area_stru } /* vma->vm_mm->page_table_lock is held */ -static int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir, +static unsigned long unuse_pgd(struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long size, - swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) + swp_entry_t entry, struct page *page) { pmd_t * pmd; unsigned long offset, end; + unsigned long foundaddr; if (pgd_none(*dir)) return 0; @@ -528,9 +530,10 @@ static int unuse_pgd(struct vm_area_stru if (address >= end) BUG(); do { - if (unuse_pmd(vma, pmd, address, end - address, - offset, entry, page, pte_chainp)) - return 1; + foundaddr = unuse_pmd(vma, pmd, address, end - address, + offset, entry, page); + if (foundaddr) + return foundaddr; address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); @@ -538,17 +541,19 @@ static int unuse_pgd(struct vm_area_stru } /* vma->vm_mm->page_table_lock is held */ -static int unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir, - swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) +static unsigned long unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir, + swp_entry_t entry, struct page *page) { unsigned long start = vma->vm_start, end = vma->vm_end; + unsigned long foundaddr; if (start >= end) BUG(); do { - if (unuse_pgd(vma, pgdir, start, end - start, - entry, page, pte_chainp)) - return 1; + foundaddr = unuse_pgd(vma, pgdir, start, end - start, + entry, page); + if (foundaddr) + return foundaddr; start = (start + PGDIR_SIZE) & PGDIR_MASK; pgdir++; } while (start && (start < end)); @@ -559,24 +564,27 @@ static int unuse_process(struct mm_struc swp_entry_t entry, struct page* page) { struct vm_area_struct* vma; - struct pte_chain *pte_chain; - - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - return -ENOMEM; + unsigned long foundaddr = 0; + int ret = 0; /* * Go through process' page directory. */ + down_read(&mm->mmap_sem); spin_lock(&mm->page_table_lock); for (vma = mm->mmap; vma; vma = vma->vm_next) { - pgd_t * pgd = pgd_offset(mm, vma->vm_start); - if (unuse_vma(vma, pgd, entry, page, &pte_chain)) - break; + if (!is_vm_hugetlb_page(vma)) { + pgd_t * pgd = pgd_offset(mm, vma->vm_start); + foundaddr = unuse_vma(vma, pgd, entry, page); + if (foundaddr) + break; + } } spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); - return 0; + if (foundaddr && mremap_moved_anon_rmap(page, foundaddr)) + ret = make_page_exclusive(vma, foundaddr); + up_read(&mm->mmap_sem); + return ret; } /* @@ -677,7 +685,7 @@ static int try_to_unuse(unsigned int typ */ swap_map = &si->swap_map[i]; entry = swp_entry(type, i); - page = read_swap_cache_async(entry); + page = read_swap_cache_async(entry, NULL, 0); if (!page) { /* * Either swap_duplicate() failed because entry --- diff/mm/vmalloc.c 2004-02-18 08:54:13.000000000 +0000 +++ source/mm/vmalloc.c 2004-04-21 10:46:51.820704168 +0100 @@ -17,7 +17,6 @@ #include #include -#include #include --- diff/mm/vmscan.c 2004-04-21 10:45:36.038224856 +0100 +++ source/mm/vmscan.c 2004-04-21 10:46:51.821704016 +0100 @@ -33,7 +33,6 @@ #include #include -#include #include #include --- diff/net/Kconfig 2004-04-21 10:45:36.040224552 +0100 +++ source/net/Kconfig 2004-04-21 10:46:51.823703712 +0100 @@ -650,18 +650,17 @@ endmenu endmenu +config KGDBOE + def_bool X86 && KGDB + config NETPOLL - def_bool NETCONSOLE + def_bool NETCONSOLE || KGDBOE config NETPOLL_RX - bool "Netpoll support for trapping incoming packets" - default n - depends on NETPOLL + def_bool KGDBOE config NETPOLL_TRAP - bool "Netpoll traffic trapping" - default n - depends on NETPOLL + def_bool KGDBOE config NET_POLL_CONTROLLER def_bool NETPOLL --- diff/net/core/dev.c 2004-04-21 10:45:36.053222576 +0100 +++ source/net/core/dev.c 2004-04-21 10:46:51.822703864 +0100 @@ -1523,7 +1523,6 @@ static void sample_queue(unsigned long d } #endif - /** * netif_rx - post buffer to the network code * @skb: buffer to post @@ -1843,7 +1842,6 @@ static void net_rx_action(struct softirq unsigned long start_time = jiffies; int budget = netdev_max_backlog; - local_irq_disable(); while (!list_empty(&queue->poll_list)) { @@ -1869,6 +1867,10 @@ static void net_rx_action(struct softirq dev_put(dev); local_irq_disable(); } + +#ifdef CONFIG_KGDBOE + kgdb_process_breakpoint(); +#endif } out: local_irq_enable(); --- diff/net/socket.c 2004-03-11 10:20:29.000000000 +0000 +++ source/net/socket.c 2004-04-21 10:46:51.824703560 +0100 @@ -308,9 +308,9 @@ static void init_once(void * foo, kmem_c static int init_inodecache(void) { sock_inode_cachep = kmem_cache_create("sock_inode_cache", - sizeof(struct socket_alloc), - 0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, - init_once, NULL); + sizeof(struct socket_alloc), + 0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, + init_once, NULL); if (sock_inode_cachep == NULL) return -ENOMEM; return 0; --- diff/scripts/Makefile.modpost 2004-04-21 10:45:36.115213152 +0100 +++ source/scripts/Makefile.modpost 2004-04-21 10:46:51.824703560 +0100 @@ -1,34 +1,71 @@ # =========================================================================== # Module versions # =========================================================================== +# +# Stage one of module building created the following: +# a) The individual .o files used for the module +# b) A .o file wich is the .o files above linked together +# c) A .mod file in $(MODVERDIR)/, listing the name of the +# the preliminary .o file, plus all .o files + +# Stage 2 is handled by this file and does the following +# 1) Find all modules from the files listed in $(MODVERDIR)/ +# 2) modpost is then used to +# 3) create one .mod.c file pr. module +# 4) create one Module.symvers file with CRC for all exported symbols +# 5) compile all .mod.c files +# 6) final link of the module to a file + +# Step 3 is used to place certain information in the module's ELF +# section, including information such as: +# Version magic (see include/vermagic.h for full details) +# - Kernel release +# - SMP is CONFIG_SMP +# - PREEMPT is CONFIG_PREEMPT +# - GCC Version +# Module info +# - Module version (MODULE_VERSION) +# - Module alias'es (MODULE_ALIAS) +# - Module license (MODULE_LICENSE) +# - See include/linux/module.h for more details + +# Step 4 is solely used to allow module versioning in external modules, +# where the CRC of each module is retreived from the Module.symers file. -.PHONY: __modversions -__modversions: +.PHONY: _modpost +_modpost: __modpost include .config include scripts/Makefile.lib -# +symverfile := $(objtree)/Module.symvers +# Step 1), find all modules listed in $(MODVERDIR)/ __modules := $(shell head -q -n1 /dev/null $(wildcard $(MODVERDIR)/*.mod)) -modules := $(patsubst %.o,%.ko,$(wildcard $(__modules:.ko=.o))) +modules := $(patsubst %.o,%.ko, $(wildcard $(__modules:.ko=.o))) -__modversions: $(modules) - @: +_modpost: $(modules) -# The final module link -quiet_cmd_ld_ko_o = LD [M] $@ - cmd_ld_ko_o = $(LD) $(LDFLAGS) $(LDFLAGS_MODULE) -o $@ \ - $(filter-out FORCE,$^) +# Step 2), invoke modpost +# Includes step 3,4 +quiet_cmd_modpost = MODPOST + cmd_modpost = scripts/modpost \ + $(if $(KBUILD_EXTMOD),-i,-o) $(symverfile) \ + $(filter-out FORCE,$^) -$(modules): %.ko :%.o %.mod.o FORCE - $(call if_changed,ld_ko_o) +.PHONY: __modpost +__modpost: $(wildcard vmlinux) $(modules:.ko=.o) FORCE + $(call cmd,modpost) + +# Declare generated files as targets for modpost +$(symverfile): __modpost ; +$(modules:.ko=.mod.c): __modpost ; -targets += $(modules) -# Compile version info for unresolved symbols +# Step 5), compile all *.mod.c files +# modname is set to make c_flags define KBUILD_MODNAME modname = $(*F) quiet_cmd_cc_o_c = CC $@ @@ -40,23 +77,16 @@ $(modules:.ko=.mod.o): %.mod.o: %.mod.c targets += $(modules:.ko=.mod.o) -# All the .mod.c files are generated using the helper "modpost" - -.PHONY: __modpost - -$(modules:.ko=.mod.c): __modpost ; - -# Extract all checksums for all exported symbols +# Step 6), final link of the modules +quiet_cmd_ld_ko_o = LD [M] $@ + cmd_ld_ko_o = $(LD) $(LDFLAGS) $(LDFLAGS_MODULE) -o $@ \ + $(filter-out FORCE,$^) -quiet_cmd_modpost = MODPOST - cmd_modpost = scripts/modpost \ - $(if $(filter vmlinux,$^),-o,-i) $(objtree)/Module.symvers \ - $(filter-out FORCE,$^) +$(modules): %.ko :%.o %.mod.o FORCE + $(call if_changed,ld_ko_o) -__modpost: $(if $(KBUILD_EXTMOD),,$(wildcard vmlinux)) $(modules:.ko=.o) FORCE - $(call if_changed,modpost) +targets += $(modules) -targets += __modpost # Add FORCE to the prequisites of a target to force it to be always rebuilt. # --------------------------------------------------------------------------- --- diff/scripts/modpost.c 2004-04-21 10:45:36.117212848 +0100 +++ source/scripts/modpost.c 2004-04-21 10:46:51.825703408 +0100 @@ -625,10 +625,9 @@ read_dump(const char *fname) void *file = grab_file(fname, &size); char *line; - if (!file) { - perror(fname); - abort(); - } + if (!file) + /* No symbol versions, silently ignore */ + return; while ((line = get_next_line(&pos, file, size))) { char *symname, *modname, *d; --- diff/security/capability.c 2004-02-09 10:36:12.000000000 +0000 +++ source/security/capability.c 2004-04-21 10:46:51.825703408 +0100 @@ -35,7 +35,7 @@ static struct security_operations capabi .netlink_send = cap_netlink_send, .netlink_recv = cap_netlink_recv, - .bprm_compute_creds = cap_bprm_compute_creds, + .bprm_apply_creds = cap_bprm_apply_creds, .bprm_set_security = cap_bprm_set_security, .bprm_secureexec = cap_bprm_secureexec, --- diff/security/commoncap.c 2004-04-05 12:57:08.000000000 +0100 +++ source/security/commoncap.c 2004-04-21 10:46:51.825703408 +0100 @@ -115,13 +115,15 @@ int cap_bprm_set_security (struct linux_ return 0; } -/* Copied from fs/exec.c */ static inline int must_not_trace_exec (struct task_struct *p) { - return (p->ptrace & PT_PTRACED) && !(p->ptrace & PT_PTRACE_CAP); + return ((p->ptrace & PT_PTRACED) && !(p->ptrace & PT_PTRACE_CAP)) + || atomic_read(&p->fs->count) > 1 + || atomic_read(&p->files->count) > 1 + || atomic_read(&p->sighand->count) > 1; } -void cap_bprm_compute_creds (struct linux_binprm *bprm) +void cap_bprm_apply_creds (struct linux_binprm *bprm) { /* Derived from fs/exec.c:compute_creds. */ kernel_cap_t new_permitted, working; @@ -132,18 +134,26 @@ void cap_bprm_compute_creds (struct linu new_permitted = cap_combine (new_permitted, working); task_lock(current); + + if (bprm->e_uid != current->uid || bprm->e_gid != current->gid) { + current->mm->dumpable = 0; + + if (must_not_trace_exec(current) && !capable(CAP_SETUID)) { + bprm->e_uid = current->uid; + bprm->e_gid = current->gid; + } + } + + current->suid = current->euid = current->fsuid = bprm->e_uid; + current->sgid = current->egid = current->fsgid = bprm->e_gid; + if (!cap_issubset (new_permitted, current->cap_permitted)) { current->mm->dumpable = 0; - if (must_not_trace_exec (current) - || atomic_read (¤t->fs->count) > 1 - || atomic_read (¤t->files->count) > 1 - || atomic_read (¤t->sighand->count) > 1) { - if (!capable (CAP_SETPCAP)) { - new_permitted = cap_intersect (new_permitted, - current-> - cap_permitted); - } + if (must_not_trace_exec (current) && !capable (CAP_SETPCAP)) { + new_permitted = cap_intersect (new_permitted, + current-> + cap_permitted); } } @@ -315,7 +325,7 @@ int cap_vm_enough_memory(long pages) vm_acct_memory(pages); - /* + /* * Sometimes we want to use more memory than we have */ if (sysctl_overcommit_memory == 1) @@ -377,7 +387,7 @@ EXPORT_SYMBOL(cap_capget); EXPORT_SYMBOL(cap_capset_check); EXPORT_SYMBOL(cap_capset_set); EXPORT_SYMBOL(cap_bprm_set_security); -EXPORT_SYMBOL(cap_bprm_compute_creds); +EXPORT_SYMBOL(cap_bprm_apply_creds); EXPORT_SYMBOL(cap_bprm_secureexec); EXPORT_SYMBOL(cap_inode_setxattr); EXPORT_SYMBOL(cap_inode_removexattr); --- diff/security/dummy.c 2004-04-05 12:57:08.000000000 +0100 +++ source/security/dummy.c 2004-04-21 10:46:51.826703256 +0100 @@ -26,6 +26,8 @@ #include #include #include +#include +#include static int dummy_ptrace (struct task_struct *parent, struct task_struct *child) { @@ -116,7 +118,7 @@ static int dummy_vm_enough_memory(long p vm_acct_memory(pages); - /* + /* * Sometimes we want to use more memory than we have */ if (sysctl_overcommit_memory == 1) @@ -169,9 +171,30 @@ static void dummy_bprm_free_security (st return; } -static void dummy_bprm_compute_creds (struct linux_binprm *bprm) +static inline int must_not_trace_exec (struct task_struct *p) { - return; + return ((p->ptrace & PT_PTRACED) && !(p->ptrace & PT_PTRACE_CAP)) + || atomic_read(&p->fs->count) > 1 + || atomic_read(&p->files->count) > 1 + || atomic_read(&p->sighand->count) > 1; +} + +static void dummy_bprm_apply_creds (struct linux_binprm *bprm) +{ + task_lock(current); + if (bprm->e_uid != current->uid || bprm->e_gid != current->gid) { + current->mm->dumpable = 0; + + if (must_not_trace_exec(current) && !capable(CAP_SETUID)) { + bprm->e_uid = current->uid; + bprm->e_gid = current->gid; + } + } + + current->suid = current->euid = current->fsuid = bprm->e_uid; + current->sgid = current->egid = current->fsgid = bprm->e_gid; + + task_unlock(current); } static int dummy_bprm_set_security (struct linux_binprm *bprm) @@ -887,7 +910,7 @@ void security_fixup_ops (struct security set_to_dummy_if_null(ops, vm_enough_memory); set_to_dummy_if_null(ops, bprm_alloc_security); set_to_dummy_if_null(ops, bprm_free_security); - set_to_dummy_if_null(ops, bprm_compute_creds); + set_to_dummy_if_null(ops, bprm_apply_creds); set_to_dummy_if_null(ops, bprm_set_security); set_to_dummy_if_null(ops, bprm_check_security); set_to_dummy_if_null(ops, bprm_secureexec); --- diff/security/root_plug.c 2003-06-30 10:07:24.000000000 +0100 +++ source/security/root_plug.c 2004-04-21 10:46:51.826703256 +0100 @@ -90,7 +90,7 @@ static struct security_operations rootpl .capset_set = cap_capset_set, .capable = cap_capable, - .bprm_compute_creds = cap_bprm_compute_creds, + .bprm_apply_creds = cap_bprm_apply_creds, .bprm_set_security = cap_bprm_set_security, .task_post_setuid = cap_task_post_setuid, --- diff/security/selinux/Kconfig 2004-04-21 10:45:36.118212696 +0100 +++ source/security/selinux/Kconfig 2004-04-21 10:46:51.828702952 +0100 @@ -24,6 +24,21 @@ config SECURITY_SELINUX_BOOTPARAM If you are unsure how to answer this question, answer N. +config SECURITY_SELINUX_DISABLE + bool "NSA SELinux runtime disable" + depends on SECURITY_SELINUX + default n + help + This option enables writing to a selinuxfs node 'disable', which + allows SELinux to be disabled at runtime prior to the policy load. + SELinux will then remain disabled until the next boot. + This option is similar to the selinux=0 boot parameter, but is to + support runtime disabling of SELinux, e.g. from /sbin/init, for + portability across platforms where boot parameters are difficult + to employ. + + If you are unsure how to answer this question, answer N. + config SECURITY_SELINUX_DEVELOP bool "NSA SELinux Development Support" depends on SECURITY_SELINUX --- diff/security/selinux/hooks.c 2004-04-21 10:45:36.122212088 +0100 +++ source/security/selinux/hooks.c 2004-04-21 10:46:51.828702952 +0100 @@ -753,6 +753,7 @@ static int inode_doinit_with_dentry(stru inode->i_ino); goto out; } + BUG_ON(inode != dentry->d_inode); len = INITCONTEXTLEN; context = kmalloc(len, GFP_KERNEL); @@ -1745,7 +1746,7 @@ static inline void flush_unauthorized_fi spin_unlock(&files->file_lock); } -static void selinux_bprm_compute_creds(struct linux_binprm *bprm) +static void selinux_bprm_apply_creds(struct linux_binprm *bprm) { struct task_security_struct *tsec, *psec; struct bprm_security_struct *bsec; @@ -1755,7 +1756,7 @@ static void selinux_bprm_compute_creds(s struct rlimit *rlim, *initrlim; int rc, i; - secondary_ops->bprm_compute_creds(bprm); + secondary_ops->bprm_apply_creds(bprm); tsec = current->security; @@ -2560,7 +2561,7 @@ static int selinux_task_setrlimit(unsign /* Control the ability to change the hard limit (whether lowering or raising it), so that the hard limit can later be used as a safe reset point for the soft limit - upon context transitions. See selinux_bprm_compute_creds. */ + upon context transitions. See selinux_bprm_apply_creds. */ if (old_rlim->rlim_max != new_rlim->rlim_max) return task_has_perm(current, current, PROCESS__SETRLIMIT); @@ -3971,7 +3972,7 @@ struct security_operations selinux_ops = .bprm_alloc_security = selinux_bprm_alloc_security, .bprm_free_security = selinux_bprm_free_security, - .bprm_compute_creds = selinux_bprm_compute_creds, + .bprm_apply_creds = selinux_bprm_apply_creds, .bprm_set_security = selinux_bprm_set_security, .bprm_check_security = selinux_bprm_check_security, .bprm_secureexec = selinux_bprm_secureexec, @@ -4216,4 +4217,57 @@ out: __initcall(selinux_nf_ip_init); +#ifdef CONFIG_SECURITY_SELINUX_DISABLE +static void selinux_nf_ip_exit(void) +{ + printk(KERN_INFO "SELinux: Unregistering netfilter hooks\n"); + + nf_unregister_hook(&selinux_ipv4_op); +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + nf_unregister_hook(&selinux_ipv6_op); +#endif /* IPV6 */ +} +#endif + +#else /* CONFIG_SECURITY_NETWORK && CONFIG_NETFILTER */ + +#ifdef CONFIG_SECURITY_SELINUX_DISABLE +#define selinux_nf_ip_exit() +#endif + #endif /* CONFIG_SECURITY_NETWORK && CONFIG_NETFILTER */ + +#ifdef CONFIG_SECURITY_SELINUX_DISABLE +int selinux_disable(void) +{ + extern void exit_sel_fs(void); + static int selinux_disabled = 0; + + if (ss_initialized) { + /* Not permitted after initial policy load. */ + return -EINVAL; + } + + if (selinux_disabled) { + /* Only do this once. */ + return -EINVAL; + } + + printk(KERN_INFO "SELinux: Disabled at runtime.\n"); + + selinux_disabled = 1; + + /* Reset security_ops to the secondary module, dummy or capability. */ + security_ops = secondary_ops; + + /* Unregister netfilter hooks. */ + selinux_nf_ip_exit(); + + /* Unregister selinuxfs. */ + exit_sel_fs(); + + return 0; +} +#endif + + --- diff/security/selinux/selinuxfs.c 2004-04-21 10:45:36.123211936 +0100 +++ source/security/selinux/selinuxfs.c 2004-04-21 10:46:51.829702800 +0100 @@ -46,6 +46,8 @@ int task_has_security(struct task_struct struct task_security_struct *tsec; tsec = tsk->security; + if (!tsec) + return -EACCES; return avc_has_perm(tsec->sid, SECINITSID_SECURITY, SECCLASS_SECURITY, perms, NULL, NULL); @@ -61,8 +63,9 @@ enum sel_inos { SEL_RELABEL, /* compute relabeling decision */ SEL_USER, /* compute reachable user contexts */ SEL_POLICYVERS, /* return policy version for this kernel */ - SEL_COMMIT_BOOLS, - SEL_MLS /* return if MLS policy is enabled */ + SEL_COMMIT_BOOLS, /* commit new boolean values */ + SEL_MLS, /* return if MLS policy is enabled */ + SEL_DISABLE /* disable SELinux until next reboot */ }; static ssize_t sel_read_enforce(struct file *filp, char *buf, @@ -151,6 +154,53 @@ static struct file_operations sel_enforc .write = sel_write_enforce, }; +#ifdef CONFIG_SECURITY_SELINUX_DISABLE +static ssize_t sel_write_disable(struct file * file, const char * buf, + size_t count, loff_t *ppos) + +{ + char *page; + ssize_t length; + int new_value; + extern int selinux_disable(void); + + if (count < 0 || count >= PAGE_SIZE) + return -ENOMEM; + if (*ppos != 0) { + /* No partial writes. */ + return -EINVAL; + } + page = (char*)__get_free_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + memset(page, 0, PAGE_SIZE); + length = -EFAULT; + if (copy_from_user(page, buf, count)) + goto out; + + length = -EINVAL; + if (sscanf(page, "%d", &new_value) != 1) + goto out; + + if (new_value) { + length = selinux_disable(); + if (length < 0) + goto out; + } + + length = count; +out: + free_page((unsigned long) page); + return length; +} +#else +#define sel_write_disable NULL +#endif + +static struct file_operations sel_disable_ops = { + .write = sel_write_disable, +}; + static ssize_t sel_read_policyvers(struct file *filp, char *buf, size_t count, loff_t *ppos) { @@ -1005,6 +1055,7 @@ static int sel_fill_super(struct super_b [SEL_POLICYVERS] = {"policyvers", &sel_policyvers_ops, S_IRUGO}, [SEL_COMMIT_BOOLS] = {"commit_pending_bools", &sel_commit_bools_ops, S_IWUSR}, [SEL_MLS] = {"mls", &sel_mls_ops, S_IRUGO}, + [SEL_DISABLE] = {"disable", &sel_disable_ops, S_IWUSR}, /* last one */ {""} }; ret = simple_fill_super(sb, SELINUX_MAGIC, selinux_files); @@ -1054,3 +1105,10 @@ static int __init init_sel_fs(void) } __initcall(init_sel_fs); + +#ifdef CONFIG_SECURITY_SELINUX_DISABLE +void exit_sel_fs(void) +{ + unregister_filesystem(&sel_fs_type); +} +#endif --- diff/security/selinux/ss/services.c 2004-04-21 10:45:36.125211632 +0100 +++ source/security/selinux/ss/services.c 2004-04-21 10:46:51.830702648 +0100 @@ -456,9 +456,7 @@ int security_context_to_sid(char *sconte goto out; } } - printk(KERN_ERR "security_context_to_sid: called before " - "initial load_policy on unknown context %s\n", scontext); - rc = -EINVAL; + *sid = SECINITSID_KERNEL; goto out; } *sid = SECSID_NULL; @@ -1343,8 +1341,6 @@ int security_get_user_sids(u32 fromsid, if (!ebitmap_get_bit(&role->types, j)) continue; usercon.type = j+1; - if (usercon.type == fromcon->type) - continue; mls_for_user_ranges(user,usercon) { rc = context_struct_compute_av(fromcon, &usercon, SECCLASS_PROCESS, --- diff/sound/oss/dmasound/dmasound_atari.c 2003-09-17 12:28:13.000000000 +0100 +++ source/sound/oss/dmasound/dmasound_atari.c 2004-04-21 10:46:51.831702496 +0100 @@ -22,7 +22,6 @@ #include #include -#include #include #include #include --- diff/sound/pci/au88x0/au8820.c 2004-04-05 12:57:08.000000000 +0100 +++ source/sound/pci/au88x0/au8820.c 2004-04-21 10:46:51.831702496 +0100 @@ -1,7 +1,7 @@ #include "au8820.h" #include "au88x0.h" static struct pci_device_id snd_vortex_ids[] = { - {PCI_VENDOR_ID_AUREAL, PCI_DEVICE_ID_AUREAL_VORTEX, + {PCI_VENDOR_ID_AUREAL, PCI_DEVICE_ID_AUREAL_VORTEX_1, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0,}, {0,} }; --- diff/sound/pci/au88x0/au8830.c 2004-04-05 12:57:08.000000000 +0100 +++ source/sound/pci/au88x0/au8830.c 2004-04-21 10:46:51.832702344 +0100 @@ -1,7 +1,7 @@ #include "au8830.h" #include "au88x0.h" static struct pci_device_id snd_vortex_ids[] = { - {PCI_VENDOR_ID_AUREAL, PCI_DEVICE_ID_AUREAL_VORTEX2, + {PCI_VENDOR_ID_AUREAL, PCI_DEVICE_ID_AUREAL_VORTEX_2, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0,}, {0,} }; --- diff/sound/pci/au88x0/au88x0.h 2004-04-05 12:57:08.000000000 +0100 +++ source/sound/pci/au88x0/au88x0.h 2004-04-21 10:46:51.832702344 +0100 @@ -80,8 +80,8 @@ #define VORTEX_IS_QUAD(x) ((x->codec == NULL) ? 0 : (x->codec->ext_id|0x80)) /* Check if chip has bug. */ #define IS_BAD_CHIP(x) (\ - (x->rev < 3 && x->device == PCI_DEVICE_ID_AUREAL_VORTEX) || \ - (x->rev < 0xfe && x->device == PCI_DEVICE_ID_AUREAL_VORTEX2) || \ + (x->rev < 3 && x->device == PCI_DEVICE_ID_AUREAL_VORTEX_1) || \ + (x->rev < 0xfe && x->device == PCI_DEVICE_ID_AUREAL_VORTEX_2) || \ (x->rev < 0xfe && x->device == PCI_DEVICE_ID_AUREAL_ADVANTAGE)) --- diff/sound/pci/intel8x0.c 2004-04-05 12:57:08.000000000 +0100 +++ source/sound/pci/intel8x0.c 2004-04-21 10:46:51.833702192 +0100 @@ -445,6 +445,7 @@ struct _snd_intel8x0 { #ifdef CONFIG_PM int in_suspend; + u32 pci_state[64 / sizeof(u32)]; #endif }; @@ -2223,11 +2224,13 @@ static int snd_intel8x0_suspend(struct p { intel8x0_t *chip = snd_magic_cast(intel8x0_t, pci_get_drvdata(dev), return -ENXIO); intel8x0_suspend(chip); + pci_save_state(dev, chip->pci_state); return 0; } static int snd_intel8x0_resume(struct pci_dev *dev) { intel8x0_t *chip = snd_magic_cast(intel8x0_t, pci_get_drvdata(dev), return -ENXIO); + pci_restore_state(dev, chip->pci_state); intel8x0_resume(chip); return 0; } --- diff/Documentation/i386/kgdb/andthen 1970-01-01 01:00:00.000000000 +0100 +++ source/Documentation/i386/kgdb/andthen 2004-04-21 10:46:51.330778648 +0100 @@ -0,0 +1,100 @@ + +define set_andthen + set var $thp=0 + set var $thp=(struct kgdb_and_then_struct *)&kgdb_data[0] + set var $at_size = (sizeof kgdb_data)/(sizeof *$thp) + set var $at_oc=kgdb_and_then_count + set var $at_cc=$at_oc +end + +define andthen_next + set var $at_cc=$arg0 +end + +define andthen + andthen_set_edge + if ($at_cc >= $at_oc) + printf "Outside window. Window size is %d\n",($at_oc-$at_low) + else + printf "%d: ",$at_cc + output *($thp+($at_cc++ % $at_size )) + printf "\n" + end +end +define andthen_set_edge + set var $at_oc=kgdb_and_then_count + set var $at_low = $at_oc - $at_size + if ($at_low < 0 ) + set var $at_low = 0 + end + if (( $at_cc > $at_oc) || ($at_cc < $at_low)) + printf "Count outside of window, setting count to " + if ($at_cc >= $at_oc) + set var $at_cc = $at_oc + else + set var $at_cc = $at_low + end + printf "%d\n",$at_cc + end +end + +define beforethat + andthen_set_edge + if ($at_cc <= $at_low) + printf "Outside window. Window size is %d\n",($at_oc-$at_low) + else + printf "%d: ",$at_cc-1 + output *($thp+(--$at_cc % $at_size )) + printf "\n" + end +end + +document andthen_next + andthen_next + . sets the number of the event to display next. If this event + . is not in the event pool, either andthen or beforethat will + . correct it to the nearest event pool edge. The event pool + . ends at the last event recorded and begins + . prior to that. If beforethat is used next, it will display + . event -1. +. + andthen commands are: set_andthen, andthen_next, andthen and beforethat +end + + +document andthen + andthen +. displays the next event in the list. sets up to display +. the oldest saved event first. +. (optional) count of the event to display. +. note the number of events saved is specified at configure time. +. if events are saved between calls to andthen the index will change +. but the displayed event will be the next one (unless the event buffer +. is overrun). +. +. andthen commands are: set_andthen, andthen_next, andthen and beforethat +end + +document set_andthen + set_andthen +. sets up to use the and commands. +. if you have defined your own struct, use the above and +. then enter the following: +. p $thp=(struct kgdb_and_then_structX *)&kgdb_data[0] +. where is the name of your structure. +. +. andthen commands are: set_andthen, andthen_next, andthen and beforethat +end + +document beforethat + beforethat +. displays the next prior event in the list. sets up to +. display the last occuring event first. +. +. note the number of events saved is specified at configure time. +. if events are saved between calls to beforethat the index will change +. but the displayed event will be the next one (unless the event buffer +. is overrun). +. +. andthen commands are: set_andthen, andthen_next, andthen and beforethat +end --- diff/Documentation/i386/kgdb/debug-nmi.txt 1970-01-01 01:00:00.000000000 +0100 +++ source/Documentation/i386/kgdb/debug-nmi.txt 2004-04-21 10:46:51.330778648 +0100 @@ -0,0 +1,37 @@ +Subject: Debugging with NMI +Date: Mon, 12 Jul 1999 11:28:31 -0500 +From: David Grothe +Organization: Gcom, Inc +To: David Grothe + +Kernel hackers: + +Maybe this is old hat, but it is new to me -- + +On an ISA bus machine, if you short out the A1 and B1 pins of an ISA +slot you will generate an NMI to the CPU. This interrupts even a +machine that is hung in a loop with interrupts disabled. Used in +conjunction with kgdb < +ftp://ftp.gcom.com/pub/linux/src/kgdb-2.3.35/kgdb-2.3.35.tgz > you can +gain debugger control of a machine that is hung in the kernel! Even +without kgdb the kernel will print a stack trace so you can find out +where it was hung. + +The A1/B1 pins are directly opposite one another and the farthest pins +towards the bracket end of the ISA bus socket. You can stick a paper +clip or multi-meter probe between them to short them out. + +I had a spare ISA bus to PC104 bus adapter around. The PC104 end of the +board consists of two rows of wire wrap pins. So I wired a push button +between the A1/B1 pins and now have an ISA board that I can stick into +any ISA bus slot for debugger entry. + +Microsoft has a circuit diagram of a PCI card at +http://www.microsoft.com/hwdev/DEBUGGING/DMPSW.HTM. If you want to +build one you will have to mail them and ask for the PAL equations. +Nobody makes one comercially. + +[THIS TIP COMES WITH NO WARRANTY WHATSOEVER. It works for me, but if +your machine catches fire, it is your problem, not mine.] + +-- Dave (the kgdb guy) --- diff/Documentation/i386/kgdb/gdb-globals.txt 1970-01-01 01:00:00.000000000 +0100 +++ source/Documentation/i386/kgdb/gdb-globals.txt 2004-04-21 10:46:51.331778496 +0100 @@ -0,0 +1,71 @@ +Sender: akale@veritas.com +Date: Fri, 23 Jun 2000 19:26:35 +0530 +From: "Amit S. Kale" +Organization: Veritas Software (India) +To: Dave Grothe , linux-kernel@vger.rutgers.edu +CC: David Milburn , + "Edouard G. Parmelan" , + ezannoni@cygnus.com, Keith Owens +Subject: Re: Module debugging using kgdb + +Dave Grothe wrote: +> +> Amit: +> +> There is a 2.4.0 version of kgdb on our ftp site: +> ftp://ftp.gcom.com/pub/linux/src/kgdb. I mirrored your version of gdb +> and loadmodule.sh there. +> +> Have a look at the README file and see if I go it right. If not, send +> me some corrections and I will update it. +> +> Does your version of gdb solve the global variable problem? + +Yes. +Thanks to Elena Zanoni, gdb (developement version) can now calculate +correctly addresses of dynamically loaded object files. I have not been +following gdb developement for sometime and am not sure when symbol +address calculation fix is going to appear in a gdb stable version. + +Elena, any idea when the fix will make it to a prebuilt gdb from a +redhat release? + +For the time being I have built a gdb developement version. It can be +used for module debugging with loadmodule.sh script. + +The problem with calculating of module addresses with previous versions +of gdb was as follows: +gdb did not use base address of a section while calculating address of +a symbol in the section in an object file loaded via 'add-symbol-file'. +It used address of .text segment instead. Due to this addresses of +symbols in .data, .bss etc. (e.g. global variables) were calculated incorrectly. + +Above mentioned fix allow gdb to use base address of a segment while +calculating address of a symbol in it. It adds a parameter '-s' to +'add-symbol-file' command for specifying base address of a segment. + +loadmodule.sh script works as follows. + +1. Copy a module file to target machine. +2. Load the module on the target machine using insmod with -m parameter. +insmod produces a module load map which contains base addresses of all +sections in the module and addresses of symbols in the module file. +3. Find all sections and their base addresses in the module from +the module map. +4. Generate a script that loads the module file. The script uses +'add-symbol-file' and specifies address of text segment followed by +addresses of all segments in the module. + +Here is an example gdb script produced by loadmodule.sh script. + +add-symbol-file foo 0xd082c060 -s .text.lock 0xd08cbfb5 +-s .fixup 0xd08cfbdf -s .rodata 0xd08cfde0 -s __ex_table 0xd08e3b38 +-s .data 0xd08e3d00 -s .bss 0xd08ec8c0 -s __ksymtab 0xd08ee838 + +With this command gdb can calculate addresses of symbols in ANY segment +in a module file. + +Regards. +-- +Amit Kale +Veritas Software ( http://www.veritas.com ) --- diff/Documentation/i386/kgdb/gdbinit 1970-01-01 01:00:00.000000000 +0100 +++ source/Documentation/i386/kgdb/gdbinit 2004-04-21 10:46:51.331778496 +0100 @@ -0,0 +1,14 @@ +shell echo -e "\003" >/dev/ttyS0 +set remotebaud 38400 +target remote /dev/ttyS0 +define si +stepi +printf "EAX=%08x EBX=%08x ECX=%08x EDX=%08x\n", $eax, $ebx, $ecx, $edx +printf "ESI=%08x EDI=%08x EBP=%08x ESP=%08x\n", $esi, $edi, $ebp, $esp +x/i $eip +end +define ni +nexti +printf "EAX=%08x EBX=%08x ECX=%08x EDX=%08x\n", $eax, $ebx, $ecx, $edx +printf "ESI=%08x EDI=%08x EBP=%08x ESP=%08x\n", $esi, $edi, $ebp, $esp +x/i $eip --- diff/Documentation/i386/kgdb/gdbinit-modules 1970-01-01 01:00:00.000000000 +0100 +++ source/Documentation/i386/kgdb/gdbinit-modules 2004-04-21 10:46:51.332778344 +0100 @@ -0,0 +1,146 @@ +# +# Usefull GDB user-command to debug Linux Kernel Modules with gdbstub. +# +# This don't work for Linux-2.0 or older. +# +# Author Edouard G. Parmelan +# +# +# Fri Apr 30 20:33:29 CEST 1999 +# First public release. +# +# Major cleanup after experiment Linux-2.0 kernel without success. +# Symbols of a module are not in the correct order, I can't explain +# why :( +# +# Fri Mar 19 15:41:40 CET 1999 +# Initial version. +# +# Thu Jan 6 16:29:03 CST 2000 +# A little fixing by Dave Grothe +# +# Mon Jun 19 09:33:13 CDT 2000 +# Alignment changes from Edouard Parmelan +# +# The basic idea is to find where insmod load the module and inform +# GDB to load the symbol table of the module with the GDB command +# ``add-symbol-file
''. +# +# The Linux kernel holds the list of all loaded modules in module_list, +# this list end with &kernel_module (exactly with module->next == NULL, +# but the last module is not a real module). +# +# Insmod allocates the struct module before the object file. Since +# Linux-2.1, this structure contain his size. The real address of +# the object file is then (char*)module + module->size_of_struct. +# +# You can use three user functions ``mod-list'', ``mod-print-symbols'' +# and ``add-module-symbols''. +# +# mod-list list all loaded modules with the format: +# +# +# As soon as you have found the address of your module, you can +# print its exported symbols (mod-print-symbols) or inform GDB to add +# symbols from your module file (mod-add-symbols). +# +# The argument that you give to mod-print-symbols or mod-add-symbols +# is the from the mod-list command. +# +# When using the mod-add-symbols command you must also give the full +# pathname of the modules object code file. +# +# The command mod-add-lis is an example of how to make this easier. +# You can edit this macro to contain the path name of your own +# favorite module and then use it as a shorthand to load it. You +# still need the module-address, however. +# +# The internal function ``mod-validate'' set the GDB variable $mod +# as a ``struct module*'' if the kernel known the module otherwise +# $mod is set to NULL. This ensure to not add symbols for a wrong +# address. +# +# Have a nice hacking day ! +# +# +define mod-list + set $mod = (struct module*)module_list + # the last module is the kernel, ignore it + while $mod != &kernel_module + printf "%p\t%s\n", (long)$mod, ($mod)->name + set $mod = $mod->next + end +end +document mod-list +List all modules in the form: +Use the as the argument for the other +mod-commands: mod-print-symbols, mod-add-symbols. +end + +define mod-validate + set $mod = (struct module*)module_list + while ($mod != $arg0) && ($mod != &kernel_module) + set $mod = $mod->next + end + if $mod == &kernel_module + set $mod = 0 + printf "%p is not a module\n", $arg0 + end +end +document mod-validate +mod-validate +Internal user-command used to validate the module parameter. +If is a real loaded module, set $mod to it otherwise set $mod to 0. +end + + +define mod-print-symbols + mod-validate $arg0 + if $mod != 0 + set $i = 0 + while $i < $mod->nsyms + set $sym = $mod->syms[$i] + printf "%p\t%s\n", $sym->value, $sym->name + set $i = $i + 1 + end + end +end +document mod-print-symbols +mod-print-symbols +Print all exported symbols of the module. see mod-list +end + + +define mod-add-symbols-align + mod-validate $arg0 + if $mod != 0 + set $mod_base = ($mod->size_of_struct + (long)$mod) + if ($arg2 != 0) && (($mod_base & ($arg2 - 1)) != 0) + set $mod_base = ($mod_base | ($arg2 - 1)) + 1 + end + add-symbol-file $arg1 $mod_base + end +end +document mod-add-symbols-align +mod-add-symbols-align +Load the symbols table of the module from the object file where +first section aligment is . +To retreive alignment, use `objdump -h '. +end + +define mod-add-symbols + mod-add-symbols-align $arg0 $arg1 sizeof(long) +end +document mod-add-symbols +mod-add-symbols +Load the symbols table of the module from the object file. +Default alignment is 4. See mod-add-symbols-align. +end + +define mod-add-lis + mod-add-symbols-align $arg0 /usr/src/LiS/streams.o 16 +end +document mod-add-lis +mod-add-lis +Does mod-add-symbols /usr/src/LiS/streams.o +end --- diff/Documentation/i386/kgdb/gdbinit.hw 1970-01-01 01:00:00.000000000 +0100 +++ source/Documentation/i386/kgdb/gdbinit.hw 2004-04-21 10:46:51.331778496 +0100 @@ -0,0 +1,117 @@ + +#Using ia-32 hardware breakpoints. +# +#4 hardware breakpoints are available in ia-32 processors. These breakpoints +#do not need code modification. They are set using debug registers. +# +#Each hardware breakpoint can be of one of the +#three types: execution, write, access. +#1. An Execution breakpoint is triggered when code at the breakpoint address is +#executed. +#2. A write breakpoint ( aka watchpoints ) is triggered when memory location +#at the breakpoint address is written. +#3. An access breakpoint is triggered when memory location at the breakpoint +#address is either read or written. +# +#As hardware breakpoints are available in limited number, use software +#breakpoints ( br command in gdb ) instead of execution hardware breakpoints. +# +#Length of an access or a write breakpoint defines length of the datatype to +#be watched. Length is 1 for char, 2 short , 3 int. +# +#For placing execution, write and access breakpoints, use commands +#hwebrk, hwwbrk, hwabrk +#To remove a breakpoint use hwrmbrk command. +# +#These commands take following types of arguments. For arguments associated +#with each command, use help command. +#1. breakpointno: 0 to 3 +#2. length: 1 to 3 +#3. address: Memory location in hex ( without 0x ) e.g c015e9bc +# +#Use the command exinfo to find which hardware breakpoint occured. + +#hwebrk breakpointno address +define hwebrk + maintenance packet Y$arg0,0,0,$arg1 +end +document hwebrk + hwebrk
+ Places a hardware execution breakpoint + = 0 - 3 +
= Hex digits without leading "0x". +end + +#hwwbrk breakpointno length address +define hwwbrk + maintenance packet Y$arg0,1,$arg1,$arg2 +end +document hwwbrk + hwwbrk
+ Places a hardware write breakpoint + = 0 - 3 + = 1 (1 byte), 2 (2 byte), 3 (4 byte) +
= Hex digits without leading "0x". +end + +#hwabrk breakpointno length address +define hwabrk + maintenance packet Y$arg0,1,$arg1,$arg2 +end +document hwabrk + hwabrk
+ Places a hardware access breakpoint + = 0 - 3 + = 1 (1 byte), 2 (2 byte), 3 (4 byte) +
= Hex digits without leading "0x". +end + +#hwrmbrk breakpointno +define hwrmbrk + maintenance packet y$arg0 +end +document hwrmbrk + hwrmbrk + = 0 - 3 + Removes a hardware breakpoint +end + +define reboot + maintenance packet r +end +#exinfo +define exinfo + maintenance packet qE +end +document exinfo + exinfo + Gives information about a breakpoint. +end +define get_th + p $th=(struct thread_info *)((int)$esp & ~8191) +end +document get_th + get_tu + Gets and prints the current thread_info pointer, Defines th to be it. +end +define get_cu + p $cu=((struct thread_info *)((int)$esp & ~8191))->task +end +document get_cu + get_cu + Gets and print the "current" value. Defines $cu to be it. +end +define int_off + set var $flags=$eflags + set $eflags=$eflags&~0x200 + end +define int_on + set var $eflags|=$flags&0x200 + end +document int_off + saves the current interrupt state and clears the processor interrupt + flag. Use int_on to restore the saved flag. +end +document int_on + Restores the interrupt flag saved by int_off. +end --- diff/Documentation/i386/kgdb/kgdb.txt 1970-01-01 01:00:00.000000000 +0100 +++ source/Documentation/i386/kgdb/kgdb.txt 2004-04-21 10:46:51.334778040 +0100 @@ -0,0 +1,775 @@ +Last edit: <20030806.1637.12> +This file has information specific to the i386 kgdb option. Other +platforms with the kgdb option may behave in a similar fashion. + +New features: +============ +20030806.1557.37 +This version was made against the 2.6.0-test2 kernel. We have made the +following changes: + +- The getthread() code in the stub calls find_task_by_pid(). It fails + if we are early in the bring up such that the pid arrays have yet to + be allocated. We have added a line to kernel/pid.c to make + "kgdb_pid_init_done" true once the arrays are allocated. This way the + getthread() code knows not to call. This is only used by the thread + debugging stuff and threads will not yet exist at this point in the + boot. + +- For some reason, gdb was not asking for a new thread list when the + "info thread" command was given. We changed to the newer version of + the thread info command and gdb now seems to ask when needed. Result, + we now get all threads in the thread list. + +- We now respond to the ThreadExtraInfo request from gdb with the thread + name from task_struct .comm. This then appears in the thread list. + Thoughts on additional options for this are welcome. Things such as + "has BKL" and "Preempted" come to mind. I think we could have a flag + word that could enable different bits of info here. + +- We now honor, sort of, the C and S commands. These are continue and + single set after delivering a signal. We ignore the signal and do the + requested action. This only happens when we told gdb that a signal + was the reason for entry, which is only done on memory faults. The + result is that you can now continue into the Oops. + +- We changed the -g to -gdwarf-2. This seems to be the same as -ggdb, + but it is more exact on what language to use. + +- We added two dwarf2 include files and a bit of code at the end of + entry.S. This does not yet work, so it is disabled. Still we want to + keep track of the code and "maybe" someone out there can fix it. + +- Randy Dunlap sent some fix ups for this file which are now merged. + +- Hugh Dickins sent a fix to a bit of code in traps.c that prevents a + compiler warning if CONFIG_KGDB is off (now who would do that :). + +- Andrew Morton sent a fix for the serial driver which is now merged. + +- Andrew also sent a change to the stub around the cpu managment code + which is also merged. + +- Andrew also sent a patch to make "f" as well as "g" work as SysRq + commands to enter kgdb, merged. + +- If CONFIG_KGDB and CONFIG_DEBUG_SPINLOCKS are both set we added a + "who" field to the spinlock data struct. This is filled with + "current" when ever the spinlock suceeds. Useful if you want to know + who has the lock. + +_ And last, but not least, we fixed the "get_cu" macro to properly get + the current value of "current". + +New features: +============ +20030505.1827.27 +We are starting to align with the sourceforge version, at least in +commands. To this end, the boot command string to start kgdb at +boot time has been changed from "kgdb" to "gdb". + +Andrew Morton sent a couple of patches which are now included as follows: +1.) We now return a flag to the interrupt handler. +2.) We no longer use smp_num_cpus (a conflict with the lock meter). +3.) And from William Lee Irwin III code to make + sure high-mem is set up before we attempt to register our interrupt + handler. +We now include asm/kgdb.h from config.h so you will most likely never +have to include it. It also 'NULLS' the kgdb macros you might have in +your code when CONFIG_KGDB is not defined. This allows you to just +turn off CONFIG_KGDB to turn off all the kgdb_ts() calls and such. +This include is conditioned on the machine being an x86 so as to not +mess with other archs. + +20020801.1129.03 +This is currently the version for the 2.4.18 (and beyond?) kernel. + +We have several new "features" beginning with this version: + +1.) Kgdb now syncs the "other" CPUs with a cross-CPU NMI. No more + waiting and it will pull that guy out of an IRQ off spin lock :) + +2.) We doctored up the code that tells where a task is waiting and + included it so that the "info thread" command will show a bit more + than "schedule()". Try it... + +3.) Added the ability to call a function from gdb. All the standard gdb + issues apply, i.e. if you hit a breakpoint in the function, you are + not allowed to call another (gdb limitation, not kgdb). To help + this capability we added a memory allocation function. Gdb does not + return this memory (it is used for strings that you pass to that function + you are calling from gdb) so we fixed up a way to allow you to + manually return the memory (see below). + +4.) Kgdb time stamps (kgdb_ts()) are enhanced to expand what was the + interrupt flag to now also include the preemption count and the + "in_interrupt" info. The flag is now called "with_pif" to indicate + the order, preempt_count, in_interrupt, flag. The preempt_count is + shifted left by 4 bits so you can read the count in hex by dropping + the low order digit. In_interrupt is in bit 1, and the flag is in + bit 0. + +5.) The command: "p kgdb_info" is now expanded and prints something + like: +(gdb) p kgdb_info +$2 = {used_malloc = 0, called_from = 0xc0107506, entry_tsc = 67468627259, + errcode = 0, vector = 3, print_debug_info = 0, hold_on_sstep = 1, + cpus_waiting = {{task = 0xc027a000, pid = 32768, hold = 0, + regs = 0xc027bf84}, {task = 0x0, pid = 0, hold = 0, regs = 0x0}}} + + Things to note here: a.) used_malloc is the amount of memory that + has been malloc'ed to do calls from gdb. You can reclaim this + memory like this: "p kgdb_info.used_malloc=0" Cool, huh? b.) + cpus_waiting is now "sized" by the number of CPUs you enter at + configure time in the kgdb configure section. This is NOT used + anywhere else in the system, but it is "nice" here. c.) The task's + "pid" is now in the structure. This is the pid you will need to use + to decode to the thread id to get gdb to look at that thread. + Remember that the "info thread" command prints a list of threads + wherein it numbers each thread with its reference number followed + by the thread's pid. Note that the per-CPU idle threads actually + have pids of 0 (yes, there is more than one pid 0 in an SMP system). + To avoid confusion, kgdb numbers these threads with numbers beyond + the MAX_PID. That is why you see 32768 and above. + +6.) A subtle change, we now provide the complete register set for tasks + that are active on the other CPUs. This allows better trace back on + those tasks. + + And, let's mention what we could not fix. Back-trace from all but the + thread that we trapped will, most likely, have a bogus entry in it. + The problem is that gdb does not recognize the entry code for + functions that use "current" near (at all?) the entry. The compiler + is putting the "current" decode as the first two instructions of the + function where gdb expects to find %ebp changing code. Back trace + also has trouble with interrupt frames. I am talking with Daniel + Jacobowitz about some way to fix this, but don't hold your breath. + +20011220.0050.35 +Major enhancement with this version is the ability to hold one or more +CPUs in an SMP system while allowing the others to continue. Also, by +default only the current CPU is enabled on single-step commands (please +note that gdb issues single-step commands at times other than when you +use the si command). + +Another change is to collect some useful information in +a global structure called "kgdb_info". You should be able to just: + +p kgdb_info + +although I have seen cases where the first time this is done gdb just +prints the first member but prints the whole structure if you then enter +CR (carriage return or enter). This also works: + +p *&kgdb_info + +Here is a sample: +(gdb) p kgdb_info +$4 = {called_from = 0xc010732c, entry_tsc = 32804123790856, errcode = 0, + vector = 3, print_debug_info = 0} + +"Called_from" is the return address from the current entry into kgdb. +Sometimes it is useful to know why you are in kgdb, for example, was +it an NMI or a real breakpoint? The simple way to interrogate this +return address is: + +l *0xc010732c + +which will print the surrounding few lines of source code. + +"Entry_tsc" is the CPU TSC on entry to kgdb (useful to compare to the +kgdb_ts entries). + +"errcode" and "vector" are other entry parameters which may be helpful on +some traps. + +"print_debug_info" is the internal debugging kgdb print enable flag. Yes, +you can modify it. + +In SMP systems kgdb_info also includes the "cpus_waiting" structure and +"hold_on_step": + +(gdb) p kgdb_info +$7 = {called_from = 0xc0112739, entry_tsc = 1034936624074, errcode = 0, + vector = 2, print_debug_info = 0, hold_on_sstep = 1, cpus_waiting = {{ + task = 0x0, hold = 0, regs = 0x0}, {task = 0xc71b8000, hold = 0, + regs = 0xc71b9f70}, {task = 0x0, hold = 0, regs = 0x0}, {task = 0x0, + hold = 0, regs = 0x0}, {task = 0x0, hold = 0, regs = 0x0}, {task = 0x0, + hold = 0, regs = 0x0}, {task = 0x0, hold = 0, regs = 0x0}, {task = 0x0, + hold = 0, regs = 0x0}}} + +"Cpus_waiting" has an entry for each CPU other than the current one that +has been stopped. Each entry contains the task_struct address for that +CPU, the address of the regs for that task and a hold flag. All these +have the proper typing so that, for example: + +p *kgdb_info.cpus_waiting[1].regs + +will print the registers for CPU 1. + +"Hold_on_sstep" is a new feature with this version and comes up set or +true. What this means is that whenever kgdb is asked to single-step all +other CPUs are held (i.e. not allowed to execute). The flag applies to +all but the current CPU and, again, can be changed: + +p kgdb_info.hold_on_sstep=0 + +restores the old behavior of letting all CPUs run during single-stepping. + +Likewise, each CPU has a "hold" flag, which if set, locks that CPU out +of execution. Note that this has some risk in cases where the CPUs need +to communicate with each other. If kgdb finds no CPU available on exit, +it will push a message thru gdb and stay in kgdb. Note that it is legal +to hold the current CPU as long as at least one CPU can execute. + +20010621.1117.09 +This version implements an event queue. Events are signaled by calling +a function in the kgdb stub and may be examined from gdb. See EVENTS +below for details. This version also tightens up the interrupt and SMP +handling to not allow interrupts on the way to kgdb from a breakpoint +trap. It is fine to allow these interrupts for user code, but not +system debugging. + +Version +======= + +This version of the kgdb package was developed and tested on +kernel version 2.4.16. It will not install on any earlier kernels. +It is possible that it will continue to work on later versions +of 2.4 and then versions of 2.5 (I hope). + + +Debugging Setup +=============== + +Designate one machine as the "development" machine. This is the +machine on which you run your compiles and which has your source +code for the kernel. Designate a second machine as the "target" +machine. This is the machine that will run your experimental +kernel. + +The two machines will be connected together via a serial line out +one or the other of the COM ports of the PC. You will need the +appropriate modem eliminator (null modem) cable(s) for this. + +Decide on which tty port you want the machines to communicate, then +connect them up back-to-back using the null modem cable. COM1 is +/dev/ttyS0 and COM2 is /dev/ttyS1. You should test this connection +with the two machines prior to trying to debug a kernel. Once you +have it working, on the TARGET machine, enter: + +setserial /dev/ttyS0 (or what ever tty you are using) + +and record the port address and the IRQ number. + +On the DEVELOPMENT machine you need to apply the patch for the kgdb +hooks. You have probably already done that if you are reading this +file. + +On your DEVELOPMENT machine, go to your kernel source directory and do +"make Xconfig" where X is one of "x", "menu", or "". If you are +configuring in the standard serial driver, it must not be a module. +Either yes or no is ok, but making the serial driver a module means it +will initialize after kgdb has set up the UART interrupt code and may +cause a failure of the control-C option discussed below. The configure +question for the serial driver is under the "Character devices" heading +and is: + +"Standard/generic (8250/16550 and compatible UARTs) serial support" + +Go down to the kernel debugging menu item and open it up. Enable the +kernel kgdb stub code by selecting that item. You can also choose to +turn on the "-ggdb -O1" compile options. The -ggdb causes the compiler +to put more debug info (like local symbols) in the object file. On the +i386 -g and -ggdb are the same so this option just reduces to "O1". The +-O1 reduces the optimization level. This may be helpful in some cases, +be aware, however, that this may also mask the problem you are looking +for. + +The baud rate. Default is 115200. What ever you choose be sure that +the host machine is set to the same speed. I recommend the default. + +The port. This is the I/O address of the serial UART that you should +have gotten using setserial as described above. The standard COM1 port +(3f8) using IRQ 4 is default. COM2 is 2f8 which by convention uses IRQ +3. + +The port IRQ (see above). + +Stack overflow test. This option makes a minor change in the trap, +system call and interrupt code to detect stack overflow and transfer +control to kgdb if it happens. (Some platforms have this in the +baseline code, but the i386 does not.) + +You can also configure the system to recognize the boot option +"console=kgdb" which if given will cause all console output during +booting to be put thru gdb as well as other consoles. This option +requires that gdb and kgdb be connected prior to sending console output +so, if they are not, a breakpoint is executed to force the connection. +This will happen before any kernel output (it is going thru gdb, right), +and will stall the boot until the connection is made. + +You can also configure in a patch to SysRq to enable the kGdb SysRq. +This request generates a breakpoint. Since the serial port IRQ line is +set up after any serial drivers, it is possible that this command will +work when the control-C will not. + +Save and exit the Xconfig program. Then do "make clean" , "make dep" +and "make bzImage" (or whatever target you want to make). This gets the +kernel compiled with the "-g" option set -- necessary for debugging. + +You have just built the kernel on your DEVELOPMENT machine that you +intend to run on your TARGET machine. + +To install this new kernel, use the following installation procedure. +Remember, you are on the DEVELOPMENT machine patching the kernel source +for the kernel that you intend to run on the TARGET machine. + +Copy this kernel to your target machine using your usual procedures. I +usually arrange to copy development: +/usr/src/linux/arch/i386/boot/bzImage to /vmlinuz on the TARGET machine +via a LAN based NFS access. That is, I run the cp command on the target +and copy from the development machine via the LAN. Run Lilo (see "man +lilo" for details on how to set this up) on the new kernel on the target +machine so that it will boot! Then boot the kernel on the target +machine. + +On the DEVELOPMENT machine, create a file called .gdbinit in the +directory /usr/src/linux. An example .gdbinit file looks like this: + +shell echo -e "\003" >/dev/ttyS0 +set remotebaud 38400 (or what ever speed you have chosen) +target remote /dev/ttyS0 + + +Change the "echo" and "target" definition so that it specifies the tty +port that you intend to use. Change the "remotebaud" definition to +match the data rate that you are going to use for the com line. + +You are now ready to try it out. + +Boot your target machine with "kgdb" in the boot command i.e. something +like: + +lilo> test kgdb + +or if you also want console output thru gdb: + +lilo> test kgdb console=kgdb + +You should see the lilo message saying it has loaded the kernel and then +all output stops. The kgdb stub is trying to connect with gdb. Start +gdb something like this: + + +On your DEVELOPMENT machine, cd /usr/src/linux and enter "gdb vmlinux". +When gdb gets the symbols loaded it will read your .gdbinit file and, if +everything is working correctly, you should see gdb print out a few +lines indicating that a breakpoint has been taken. It will actually +show a line of code in the target kernel inside the kgdb activation +code. + +The gdb interaction should look something like this: + + linux-dev:/usr/src/linux# gdb vmlinux + GDB is free software and you are welcome to distribute copies of it + under certain conditions; type "show copying" to see the conditions. + There is absolutely no warranty for GDB; type "show warranty" for details. + GDB 4.15.1 (i486-slackware-linux), + Copyright 1995 Free Software Foundation, Inc... + breakpoint () at i386-stub.c:750 + 750 } + (gdb) + +You can now use whatever gdb commands you like to set breakpoints. +Enter "continue" to start your target machine executing again. At this +point the target system will run at full speed until it encounters +your breakpoint or gets a segment violation in the kernel, or whatever. + +If you have the kgdb console enabled when you continue, gdb will print +out all the console messages. + +The above example caused a breakpoint relatively early in the boot +process. For the i386 kgdb it is possible to code a break instruction +as the first C-language point in init/main.c, i.e. as the first instruction +in start_kernel(). This could be done as follows: + +#include + breakpoint(); + +This breakpoint() is really a function that sets up the breakpoint and +single-step hardware trap cells and then executes a breakpoint. Any +early hard coded breakpoint will need to use this function. Once the +trap cells are set up they need not be set again, but doing it again +does not hurt anything, so you don't need to be concerned about which +breakpoint is hit first. Once the trap cells are set up (and the kernel +sets them up in due course even if breakpoint() is never called) the +macro: + +BREAKPOINT; + +will generate an inline breakpoint. This may be more useful as it stops +the processor at the instruction instead of in a function a step removed +from the location of interest. In either case must be +included to define both breakpoint() and BREAKPOINT. + +Triggering kgdbstub at other times +================================== + +Often you don't need to enter the debugger until much later in the boot +or even after the machine has been running for some time. Once the +kernel is booted and interrupts are on, you can force the system to +enter the debugger by sending a control-C to the debug port. This is +what the first line of the recommended .gdbinit file does. This allows +you to start gdb any time after the system is up as well as when the +system is already at a breakpoint. (In the case where the system is +already at a breakpoint the control-C is not needed, however, it will +be ignored by the target so no harm is done. Also note the the echo +command assumes that the port speed is already set. This will be true +once gdb has connected, but it is best to set the port speed before you +run gdb.) + +Another simple way to do this is to put the following file in you ~/bin +directory: + +#!/bin/bash +echo -e "\003" > /dev/ttyS0 + +Here, the ttyS0 should be replaced with what ever port you are using. +The "\003" is control-C. Once you are connected with gdb, you can enter +control-C at the command prompt. + +An alternative way to get control to the debugger is to enable the kGdb +SysRq command. Then you would enter Alt-SysRq-g (all three keys at the +same time, but push them down in the order given). To refresh your +memory of the available SysRq commands try Alt-SysRq-=. Actually any +undefined command could replace the "=", but I like to KNOW that what I +am pushing will never be defined. + +Debugging hints +=============== + +You can break into the target machine at any time from the development +machine by typing ^C (see above paragraph). If the target machine has +interrupts enabled this will stop it in the kernel and enter the +debugger. + +There is unfortunately no way of breaking into the kernel if it is +in a loop with interrupts disabled, so if this happens to you then +you need to place exploratory breakpoints or printk's into the kernel +to find out where it is looping. The exploratory breakpoints can be +entered either thru gdb or hard coded into the source. This is very +handy if you do something like: + +if () BREAKPOINT; + + +There is a copy of an e-mail in the Documentation/i386/kgdb/ directory +(debug-nmi.txt) which describes how to create an NMI on an ISA bus +machine using a paper clip. I have a sophisticated version of this made +by wiring a push button switch into a PC104/ISA bus adapter card. The +adapter card nicely furnishes wire wrap pins for all the ISA bus +signals. + +When you are done debugging the kernel on the target machine it is a +good idea to leave it in a running state. This makes reboots faster, +bypassing the fsck. So do a gdb "continue" as the last gdb command if +this is possible. To terminate gdb itself on the development machine +and leave the target machine running, first clear all breakpoints and +continue, then type ^Z to suspend gdb and then kill it with "kill %1" or +something similar. + +If gdbstub Does Not Work +======================== + +If it doesn't work, you will have to troubleshoot it. Do the easy +things first like double checking your cabling and data rates. You +might try some non-kernel based programs to see if the back-to-back +connection works properly. Just something simple like cat /etc/hosts +>/dev/ttyS0 on one machine and cat /dev/ttyS0 on the other will tell you +if you can send data from one machine to the other. Make sure it works +in both directions. There is no point in tearing out your hair in the +kernel if the line doesn't work. + +All of the real action takes place in the file +/usr/src/linux/arch/i386/kernel/kgdb_stub.c. That is the code on the target +machine that interacts with gdb on the development machine. In gdb you can +turn on a debug switch with the following command: + + set remotedebug + +This will print out the protocol messages that gdb is exchanging with +the target machine. + +Another place to look is /usr/src/arch/i386/lib/kgdb_serial.c. This is +the code that talks to the serial port on the target side. There might +be a problem there. In particular there is a section of this code that +tests the UART which will tell you what UART you have if you define +"PRNT" (just remove "_off" from the #define PRNT_off). To view this +report you will need to boot the system without any beakpoints. This +allows the kernel to run to the point where it calls kgdb to set up +interrupts. At this time kgdb will test the UART and print out the type +it finds. (You need to wait so that the printks are actually being +printed. Early in the boot they are cached, waiting for the console to +be enabled. Also, if kgdb is entered thru a breakpoint it is possible +to cause a dead lock by calling printk when the console is locked. The +stub thus avoids doing printks from breakpoints, especially in the +serial code.) At this time, if the UART fails to do the expected thing, +kgdb will print out (using printk) information on what failed. (These +messages will be buried in all the other boot up messages. Look for +lines that start with "gdb_hook_interrupt:". You may want to use dmesg +once the system is up to view the log. If this fails or if you still +don't connect, review your answers for the port address. Use: + +setserial /dev/ttyS0 + +to get the current port and IRQ information. This command will also +tell you what the system found for the UART type. The stub recognizes +the following UART types: + +16450, 16550, and 16550A + +If you are really desperate you can use printk debugging in the +kgdbstub code in the target kernel until you get it working. In particular, +there is a global variable in /usr/src/linux/arch/i386/kernel/kgdb_stub.c +named "remote_debug". Compile your kernel with this set to 1, rather +than 0 and the debug stub will print out lots of stuff as it does +what it does. Likewise there are debug printks in the kgdb_serial.c +code that can be turned on with simple changes in the macro defines. + + +Debugging Loadable Modules +========================== + +This technique comes courtesy of Edouard Parmelan + + +When you run gdb, enter the command + +source gdbinit-modules + +This will read in a file of gdb macros that was installed in your +kernel source directory when kgdb was installed. This file implements +the following commands: + +mod-list + Lists the loaded modules in the form + +mod-print-symbols + Prints all the symbols in the indicated module. + +mod-add-symbols + Loads the symbols from the object file and associates them + with the indicated module. + +After you have loaded the module that you want to debug, use the command +mod-list to find the of your module. Then use that +address in the mod-add-symbols command to load your module's symbols. +From that point onward you can debug your module as if it were a part +of the kernel. + +The file gdbinit-modules also contains a command named mod-add-lis as +an example of how to construct a command of your own to load your +favorite module. The idea is to "can" the pathname of the module +in the command so you don't have to type so much. + +Threads +======= + +Each process in a target machine is seen as a gdb thread. gdb thread +related commands (info threads, thread n) can be used. + +ia-32 hardware breakpoints +========================== + +kgdb stub contains support for hardware breakpoints using debugging features +of ia-32(x86) processors. These breakpoints do not need code modification. +They use debugging registers. 4 hardware breakpoints are available in ia-32 +processors. + +Each hardware breakpoint can be of one of the following three types. + +1. Execution breakpoint - An Execution breakpoint is triggered when code + at the breakpoint address is executed. + + As limited number of hardware breakpoints are available, it is + advisable to use software breakpoints ( break command ) instead + of execution hardware breakpoints, unless modification of code + is to be avoided. + +2. Write breakpoint - A write breakpoint is triggered when memory + location at the breakpoint address is written. + + A write or can be placed for data of variable length. Length of + a write breakpoint indicates length of the datatype to be + watched. Length is 1 for 1 byte data , 2 for 2 byte data, 3 for + 4 byte data. + +3. Access breakpoint - An access breakpoint is triggered when memory + location at the breakpoint address is either read or written. + + Access breakpoints also have lengths similar to write breakpoints. + +IO breakpoints in ia-32 are not supported. + +Since gdb stub at present does not use the protocol used by gdb for hardware +breakpoints, hardware breakpoints are accessed through gdb macros. gdb macros +for hardware breakpoints are described below. + +hwebrk - Places an execution breakpoint + hwebrk breakpointno address +hwwbrk - Places a write breakpoint + hwwbrk breakpointno length address +hwabrk - Places an access breakpoint + hwabrk breakpointno length address +hwrmbrk - Removes a breakpoint + hwrmbrk breakpointno +exinfo - Tells whether a software or hardware breakpoint has occurred. + Prints number of the hardware breakpoint if a hardware breakpoint has + occurred. + +Arguments required by these commands are as follows +breakpointno - 0 to 3 +length - 1 to 3 +address - Memory location in hex digits ( without 0x ) e.g c015e9bc + +SMP support +========== + +When a breakpoint occurs or user issues a break ( Ctrl + C ) to gdb +client, all the processors are forced to enter the debugger. Current +thread corresponds to the thread running on the processor where +breakpoint occurred. Threads running on other processor(s) appear +similar to other non-running threads in the 'info threads' output. +Within the kgdb stub there is a structure "waiting_cpus" in which kgdb +records the values of "current" and "regs" for each CPU other than the +one that hit the breakpoint. "current" is a pointer to the task +structure for the task that CPU is running, while "regs" points to the +saved registers for the task. This structure can be examined with the +gdb "p" command. + +ia-32 hardware debugging registers on all processors are set to same +values. Hence any hardware breakpoints may occur on any processor. + +gdb troubleshooting +=================== + +1. gdb hangs +Kill it. restart gdb. Connect to target machine. + +2. gdb cannot connect to target machine (after killing a gdb and +restarting another) If the target machine was not inside debugger when +you killed gdb, gdb cannot connect because the target machine won't +respond. In this case echo "Ctrl+C"(ASCII 3) to the serial line. +e.g. echo -e "\003" > /dev/ttyS1 +This forces that target machine into the debugger, after which you +can connect. + +3. gdb cannot connect even after echoing Ctrl+C into serial line +Try changing serial line settings min to 1 and time to 0 +e.g. stty min 1 time 0 < /dev/ttyS1 +Try echoing again + +Check serial line speed and set it to correct value if required +e.g. stty ispeed 115200 ospeed 115200 < /dev/ttyS1 + +EVENTS +====== + +Ever want to know the order of things happening? Which CPU did what and +when? How did the spinlock get the way it is? Then events are for +you. Events are defined by calls to an event collection interface and +saved for later examination. In this case, kgdb events are saved by a +very fast bit of code in kgdb which is fully SMP and interrupt protected +and they are examined by using gdb to display them. Kgdb keeps only +the last N events, where N must be a power of two and is defined at +configure time. + + +Events are signaled to kgdb by calling: + +kgdb_ts(data0,data1) + +For each call kgdb records each call in an array along with other info. +Here is the array definition: + +struct kgdb_and_then_struct { +#ifdef CONFIG_SMP + int on_cpu; +#endif + long long at_time; + int from_ln; + char * in_src; + void *from; + int with_if; + int data0; + int data1; +}; + +For SMP machines the CPU is recorded, for all machines the TSC is +recorded (gets a time stamp) as well as the line number and source file +the call was made from. The address of the (from), the "if" (interrupt +flag) and the two data items are also recorded. The macro kgdb_ts casts +the types to int, so you can put any 32-bit values here. There is a +configure option to select the number of events you want to keep. A +nice number might be 128, but you can keep up to 1024 if you want. The +number must be a power of two. An "andthen" macro library is provided +for gdb to help you look at these events. It is also possible to define +a different structure for the event storage and cast the data to this +structure. For example the following structure is defined in kgdb: + +struct kgdb_and_then_struct2 { +#ifdef CONFIG_SMP + int on_cpu; +#endif + long long at_time; + int from_ln; + char * in_src; + void *from; + int with_if; + struct task_struct *t1; + struct task_struct *t2; +}; + +If you use this for display, the data elements will be displayed as +pointers to task_struct entries. You may want to define your own +structure to use in casting. You should only change the last two items +and you must keep the structure size the same. Kgdb will handle these +as 32-bit ints, but within that constraint you can define a structure to +cast to any 32-bit quantity. This need only be available to gdb and is +only used for casting in the display code. + +Final Items +=========== + +I picked up this code from Amit S. Kale and enhanced it. + +If you make some really cool modification to this stuff, or if you +fix a bug, please let me know. + +George Anzinger + + +Amit S. Kale + + +(First kgdb by David Grothe ) + +(modified by Tigran Aivazian ) + Putting gdbstub into the kernel config menu. + +(modified by Scott Foehner ) + Hooks for entering gdbstub at boot time. + +(modified by Amit S. Kale ) + Threads, ia-32 hw debugging, mp support, console support, + nmi watchdog handling. + +(modified by George Anzinger ) + Extended threads to include the idle threads. + Enhancements to allow breakpoint() at first C code. + Use of module_init() and __setup() to automate the configure. + Enhanced the cpu "collection" code to work in early bring-up. + Added ability to call functions from gdb + Print info thread stuff without going back to schedule() + Now collect the "other" cpus with an IPI/ NMI. --- diff/Documentation/i386/kgdb/kgdbeth.txt 1970-01-01 01:00:00.000000000 +0100 +++ source/Documentation/i386/kgdb/kgdbeth.txt 2004-04-21 10:46:51.332778344 +0100 @@ -0,0 +1,92 @@ +KGDB over ethernet +================== + +Authors +------- + +Robert Walsh (2.6 port) +wangdi (2.6 port) +Matt Mackall (netpoll api) +San Mehat (original 2.4 code) + + +Introduction +------------ + +KGDB supports debugging over ethernet (kgdboe) via polling of a given +network interface. Most cards should be supported automatically. +Debugging facilities are available as soon as the network driver and +kgdboe have initialized. Unfortunately, this is too late in the boot +process for debugging some issues, but works quite well for many +others. This should not interfere with normal network usage and +doesn't require a dedicated NIC. + +Terminology +----------- + +This document uses the following terms: + + TARGET: the machine being debugged. + HOST: the machine running gdb. + + +Usage +----- + +You need to use the following command-line option on the TARGET kernel: + + kgdboe=[tgt-port]@/[dev],[host-port]@/[host-macaddr] + + where + tgt-port source for UDP packets (defaults to 6443) + tgt-ip source IP to use (interface address) + dev network interface (eth0) + host-port HOST UDP port (6442) (not really used) + host-ip IP address for HOST machine + host-macaddr ethernet MAC address for HOST (ff:ff:ff:ff:ff:ff) + + examples: + + kgdboe=7000@192.168.0.1/eth1,7001@192.168.0.2/00:05:3C:04:47:5D + this machine is 192.168.0.1 on eth1 + remote machine is 192.168.0.2 with MAC address 00:05:3C:04:47:5D + listen for gdb packets on port 7000 + send unsolicited gdb packets to port 7001 + + kgdboe=@192.168.0.1/,@192.168.0.2/ + this machine is 192.168.0.1 on default interface eth0 + remote machine is 192.168.0.2, use default broadcast MAC address + listen for gdb packets on default port 6443 + send unsolicited gdb packets to port 6442 + +Only packets originating from the configured HOST IP address will be +accepted by the debugger. + +On the HOST side, run gdb as normal and use a remote UDP host as the +target: + + % gdb ./vmlinux + GNU gdb Red Hat Linux (5.3post-0.20021129.18rh) + Copyright 2003 Free Software Foundation, Inc. + GDB is free software, covered by the GNU General Public License, and you are + welcome to change it and/or distribute copies of it under certain conditions. + Type "show copying" to see the conditions. + There is absolutely no warranty for GDB. Type "show warranty" for details. + This GDB was configured as "i386-redhat-linux-gnu"... + (gdb) target remote udp:HOSTNAME:6443 + +You can now continue as if you were debugging over a serial line. + +Limitations +----------- + +The current release of this code is exclusive of using kgdb on a +serial interface, so you must boot without the kgdboe option to use +serial debugging. Trying to debug the network driver while using it +will prove interesting. + +Bug reports +----------- + +Send bug reports to Robert Walsh and Matt +Mackall . --- diff/Documentation/i386/kgdb/loadmodule.sh 1970-01-01 01:00:00.000000000 +0100 +++ source/Documentation/i386/kgdb/loadmodule.sh 2004-04-21 10:46:51.334778040 +0100 @@ -0,0 +1,78 @@ +#/bin/sh +# This script loads a module on a target machine and generates a gdb script. +# source generated gdb script to load the module file at appropriate addresses +# in gdb. +# +# Usage: +# Loading the module on target machine and generating gdb script) +# [foo]$ loadmodule.sh +# +# Loading the module file into gdb +# (gdb) source +# +# Modify following variables according to your setup. +# TESTMACHINE - Name of the target machine +# GDBSCRIPTS - The directory where a gdb script will be generated +# +# Author: Amit S. Kale (akale@veritas.com). +# +# If you run into problems, please check files pointed to by following +# variables. +# ERRFILE - /tmp/.errs contains stderr output of insmod +# MAPFILE - /tmp/.map contains stdout output of insmod +# GDBSCRIPT - $GDBSCRIPTS/load gdb script. + +TESTMACHINE=foo +GDBSCRIPTS=/home/bar + +if [ $# -lt 1 ] ; then { + echo Usage: $0 modulefile + exit +} ; fi + +MODULEFILE=$1 +MODULEFILEBASENAME=`basename $1` + +if [ $MODULEFILE = $MODULEFILEBASENAME ] ; then { + MODULEFILE=`pwd`/$MODULEFILE +} fi + +ERRFILE=/tmp/$MODULEFILEBASENAME.errs +MAPFILE=/tmp/$MODULEFILEBASENAME.map +GDBSCRIPT=$GDBSCRIPTS/load$MODULEFILEBASENAME + +function findaddr() { + local ADDR=0x$(echo "$SEGMENTS" | \ + grep "$1" | sed 's/^[^ ]*[ ]*[^ ]*[ ]*//' | \ + sed 's/[ ]*[^ ]*$//') + echo $ADDR +} + +function checkerrs() { + if [ "`cat $ERRFILE`" != "" ] ; then { + cat $ERRFILE + exit + } fi +} + +#load the module +echo Copying $MODULEFILE to $TESTMACHINE +rcp $MODULEFILE root@${TESTMACHINE}: + +echo Loading module $MODULEFILE +rsh -l root $TESTMACHINE /sbin/insmod -m ./`basename $MODULEFILE` \ + > $MAPFILE 2> $ERRFILE +checkerrs + +SEGMENTS=`head -n 11 $MAPFILE | tail -n 10` +TEXTADDR=$(findaddr "\\.text[^.]") +LOADSTRING="add-symbol-file $MODULEFILE $TEXTADDR" +SEGADDRS=`echo "$SEGMENTS" | awk '//{ + if ($1 != ".text" && $1 != ".this" && + $1 != ".kstrtab" && $1 != ".kmodtab") { + print " -s " $1 " 0x" $3 " " + } +}'` +LOADSTRING="$LOADSTRING $SEGADDRS" +echo Generating script $GDBSCRIPT +echo $LOADSTRING > $GDBSCRIPT --- diff/Documentation/must-fix.txt 1970-01-01 01:00:00.000000000 +0100 +++ source/Documentation/must-fix.txt 2004-04-21 10:46:51.346776216 +0100 @@ -0,0 +1,288 @@ + +Must-fix bugs +============= + +drivers/char/ +~~~~~~~~~~~~~ + +o TTY locking is broken. + + o see FIXME in do_tty_hangup(). This causes ppp BUGs in local_bh_enable() + + o Other problems: aviro, dipankar, Alan have details. + + o somebody will have to document the tty driver and ldisc API + +drivers/tty +~~~~~~~~~~~ + +o viro: tty_driver refcounting, tty/misc/upper levels of sound still not + completely fixed. + +drivers/block/ +~~~~~~~~~~~~~~ + +o loop.c: Concurrent write access on block devices might cause a deadlock + of the complete system. See: + http://marc.theaimsgroup.com/?l=linux-kernel&m=106275365925769&w== + http://bugzilla.kernel.org/show_bug.cgi?id=1198 + Thread of possible fix: + http://www.kerneli.org/pipermail/cryptoapi-devel/2003-October/000676.html + + (Fruhwirth Clemens) + +o ideraid hasn't been ported to 2.5 at all yet. + + We need to understand whether the proposed BIO split code will suffice + for this. + +drivers/input/ +~~~~~~~~~~~~~~ + +o rmk: unconverted keyboard/mouse drivers (there's a deadline of 2.6.0 + currently on these remaining in my/Linus' tree.) + +o viro: large absence of locking. + +o viro: parport is nearly as bad as that and there the code is more hairy. + IMO parport is more of "figure out what API changes are needed for its + users, get them done ASAP, then fix generic layer at leisure" + +o (Albert Cahalan) Lots of people (check Google) get this message from the + kernel: + + psmouse.c: Lost synchronization, throwing 2 bytes away. + + (the number of bytes will be 1, 2, or 3) + + At work, I get it when there is heavy NFS traffic. The mouse goes crazy, + jumping around and doing random cut-and-paste all over everything. This + is with a decently fast and modern PC. + +o There seem to be too many reports of keyboards and mice failing or acting + strangely. + + +drivers/misc/ +~~~~~~~~~~~~~ + +o rmk: UCB1[23]00 drivers, currently sitting in drivers/misc in the ARM + tree. (touchscreen, audio, gpio, type device.) + + These need to be moved out of drivers/misc/ and into real places + +o viro: actually, misc.c has a good chance to die. With cdev-cidr that's + trivial. + +drivers/net/ +~~~~~~~~~~~~ + +drivers/net/irda/ +~~~~~~~~~~~~~~~~~ + + (Jean Tourrilhes) + +o irport need to be converted to sir-kthread + +o dongle drivers need to be converted to sir-dev (in progress) + +o new drivers (irtty-sir/smsc-ircc2/donauboe) need more testing (in progress) + + +drivers/pci/ +~~~~~~~~~~~~ + +o alan: Some cardbus crashes the system + + (bugzilla, please?) + +drivers/pcmcia/ +~~~~~~~~~~~~~~~ + +o alan: This is a locking disaster. + + (rmk, brodo: in progress) + +drivers/pld/ +~~~~~~~~~~~~ + +o rmk: EPXA (ARM platform) PLD hotswap drivers (drivers/pld) + + (rmk: will work out what to do here. maybe drivers/arm/) + +drivers/video/ +~~~~~~~~~~~~~~ + +o Lots of drivers don't compile, others do but don't work. + +drivers/scsi/ +~~~~~~~~~~~~~ + +o Convert am53c974, dpt_i2o, initio and pci2220i to DMA-mapping + +o Make inia100, cpqfc, pci2000 and dc390t compile + +o Convert + + wd33c99 based: a2091 a3000 gpv11 mvme174 sgiwd93 + + 53c7xx based: amiga7xxx bvme6000 mvme16x initio am53c974 pci2000 + pci2220i dc390t + + To new error handling + + It also might be possible to shift the 53c7xx based drivers over to + 53c700 which does the new EH stuff, but I don't have the hardware to check + such a shift. + + For the non-compiling stuff, I've probably missed a few that just aren't + compilable on my platforms, so any updates would be welcome. Also, are + some of our non-compiling or unconverted drivers obsolete? + +fs/ +~~~ + +o AIO/direct-IO writes can race with truncate and wreck filesystems. + (Badari has a patch) + +o viro: fs/char_dev.c needs removal of aeb stuff and merge of cdev-cidr. + In progress. + +o forward-port sct's O_DIRECT fixes (Badari has a patch) + +o viro: there is some generic stuff for namei/namespace/super, but that's a + slow-merge and can go in 2.6 just fine + +o trond: NFS has a mmap-versus-truncate problem (fixed? needs testing) + +o trond: NFSv4 client, bugs in lockd, RPSEC_GSS for NFSv[23], some atomic open + bits. more info: http://www.fys.uio.no/~trondmy/src/Linux-2.6.x/2.6.0-test11/ + +kernel/sched.c +~~~~~~~~~~~~~~ + +o Starvation, general interactivity need close monitoring. + +o SMT aware scheduler (Ingo, Rusty, Nick have implementations) + +kernel/ +~~~~~~~ + +o Alan: 32bit uid support is *still* broken for process accounting. + + Create a 32bit uid, turn accounting on. Shock horror it doesn't work + because the field is 16bit. We need an acct structure flag day for 2.6 + IMHO + + (alan has patch) + +o viro: core sysctl code is racy. And its interaction wiuth sysfs + +o (ingo) rwsems (on x86) are limited to 32766 waiting processes. This + means that setting pid_max to above 32K is unsafe :-( + + An option is to use CONFIG_RWSEM_GENERIC_SPINLOCK variant all the time, + for all archs, and not inline any part of the ops. + +lib/kobject.c +~~~~~~~~~~~~~ + +o kobject refcounting (comments from Al Viro): + + _anything_ can grab a temporary reference to kobject. IOW, if kobject is + embedded into something that could be freed - it _MUST_ have a destructor + and that destructor _MUST_ be the destructor for containing object. + + Any violation of the above (and we already have a bunch of those) is a + user-triggerable memory corruption. + + We can tolerate it for a while in 2.5 (e.g. during work on susbsystem we + can decide to switch to that way of handling objects and have subsystem + vulnerable for a while), but all such windows must be closed before 2.6 + and during 2.6 we can't open them at all. + +o All block drivers which control multiple gendisks with a single + request_queue are broken, due to one-to-one assumptions in the request + queue sysfs hookup. + +mm/ +~~~ + +o GFP_DMA32 (or something like that). Lots of ideas. jejb, zaitcev, + willy, arjan, wli. + + Specifically, 64-bit systems need to be able to enforce 32-bit addressing + limits for device metadata like network cards' ring buffers and SCSI + command descriptors. + +o access_process_vm() doesn't flush right. We probably need new flushing + primitives to do this (davem?) + + +modules +~~~~~~~ + + (Rusty) + +net/ +~~~~ + + (davem) + +o UDP apps can in theory deadlock, because the ip_append_data path can end + up sleeping while the socket lock is held. + + It is OK to sleep with the socket held held, normally. But in this case + the sleep happens while waiting for socket memory/space to become + available, if another context needs to take the socket lock to free up the + space we could hang. + + I sent a rough patch on how to fix this to Alexey, and he is analyzing + the situation. I expect a final fix from him next week or so. + +o Semantics for IPSEC during operations such as TCP connect suck currently. + + When we first try to connect to a destination, we may need to ask the + IPSEC key management daemon to resolve the IPSEC routes for us. For the + purposes of what the kernel needs to do, you can think of it like ARP. We + can't send the packet out properly until we resolve the path. + + What happens now for IPSEC is basically this: + + O_NONBLOCK: returns -EAGAIN over and over until route is resolved + + !O_NONBLOCK: Sleeps until route is resolved + + These semantics are total crap. The solution, which Alexey is working + on, is to allow incomplete routes to exist. These "incomplete" routes + merely put the packet onto a "resolution queue", and once the key manager + does it's thing we finish the output of the packet. This is precisely how + ARP works. + + I don't know when Alexey will be done with this. + +net/*/netfilter/ +~~~~~~~~~~~~~~~~ + + (Rusty) + +sound/ +~~~~~~ + +global +~~~~~~ + +o viro: 64-bit dev_t (not a mustfix for 2.6.0). 32-bit dev_t is done, 64-bit + means extra work on nfsd/raid/etc. + +o alan: Forward port 2.4 fixes + - Chris Wright: Security fixes including execve holes, execve vs proc races + +o There are about 60 or 70 security related checks that need doing + (copy_user etc) from Stanford tools. (badari is looking into this, and + hollisb) + +o A couple of hundred real looking bugzilla bugs + +o viro: cdev rework. Mostly done. + --- diff/Documentation/sched-domains.txt 1970-01-01 01:00:00.000000000 +0100 +++ source/Documentation/sched-domains.txt 2004-04-21 10:46:51.346776216 +0100 @@ -0,0 +1,55 @@ +Each CPU has a "base" scheduling domain (struct sched_domain). These are +accessed via cpu_sched_domain(i) and this_sched_domain() macros. The domain +hierarchy is built from these base domains via the ->parent pointer. ->parent +MUST be NULL terminated, and domain structures should be per-CPU as they +are locklessly updated. + +Each scheduling domain spans a number of CPUs (stored in the ->span field). +A domain's span MUST be a superset of it child's span, and a base domain +for CPU i MUST span at least i. The top domain for each CPU will generally +span all CPUs in the system although strictly it doesn't have to, but this +could lead to a case where some CPUs will never be given tasks to run unless +the CPUs allowed mask is explicitly set. A sched domain's span means "balance +process load among these CPUs". + +Each scheduling domain must have one or more CPU groups (struct sched_group) +which are organised as a circular one way linked list from the ->groups +pointer. The union of cpumasks of these groups MUST be the same as the +domain's span. The intersection of cpumasks from any two of these groups +MUST be the empty set. The group pointed to by the ->groups pointer MUST +contain the CPU to which the domain belongs. Groups may be shared among +CPUs as they contain read only data after they have been set up. + +Balancing within a sched domain occurs between groups. That is, each group +is treated as one entity. The load of a group is defined as the sum of the +load of each of its member CPUs, and only when the load of a group becomes +out of balance are tasks moved between groups. + +In kernel/sched.c, rebalance_tick is run periodically on each CPU. This +function takes its CPU's base sched domain and checks to see if has reached +its rebalance interval. If so, then it will run load_balance on that domain. +rebalance_tick then checks the parent sched_domain (if it exists), and the +parent of the parent and so forth. + +*** Implementing sched domains *** +The "base" domain will "span" the first level of the hierarchy. In the case +of SMT, you'll span all siblings of the physical CPU, with each group being +a single virtual CPU. + +In SMP, the parent of the base domain will span all physical CPUs in the +node. Each group being a single physical CPU. Then with NUMA, the parent +of the SMP domain will span the entire machine, with each group having the +cpumask of a node. Or, you could do multi-level NUMA or Opteron, for example, +might have just one domain covering its one NUMA level. + +The implementor should read comments in include/linux/sched.h: +struct sched_domain fields, SD_FLAG_*, SD_*_INIT to get an idea of +the specifics and what to tune. + +Implementors should change the line +#undef SCHED_DOMAIN_DEBUG +to +#define SCHED_DOMAIN_DEBUG +in kernel/sched.c as this enables an error checking parse of the sched domains +which should catch most possible errors (described above). It also prints out +the domain structure in a visual format. --- diff/Documentation/should-fix.txt 1970-01-01 01:00:00.000000000 +0100 +++ source/Documentation/should-fix.txt 2004-04-21 10:46:51.347776064 +0100 @@ -0,0 +1,545 @@ +Not-ready features and speedups +=============================== + +Legend: + +PRI1: We're totally lame if this doesn't get in +PRI2: Would be nice +PRI3: Not very important + +drivers/block/ +~~~~~~~~~~~~~~ + +o viro: paride drivers need a big cleanup. Partially done, but ATAPI drivers + need serious work and bug fixing. + + PRI2 + +drivers/char/rtc/ +~~~~~~~~~~~~~~~~~ + +o rmk, trini: add support for alarms to the existing generic rtc driver. + + PRI2 + +console drivers +~~~~~~~~~~~~~~~ + (Pavel Machek ) + +o There are few must-fix bugs in cursor handling. + +o Play with gpm selection for a while and your cursor gets corrupted with + random dots. Ouch. + +device mapper +~~~~~~~~~~~~~ + +o ioctl interface cleanup patch is ready (redo the structure layouts) + + PRI1 + +o A port of the 2.4 snapshot and mirror targets is in progress + + PRI1 + +o the fs interface to dm needs to be redone. gregkh was going to work on + this. viro is interested in seeing work thus-far. + + PRI2 + +drivers/net/wireless/ +~~~~~~~~~~~~~~~~~~~~~ + + (Jean Tourrilhes ) + +o get HostAP driver in the kernel. No consolidation of the 802.11 + management across driver can happen until this one is in (which is probably + 2.7.X material). I think Jouni is mostly ready but didn't find time for + it. + + PRI2 + +o get more wireless drivers into the kernel. The most "integrable" drivers + at this point seem the NWN driver, Pavel's Spectrum driver. + + PRI1 + +drivers/usb/gadget/ +~~~~~~~~~~~~~~~~~~~ + +o rmk: SA11xx USB client/gadget code (David B has been doing some work on + this, and keeps trying to prod me, but unfortunately I haven't had the time + to look at his work, sorry David.) + + PRI3 + +fs/ +~~~ + +o ext3 and ext2 block allocators have serious failure modes - interleaved + allocations. + + PRI3 + +o Integrate Chris Mason's 2.4 reiserfs ordered data and data journaling + patches. They make reiserfs a lot safer. + + Ordered: PRI2 + data journalled: PRI3 + +o viro: convert more filesystems to use lib/parser.c for options. + + PRI2 + +o aio: fs IO isn't async at present. suparna has restart patches, they're + in -mm. Need to get Ben to review/comment. + + PRI1. + +o drepper: various filesystems use ->pid wrongly + + PRI1 + +o hch: devfs: there's a fundamental lookup vs devfsd race that's only + fixable by introducing a lookup vs devfs deadlock. I can't see how this is + fixable without getting rid of the current devfsd design. Mandrake seems + to have a workaround for this so this is at least not triggered so easily, + but that's not what I'd consider a fix.. + + PRI2 + +kernel/ +~~~~~~~ + +o rusty: Zippel's Reference count simplification. Tricky code, but cuts + about 120 lines from module.c. Patch exists, needs stressing. + + PRI3 + +o rusty: Fix module-failed-init races by starting module "disabled". Patch + exists, requires some subsystems (ie. add_partition) to explicitly say + "make module live now". Without patch we are no worse off than 2.4 etc. + + PRI1 + +o Integrate userspace irq balancing daemon. + + PRI2 + +o kexec. Seems to work, was in -mm. + + PRI3 + +o rmk: lib/inflate.c must not use static variables (causes these to be + referenced via GOTOFF relocations in PIC decompressor. We have a PIC + decompressor to avoid having to hard code a per platform zImage link + address into the makefiles.) + + PRI2 + +o klibc merge? + + PRI2 + +mm/ +~~~ + +o dropbehind for large files + + PRI2 + +net/ +~~~~ + + (davem) + +o Real serious use of IPSEC is hampered by lack of MPLS support. MPLS is a + switching technology that works by switching based upon fixed length labels + prepended to packets. Many people use this and IPSEC to implement VPNs + over public networks, it is also used for things like traffic engineering. + + A good reference site is: + + http://www.mplsrc.com/ + + Anyways, an existing (crappy) implementation exists. I've almost + completed a rewrite, I should have something in the tree next week. + + PRI1 + +o Sometimes we generate IP fragments when it truly isn't necessary. + + The way IP fragmentation is specified, each fragment must be modulo 8 + bytes in length. So suppose the device has an MTU that is not 0 modulo 8, + ethernet even classifies in this way. 1500 == (8 * 187) + 4 + + Our IP fragmenting engine can fragment on packets that are sized within + the last modulo 8 bytes of the MTU. This happens in obscure cases, but it + does happen. + + I've proposed a fix to Alexey, whereby very late in the output path we + check the packet, if we fragmented but the data length would fit into the + MTU we unfragment the packet. + + This is low priority, because technically it creates suboptimal behavior + rather than mis-operation. + + PRI1 + +net/*/netfilter/ +~~~~~~~~~~~~~~~~ + +o Lots of misc. cleanups, which are happening slowly. + + PRI2 + +power management +~~~~~~~~~~~~~~~~ + +o Pat and Pavel disagree over swsusp. Need to sort that out. + + PRI2 + +o Frame buffer restore codepaths (that requires some deep PCI magic) + + PRI2 + +o XFree86 hooks + + PRI2 + +o AGP restoration + + PRI2 + +o DRI restoration + + (davej/Alan: not super-critical, can crash laptop on restore. davej + looking into it.) + + PRI2 + +o IDE suspend/resume without races (Ben is looking at this a little) + + PRI2 + +o Pat: There are already CPU device structures; MTRRs should be a + dynamically registered interface of CPUs, which implies there needs + to be some other glue to know that there are MTRRs that need to be + saved/restored. + + PRI1 + +global +~~~~~~ + +o We need a kernel side API for reporting error events to userspace (could + be async to 2.6 itself) + + (Prototype core based on netlink exists) + + PRI2 + +o Kai: Introduce a sane, easy and standard way to build external modules + - make clean and make modules_install are both broken + + PRI2 + +drivers +~~~~~~~ + +o Alan: Cardbus/PCMCIA requires all Russell's stuff is merged to do + multiheader right and so on + + PRI1 + +drivers/acpi/ +~~~~~~~~~~~~~ + +o Fix acpi for all newer IBM Thinkpads see + http://bugme.osdl.org/show_bug.cgi?id=1038 for more information + +o alan: VIA APIC stuff is one bit of this, there are also some other + reports that were caused by ACPI not setting level v edge trigger some + times + + PRI1 + +o mochel: it seems the acpi irq routing code could use a serious rewrite. + + grover: The problem is the ACPI irq routing code is trying to piggyback + on the existing MPS-specific data structures, and it's generally a hack. + So yes mochel is right, but it is also purging MPS-ities from common code + as well. I've done some preliminary work in this area and it doesn't seem + to break anything (yet) but a rewrite in this area imho should not be + rushed out the door. And, I think the above bugs can be fixed w/o the + rewrite. + + PRI2 + +o mochel: ACPI suspend doesn't work. Important, not cricital. Pat is + working it. + + PRI2 + +drivers/block/ +~~~~~~~~~~~~~~ + +o More testing of floppy + + PRI3 + +drivers/char/ +~~~~~~~~~~~~~ + + +drivers/ide/ +~~~~~~~~~~~~ + + (Alan) + +o IDE PIO has occasional unexplained PIO disk eating reports + + PRI1 + +o IDE has multiple zillions of races/hangs in 2.5 still + + PRI1 + +o IDE scsi needs rewriting + + PRI2 + +o IDE needs significant reworking to handle Simplex right + + PRI2 + +o IDE hotplug handling for 2.5 is completely broken still + + PRI2 + +o There are lots of other IDE bugs that wont go away until the taskfile + stuff is included, the locking bugs that allow any user to hang the IDE + layer in 2.5, and some other updates are forward ported. (esp. HPT372N). + + PRI1 + +drivers/isdn/ +~~~~~~~~~~~~~ + + (Kai, rmk) + +o isdn_tty locking is completely broken (cli() and friends) + + PRI2 + +o fix other drivers + + PRI2 + +o lots more cleanups, adaption to recent APIs etc + + PRI3 + +o fixup tty-based ISDN drivers which provide TIOCM* ioctls (see my recent + 3-set patch for serial stuff) + + Alternatively, we could re-introduce the fallback to driver ioctl parsing + for these if not enough drivers get updated. + + PRI3 + +drivers/net/ +~~~~~~~~~~~~ + +o davej: Either Wireless network drivers or PCMCIA broke somewhen. A + configuration that worked fine under 2.4 doesn't receive any packets. Need + to look into this more to make sure I don't have any misconfiguration that + just 'happened to work' under 2.4 + + PRI1 + +drivers/scsi/ +~~~~~~~~~~~~~ + +o jejb: qlogic - + + o Merge the feral driver. It covers all qlogic chips: 1020 all the way + up to 23xxx. http://linux-scsi.bkbits.net/scsi-isp-2.5 + + o qla2xxx: only for FC chips. Has significant build issues. hch + promises to send me a "must fix" list for this. + http://linux-scsi.bkbits.net/scsi-qla2xxx-2.5 + + PRI2 + +o hch, Mike Anderson, Badari Pulavarty: scsi locking issues + + o there are lots of members of struct Scsi_Host/scsi_device/scsi_cmnd + with very unclear locking, many of them probably want to become + atomic_t's or bitmaps (for the 1bit bitfields). + + o there's lots of volatile abuse in the scsi code that needs to be + thought about. + + o there's some global variables incremented without any locks + + PRI2 + +sound/ +~~~~~~ + +o rmk: several OSS drivers for SA11xx-based hardware in need of + ALSA-ification and L3 bus support code for these. + +o rmk: need to complete ALSA-ification of the WaveArtist driver for both + NetWinder and other stuff (there's some fairly fundamental differences in + the way the mixer needs to be handled for the NetWinder.) + + (Issues with forward-porting 2.4 bugfixes.) + (Killing off OSS is 2.7 material) + +PRI2 + +arch/i386/ +~~~~~~~~~~ + +o Also PC9800 merge needs finishing to the point we want for 2.6 (not all). + + PRI3 + +o davej: PAT support (for mtrr exhaustion w/ AGP) + + PRI2 + +o 2.5.x won't boot on some 440GX + + alan: Problem understood now, feasible fix in 2.4/2.4-ac. (440GX has two + IRQ routers, we use the $PIR table with the PIIX, but the 440GX doesnt use + the PIIX for its IRQ routing). Fall back to BIOS for 440GX works and Intel + concurs. + + PRI1 + +o 2.5.x doesn't handle VIA APIC right yet. + + 1. We must write the PCI_INTERRUPT_LINE + + 2. We have quirk handlers that seem to trash it. + + PRI1 + +o ECC driver questions are not yet sorted (DaveJ is working on this) (Dan + Hollis) + + alan: ECC - I have some test bits from Dan's stuff - they need no kernel + core changes for most platforms. That means we can treat it as a random + driver merge. + + PRI3 + +o alan: 2.4 has some fixes for tsc handling bugs. One where some bioses in + SMM mode mess up our toggle on the time high/low or mangle the counter and + one where a few chips need religious use of _p for timer access and we + don't do that. This is forward porting little bits of fixup. + + ACPI HZ stuff we can't trap - a lot of ACPI is implemented as outb's + triggering SMM traps + + PRI1 + +arch/x86_64/ +~~~~~~~~~~~~ + + (Andi) + +o time handling is broken. Need to move up 2.4 time.c code. + + PRI1 + +o NMI watchdog seems to tick too fast + + PRI2 + +o need to coredump 64bit vsyscall code with dwarf2 + + PRI2 + +o move 64bit signal trampolines into vsyscall code and add dwarf2 for it. + (in progress) + + PRI1 + +o describe kernel assembly with dwarf2 annotations for kgdb + + PRI3 + +arch/alpha/ +~~~~~~~~~~~ + +o rth: Ptrace writes are broken. This means we can't (reliably) set + breakpoints or modify variables from gdb. + + PRI1 + +arch/arm/ +~~~~~~~~~ + +o rmk: missing raw keyboard translation tables for all ARM machines. + Haven't even looked into this at all. This could be messy since there + isn't an ARM architecture standard. I'm presently hoping that it won't be + an issue. If it does, I guess we'll see drivers/char/keyboard.c explode. + + PRI2 + +arch/others/ +~~~~~~~~~~~~ + +o SH needs resyncing, as do some other ports. SH64 needs merging. + No impact on mainstream platforms hopefully. + + PRI2 + +arch/s390/ +~~~~~~~~~ + +o A nastly memory management problem causes random crashes. These appear + to be fixed/hidden by the objrmap patch, more investigation is needed. + + PRI1 + +drivers/s390/ +~~~~~~~~~~~~~ + +o Early userspace and 64 bit dev_t will allow the removal of most of + dasd_devmap.c and dasd_genhd.c. + + PRI2 + +o The 3270 console driver needs to be replaced with a working one + (prototype is there, needs to be finished). + + PRI2 + +o Minor interface changes are pending in cio/ when the z990 machines are + out. + + PRI2 + +o Jan Glauber is working on a fix for the timer issues related to running + on virtualized CPUs (wall-clock vs. cpu time). + + PRI1 + +o a block device driver for ramdisks shared among virtual machines + + PRI3 + +o driver for crypto hardware + + PRI3 + +o 'claw' network device driver + + PRI3 + --- diff/arch/i386/kernel/kgdb_stub.c 1970-01-01 01:00:00.000000000 +0100 +++ source/arch/i386/kernel/kgdb_stub.c 2004-04-21 10:46:51.145806768 +0100 @@ -0,0 +1,2454 @@ +/* + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + */ + +/* + * Copyright (c) 2000 VERITAS Software Corporation. + * + */ +/**************************************************************************** + * Header: remcom.c,v 1.34 91/03/09 12:29:49 glenne Exp $ + * + * Module name: remcom.c $ + * Revision: 1.34 $ + * Date: 91/03/09 12:29:49 $ + * Contributor: Lake Stevens Instrument Division$ + * + * Description: low level support for gdb debugger. $ + * + * Considerations: only works on target hardware $ + * + * Written by: Glenn Engel $ + * Updated by: David Grothe + * Updated by: Robert Walsh + * Updated by: wangdi + * ModuleState: Experimental $ + * + * NOTES: See Below $ + * + * Modified for 386 by Jim Kingdon, Cygnus Support. + * Compatibility with 2.1.xx kernel by David Grothe + * + * Changes to allow auto initilization. All that is needed is that it + * be linked with the kernel and a break point (int 3) be executed. + * The header file defines BREAKPOINT to allow one to do + * this. It should also be possible, once the interrupt system is up, to + * call putDebugChar("+"). Once this is done, the remote debugger should + * get our attention by sending a ^C in a packet. George Anzinger + * + * Integrated into 2.2.5 kernel by Tigran Aivazian + * Added thread support, support for multiple processors, + * support for ia-32(x86) hardware debugging. + * Amit S. Kale ( akale@veritas.com ) + * + * Modified to support debugging over ethernet by Robert Walsh + * and wangdi , based on + * code by San Mehat. + * + * + * To enable debugger support, two things need to happen. One, a + * call to set_debug_traps() is necessary in order to allow any breakpoints + * or error conditions to be properly intercepted and reported to gdb. + * Two, a breakpoint needs to be generated to begin communication. This + * is most easily accomplished by a call to breakpoint(). Breakpoint() + * simulates a breakpoint by executing an int 3. + * + ************* + * + * The following gdb commands are supported: + * + * command function Return value + * + * g return the value of the CPU registers hex data or ENN + * G set the value of the CPU registers OK or ENN + * + * mAA..AA,LLLL Read LLLL bytes at address AA..AA hex data or ENN + * MAA..AA,LLLL: Write LLLL bytes at address AA.AA OK or ENN + * + * c Resume at current address SNN ( signal NN) + * cAA..AA Continue at address AA..AA SNN + * + * s Step one instruction SNN + * sAA..AA Step one instruction from AA..AA SNN + * + * k kill + * + * ? What was the last sigval ? SNN (signal NN) + * + * All commands and responses are sent with a packet which includes a + * checksum. A packet consists of + * + * $#. + * + * where + * :: + * :: < two hex digits computed as modulo 256 sum of > + * + * When a packet is received, it is first acknowledged with either '+' or '-'. + * '+' indicates a successful transfer. '-' indicates a failed transfer. + * + * Example: + * + * Host: Reply: + * $m0,10#2a +$00010203040506070809101112131415#42 + * + ****************************************************************************/ +#define KGDB_VERSION "<20030915.1651.33>" +#include +#include +#include /* for strcpy */ +#include +#include +#include +#include +#include /* for linux pt_regs struct */ +#include +#include +#include +#include +#include +#include +#include +#include + +/************************************************************************ + * + * external low-level support routines + */ +typedef void (*Function) (void); /* pointer to a function */ + +/* Thread reference */ +typedef unsigned char threadref[8]; + +extern int tty_putDebugChar(int); /* write a single character */ +extern int tty_getDebugChar(void); /* read and return a single char */ +extern void tty_flushDebugChar(void); /* flush pending characters */ +extern int eth_putDebugChar(int); /* write a single character */ +extern int eth_getDebugChar(void); /* read and return a single char */ +extern void eth_flushDebugChar(void); /* flush pending characters */ + +/************************************************************************/ +/* BUFMAX defines the maximum number of characters in inbound/outbound buffers*/ +/* at least NUMREGBYTES*2 are needed for register packets */ +/* Longer buffer is needed to list all threads */ +#define BUFMAX 400 + +char *kgdb_version = KGDB_VERSION; + +/* debug > 0 prints ill-formed commands in valid packets & checksum errors */ +int debug_regs = 0; /* set to non-zero to print registers */ + +/* filled in by an external module */ +char *gdb_module_offsets; + +static const char hexchars[] = "0123456789abcdef"; + +/* Number of bytes of registers. */ +#define NUMREGBYTES 64 +/* + * Note that this register image is in a different order than + * the register image that Linux produces at interrupt time. + * + * Linux's register image is defined by struct pt_regs in ptrace.h. + * Just why GDB uses a different order is a historical mystery. + */ +enum regnames { _EAX, /* 0 */ + _ECX, /* 1 */ + _EDX, /* 2 */ + _EBX, /* 3 */ + _ESP, /* 4 */ + _EBP, /* 5 */ + _ESI, /* 6 */ + _EDI, /* 7 */ + _PC /* 8 also known as eip */ , + _PS /* 9 also known as eflags */ , + _CS, /* 10 */ + _SS, /* 11 */ + _DS, /* 12 */ + _ES, /* 13 */ + _FS, /* 14 */ + _GS /* 15 */ +}; + +/*************************** ASSEMBLY CODE MACROS *************************/ +/* + * Put the error code here just in case the user cares. + * Likewise, the vector number here (since GDB only gets the signal + * number through the usual means, and that's not very specific). + * The called_from is the return address so he can tell how we entered kgdb. + * This will allow him to seperate out the various possible entries. + */ +#define REMOTE_DEBUG 0 /* set != to turn on printing (also available in info) */ + +#define PID_MAX PID_MAX_DEFAULT + +#ifdef CONFIG_SMP +void smp_send_nmi_allbutself(void); +#define IF_SMP(x) x +#undef MAX_NO_CPUS +#ifndef CONFIG_NO_KGDB_CPUS +#define CONFIG_NO_KGDB_CPUS 2 +#endif +#if CONFIG_NO_KGDB_CPUS > NR_CPUS +#define MAX_NO_CPUS NR_CPUS +#else +#define MAX_NO_CPUS CONFIG_NO_KGDB_CPUS +#endif +#define hold_init hold_on_sstep: 1, +#define MAX_CPU_MASK (unsigned long)((1LL << MAX_NO_CPUS) - 1LL) +#define NUM_CPUS num_online_cpus() +#else +#define IF_SMP(x) +#define hold_init +#undef MAX_NO_CPUS +#define MAX_NO_CPUS 1 +#define NUM_CPUS 1 +#endif +#define NOCPU (struct task_struct *)0xbad1fbad +/* *INDENT-OFF* */ +struct kgdb_info { + int used_malloc; + void *called_from; + long long entry_tsc; + int errcode; + int vector; + int print_debug_info; +#ifdef CONFIG_SMP + int hold_on_sstep; + struct { + volatile struct task_struct *task; + int pid; + int hold; + struct pt_regs *regs; + } cpus_waiting[MAX_NO_CPUS]; +#endif +} kgdb_info = {hold_init print_debug_info:REMOTE_DEBUG, vector:-1}; + +/* *INDENT-ON* */ + +#define used_m kgdb_info.used_malloc +/* + * This is little area we set aside to contain the stack we + * need to build to allow gdb to call functions. We use one + * per cpu to avoid locking issues. We will do all this work + * with interrupts off so that should take care of the protection + * issues. + */ +#define LOOKASIDE_SIZE 200 /* should be more than enough */ +#define MALLOC_MAX 200 /* Max malloc size */ +struct { + unsigned int esp; + int array[LOOKASIDE_SIZE]; +} fn_call_lookaside[MAX_NO_CPUS]; + +static int trap_cpu; +static unsigned int OLD_esp; + +#define END_OF_LOOKASIDE &fn_call_lookaside[trap_cpu].array[LOOKASIDE_SIZE] +#define IF_BIT 0x200 +#define TF_BIT 0x100 + +#define MALLOC_ROUND 8-1 + +static char malloc_array[MALLOC_MAX]; +IF_SMP(static void to_gdb(const char *mess)); +void * +malloc(int size) +{ + + if (size <= (MALLOC_MAX - used_m)) { + int old_used = used_m; + used_m += ((size + MALLOC_ROUND) & (~MALLOC_ROUND)); + return &malloc_array[old_used]; + } else { + return NULL; + } +} + +/* + * I/O dispatch functions... + * Based upon kgdboe, either call the ethernet + * handler or the serial one.. + */ +void +putDebugChar(int c) +{ + if (!kgdboe) { + tty_putDebugChar(c); + } else { + eth_putDebugChar(c); + } +} + +int +getDebugChar(void) +{ + if (!kgdboe) { + return tty_getDebugChar(); + } else { + return eth_getDebugChar(); + } +} + +void +flushDebugChar(void) +{ + if (!kgdboe) { + tty_flushDebugChar(); + } else { + eth_flushDebugChar(); + } +} + +/* + * Gdb calls functions by pushing agruments, including a return address + * on the stack and the adjusting EIP to point to the function. The + * whole assumption in GDB is that we are on a different stack than the + * one the "user" i.e. code that hit the break point, is on. This, of + * course is not true in the kernel. Thus various dodges are needed to + * do the call without directly messing with EIP (which we can not change + * as it is just a location and not a register. To adjust it would then + * require that we move every thing below EIP up or down as needed. This + * will not work as we may well have stack relative pointer on the stack + * (such as the pointer to regs, for example). + + * So here is what we do: + * We detect gdb attempting to store into the stack area and instead, store + * into the fn_call_lookaside.array at the same relative location as if it + * were the area ESP pointed at. We also trap ESP modifications + * and uses these to adjust fn_call_lookaside.esp. On entry + * fn_call_lookaside.esp will be set to point at the last entry in + * fn_call_lookaside.array. This allows us to check if it has changed, and + * if so, on exit, we add the registers we will use to do the move and a + * trap/ interrupt return exit sequence. We then adjust the eflags in the + * regs array (remember we now have a copy in the fn_call_lookaside.array) to + * kill the interrupt bit, AND we change EIP to point at our set up stub. + * As part of the register set up we preset the registers to point at the + * begining and end of the fn_call_lookaside.array, so all the stub needs to + * do is move words from the array to the stack until ESP= the desired value + * then do the rti. This will then transfer to the desired function with + * all the correct registers. Nifty huh? + */ +extern asmlinkage void fn_call_stub(void); +extern asmlinkage void fn_rtn_stub(void); +/* *INDENT-OFF* */ +__asm__("fn_rtn_stub:\n\t" + "movl %eax,%esp\n\t" + "fn_call_stub:\n\t" + "1:\n\t" + "addl $-4,%ebx\n\t" + "movl (%ebx), %eax\n\t" + "pushl %eax\n\t" + "cmpl %esp,%ecx\n\t" + "jne 1b\n\t" + "popl %eax\n\t" + "popl %ebx\n\t" + "popl %ecx\n\t" + "iret \n\t"); +/* *INDENT-ON* */ +#define gdb_i386vector kgdb_info.vector +#define gdb_i386errcode kgdb_info.errcode +#define waiting_cpus kgdb_info.cpus_waiting +#define remote_debug kgdb_info.print_debug_info +#define hold_cpu(cpu) kgdb_info.cpus_waiting[cpu].hold +/* gdb locks */ + +#ifdef CONFIG_SMP +static int in_kgdb_called; +static spinlock_t waitlocks[MAX_NO_CPUS] = + {[0 ... MAX_NO_CPUS - 1] = SPIN_LOCK_UNLOCKED }; +/* + * The following array has the thread pointer of each of the "other" + * cpus. We make it global so it can be seen by gdb. + */ +volatile int in_kgdb_entry_log[MAX_NO_CPUS]; +volatile struct pt_regs *in_kgdb_here_log[MAX_NO_CPUS]; +/* +static spinlock_t continuelocks[MAX_NO_CPUS]; +*/ +spinlock_t kgdb_spinlock = SPIN_LOCK_UNLOCKED; +/* waiters on our spinlock plus us */ +static atomic_t spinlock_waiters = ATOMIC_INIT(1); +static int spinlock_count = 0; +static int spinlock_cpu = 0; +/* + * Note we use nested spin locks to account for the case where a break + * point is encountered when calling a function by user direction from + * kgdb. Also there is the memory exception recursion to account for. + * Well, yes, but this lets other cpus thru too. Lets add a + * cpu id to the lock. + */ +#define KGDB_SPIN_LOCK(x) if( spinlock_count == 0 || \ + spinlock_cpu != smp_processor_id()){\ + atomic_inc(&spinlock_waiters); \ + while (! spin_trylock(x)) {\ + in_kgdb(®s);\ + }\ + atomic_dec(&spinlock_waiters); \ + spinlock_count = 1; \ + spinlock_cpu = smp_processor_id(); \ + }else{ \ + spinlock_count++; \ + } +#define KGDB_SPIN_UNLOCK(x) if( --spinlock_count == 0) spin_unlock(x) +#else +unsigned kgdb_spinlock = 0; +#define KGDB_SPIN_LOCK(x) --*x +#define KGDB_SPIN_UNLOCK(x) ++*x +#endif + +int +hex(char ch) +{ + if ((ch >= 'a') && (ch <= 'f')) + return (ch - 'a' + 10); + if ((ch >= '0') && (ch <= '9')) + return (ch - '0'); + if ((ch >= 'A') && (ch <= 'F')) + return (ch - 'A' + 10); + return (-1); +} + +/* scan for the sequence $# */ +void +getpacket(char *buffer) +{ + unsigned char checksum; + unsigned char xmitcsum; + int i; + int count; + char ch; + + do { + /* wait around for the start character, ignore all other characters */ + while ((ch = (getDebugChar() & 0x7f)) != '$') ; + checksum = 0; + xmitcsum = -1; + + count = 0; + + /* now, read until a # or end of buffer is found */ + while (count < BUFMAX) { + ch = getDebugChar() & 0x7f; + if (ch == '#') + break; + checksum = checksum + ch; + buffer[count] = ch; + count = count + 1; + } + buffer[count] = 0; + + if (ch == '#') { + xmitcsum = hex(getDebugChar() & 0x7f) << 4; + xmitcsum += hex(getDebugChar() & 0x7f); + if ((remote_debug) && (checksum != xmitcsum)) { + printk + ("bad checksum. My count = 0x%x, sent=0x%x. buf=%s\n", + checksum, xmitcsum, buffer); + } + + if (checksum != xmitcsum) + putDebugChar('-'); /* failed checksum */ + else { + putDebugChar('+'); /* successful transfer */ + /* if a sequence char is present, reply the sequence ID */ + if (buffer[2] == ':') { + putDebugChar(buffer[0]); + putDebugChar(buffer[1]); + /* remove sequence chars from buffer */ + count = strlen(buffer); + for (i = 3; i <= count; i++) + buffer[i - 3] = buffer[i]; + } + } + } + } while (checksum != xmitcsum); + + if (remote_debug) + printk("R:%s\n", buffer); + flushDebugChar(); +} + +/* send the packet in buffer. */ + +void +putpacket(char *buffer) +{ + unsigned char checksum; + int count; + char ch; + + /* $#. */ + + if (!kgdboe) { + do { + if (remote_debug) + printk("T:%s\n", buffer); + putDebugChar('$'); + checksum = 0; + count = 0; + + while ((ch = buffer[count])) { + putDebugChar(ch); + checksum += ch; + count += 1; + } + + putDebugChar('#'); + putDebugChar(hexchars[checksum >> 4]); + putDebugChar(hexchars[checksum % 16]); + flushDebugChar(); + + } while ((getDebugChar() & 0x7f) != '+'); + } else { + /* + * For udp, we can not transfer too much bytes once. + * We only transfer MAX_SEND_COUNT size bytes each time + */ + +#define MAX_SEND_COUNT 30 + + int send_count = 0, i = 0; + char send_buf[MAX_SEND_COUNT]; + + do { + if (remote_debug) + printk("T:%s\n", buffer); + putDebugChar('$'); + checksum = 0; + count = 0; + send_count = 0; + while ((ch = buffer[count])) { + if (send_count >= MAX_SEND_COUNT) { + for(i = 0; i < MAX_SEND_COUNT; i++) { + putDebugChar(send_buf[i]); + } + flushDebugChar(); + send_count = 0; + } else { + send_buf[send_count] = ch; + checksum += ch; + count ++; + send_count++; + } + } + for(i = 0; i < send_count; i++) + putDebugChar(send_buf[i]); + putDebugChar('#'); + putDebugChar(hexchars[checksum >> 4]); + putDebugChar(hexchars[checksum % 16]); + flushDebugChar(); + } while ((getDebugChar() & 0x7f) != '+'); + } +} + +static char remcomInBuffer[BUFMAX]; +static char remcomOutBuffer[BUFMAX]; +static short error; + +void +debug_error(char *format, char *parm) +{ + if (remote_debug) + printk(format, parm); +} + +static void +print_regs(struct pt_regs *regs) +{ + printk("EAX=%08lx ", regs->eax); + printk("EBX=%08lx ", regs->ebx); + printk("ECX=%08lx ", regs->ecx); + printk("EDX=%08lx ", regs->edx); + printk("\n"); + printk("ESI=%08lx ", regs->esi); + printk("EDI=%08lx ", regs->edi); + printk("EBP=%08lx ", regs->ebp); + printk("ESP=%08lx ", (long) ®s->esp); + printk("\n"); + printk(" DS=%08x ", regs->xds); + printk(" ES=%08x ", regs->xes); + printk(" SS=%08x ", __KERNEL_DS); + printk(" FL=%08lx ", regs->eflags); + printk("\n"); + printk(" CS=%08x ", regs->xcs); + printk(" IP=%08lx ", regs->eip); +#if 0 + printk(" FS=%08x ", regs->fs); + printk(" GS=%08x ", regs->gs); +#endif + printk("\n"); + +} /* print_regs */ + +#define NEW_esp fn_call_lookaside[trap_cpu].esp + +static void +regs_to_gdb_regs(int *gdb_regs, struct pt_regs *regs) +{ + gdb_regs[_EAX] = regs->eax; + gdb_regs[_EBX] = regs->ebx; + gdb_regs[_ECX] = regs->ecx; + gdb_regs[_EDX] = regs->edx; + gdb_regs[_ESI] = regs->esi; + gdb_regs[_EDI] = regs->edi; + gdb_regs[_EBP] = regs->ebp; + gdb_regs[_DS] = regs->xds; + gdb_regs[_ES] = regs->xes; + gdb_regs[_PS] = regs->eflags; + gdb_regs[_CS] = regs->xcs; + gdb_regs[_PC] = regs->eip; + /* Note, as we are a debugging the kernel, we will always + * trap in kernel code, this means no priviledge change, + * and so the pt_regs structure is not completely valid. In a non + * privilege change trap, only EFLAGS, CS and EIP are put on the stack, + * SS and ESP are not stacked, this means that the last 2 elements of + * pt_regs is not valid (they would normally refer to the user stack) + * also, using regs+1 is no good because you end up will a value that is + * 2 longs (8) too high. This used to cause stepping over functions + * to fail, so my fix is to use the address of regs->esp, which + * should point at the end of the stack frame. Note I have ignored + * completely exceptions that cause an error code to be stacked, such + * as double fault. Stuart Hughes, Zentropix. + * original code: gdb_regs[_ESP] = (int) (regs + 1) ; + + * this is now done on entry and moved to OLD_esp (as well as NEW_esp). + */ + gdb_regs[_ESP] = NEW_esp; + gdb_regs[_SS] = __KERNEL_DS; + gdb_regs[_FS] = 0xFFFF; + gdb_regs[_GS] = 0xFFFF; +} /* regs_to_gdb_regs */ + +static void +gdb_regs_to_regs(int *gdb_regs, struct pt_regs *regs) +{ + regs->eax = gdb_regs[_EAX]; + regs->ebx = gdb_regs[_EBX]; + regs->ecx = gdb_regs[_ECX]; + regs->edx = gdb_regs[_EDX]; + regs->esi = gdb_regs[_ESI]; + regs->edi = gdb_regs[_EDI]; + regs->ebp = gdb_regs[_EBP]; + regs->xds = gdb_regs[_DS]; + regs->xes = gdb_regs[_ES]; + regs->eflags = gdb_regs[_PS]; + regs->xcs = gdb_regs[_CS]; + regs->eip = gdb_regs[_PC]; + NEW_esp = gdb_regs[_ESP]; /* keep the value */ +#if 0 /* can't change these */ + regs->esp = gdb_regs[_ESP]; + regs->xss = gdb_regs[_SS]; + regs->fs = gdb_regs[_FS]; + regs->gs = gdb_regs[_GS]; +#endif + +} /* gdb_regs_to_regs */ + +int thread_list = 0; + +void +get_gdb_regs(struct task_struct *p, struct pt_regs *regs, int *gdb_regs) +{ + unsigned long stack_page; + int count = 0; + IF_SMP(int i); + if (!p || p == current) { + regs_to_gdb_regs(gdb_regs, regs); + return; + } +#ifdef CONFIG_SMP + for (i = 0; i < MAX_NO_CPUS; i++) { + if (p == kgdb_info.cpus_waiting[i].task) { + regs_to_gdb_regs(gdb_regs, + kgdb_info.cpus_waiting[i].regs); + gdb_regs[_ESP] = + (int) &kgdb_info.cpus_waiting[i].regs->esp; + + return; + } + } +#endif + memset(gdb_regs, 0, NUMREGBYTES); + gdb_regs[_ESP] = p->thread.esp; + gdb_regs[_PC] = p->thread.eip; + gdb_regs[_EBP] = *(int *) gdb_regs[_ESP]; + gdb_regs[_EDI] = *(int *) (gdb_regs[_ESP] + 4); + gdb_regs[_ESI] = *(int *) (gdb_regs[_ESP] + 8); + +/* + * This code is to give a more informative notion of where a process + * is waiting. It is used only when the user asks for a thread info + * list. If he then switches to the thread, s/he will find the task + * is in schedule, but a back trace should show the same info we come + * up with. This code was shamelessly purloined from process.c. It was + * then enhanced to provide more registers than simply the program + * counter. + */ + + if (!thread_list) { + return; + } + + if (p->state == TASK_RUNNING) + return; + stack_page = (unsigned long) p->thread_info; + if (gdb_regs[_ESP] < stack_page || gdb_regs[_ESP] > + THREAD_SIZE - sizeof(long) + stack_page) + return; + /* include/asm-i386/system.h:switch_to() pushes ebp last. */ + do { + if (gdb_regs[_EBP] < stack_page || + gdb_regs[_EBP] > THREAD_SIZE - 2*sizeof(long) + stack_page) + return; + gdb_regs[_PC] = *(unsigned long *) (gdb_regs[_EBP] + 4); + gdb_regs[_ESP] = gdb_regs[_EBP] + 8; + gdb_regs[_EBP] = *(unsigned long *) gdb_regs[_EBP]; + if (!in_sched_functions(gdb_regs[_PC])) + return; + } while (count++ < 16); + return; +} + +/* Indicate to caller of mem2hex or hex2mem that there has been an + error. */ +static volatile int mem_err = 0; +static volatile int mem_err_expected = 0; +static volatile int mem_err_cnt = 0; +static int garbage_loc = -1; + +int +get_char(char *addr) +{ + return *addr; +} + +void +set_char(char *addr, int val, int may_fault) +{ + /* + * This code traps references to the area mapped to the kernel + * stack as given by the regs and, instead, stores to the + * fn_call_lookaside[cpu].array + */ + if (may_fault && + (unsigned int) addr < OLD_esp && + ((unsigned int) addr > (OLD_esp - (unsigned int) LOOKASIDE_SIZE))) { + addr = (char *) END_OF_LOOKASIDE - ((char *) OLD_esp - addr); + } + *addr = val; +} + +/* convert the memory pointed to by mem into hex, placing result in buf */ +/* return a pointer to the last char put in buf (null) */ +/* If MAY_FAULT is non-zero, then we should set mem_err in response to + a fault; if zero treat a fault like any other fault in the stub. */ +char * +mem2hex(char *mem, char *buf, int count, int may_fault) +{ + int i; + unsigned char ch; + + if (may_fault) { + mem_err_expected = 1; + mem_err = 0; + } + for (i = 0; i < count; i++) { + /* printk("%lx = ", mem) ; */ + + ch = get_char(mem++); + + /* printk("%02x\n", ch & 0xFF) ; */ + if (may_fault && mem_err) { + if (remote_debug) + printk("Mem fault fetching from addr %lx\n", + (long) (mem - 1)); + *buf = 0; /* truncate buffer */ + return (buf); + } + *buf++ = hexchars[ch >> 4]; + *buf++ = hexchars[ch % 16]; + } + *buf = 0; + if (may_fault) + mem_err_expected = 0; + return (buf); +} + +/* convert the hex array pointed to by buf into binary to be placed in mem */ +/* return a pointer to the character AFTER the last byte written */ +/* NOTE: We use the may fault flag to also indicate if the write is to + * the registers (0) or "other" memory (!=0) + */ +char * +hex2mem(char *buf, char *mem, int count, int may_fault) +{ + int i; + unsigned char ch; + + if (may_fault) { + mem_err_expected = 1; + mem_err = 0; + } + for (i = 0; i < count; i++) { + ch = hex(*buf++) << 4; + ch = ch + hex(*buf++); + set_char(mem++, ch, may_fault); + + if (may_fault && mem_err) { + if (remote_debug) + printk("Mem fault storing to addr %lx\n", + (long) (mem - 1)); + return (mem); + } + } + if (may_fault) + mem_err_expected = 0; + return (mem); +} + +/**********************************************/ +/* WHILE WE FIND NICE HEX CHARS, BUILD AN INT */ +/* RETURN NUMBER OF CHARS PROCESSED */ +/**********************************************/ +int +hexToInt(char **ptr, int *intValue) +{ + int numChars = 0; + int hexValue; + + *intValue = 0; + + while (**ptr) { + hexValue = hex(**ptr); + if (hexValue >= 0) { + *intValue = (*intValue << 4) | hexValue; + numChars++; + } else + break; + + (*ptr)++; + } + + return (numChars); +} + +#define stubhex(h) hex(h) +#ifdef old_thread_list + +static int +stub_unpack_int(char *buff, int fieldlength) +{ + int nibble; + int retval = 0; + + while (fieldlength) { + nibble = stubhex(*buff++); + retval |= nibble; + fieldlength--; + if (fieldlength) + retval = retval << 4; + } + return retval; +} +#endif +static char * +pack_hex_byte(char *pkt, int byte) +{ + *pkt++ = hexchars[(byte >> 4) & 0xf]; + *pkt++ = hexchars[(byte & 0xf)]; + return pkt; +} + +#define BUF_THREAD_ID_SIZE 16 + +static char * +pack_threadid(char *pkt, threadref * id) +{ + char *limit; + unsigned char *altid; + + altid = (unsigned char *) id; + limit = pkt + BUF_THREAD_ID_SIZE; + while (pkt < limit) + pkt = pack_hex_byte(pkt, *altid++); + return pkt; +} + +#ifdef old_thread_list +static char * +unpack_byte(char *buf, int *value) +{ + *value = stub_unpack_int(buf, 2); + return buf + 2; +} + +static char * +unpack_threadid(char *inbuf, threadref * id) +{ + char *altref; + char *limit = inbuf + BUF_THREAD_ID_SIZE; + int x, y; + + altref = (char *) id; + + while (inbuf < limit) { + x = stubhex(*inbuf++); + y = stubhex(*inbuf++); + *altref++ = (x << 4) | y; + } + return inbuf; +} +#endif +void +int_to_threadref(threadref * id, int value) +{ + unsigned char *scan; + + scan = (unsigned char *) id; + { + int i = 4; + while (i--) + *scan++ = 0; + } + *scan++ = (value >> 24) & 0xff; + *scan++ = (value >> 16) & 0xff; + *scan++ = (value >> 8) & 0xff; + *scan++ = (value & 0xff); +} +int +int_to_hex_v(unsigned char * id, int value) +{ + unsigned char *start = id; + int shift; + int ch; + + for (shift = 28; shift >= 0; shift -= 4) { + if ((ch = (value >> shift) & 0xf) || (id != start)) { + *id = hexchars[ch]; + id++; + } + } + if (id == start) + *id++ = '0'; + return id - start; +} +#ifdef old_thread_list + +static int +threadref_to_int(threadref * ref) +{ + int i, value = 0; + unsigned char *scan; + + scan = (char *) ref; + scan += 4; + i = 4; + while (i-- > 0) + value = (value << 8) | ((*scan++) & 0xff); + return value; +} +#endif +static int +cmp_str(char *s1, char *s2, int count) +{ + while (count--) { + if (*s1++ != *s2++) + return 0; + } + return 1; +} + +#if 1 /* this is a hold over from 2.4 where O(1) was "sometimes" */ +extern struct task_struct *kgdb_get_idle(int cpu); +#define idle_task(cpu) kgdb_get_idle(cpu) +#else +#define idle_task(cpu) init_tasks[cpu] +#endif + +extern int kgdb_pid_init_done; + +struct task_struct * +getthread(int pid) +{ + struct task_struct *thread; + if (pid >= PID_MAX && pid <= (PID_MAX + MAX_NO_CPUS)) { + + return idle_task(pid - PID_MAX); + } else { + /* + * find_task_by_pid is relatively safe all the time + * Other pid functions require lock downs which imply + * that we may be interrupting them (as we get here + * in the middle of most any lock down). + * Still we don't want to call until the table exists! + */ + if (kgdb_pid_init_done){ + thread = find_task_by_pid(pid); + if (thread) { + return thread; + } + } + } + return NULL; +} +/* *INDENT-OFF* */ +struct hw_breakpoint { + unsigned enabled; + unsigned type; + unsigned len; + unsigned addr; +} breakinfo[4] = { {enabled:0}, + {enabled:0}, + {enabled:0}, + {enabled:0}}; +/* *INDENT-ON* */ +unsigned hw_breakpoint_status; +void +correct_hw_break(void) +{ + int breakno; + int correctit; + int breakbit; + unsigned dr7; + + asm volatile ("movl %%db7, %0\n":"=r" (dr7) + :); + /* *INDENT-OFF* */ + do { + unsigned addr0, addr1, addr2, addr3; + asm volatile ("movl %%db0, %0\n" + "movl %%db1, %1\n" + "movl %%db2, %2\n" + "movl %%db3, %3\n" + :"=r" (addr0), "=r"(addr1), + "=r"(addr2), "=r"(addr3) + :); + } while (0); + /* *INDENT-ON* */ + correctit = 0; + for (breakno = 0; breakno < 3; breakno++) { + breakbit = 2 << (breakno << 1); + if (!(dr7 & breakbit) && breakinfo[breakno].enabled) { + correctit = 1; + dr7 |= breakbit; + dr7 &= ~(0xf0000 << (breakno << 2)); + dr7 |= (((breakinfo[breakno].len << 2) | + breakinfo[breakno].type) << 16) << + (breakno << 2); + switch (breakno) { + case 0: + asm volatile ("movl %0, %%dr0\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 1: + asm volatile ("movl %0, %%dr1\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 2: + asm volatile ("movl %0, %%dr2\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 3: + asm volatile ("movl %0, %%dr3\n"::"r" + (breakinfo[breakno].addr)); + break; + } + } else if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { + correctit = 1; + dr7 &= ~breakbit; + dr7 &= ~(0xf0000 << (breakno << 2)); + } + } + if (correctit) { + asm volatile ("movl %0, %%db7\n"::"r" (dr7)); + } +} + +int +remove_hw_break(unsigned breakno) +{ + if (!breakinfo[breakno].enabled) { + return -1; + } + breakinfo[breakno].enabled = 0; + return 0; +} + +int +set_hw_break(unsigned breakno, unsigned type, unsigned len, unsigned addr) +{ + if (breakinfo[breakno].enabled) { + return -1; + } + breakinfo[breakno].enabled = 1; + breakinfo[breakno].type = type; + breakinfo[breakno].len = len; + breakinfo[breakno].addr = addr; + return 0; +} + +#ifdef CONFIG_SMP +static int in_kgdb_console = 0; + +int +in_kgdb(struct pt_regs *regs) +{ + unsigned flags; + int cpu = smp_processor_id(); + in_kgdb_called = 1; + if (!spin_is_locked(&kgdb_spinlock)) { + if (in_kgdb_here_log[cpu] || /* we are holding this cpu */ + in_kgdb_console) { /* or we are doing slow i/o */ + return 1; + } + return 0; + } + + /* As I see it the only reason not to let all cpus spin on + * the same spin_lock is to allow selected ones to proceed. + * This would be a good thing, so we leave it this way. + * Maybe someday.... Done ! + + * in_kgdb() is called from an NMI so we don't pretend + * to have any resources, like printk() for example. + */ + + kgdb_local_irq_save(flags); /* only local here, to avoid hanging */ + /* + * log arival of this cpu + * The NMI keeps on ticking. Protect against recurring more + * than once, and ignor the cpu that has the kgdb lock + */ + in_kgdb_entry_log[cpu]++; + in_kgdb_here_log[cpu] = regs; + if (cpu == spinlock_cpu || waiting_cpus[cpu].task) + goto exit_in_kgdb; + + /* + * For protection of the initilization of the spin locks by kgdb + * it locks the kgdb spinlock before it gets the wait locks set + * up. We wait here for the wait lock to be taken. If the + * kgdb lock goes away first?? Well, it could be a slow exit + * sequence where the wait lock is removed prior to the kgdb lock + * so if kgdb gets unlocked, we just exit. + */ + + while (spin_is_locked(&kgdb_spinlock) && + !spin_is_locked(waitlocks + cpu)) ; + if (!spin_is_locked(&kgdb_spinlock)) + goto exit_in_kgdb; + + waiting_cpus[cpu].task = current; + waiting_cpus[cpu].pid = (current->pid) ? : (PID_MAX + cpu); + waiting_cpus[cpu].regs = regs; + + spin_unlock_wait(waitlocks + cpu); + + /* + * log departure of this cpu + */ + waiting_cpus[cpu].task = 0; + waiting_cpus[cpu].pid = 0; + waiting_cpus[cpu].regs = 0; + correct_hw_break(); + exit_in_kgdb: + in_kgdb_here_log[cpu] = 0; + kgdb_local_irq_restore(flags); + return 1; + /* + spin_unlock(continuelocks + smp_processor_id()); + */ +} + +void +smp__in_kgdb(struct pt_regs regs) +{ + ack_APIC_irq(); + in_kgdb(®s); +} +#else +int +in_kgdb(struct pt_regs *regs) +{ + return (kgdb_spinlock); +} +#endif + +void +printexceptioninfo(int exceptionNo, int errorcode, char *buffer) +{ + unsigned dr6; + int i; + switch (exceptionNo) { + case 1: /* debug exception */ + break; + case 3: /* breakpoint */ + sprintf(buffer, "Software breakpoint"); + return; + default: + sprintf(buffer, "Details not available"); + return; + } + asm volatile ("movl %%db6, %0\n":"=r" (dr6) + :); + if (dr6 & 0x4000) { + sprintf(buffer, "Single step"); + return; + } + for (i = 0; i < 4; ++i) { + if (dr6 & (1 << i)) { + sprintf(buffer, "Hardware breakpoint %d", i); + return; + } + } + sprintf(buffer, "Unknown trap"); + return; +} + +/* + * This function does all command procesing for interfacing to gdb. + * + * NOTE: The INT nn instruction leaves the state of the interrupt + * enable flag UNCHANGED. That means that when this routine + * is entered via a breakpoint (INT 3) instruction from code + * that has interrupts enabled, then interrupts will STILL BE + * enabled when this routine is entered. The first thing that + * we do here is disable interrupts so as to prevent recursive + * entries and bothersome serial interrupts while we are + * trying to run the serial port in polled mode. + * + * For kernel version 2.1.xx the kgdb_cli() actually gets a spin lock so + * it is always necessary to do a restore_flags before returning + * so as to let go of that lock. + */ +int +kgdb_handle_exception(int exceptionVector, + int signo, int err_code, struct pt_regs *linux_regs) +{ + struct task_struct *usethread = NULL; + struct task_struct *thread_list_start = 0, *thread = NULL; + int addr, length; + int breakno, breaktype; + char *ptr; + int newPC; + threadref thref; + int threadid; + int thread_min = PID_MAX + MAX_NO_CPUS; +#ifdef old_thread_list + int maxthreads; +#endif + int nothreads; + unsigned long flags; + int gdb_regs[NUMREGBYTES / 4]; + int dr6; + IF_SMP(int entry_state = 0); /* 0, ok, 1, no nmi, 2 sync failed */ +#define NO_NMI 1 +#define NO_SYNC 2 +#define regs (*linux_regs) +#define NUMREGS NUMREGBYTES/4 + /* + * If the entry is not from the kernel then return to the Linux + * trap handler and let it process the interrupt normally. + */ + if ((linux_regs->eflags & VM_MASK) || (3 & linux_regs->xcs)) { + printk("ignoring non-kernel exception\n"); + print_regs(®s); + return (0); + } + /* + * If we're using eth mode, set the 'mode' in the netdevice. + */ + + if (kgdboe) + netpoll_set_trap(1); + + kgdb_local_irq_save(flags); + + /* Get kgdb spinlock */ + + KGDB_SPIN_LOCK(&kgdb_spinlock); + rdtscll(kgdb_info.entry_tsc); + /* + * We depend on this spinlock and the NMI watch dog to control the + * other cpus. They will arrive at "in_kgdb()" as a result of the + * NMI and will wait there for the following spin locks to be + * released. + */ +#ifdef CONFIG_SMP + +#if 0 + if (cpu_callout_map & ~MAX_CPU_MASK) { + printk("kgdb : too many cpus, possibly not mapped" + " in contiguous space, change MAX_NO_CPUS" + " in kgdb_stub and make new kernel.\n" + " cpu_callout_map is %lx\n", cpu_callout_map); + goto exit_just_unlock; + } +#endif + if (spinlock_count == 1) { + int time = 0, end_time, dum = 0; + int i; + int cpu_logged_in[MAX_NO_CPUS] = {[0 ... MAX_NO_CPUS - 1] = (0) + }; + if (remote_debug) { + printk("kgdb : cpu %d entry, syncing others\n", + smp_processor_id()); + } + for (i = 0; i < MAX_NO_CPUS; i++) { + /* + * Use trylock as we may already hold the lock if + * we are holding the cpu. Net result is all + * locked. + */ + spin_trylock(&waitlocks[i]); + } + for (i = 0; i < MAX_NO_CPUS; i++) + cpu_logged_in[i] = 0; + /* + * Wait for their arrival. We know the watch dog is active if + * in_kgdb() has ever been called, as it is always called on a + * watchdog tick. + */ + rdtsc(dum, time); + end_time = time + 2; /* Note: we use the High order bits! */ + i = 1; + if (num_online_cpus() > 1) { + int me_in_kgdb = in_kgdb_entry_log[smp_processor_id()]; + smp_send_nmi_allbutself(); + + while (i < num_online_cpus() && time != end_time) { + int j; + for (j = 0; j < MAX_NO_CPUS; j++) { + if (waiting_cpus[j].task && + waiting_cpus[j].task != NOCPU && + !cpu_logged_in[j]) { + i++; + cpu_logged_in[j] = 1; + if (remote_debug) { + printk + ("kgdb : cpu %d arrived at kgdb\n", + j); + } + break; + } else if (!waiting_cpus[j].task && + !cpu_online(j)) { + waiting_cpus[j].task = NOCPU; + cpu_logged_in[j] = 1; + waiting_cpus[j].hold = 1; + break; + } + if (!waiting_cpus[j].task && + in_kgdb_here_log[j]) { + + int wait = 100000; + while (wait--) ; + if (!waiting_cpus[j].task && + in_kgdb_here_log[j]) { + printk + ("kgdb : cpu %d stall" + " in in_kgdb\n", + j); + i++; + cpu_logged_in[j] = 1; + waiting_cpus[j].task = + (struct task_struct + *) 1; + } + } + } + + if (in_kgdb_entry_log[smp_processor_id()] > + (me_in_kgdb + 10)) { + break; + } + + rdtsc(dum, time); + } + if (i < num_online_cpus()) { + printk + ("kgdb : time out, proceeding without sync\n"); +#if 0 + printk("kgdb : Waiting_cpus: 0 = %d, 1 = %d\n", + waiting_cpus[0].task != 0, + waiting_cpus[1].task != 0); + printk("kgdb : Cpu_logged in: 0 = %d, 1 = %d\n", + cpu_logged_in[0], cpu_logged_in[1]); + printk + ("kgdb : in_kgdb_here_log in: 0 = %d, 1 = %d\n", + in_kgdb_here_log[0] != 0, + in_kgdb_here_log[1] != 0); +#endif + entry_state = NO_SYNC; + } else { +#if 0 + int ent = + in_kgdb_entry_log[smp_processor_id()] - + me_in_kgdb; + printk("kgdb : sync after %d entries\n", ent); +#endif + } + } else { + if (remote_debug) { + printk + ("kgdb : %d cpus, but watchdog not active\n" + "proceeding without locking down other cpus\n", + num_online_cpus()); + entry_state = NO_NMI; + } + } + } +#endif + + if (remote_debug) { + unsigned long *lp = (unsigned long *) &linux_regs; + + printk("handle_exception(exceptionVector=%d, " + "signo=%d, err_code=%d, linux_regs=%p)\n", + exceptionVector, signo, err_code, linux_regs); + if (debug_regs) { + print_regs(®s); + printk("Stk: %8lx %8lx %8lx %8lx" + " %8lx %8lx %8lx %8lx\n", + lp[0], lp[1], lp[2], lp[3], + lp[4], lp[5], lp[6], lp[7]); + printk(" %8lx %8lx %8lx %8lx" + " %8lx %8lx %8lx %8lx\n", + lp[8], lp[9], lp[10], lp[11], + lp[12], lp[13], lp[14], lp[15]); + printk(" %8lx %8lx %8lx %8lx " + "%8lx %8lx %8lx %8lx\n", + lp[16], lp[17], lp[18], lp[19], + lp[20], lp[21], lp[22], lp[23]); + printk(" %8lx %8lx %8lx %8lx " + "%8lx %8lx %8lx %8lx\n", + lp[24], lp[25], lp[26], lp[27], + lp[28], lp[29], lp[30], lp[31]); + } + } + + /* Disable hardware debugging while we are in kgdb */ + /* Get the debug register status register */ +/* *INDENT-OFF* */ + __asm__("movl %0,%%db7" + : /* no output */ + :"r"(0)); + + asm volatile ("movl %%db6, %0\n" + :"=r" (hw_breakpoint_status) + :); + +/* *INDENT-ON* */ + switch (exceptionVector) { + case 0: /* divide error */ + case 1: /* debug exception */ + case 2: /* NMI */ + case 3: /* breakpoint */ + case 4: /* overflow */ + case 5: /* bounds check */ + case 6: /* invalid opcode */ + case 7: /* device not available */ + case 8: /* double fault (errcode) */ + case 10: /* invalid TSS (errcode) */ + case 12: /* stack fault (errcode) */ + case 16: /* floating point error */ + case 17: /* alignment check (errcode) */ + default: /* any undocumented */ + break; + case 11: /* segment not present (errcode) */ + case 13: /* general protection (errcode) */ + case 14: /* page fault (special errcode) */ + case 19: /* cache flush denied */ + if (mem_err_expected) { + /* + * This fault occured because of the + * get_char or set_char routines. These + * two routines use either eax of edx to + * indirectly reference the location in + * memory that they are working with. + * For a page fault, when we return the + * instruction will be retried, so we + * have to make sure that these + * registers point to valid memory. + */ + mem_err = 1; /* set mem error flag */ + mem_err_expected = 0; + mem_err_cnt++; /* helps in debugging */ + /* make valid address */ + regs.eax = (long) &garbage_loc; + /* make valid address */ + regs.edx = (long) &garbage_loc; + if (remote_debug) + printk("Return after memory error: " + "mem_err_cnt=%d\n", mem_err_cnt); + if (debug_regs) + print_regs(®s); + goto exit_kgdb; + } + break; + } + if (remote_debug) + printk("kgdb : entered kgdb on cpu %d\n", smp_processor_id()); + + gdb_i386vector = exceptionVector; + gdb_i386errcode = err_code; + kgdb_info.called_from = __builtin_return_address(0); +#ifdef CONFIG_SMP + /* + * OK, we can now communicate, lets tell gdb about the sync. + * but only if we had a problem. + */ + switch (entry_state) { + case NO_NMI: + to_gdb("NMI not active, other cpus not stopped\n"); + break; + case NO_SYNC: + to_gdb("Some cpus not stopped, see 'kgdb_info' for details\n"); + default:; + } + +#endif +/* + * Set up the gdb function call area. + */ + trap_cpu = smp_processor_id(); + OLD_esp = NEW_esp = (int) (&linux_regs->esp); + + IF_SMP(once_again:) + /* reply to host that an exception has occurred */ + remcomOutBuffer[0] = 'S'; + remcomOutBuffer[1] = hexchars[signo >> 4]; + remcomOutBuffer[2] = hexchars[signo % 16]; + remcomOutBuffer[3] = 0; + + putpacket(remcomOutBuffer); + + while (1 == 1) { + error = 0; + remcomOutBuffer[0] = 0; + getpacket(remcomInBuffer); + switch (remcomInBuffer[0]) { + case '?': + remcomOutBuffer[0] = 'S'; + remcomOutBuffer[1] = hexchars[signo >> 4]; + remcomOutBuffer[2] = hexchars[signo % 16]; + remcomOutBuffer[3] = 0; + break; + case 'd': + remote_debug = !(remote_debug); /* toggle debug flag */ + printk("Remote debug %s\n", + remote_debug ? "on" : "off"); + break; + case 'g': /* return the value of the CPU registers */ + get_gdb_regs(usethread, ®s, gdb_regs); + mem2hex((char *) gdb_regs, + remcomOutBuffer, NUMREGBYTES, 0); + break; + case 'G': /* set the value of the CPU registers - return OK */ + hex2mem(&remcomInBuffer[1], + (char *) gdb_regs, NUMREGBYTES, 0); + if (!usethread || usethread == current) { + gdb_regs_to_regs(gdb_regs, ®s); + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "E00"); + } + break; + + case 'P':{ /* set the value of a single CPU register - + return OK */ + /* + * For some reason, gdb wants to talk about psudo + * registers (greater than 15). These may have + * meaning for ptrace, but for us it is safe to + * ignor them. We do this by dumping them into + * _GS which we also ignor, but do have memory for. + */ + int regno; + + ptr = &remcomInBuffer[1]; + regs_to_gdb_regs(gdb_regs, ®s); + if ((!usethread || usethread == current) && + hexToInt(&ptr, ®no) && + *ptr++ == '=' && (regno >= 0)) { + regno = + (regno >= NUMREGS ? _GS : regno); + hex2mem(ptr, (char *) &gdb_regs[regno], + 4, 0); + gdb_regs_to_regs(gdb_regs, ®s); + strcpy(remcomOutBuffer, "OK"); + break; + } + strcpy(remcomOutBuffer, "E01"); + break; + } + + /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */ + case 'm': + /* TRY TO READ %x,%x. IF SUCCEED, SET PTR = 0 */ + ptr = &remcomInBuffer[1]; + if (hexToInt(&ptr, &addr) && + (*(ptr++) == ',') && (hexToInt(&ptr, &length))) { + ptr = 0; + /* + * hex doubles the byte count + */ + if (length > (BUFMAX / 2)) + length = BUFMAX / 2; + mem2hex((char *) addr, + remcomOutBuffer, length, 1); + if (mem_err) { + strcpy(remcomOutBuffer, "E03"); + debug_error("memory fault\n", NULL); + } + } + + if (ptr) { + strcpy(remcomOutBuffer, "E01"); + debug_error + ("malformed read memory command: %s\n", + remcomInBuffer); + } + break; + + /* MAA..AA,LLLL: + Write LLLL bytes at address AA.AA return OK */ + case 'M': + /* TRY TO READ '%x,%x:'. IF SUCCEED, SET PTR = 0 */ + ptr = &remcomInBuffer[1]; + if (hexToInt(&ptr, &addr) && + (*(ptr++) == ',') && + (hexToInt(&ptr, &length)) && (*(ptr++) == ':')) { + hex2mem(ptr, (char *) addr, length, 1); + + if (mem_err) { + strcpy(remcomOutBuffer, "E03"); + debug_error("memory fault\n", NULL); + } else { + strcpy(remcomOutBuffer, "OK"); + } + + ptr = 0; + } + if (ptr) { + strcpy(remcomOutBuffer, "E02"); + debug_error + ("malformed write memory command: %s\n", + remcomInBuffer); + } + break; + case 'S': + remcomInBuffer[0] = 's'; + case 'C': + /* Csig;AA..AA where ;AA..AA is optional + * continue with signal + * Since signals are meaning less to us, delete that + * part and then fall into the 'c' code. + */ + ptr = &remcomInBuffer[1]; + length = 2; + while (*ptr && *ptr != ';') { + length++; + ptr++; + } + if (*ptr) { + do { + ptr++; + *(ptr - length++) = *ptr; + } while (*ptr); + } else { + remcomInBuffer[1] = 0; + } + + /* cAA..AA Continue at address AA..AA(optional) */ + /* sAA..AA Step one instruction from AA..AA(optional) */ + /* D detach, reply OK and then continue */ + case 'c': + case 's': + case 'D': + + /* try to read optional parameter, + pc unchanged if no parm */ + ptr = &remcomInBuffer[1]; + if (hexToInt(&ptr, &addr)) { + if (remote_debug) + printk("Changing EIP to 0x%x\n", addr); + + regs.eip = addr; + } + + newPC = regs.eip; + + /* clear the trace bit */ + regs.eflags &= 0xfffffeff; + + /* set the trace bit if we're stepping */ + if (remcomInBuffer[0] == 's') + regs.eflags |= 0x100; + + /* detach is a friendly version of continue. Note that + debugging is still enabled (e.g hit control C) + */ + if (remcomInBuffer[0] == 'D') { + strcpy(remcomOutBuffer, "OK"); + putpacket(remcomOutBuffer); + } + + if (remote_debug) { + printk("Resuming execution\n"); + print_regs(®s); + } + asm volatile ("movl %%db6, %0\n":"=r" (dr6) + :); + if (!(dr6 & 0x4000)) { + for (breakno = 0; breakno < 4; ++breakno) { + if (dr6 & (1 << breakno) && + (breakinfo[breakno].type == 0)) { + /* Set restore flag */ + regs.eflags |= 0x10000; + break; + } + } + } + + if (kgdboe) + netpoll_set_trap(0); + + correct_hw_break(); + asm volatile ("movl %0, %%db6\n"::"r" (0)); + goto exit_kgdb; + + /* kill the program */ + case 'k': /* do nothing */ + break; + + /* query */ + case 'q': + nothreads = 0; + switch (remcomInBuffer[1]) { + case 'f': + threadid = 1; + thread_list = 2; + thread_list_start = (usethread ? : current); + case 's': + if (!cmp_str(&remcomInBuffer[2], + "ThreadInfo", 10)) + break; + + remcomOutBuffer[nothreads++] = 'm'; + for (; threadid < PID_MAX + MAX_NO_CPUS; + threadid++) { + thread = getthread(threadid); + if (thread) { + nothreads += int_to_hex_v( + &remcomOutBuffer[ + nothreads], + threadid); + if (thread_min > threadid) + thread_min = threadid; + remcomOutBuffer[ + nothreads] = ','; + nothreads++; + if (nothreads > BUFMAX - 10) + break; + } + } + if (remcomOutBuffer[nothreads - 1] == 'm') { + remcomOutBuffer[nothreads - 1] = 'l'; + } else { + nothreads--; + } + remcomOutBuffer[nothreads] = 0; + break; + +#ifdef old_thread_list /* Old thread info request */ + case 'L': + /* List threads */ + thread_list = 2; + thread_list_start = (usethread ? : current); + unpack_byte(remcomInBuffer + 3, &maxthreads); + unpack_threadid(remcomInBuffer + 5, &thref); + do { + int buf_thread_limit = + (BUFMAX - 22) / BUF_THREAD_ID_SIZE; + if (maxthreads > buf_thread_limit) { + maxthreads = buf_thread_limit; + } + } while (0); + remcomOutBuffer[0] = 'q'; + remcomOutBuffer[1] = 'M'; + remcomOutBuffer[4] = '0'; + pack_threadid(remcomOutBuffer + 5, &thref); + + threadid = threadref_to_int(&thref); + for (nothreads = 0; + nothreads < maxthreads && + threadid < PID_MAX + MAX_NO_CPUS; + threadid++) { + thread = getthread(threadid); + if (thread) { + int_to_threadref(&thref, + threadid); + pack_threadid(remcomOutBuffer + + 21 + + nothreads * 16, + &thref); + nothreads++; + if (thread_min > threadid) + thread_min = threadid; + } + } + + if (threadid == PID_MAX + MAX_NO_CPUS) { + remcomOutBuffer[4] = '1'; + } + pack_hex_byte(remcomOutBuffer + 2, nothreads); + remcomOutBuffer[21 + nothreads * 16] = '\0'; + break; +#endif + case 'C': + /* Current thread id */ + remcomOutBuffer[0] = 'Q'; + remcomOutBuffer[1] = 'C'; + threadid = current->pid; + if (!threadid) { + /* + * idle thread + */ + for (threadid = PID_MAX; + threadid < PID_MAX + MAX_NO_CPUS; + threadid++) { + if (current == + idle_task(threadid - + PID_MAX)) + break; + } + } + int_to_threadref(&thref, threadid); + pack_threadid(remcomOutBuffer + 2, &thref); + remcomOutBuffer[18] = '\0'; + break; + + case 'E': + /* Print exception info */ + printexceptioninfo(exceptionVector, + err_code, remcomOutBuffer); + break; + case 'T':{ + char * nptr; + /* Thread extra info */ + if (!cmp_str(&remcomInBuffer[2], + "hreadExtraInfo,", 15)) { + break; + } + ptr = &remcomInBuffer[17]; + hexToInt(&ptr, &threadid); + thread = getthread(threadid); + nptr = &thread->comm[0]; + length = 0; + ptr = &remcomOutBuffer[0]; + do { + length++; + ptr = pack_hex_byte(ptr, *nptr++); + } while (*nptr && length < 16); + /* + * would like that 16 to be the size of + * task_struct.comm but don't know the + * syntax.. + */ + *ptr = 0; + } + } + break; + + /* task related */ + case 'H': + switch (remcomInBuffer[1]) { + case 'g': + ptr = &remcomInBuffer[2]; + hexToInt(&ptr, &threadid); + thread = getthread(threadid); + if (!thread) { + remcomOutBuffer[0] = 'E'; + remcomOutBuffer[1] = '\0'; + break; + } + /* + * Just in case I forget what this is all about, + * the "thread info" command to gdb causes it + * to ask for a thread list. It then switches + * to each thread and asks for the registers. + * For this (and only this) usage, we want to + * fudge the registers of tasks not on the run + * list (i.e. waiting) to show the routine that + * called schedule. Also, gdb, is a minimalist + * in that if the current thread is the last + * it will not re-read the info when done. + * This means that in this case we must show + * the real registers. So here is how we do it: + * Each entry we keep track of the min + * thread in the list (the last that gdb will) + * get info for. We also keep track of the + * starting thread. + * "thread_list" is cleared when switching back + * to the min thread if it is was current, or + * if it was not current, thread_list is set + * to 1. When the switch to current comes, + * if thread_list is 1, clear it, else do + * nothing. + */ + usethread = thread; + if ((thread_list == 1) && + (thread == thread_list_start)) { + thread_list = 0; + } + if (thread_list && (threadid == thread_min)) { + if (thread == thread_list_start) { + thread_list = 0; + } else { + thread_list = 1; + } + } + /* follow through */ + case 'c': + remcomOutBuffer[0] = 'O'; + remcomOutBuffer[1] = 'K'; + remcomOutBuffer[2] = '\0'; + break; + } + break; + + /* Query thread status */ + case 'T': + ptr = &remcomInBuffer[1]; + hexToInt(&ptr, &threadid); + thread = getthread(threadid); + if (thread) { + remcomOutBuffer[0] = 'O'; + remcomOutBuffer[1] = 'K'; + remcomOutBuffer[2] = '\0'; + if (thread_min > threadid) + thread_min = threadid; + } else { + remcomOutBuffer[0] = 'E'; + remcomOutBuffer[1] = '\0'; + } + break; + + case 'Y': /* set up a hardware breakpoint */ + ptr = &remcomInBuffer[1]; + hexToInt(&ptr, &breakno); + ptr++; + hexToInt(&ptr, &breaktype); + ptr++; + hexToInt(&ptr, &length); + ptr++; + hexToInt(&ptr, &addr); + if (set_hw_break(breakno & 0x3, + breaktype & 0x3, + length & 0x3, addr) == 0) { + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "ERROR"); + } + break; + + /* Remove hardware breakpoint */ + case 'y': + ptr = &remcomInBuffer[1]; + hexToInt(&ptr, &breakno); + if (remove_hw_break(breakno & 0x3) == 0) { + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "ERROR"); + } + break; + + case 'r': /* reboot */ + strcpy(remcomOutBuffer, "OK"); + putpacket(remcomOutBuffer); + /*to_gdb("Rebooting\n"); */ + /* triplefault no return from here */ + { + static long no_idt[2]; + __asm__ __volatile__("lidt %0"::"m"(no_idt[0])); + BREAKPOINT; + } + + } /* switch */ + + /* reply to the request */ + putpacket(remcomOutBuffer); + } /* while(1==1) */ + /* + * reached by goto only. + */ + exit_kgdb: + /* + * Here is where we set up to trap a gdb function call. NEW_esp + * will be changed if we are trying to do this. We handle both + * adding and subtracting, thus allowing gdb to put grung on + * the stack which it removes later. + */ + if (NEW_esp != OLD_esp) { + int *ptr = END_OF_LOOKASIDE; + if (NEW_esp < OLD_esp) + ptr -= (OLD_esp - NEW_esp) / sizeof (int); + *--ptr = linux_regs->eflags; + *--ptr = linux_regs->xcs; + *--ptr = linux_regs->eip; + *--ptr = linux_regs->ecx; + *--ptr = linux_regs->ebx; + *--ptr = linux_regs->eax; + linux_regs->ecx = NEW_esp - (sizeof (int) * 6); + linux_regs->ebx = (unsigned int) END_OF_LOOKASIDE; + if (NEW_esp < OLD_esp) { + linux_regs->eip = (unsigned int) fn_call_stub; + } else { + linux_regs->eip = (unsigned int) fn_rtn_stub; + linux_regs->eax = NEW_esp; + } + linux_regs->eflags &= ~(IF_BIT | TF_BIT); + } +#ifdef CONFIG_SMP + /* + * Release gdb wait locks + * Sanity check time. Must have at least one cpu to run. Also single + * step must not be done if the current cpu is on hold. + */ + if (spinlock_count == 1) { + int ss_hold = (regs.eflags & 0x100) && kgdb_info.hold_on_sstep; + int cpu_avail = 0; + int i; + + for (i = 0; i < MAX_NO_CPUS; i++) { + if (!cpu_online(i)) + break; + if (!hold_cpu(i)) { + cpu_avail = 1; + } + } + /* + * Early in the bring up there will be NO cpus on line... + */ + if (!cpu_avail && !cpus_empty(cpu_online_map)) { + to_gdb("No cpus unblocked, see 'kgdb_info.hold_cpu'\n"); + goto once_again; + } + if (hold_cpu(smp_processor_id()) && (regs.eflags & 0x100)) { + to_gdb + ("Current cpu must be unblocked to single step\n"); + goto once_again; + } + if (!(ss_hold)) { + int i; + for (i = 0; i < MAX_NO_CPUS; i++) { + if (!hold_cpu(i)) { + spin_unlock(&waitlocks[i]); + } + } + } else { + spin_unlock(&waitlocks[smp_processor_id()]); + } + /* Release kgdb spinlock */ + KGDB_SPIN_UNLOCK(&kgdb_spinlock); + /* + * If this cpu is on hold, this is where we + * do it. Note, the NMI will pull us out of here, + * but will return as the above lock is not held. + * We will stay here till another cpu releases the lock for us. + */ + spin_unlock_wait(waitlocks + smp_processor_id()); + kgdb_local_irq_restore(flags); + return (0); + } +#if 0 +exit_just_unlock: +#endif +#endif + /* Release kgdb spinlock */ + KGDB_SPIN_UNLOCK(&kgdb_spinlock); + kgdb_local_irq_restore(flags); + return (0); +} + +/* this function is used to set up exception handlers for tracing and + * breakpoints. + * This function is not needed as the above line does all that is needed. + * We leave it for backward compatitability... + */ +void +set_debug_traps(void) +{ + /* + * linux_debug_hook is defined in traps.c. We store a pointer + * to our own exception handler into it. + + * But really folks, every hear of labeled common, an old Fortran + * concept. Lots of folks can reference it and it is define if + * anyone does. Only one can initialize it at link time. We do + * this with the hook. See the statement above. No need for any + * executable code and it is ready as soon as the kernel is + * loaded. Very desirable in kernel debugging. + + linux_debug_hook = handle_exception ; + */ + + /* In case GDB is started before us, ack any packets (presumably + "$?#xx") sitting there. + putDebugChar ('+'); + + initialized = 1; + */ +} + +/* This function will generate a breakpoint exception. It is used at the + beginning of a program to sync up with a debugger and can be used + otherwise as a quick means to stop program execution and "break" into + the debugger. */ +/* But really, just use the BREAKPOINT macro. We will handle the int stuff + */ + +#ifdef later +/* + * possibly we should not go thru the traps.c code at all? Someday. + */ +void +do_kgdb_int3(struct pt_regs *regs, long error_code) +{ + kgdb_handle_exception(3, 5, error_code, regs); + return; +} +#endif +#undef regs +#ifdef CONFIG_TRAP_BAD_SYSCALL_EXITS +asmlinkage void +bad_sys_call_exit(int stuff) +{ + struct pt_regs *regs = (struct pt_regs *) &stuff; + printk("Sys call %d return with %x preempt_count\n", + (int) regs->orig_eax, preempt_count()); +} +#endif +#ifdef CONFIG_STACK_OVERFLOW_TEST +#include +asmlinkage void +stack_overflow(void) +{ +#ifdef BREAKPOINT + BREAKPOINT; +#else + printk("Kernel stack overflow, looping forever\n"); +#endif + while (1) { + } +} +#endif + +#if defined(CONFIG_SMP) || defined(CONFIG_KGDB_CONSOLE) +char gdbconbuf[BUFMAX]; + +static void +kgdb_gdb_message(const char *s, unsigned count) +{ + int i; + int wcount; + char *bufptr; + /* + * This takes care of NMI while spining out chars to gdb + */ + IF_SMP(in_kgdb_console = 1); + gdbconbuf[0] = 'O'; + bufptr = gdbconbuf + 1; + while (count > 0) { + if ((count << 1) > (BUFMAX - 2)) { + wcount = (BUFMAX - 2) >> 1; + } else { + wcount = count; + } + count -= wcount; + for (i = 0; i < wcount; i++) { + bufptr = pack_hex_byte(bufptr, s[i]); + } + *bufptr = '\0'; + s += wcount; + + putpacket(gdbconbuf); + + } + IF_SMP(in_kgdb_console = 0); +} +#endif +#ifdef CONFIG_SMP +static void +to_gdb(const char *s) +{ + int count = 0; + while (s[count] && (count++ < BUFMAX)) ; + kgdb_gdb_message(s, count); +} +#endif +#ifdef CONFIG_KGDB_CONSOLE +#include +#include +#include +#include +#include + +void +kgdb_console_write(struct console *co, const char *s, unsigned count) +{ + + if (gdb_i386vector == -1) { + /* + * We have not yet talked to gdb. What to do... + * lets break, on continue we can do the write. + * But first tell him whats up. Uh, well no can do, + * as this IS the console. Oh well... + * We do need to wait or the messages will be lost. + * Other option would be to tell the above code to + * ignore this breakpoint and do an auto return, + * but that might confuse gdb. Also this happens + * early enough in boot up that we don't have the traps + * set up yet, so... + */ + breakpoint(); + } + kgdb_gdb_message(s, count); +} + +/* + * ------------------------------------------------------------ + * Serial KGDB driver + * ------------------------------------------------------------ + */ + +static struct console kgdbcons = { + name:"kgdb", + write:kgdb_console_write, +#ifdef CONFIG_KGDB_USER_CONSOLE + device:kgdb_console_device, +#endif + flags:CON_PRINTBUFFER | CON_ENABLED, + index:-1, +}; + +/* + * The trick here is that this file gets linked before printk.o + * That means we get to peer at the console info in the command + * line before it does. If we are up, we register, otherwise, + * do nothing. By returning 0, we allow printk to look also. + */ +static int kgdb_console_enabled; + +int __init +kgdb_console_init(char *str) +{ + if ((strncmp(str, "kgdb", 4) == 0) || (strncmp(str, "gdb", 3) == 0)) { + register_console(&kgdbcons); + kgdb_console_enabled = 1; + } + return 0; /* let others look at the string */ +} + +__setup("console=", kgdb_console_init); + +#ifdef CONFIG_KGDB_USER_CONSOLE +static kdev_t kgdb_console_device(struct console *c); +/* This stuff sort of works, but it knocks out telnet devices + * we are leaving it here in case we (or you) find time to figure it out + * better.. + */ + +/* + * We need a real char device as well for when the console is opened for user + * space activities. + */ + +static int +kgdb_consdev_open(struct inode *inode, struct file *file) +{ + return 0; +} + +static ssize_t +kgdb_consdev_write(struct file *file, const char *buf, + size_t count, loff_t * ppos) +{ + int size, ret = 0; + static char kbuf[128]; + static DECLARE_MUTEX(sem); + + /* We are not reentrant... */ + if (down_interruptible(&sem)) + return -ERESTARTSYS; + + while (count > 0) { + /* need to copy the data from user space */ + size = count; + if (size > sizeof (kbuf)) + size = sizeof (kbuf); + if (copy_from_user(kbuf, buf, size)) { + ret = -EFAULT; + break;; + } + kgdb_console_write(&kgdbcons, kbuf, size); + count -= size; + ret += size; + buf += size; + } + + up(&sem); + + return ret; +} + +struct file_operations kgdb_consdev_fops = { + open:kgdb_consdev_open, + write:kgdb_consdev_write +}; +static kdev_t +kgdb_console_device(struct console *c) +{ + return MKDEV(TTYAUX_MAJOR, 1); +} + +/* + * This routine gets called from the serial stub in the i386/lib + * This is so it is done late in bring up (just before the console open). + */ +void +kgdb_console_finit(void) +{ + if (kgdb_console_enabled) { + char *cptr = cdevname(MKDEV(TTYAUX_MAJOR, 1)); + char *cp = cptr; + while (*cptr && *cptr != '(') + cptr++; + *cptr = 0; + unregister_chrdev(TTYAUX_MAJOR, cp); + register_chrdev(TTYAUX_MAJOR, "kgdb", &kgdb_consdev_fops); + } +} +#endif +#endif +#ifdef CONFIG_KGDB_TS +#include /* time stamp code */ +#include /* in_interrupt */ +#ifdef CONFIG_KGDB_TS_64 +#define DATA_POINTS 64 +#endif +#ifdef CONFIG_KGDB_TS_128 +#define DATA_POINTS 128 +#endif +#ifdef CONFIG_KGDB_TS_256 +#define DATA_POINTS 256 +#endif +#ifdef CONFIG_KGDB_TS_512 +#define DATA_POINTS 512 +#endif +#ifdef CONFIG_KGDB_TS_1024 +#define DATA_POINTS 1024 +#endif +#ifndef DATA_POINTS +#define DATA_POINTS 128 /* must be a power of two */ +#endif +#define INDEX_MASK (DATA_POINTS - 1) +#if (INDEX_MASK & DATA_POINTS) +#error "CONFIG_KGDB_TS_COUNT must be a power of 2" +#endif +struct kgdb_and_then_struct { +#ifdef CONFIG_SMP + int on_cpu; +#endif + struct task_struct *task; + long long at_time; + int from_ln; + char *in_src; + void *from; + int *with_shpf; + int data0; + int data1; +}; +struct kgdb_and_then_struct2 { +#ifdef CONFIG_SMP + int on_cpu; +#endif + struct task_struct *task; + long long at_time; + int from_ln; + char *in_src; + void *from; + int *with_shpf; + struct task_struct *t1; + struct task_struct *t2; +}; +struct kgdb_and_then_struct kgdb_data[DATA_POINTS]; + +struct kgdb_and_then_struct *kgdb_and_then = &kgdb_data[0]; +int kgdb_and_then_count; + +void +kgdb_tstamp(int line, char *source, int data0, int data1) +{ + static spinlock_t ts_spin = SPIN_LOCK_UNLOCKED; + int flags; + kgdb_local_irq_save(flags); + spin_lock(&ts_spin); + rdtscll(kgdb_and_then->at_time); +#ifdef CONFIG_SMP + kgdb_and_then->on_cpu = smp_processor_id(); +#endif + kgdb_and_then->task = current; + kgdb_and_then->from_ln = line; + kgdb_and_then->in_src = source; + kgdb_and_then->from = __builtin_return_address(0); + kgdb_and_then->with_shpf = (int *) (((flags & IF_BIT) >> 9) | + (preempt_count() << 8)); + kgdb_and_then->data0 = data0; + kgdb_and_then->data1 = data1; + kgdb_and_then = &kgdb_data[++kgdb_and_then_count & INDEX_MASK]; + spin_unlock(&ts_spin); + kgdb_local_irq_restore(flags); +#ifdef CONFIG_PREEMPT + +#endif + return; +} +#endif +typedef int gdb_debug_hook(int exceptionVector, + int signo, int err_code, struct pt_regs *linux_regs); +gdb_debug_hook *linux_debug_hook = &kgdb_handle_exception; /* histerical reasons... */ + +static int kgdb_need_breakpoint[NR_CPUS]; + +void kgdb_schedule_breakpoint(void) +{ + kgdb_need_breakpoint[smp_processor_id()] = 1; +} + +void kgdb_process_breakpoint(void) +{ + /* + * Handle a breakpoint queued from inside network driver code + * to avoid reentrancy issues + */ + if (kgdb_need_breakpoint[smp_processor_id()]) { + kgdb_need_breakpoint[smp_processor_id()] = 0; + BREAKPOINT; + } +} + --- diff/arch/i386/lib/kgdb_serial.c 1970-01-01 01:00:00.000000000 +0100 +++ source/arch/i386/lib/kgdb_serial.c 2004-04-21 10:46:51.171802816 +0100 @@ -0,0 +1,499 @@ +/* + * Serial interface GDB stub + * + * Written (hacked together) by David Grothe (dave@gcom.com) + * Modified to allow invokation early in boot see also + * kgdb.h for instructions by George Anzinger(george@mvista.com) + * Modified to handle debugging over ethernet by Robert Walsh + * and wangdi , based on + * code by San Mehat. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_KGDB_USER_CONSOLE +extern void kgdb_console_finit(void); +#endif +#define PRNT_off +#define TEST_EXISTANCE +#ifdef PRNT +#define dbprintk(s) printk s +#else +#define dbprintk(s) +#endif +#define TEST_INTERRUPT_off +#ifdef TEST_INTERRUPT +#define intprintk(s) printk s +#else +#define intprintk(s) +#endif + +#define IRQ_T(info) ((info->flags & ASYNC_SHARE_IRQ) ? SA_SHIRQ : SA_INTERRUPT) + +#define GDB_BUF_SIZE 512 /* power of 2, please */ + +static char gdb_buf[GDB_BUF_SIZE]; +static int gdb_buf_in_inx; +static atomic_t gdb_buf_in_cnt; +static int gdb_buf_out_inx; + +struct async_struct *gdb_async_info; +static int gdb_async_irq; + +#define outb_px(a,b) outb_p(b,a) + +static void program_uart(struct async_struct *info); +static void write_char(struct async_struct *info, int chr); +/* + * Get a byte from the hardware data buffer and return it + */ +static int +read_data_bfr(struct async_struct *info) +{ + char it = inb_p(info->port + UART_LSR); + + if (it & UART_LSR_DR) + return (inb_p(info->port + UART_RX)); + /* + * If we have a framing error assume somebody messed with + * our uart. Reprogram it and send '-' both ways... + */ + if (it & 0xc) { + program_uart(info); + write_char(info, '-'); + return ('-'); + } + return (-1); + +} /* read_data_bfr */ + +/* + * Get a char if available, return -1 if nothing available. + * Empty the receive buffer first, then look at the interface hardware. + + * Locking here is a bit of a problem. We MUST not lock out communication + * if we are trying to talk to gdb about a kgdb entry. ON the other hand + * we can loose chars in the console pass thru if we don't lock. It is also + * possible that we could hold the lock or be waiting for it when kgdb + * NEEDS to talk. Since kgdb locks down the world, it does not need locks. + * We do, of course have possible issues with interrupting a uart operation, + * but we will just depend on the uart status to help keep that straight. + + */ +static spinlock_t uart_interrupt_lock = SPIN_LOCK_UNLOCKED; +#ifdef CONFIG_SMP +extern spinlock_t kgdb_spinlock; +#endif + +static int +read_char(struct async_struct *info) +{ + int chr; + unsigned long flags; + local_irq_save(flags); +#ifdef CONFIG_SMP + if (!spin_is_locked(&kgdb_spinlock)) { + spin_lock(&uart_interrupt_lock); + } +#endif + if (atomic_read(&gdb_buf_in_cnt) != 0) { /* intr routine has q'd chars */ + chr = gdb_buf[gdb_buf_out_inx++]; + gdb_buf_out_inx &= (GDB_BUF_SIZE - 1); + atomic_dec(&gdb_buf_in_cnt); + } else { + chr = read_data_bfr(info); + } +#ifdef CONFIG_SMP + if (!spin_is_locked(&kgdb_spinlock)) { + spin_unlock(&uart_interrupt_lock); + } +#endif + local_irq_restore(flags); + return (chr); +} + +/* + * Wait until the interface can accept a char, then write it. + */ +static void +write_char(struct async_struct *info, int chr) +{ + while (!(inb_p(info->port + UART_LSR) & UART_LSR_THRE)) ; + + outb_p(chr, info->port + UART_TX); + +} /* write_char */ + +/* + * Mostly we don't need a spinlock, but since the console goes + * thru here with interrutps on, well, we need to catch those + * chars. + */ +/* + * This is the receiver interrupt routine for the GDB stub. + * It will receive a limited number of characters of input + * from the gdb host machine and save them up in a buffer. + * + * When the gdb stub routine tty_getDebugChar() is called it + * draws characters out of the buffer until it is empty and + * then reads directly from the serial port. + * + * We do not attempt to write chars from the interrupt routine + * since the stubs do all of that via tty_putDebugChar() which + * writes one byte after waiting for the interface to become + * ready. + * + * The debug stubs like to run with interrupts disabled since, + * after all, they run as a consequence of a breakpoint in + * the kernel. + * + * Perhaps someone who knows more about the tty driver than I + * care to learn can make this work for any low level serial + * driver. + */ +static irqreturn_t +gdb_interrupt(int irq, void *dev_id, struct pt_regs *regs) +{ + struct async_struct *info; + unsigned long flags; + + info = gdb_async_info; + if (!info || !info->tty || irq != gdb_async_irq) + return IRQ_NONE; + + local_irq_save(flags); + spin_lock(&uart_interrupt_lock); + do { + int chr = read_data_bfr(info); + intprintk(("Debug char on int: %x hex\n", chr)); + if (chr < 0) + continue; + + if (chr == 3) { /* Ctrl-C means remote interrupt */ + BREAKPOINT; + continue; + } + + if (atomic_read(&gdb_buf_in_cnt) >= GDB_BUF_SIZE) { + /* buffer overflow tosses early char */ + read_char(info); + } + gdb_buf[gdb_buf_in_inx++] = chr; + gdb_buf_in_inx &= (GDB_BUF_SIZE - 1); + } while (inb_p(info->port + UART_IIR) & UART_IIR_RDI); + spin_unlock(&uart_interrupt_lock); + local_irq_restore(flags); + return IRQ_HANDLED; +} /* gdb_interrupt */ + +/* + * Just a NULL routine for testing. + */ +void +gdb_null(void) +{ +} /* gdb_null */ + +/* These structure are filled in with values defined in asm/kgdb_local.h + */ +static struct serial_state state = SB_STATE; +static struct async_struct local_info = SB_INFO; +static int ok_to_enable_ints = 0; +static void kgdb_enable_ints_now(void); + +extern char *kgdb_version; +/* + * Hook an IRQ for KGDB. + * + * This routine is called from tty_putDebugChar, below. + */ +static int ints_disabled = 1; +int +gdb_hook_interrupt(struct async_struct *info, int verb) +{ + struct serial_state *state = info->state; + unsigned long flags; + int port; +#ifdef TEST_EXISTANCE + int scratch, scratch2; +#endif + + /* The above fails if memory managment is not set up yet. + * Rather than fail the set up, just keep track of the fact + * and pick up the interrupt thing later. + */ + gdb_async_info = info; + port = gdb_async_info->port; + gdb_async_irq = state->irq; + if (verb) { + printk("kgdb %s : port =%x, IRQ=%d, divisor =%d\n", + kgdb_version, + port, + gdb_async_irq, gdb_async_info->state->custom_divisor); + } + local_irq_save(flags); +#ifdef TEST_EXISTANCE + /* Existance test */ + /* Should not need all this, but just in case.... */ + + scratch = inb_p(port + UART_IER); + outb_px(port + UART_IER, 0); + outb_px(0xff, 0x080); + scratch2 = inb_p(port + UART_IER); + outb_px(port + UART_IER, scratch); + if (scratch2) { + printk + ("gdb_hook_interrupt: Could not clear IER, not a UART!\n"); + local_irq_restore(flags); + return 1; /* We failed; there's nothing here */ + } + scratch2 = inb_p(port + UART_LCR); + outb_px(port + UART_LCR, 0xBF); /* set up for StarTech test */ + outb_px(port + UART_EFR, 0); /* EFR is the same as FCR */ + outb_px(port + UART_LCR, 0); + outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO); + scratch = inb_p(port + UART_IIR) >> 6; + if (scratch == 1) { + printk("gdb_hook_interrupt: Undefined UART type!" + " Not a UART! \n"); + local_irq_restore(flags); + return 1; + } else { + dbprintk(("gdb_hook_interrupt: UART type " + "is %d where 0=16450, 2=16550 3=16550A\n", scratch)); + } + scratch = inb_p(port + UART_MCR); + outb_px(port + UART_MCR, UART_MCR_LOOP | scratch); + outb_px(port + UART_MCR, UART_MCR_LOOP | 0x0A); + scratch2 = inb_p(port + UART_MSR) & 0xF0; + outb_px(port + UART_MCR, scratch); + if (scratch2 != 0x90) { + printk("gdb_hook_interrupt: " + "Loop back test failed! Not a UART!\n"); + local_irq_restore(flags); + return scratch2 + 1000; /* force 0 to fail */ + } +#endif /* test existance */ + program_uart(info); + local_irq_restore(flags); + + return (0); + +} /* gdb_hook_interrupt */ + +static void +program_uart(struct async_struct *info) +{ + int port = info->port; + + (void) inb_p(port + UART_RX); + outb_px(port + UART_IER, 0); + + (void) inb_p(port + UART_RX); /* serial driver comments say */ + (void) inb_p(port + UART_IIR); /* this clears the interrupt regs */ + (void) inb_p(port + UART_MSR); + outb_px(port + UART_LCR, UART_LCR_WLEN8 | UART_LCR_DLAB); + outb_px(port + UART_DLL, info->state->custom_divisor & 0xff); /* LS */ + outb_px(port + UART_DLM, info->state->custom_divisor >> 8); /* MS */ + outb_px(port + UART_MCR, info->MCR); + + outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR_TRIGGER_1 | UART_FCR_CLEAR_XMIT | UART_FCR_CLEAR_RCVR); /* set fcr */ + outb_px(port + UART_LCR, UART_LCR_WLEN8); /* reset DLAB */ + outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR_TRIGGER_1); /* set fcr */ + if (!ints_disabled) { + intprintk(("KGDB: Sending %d to port %x offset %d\n", + gdb_async_info->IER, + (int) gdb_async_info->port, UART_IER)); + outb_px(gdb_async_info->port + UART_IER, gdb_async_info->IER); + } + return; +} + +/* + * tty_getDebugChar + * + * This is a GDB stub routine. It waits for a character from the + * serial interface and then returns it. If there is no serial + * interface connection then it returns a bogus value which will + * almost certainly cause the system to hang. In the + */ +int kgdb_in_isr = 0; +int kgdb_in_lsr = 0; +extern spinlock_t kgdb_spinlock; + +/* Caller takes needed protections */ + +int +tty_getDebugChar(void) +{ + volatile int chr, dum, time, end_time; + + dbprintk(("tty_getDebugChar(port %x): ", gdb_async_info->port)); + + if (gdb_async_info == NULL) { + gdb_hook_interrupt(&local_info, 0); + } + /* + * This trick says if we wait a very long time and get + * no char, return the -1 and let the upper level deal + * with it. + */ + rdtsc(dum, time); + end_time = time + 2; + while (((chr = read_char(gdb_async_info)) == -1) && + (end_time - time) > 0) { + rdtsc(dum, time); + }; + /* + * This covers our butts if some other code messes with + * our uart, hay, it happens :o) + */ + if (chr == -1) + program_uart(gdb_async_info); + + dbprintk(("%c\n", chr > ' ' && chr < 0x7F ? chr : ' ')); + return (chr); + +} /* tty_getDebugChar */ + +static int count = 3; +static spinlock_t one_at_atime = SPIN_LOCK_UNLOCKED; + +static int __init +kgdb_enable_ints(void) +{ + if (kgdboe) { + return 0; + } + if (gdb_async_info == NULL) { + gdb_hook_interrupt(&local_info, 1); + } + ok_to_enable_ints = 1; + kgdb_enable_ints_now(); +#ifdef CONFIG_KGDB_USER_CONSOLE + kgdb_console_finit(); +#endif + return 0; +} + +#ifdef CONFIG_SERIAL_8250 +void shutdown_for_kgdb(struct async_struct *gdb_async_info); +#endif + +#ifdef CONFIG_DISCONTIGMEM +static inline int kgdb_mem_init_done(void) +{ + return highmem_start_page != NULL; +} +#else +static inline int kgdb_mem_init_done(void) +{ + return max_mapnr != 0; +} +#endif + +static void +kgdb_enable_ints_now(void) +{ + if (!spin_trylock(&one_at_atime)) + return; + if (!ints_disabled) + goto exit; + if (kgdb_mem_init_done() && + ints_disabled) { /* don't try till mem init */ +#ifdef CONFIG_SERIAL_8250 + /* + * The ifdef here allows the system to be configured + * without the serial driver. + * Don't make it a module, however, it will steal the port + */ + shutdown_for_kgdb(gdb_async_info); +#endif + ints_disabled = request_irq(gdb_async_info->state->irq, + gdb_interrupt, + IRQ_T(gdb_async_info), + "KGDB-stub", NULL); + intprintk(("KGDB: request_irq returned %d\n", ints_disabled)); + } + if (!ints_disabled) { + intprintk(("KGDB: Sending %d to port %x offset %d\n", + gdb_async_info->IER, + (int) gdb_async_info->port, UART_IER)); + outb_px(gdb_async_info->port + UART_IER, gdb_async_info->IER); + } + exit: + spin_unlock(&one_at_atime); +} + +/* + * tty_putDebugChar + * + * This is a GDB stub routine. It waits until the interface is ready + * to transmit a char and then sends it. If there is no serial + * interface connection then it simply returns to its caller, having + * pretended to send the char. Caller takes needed protections. + */ +void +tty_putDebugChar(int chr) +{ + dbprintk(("tty_putDebugChar(port %x): chr=%02x '%c', ints_on=%d\n", + gdb_async_info->port, + chr, + chr > ' ' && chr < 0x7F ? chr : ' ', ints_disabled ? 0 : 1)); + + if (gdb_async_info == NULL) { + gdb_hook_interrupt(&local_info, 0); + } + + write_char(gdb_async_info, chr); /* this routine will wait */ + count = (chr == '#') ? 0 : count + 1; + if ((count == 2)) { /* try to enable after */ + if (ints_disabled & ok_to_enable_ints) + kgdb_enable_ints_now(); /* try to enable after */ + + /* We do this a lot because, well we really want to get these + * interrupts. The serial driver will clear these bits when it + * initializes the chip. Every thing else it does is ok, + * but this. + */ + if (!ints_disabled) { + outb_px(gdb_async_info->port + UART_IER, + gdb_async_info->IER); + } + } + +} /* tty_putDebugChar */ + +/* + * This does nothing for the serial port, since it doesn't buffer. + */ + +void tty_flushDebugChar(void) +{ +} + +module_init(kgdb_enable_ints); --- diff/arch/x86_64/Kconfig.kgdb 1970-01-01 01:00:00.000000000 +0100 +++ source/arch/x86_64/Kconfig.kgdb 2004-04-21 10:46:51.318780472 +0100 @@ -0,0 +1,176 @@ +config KGDB + bool "Include kgdb kernel debugger" + depends on DEBUG_KERNEL + select DEBUG_INFO + help + If you say Y here, the system will be compiled with the debug + option (-g) and a debugging stub will be included in the + kernel. This stub communicates with gdb on another (host) + computer via a serial port. The host computer should have + access to the kernel binary file (vmlinux) and a serial port + that is connected to the target machine. Gdb can be made to + configure the serial port or you can use stty and setserial to + do this. See the 'target' command in gdb. This option also + configures in the ability to request a breakpoint early in the + boot process. To request the breakpoint just include 'kgdb' + as a boot option when booting the target machine. The system + will then break as soon as it looks at the boot options. This + option also installs a breakpoint in panic and sends any + kernel faults to the debugger. For more information see the + Documentation/i386/kgdb.txt file. + +choice + depends on KGDB + prompt "Debug serial port BAUD" + default KGDB_115200BAUD + help + Gdb and the kernel stub need to agree on the baud rate to be + used. Some systems (x86 family at this writing) allow this to + be configured. + +config KGDB_9600BAUD + bool "9600" + +config KGDB_19200BAUD + bool "19200" + +config KGDB_38400BAUD + bool "38400" + +config KGDB_57600BAUD + bool "57600" + +config KGDB_115200BAUD + bool "115200" +endchoice + +config KGDB_PORT + hex "hex I/O port address of the debug serial port" + depends on KGDB + default 3f8 + help + Some systems (x86 family at this writing) allow the port + address to be configured. The number entered is assumed to be + hex, don't put 0x in front of it. The standard address are: + COM1 3f8 , irq 4 and COM2 2f8 irq 3. Setserial /dev/ttySx + will tell you what you have. It is good to test the serial + connection with a live system before trying to debug. + +config KGDB_IRQ + int "IRQ of the debug serial port" + depends on KGDB + default 4 + help + This is the irq for the debug port. If everything is working + correctly and the kernel has interrupts on a control C to the + port should cause a break into the kernel debug stub. + +config DEBUG_INFO + bool + depends on KGDB + default y + +config KGDB_MORE + bool "Add any additional compile options" + depends on KGDB + default n + help + Saying yes here turns on the ability to enter additional + compile options. + + +config KGDB_OPTIONS + depends on KGDB_MORE + string "Additional compile arguments" + default "-O1" + help + This option allows you enter additional compile options for + the whole kernel compile. Each platform will have a default + that seems right for it. For example on PPC "-ggdb -O1", and + for i386 "-O1". Note that by configuring KGDB "-g" is already + turned on. In addition, on i386 platforms + "-fomit-frame-pointer" is deleted from the standard compile + options. + +config NO_KGDB_CPUS + int "Number of CPUs" + depends on KGDB && SMP + default NR_CPUS + help + + This option sets the number of cpus for kgdb ONLY. It is used + to prune some internal structures so they look "nice" when + displayed with gdb. This is to overcome possibly larger + numbers that may have been entered above. Enter the real + number to get nice clean kgdb_info displays. + +config KGDB_TS + bool "Enable kgdb time stamp macros?" + depends on KGDB + default n + help + Kgdb event macros allow you to instrument your code with calls + to the kgdb event recording function. The event log may be + examined with gdb at a break point. Turning on this + capability also allows you to choose how many events to + keep. Kgdb always keeps the lastest events. + +choice + depends on KGDB_TS + prompt "Max number of time stamps to save?" + default KGDB_TS_128 + +config KGDB_TS_64 + bool "64" + +config KGDB_TS_128 + bool "128" + +config KGDB_TS_256 + bool "256" + +config KGDB_TS_512 + bool "512" + +config KGDB_TS_1024 + bool "1024" + +endchoice + +config STACK_OVERFLOW_TEST + bool "Turn on kernel stack overflow testing?" + depends on KGDB + default n + help + This option enables code in the front line interrupt handlers + to check for kernel stack overflow on interrupts and system + calls. This is part of the kgdb code on x86 systems. + +config KGDB_CONSOLE + bool "Enable serial console thru kgdb port" + depends on KGDB + default n + help + This option enables the command line "console=kgdb" option. + When the system is booted with this option in the command line + all kernel printk output is sent to gdb (as well as to other + consoles). For this to work gdb must be connected. For this + reason, this command line option will generate a breakpoint if + gdb has not yet connected. After the gdb continue command is + given all pent up console output will be printed by gdb on the + host machine. Neither this option, nor KGDB require the + serial driver to be configured. + +config KGDB_SYSRQ + bool "Turn on SysRq 'G' command to do a break?" + depends on KGDB + default y + help + This option includes an option in the SysRq code that allows + you to enter SysRq G which generates a breakpoint to the KGDB + stub. This will work if the keyboard is alive and can + interrupt the system. Because of constraints on when the + serial port interrupt can be enabled, this code may allow you + to interrupt the system before the serial port control C is + available. Just say yes here. + --- diff/arch/x86_64/kernel/kgdb_stub.c 1970-01-01 01:00:00.000000000 +0100 +++ source/arch/x86_64/kernel/kgdb_stub.c 2004-04-21 10:46:51.323779712 +0100 @@ -0,0 +1,2591 @@ +/* + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + */ + +/* + * Copyright (c) 2000 VERITAS Software Corporation. + * + */ +/**************************************************************************** + * Header: remcom.c,v 1.34 91/03/09 12:29:49 glenne Exp $ + * + * Module name: remcom.c $ + * Revision: 1.34 $ + * Date: 91/03/09 12:29:49 $ + * Contributor: Lake Stevens Instrument Division$ + * + * Description: low level support for gdb debugger. $ + * + * Considerations: only works on target hardware $ + * + * Written by: Glenn Engel $ + * Updated by: David Grothe + * Updated by: Robert Walsh + * Updated by: wangdi + * ModuleState: Experimental $ + * + * NOTES: See Below $ + * + * Modified for 386 by Jim Kingdon, Cygnus Support. + * Compatibility with 2.1.xx kernel by David Grothe + * + * Changes to allow auto initilization. All that is needed is that it + * be linked with the kernel and a break point (int 3) be executed. + * The header file defines BREAKPOINT to allow one to do + * this. It should also be possible, once the interrupt system is up, to + * call putDebugChar("+"). Once this is done, the remote debugger should + * get our attention by sending a ^C in a packet. George Anzinger + * + * Integrated into 2.2.5 kernel by Tigran Aivazian + * Added thread support, support for multiple processors, + * support for ia-32(x86) hardware debugging. + * Amit S. Kale ( akale@veritas.com ) + * + * Modified to support debugging over ethernet by Robert Walsh + * and wangdi , based on + * code by San Mehat. + * + * X86_64 changes from Andi Kleen's patch merged by Jim Houston + * (jim.houston@ccur.com). If it works thank Andi if its broken + * blame me. + * + * To enable debugger support, two things need to happen. One, a + * call to set_debug_traps() is necessary in order to allow any breakpoints + * or error conditions to be properly intercepted and reported to gdb. + * Two, a breakpoint needs to be generated to begin communication. This + * is most easily accomplished by a call to breakpoint(). Breakpoint() + * simulates a breakpoint by executing an int 3. + * + ************* + * + * The following gdb commands are supported: + * + * command function Return value + * + * g return the value of the CPU registers hex data or ENN + * G set the value of the CPU registers OK or ENN + * + * mAA..AA,LLLL Read LLLL bytes at address AA..AA hex data or ENN + * MAA..AA,LLLL: Write LLLL bytes at address AA.AA OK or ENN + * + * c Resume at current address SNN ( signal NN) + * cAA..AA Continue at address AA..AA SNN + * + * s Step one instruction SNN + * sAA..AA Step one instruction from AA..AA SNN + * + * k kill + * + * ? What was the last sigval ? SNN (signal NN) + * + * All commands and responses are sent with a packet which includes a + * checksum. A packet consists of + * + * $#. + * + * where + * :: + * :: < two hex digits computed as modulo 256 sum of > + * + * When a packet is received, it is first acknowledged with either '+' or '-'. + * '+' indicates a successful transfer. '-' indicates a failed transfer. + * + * Example: + * + * Host: Reply: + * $m0,10#2a +$00010203040506070809101112131415#42 + * + ****************************************************************************/ +#define KGDB_VERSION "<20030915.1651.33>" +#include +#include +#include /* for strcpy */ +#include +#include +#include +#include /* for linux pt_regs struct */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define Dearly_printk(x...) +int kgdb_enabled = 0; + +/************************************************************************ + * + * external low-level support routines + */ +typedef void (*Function) (void); /* pointer to a function */ + +/* Thread reference */ +typedef unsigned char threadref[8]; + +extern int tty_putDebugChar(int); /* write a single character */ +extern int tty_getDebugChar(void); /* read and return a single char */ +extern void tty_flushDebugChar(void); /* flush pending characters */ +extern int eth_putDebugChar(int); /* write a single character */ +extern int eth_getDebugChar(void); /* read and return a single char */ +extern void eth_flushDebugChar(void); /* flush pending characters */ + +/************************************************************************/ +/* BUFMAX defines the maximum number of characters in inbound/outbound buffers*/ +/* at least NUMREGBYTES*2 are needed for register packets */ +/* Longer buffer is needed to list all threads */ +#define BUFMAX 400 + +char *kgdb_version = KGDB_VERSION; + +/* debug > 0 prints ill-formed commands in valid packets & checksum errors */ +int debug_regs = 0; /* set to non-zero to print registers */ + +/* filled in by an external module */ +char *gdb_module_offsets; + +static const char hexchars[] = "0123456789abcdef"; + +/* Number of bytes of registers. */ +#define NUMREGBYTES (NUMREGS * sizeof(unsigned long)) +/* + * Note that this register image is in a different order than + * the register image that Linux produces at interrupt time. + * + * Linux's register image is defined by struct pt_regs in ptrace.h. + * Just why GDB uses a different order is a historical mystery. + * + * Could add XMM and segment registers here. + */ +enum regnames {_RAX, + _RBX, + _RCX, + _RDX, + _RSI, + _RDI, + _RBP, + _RSP, + _R8, + _R9, + _R10, + _R11, + _R12, + _R13, + _R14, + _R15, + _PC, + _PS, + NUMREGS }; + + +/*************************** ASSEMBLY CODE MACROS *************************/ +/* + * Put the error code here just in case the user cares. + * Likewise, the vector number here (since GDB only gets the signal + * number through the usual means, and that's not very specific). + * The called_from is the return address so he can tell how we entered kgdb. + * This will allow him to seperate out the various possible entries. + */ +#define REMOTE_DEBUG 0 /* set != to turn on printing (also available in info) */ + +#define PID_MAX PID_MAX_DEFAULT + +#ifdef CONFIG_SMP +void smp_send_nmi_allbutself(void); +#define IF_SMP(x) x +#undef MAX_NO_CPUS +#ifndef CONFIG_NO_KGDB_CPUS +#define CONFIG_NO_KGDB_CPUS 2 +#endif +#if CONFIG_NO_KGDB_CPUS > NR_CPUS +#define MAX_NO_CPUS NR_CPUS +#else +#define MAX_NO_CPUS CONFIG_NO_KGDB_CPUS +#endif +#define hold_init hold_on_sstep: 1, +#define MAX_CPU_MASK (unsigned long)((1LL << MAX_NO_CPUS) - 1LL) +#define NUM_CPUS num_online_cpus() +#else +#define IF_SMP(x) +#define hold_init +#undef MAX_NO_CPUS +#define MAX_NO_CPUS 1 +#define NUM_CPUS 1 +#endif +#define NOCPU (struct task_struct *)0xbad1fbad +/* *INDENT-OFF* */ +struct kgdb_info { + int used_malloc; + void *called_from; + long long entry_tsc; + int errcode; + int vector; + int print_debug_info; +#ifdef CONFIG_SMP + int hold_on_sstep; + struct { + volatile struct task_struct *task; + int pid; + int hold; + struct pt_regs *regs; + } cpus_waiting[MAX_NO_CPUS]; +#endif +} kgdb_info = {hold_init print_debug_info:REMOTE_DEBUG, vector:-1}; + +/* *INDENT-ON* */ + +#define used_m kgdb_info.used_malloc +/* + * This is little area we set aside to contain the stack we + * need to build to allow gdb to call functions. We use one + * per cpu to avoid locking issues. We will do all this work + * with interrupts off so that should take care of the protection + * issues. + */ +#define LOOKASIDE_SIZE 200 /* should be more than enough */ +#define MALLOC_MAX 200 /* Max malloc size */ +struct { + unsigned long rsp; + unsigned long array[LOOKASIDE_SIZE]; +} fn_call_lookaside[MAX_NO_CPUS]; + +static int trap_cpu; +static unsigned long OLD_esp; + +#define END_OF_LOOKASIDE &fn_call_lookaside[trap_cpu].array[LOOKASIDE_SIZE] +#define IF_BIT 0x200 +#define TF_BIT 0x100 + +#define MALLOC_ROUND 8-1 + +static char malloc_array[MALLOC_MAX]; +IF_SMP(static void to_gdb(const char *mess)); +void * +malloc(int size) +{ + + if (size <= (MALLOC_MAX - used_m)) { + int old_used = used_m; + used_m += ((size + MALLOC_ROUND) & (~MALLOC_ROUND)); + return &malloc_array[old_used]; + } else { + return NULL; + } +} + +/* + * I/O dispatch functions... + * Based upon kgdboe, either call the ethernet + * handler or the serial one.. + */ +void +putDebugChar(int c) +{ + if (!kgdboe) { + tty_putDebugChar(c); + } else { + eth_putDebugChar(c); + } +} + +int +getDebugChar(void) +{ + if (!kgdboe) { + return tty_getDebugChar(); + } else { + return eth_getDebugChar(); + } +} + +void +flushDebugChar(void) +{ + if (!kgdboe) { + tty_flushDebugChar(); + } else { + eth_flushDebugChar(); + } +} + +/* + * Gdb calls functions by pushing agruments, including a return address + * on the stack and the adjusting EIP to point to the function. The + * whole assumption in GDB is that we are on a different stack than the + * one the "user" i.e. code that hit the break point, is on. This, of + * course is not true in the kernel. Thus various dodges are needed to + * do the call without directly messing with EIP (which we can not change + * as it is just a location and not a register. To adjust it would then + * require that we move every thing below EIP up or down as needed. This + * will not work as we may well have stack relative pointer on the stack + * (such as the pointer to regs, for example). + + * So here is what we do: + * We detect gdb attempting to store into the stack area and instead, store + * into the fn_call_lookaside.array at the same relative location as if it + * were the area ESP pointed at. We also trap ESP modifications + * and uses these to adjust fn_call_lookaside.esp. On entry + * fn_call_lookaside.esp will be set to point at the last entry in + * fn_call_lookaside.array. This allows us to check if it has changed, and + * if so, on exit, we add the registers we will use to do the move and a + * trap/ interrupt return exit sequence. We then adjust the eflags in the + * regs array (remember we now have a copy in the fn_call_lookaside.array) to + * kill the interrupt bit, AND we change EIP to point at our set up stub. + * As part of the register set up we preset the registers to point at the + * begining and end of the fn_call_lookaside.array, so all the stub needs to + * do is move words from the array to the stack until ESP= the desired value + * then do the rti. This will then transfer to the desired function with + * all the correct registers. Nifty huh? + */ +extern asmlinkage void fn_call_stub(void); +extern asmlinkage void fn_rtn_stub(void); +/* *INDENT-OFF* */ +__asm__("fn_rtn_stub:\n\t" + "movq %rax,%rsp\n\t" + "fn_call_stub:\n\t" + "1:\n\t" + "addq $-8,%rbx\n\t" + "movq (%rbx), %rax\n\t" + "pushq %rax\n\t" + "cmpq %rsp,%rcx\n\t" + "jne 1b\n\t" + "popq %rax\n\t" + "popq %rbx\n\t" + "popq %rcx\n\t" + "iret \n\t"); +/* *INDENT-ON* */ +#define gdb_i386vector kgdb_info.vector +#define gdb_i386errcode kgdb_info.errcode +#define waiting_cpus kgdb_info.cpus_waiting +#define remote_debug kgdb_info.print_debug_info +#define hold_cpu(cpu) kgdb_info.cpus_waiting[cpu].hold +/* gdb locks */ + +#ifdef CONFIG_SMP +static int in_kgdb_called; +static spinlock_t waitlocks[MAX_NO_CPUS] = + {[0 ... MAX_NO_CPUS - 1] = SPIN_LOCK_UNLOCKED }; +/* + * The following array has the thread pointer of each of the "other" + * cpus. We make it global so it can be seen by gdb. + */ +volatile int in_kgdb_entry_log[MAX_NO_CPUS]; +volatile struct pt_regs *in_kgdb_here_log[MAX_NO_CPUS]; +/* +static spinlock_t continuelocks[MAX_NO_CPUS]; +*/ +spinlock_t kgdb_spinlock = SPIN_LOCK_UNLOCKED; +/* waiters on our spinlock plus us */ +static atomic_t spinlock_waiters = ATOMIC_INIT(1); +static int spinlock_count = 0; +static int spinlock_cpu = 0; +/* + * Note we use nested spin locks to account for the case where a break + * point is encountered when calling a function by user direction from + * kgdb. Also there is the memory exception recursion to account for. + * Well, yes, but this lets other cpus thru too. Lets add a + * cpu id to the lock. + */ +#define KGDB_SPIN_LOCK(x) if( spinlock_count == 0 || \ + spinlock_cpu != smp_processor_id()){\ + atomic_inc(&spinlock_waiters); \ + while (! spin_trylock(x)) {\ + in_kgdb(®s);\ + }\ + atomic_dec(&spinlock_waiters); \ + spinlock_count = 1; \ + spinlock_cpu = smp_processor_id(); \ + }else{ \ + spinlock_count++; \ + } +#define KGDB_SPIN_UNLOCK(x) if( --spinlock_count == 0) spin_unlock(x) +#else +unsigned kgdb_spinlock = 0; +#define KGDB_SPIN_LOCK(x) --*x +#define KGDB_SPIN_UNLOCK(x) ++*x +#endif + +int +hex(char ch) +{ + if ((ch >= 'a') && (ch <= 'f')) + return (ch - 'a' + 10); + if ((ch >= '0') && (ch <= '9')) + return (ch - '0'); + if ((ch >= 'A') && (ch <= 'F')) + return (ch - 'A' + 10); + return (-1); +} + +/* scan for the sequence $# */ +void +getpacket(char *buffer) +{ + unsigned char checksum; + unsigned char xmitcsum; + int i; + int count; + char ch; + + do { + /* wait around for the start character, ignore all other characters */ + while ((ch = (getDebugChar() & 0x7f)) != '$') ; + checksum = 0; + xmitcsum = -1; + + count = 0; + + /* now, read until a # or end of buffer is found */ + while (count < BUFMAX) { + ch = getDebugChar() & 0x7f; + if (ch == '#') + break; + checksum = checksum + ch; + buffer[count] = ch; + count = count + 1; + } + buffer[count] = 0; + + if (ch == '#') { + xmitcsum = hex(getDebugChar() & 0x7f) << 4; + xmitcsum += hex(getDebugChar() & 0x7f); + if ((remote_debug) && (checksum != xmitcsum)) { + printk + ("bad checksum. My count = 0x%x, sent=0x%x. buf=%s\n", + checksum, xmitcsum, buffer); + } + + if (checksum != xmitcsum) + putDebugChar('-'); /* failed checksum */ + else { + putDebugChar('+'); /* successful transfer */ + /* if a sequence char is present, reply the sequence ID */ + if (buffer[2] == ':') { + putDebugChar(buffer[0]); + putDebugChar(buffer[1]); + /* remove sequence chars from buffer */ + count = strlen(buffer); + for (i = 3; i <= count; i++) + buffer[i - 3] = buffer[i]; + } + } + } + } while (checksum != xmitcsum); + + if (remote_debug) + printk("R:%s\n", buffer); + flushDebugChar(); +} + +/* send the packet in buffer. */ + +void +putpacket(char *buffer) +{ + unsigned char checksum; + int count; + char ch; + + /* $#. */ + + if (!kgdboe) { + do { + if (remote_debug) + printk("T:%s\n", buffer); + putDebugChar('$'); + checksum = 0; + count = 0; + + while ((ch = buffer[count])) { + putDebugChar(ch); + checksum += ch; + count += 1; + } + + putDebugChar('#'); + putDebugChar(hexchars[checksum >> 4]); + putDebugChar(hexchars[checksum % 16]); + flushDebugChar(); + + } while ((getDebugChar() & 0x7f) != '+'); + } else { + /* + * For udp, we can not transfer too much bytes once. + * We only transfer MAX_SEND_COUNT size bytes each time + */ + +#define MAX_SEND_COUNT 30 + + int send_count = 0, i = 0; + char send_buf[MAX_SEND_COUNT]; + + do { + if (remote_debug) + printk("T:%s\n", buffer); + putDebugChar('$'); + checksum = 0; + count = 0; + send_count = 0; + while ((ch = buffer[count])) { + if (send_count >= MAX_SEND_COUNT) { + for(i = 0; i < MAX_SEND_COUNT; i++) { + putDebugChar(send_buf[i]); + } + flushDebugChar(); + send_count = 0; + } else { + send_buf[send_count] = ch; + checksum += ch; + count ++; + send_count++; + } + } + for(i = 0; i < send_count; i++) + putDebugChar(send_buf[i]); + putDebugChar('#'); + putDebugChar(hexchars[checksum >> 4]); + putDebugChar(hexchars[checksum % 16]); + flushDebugChar(); + } while ((getDebugChar() & 0x7f) != '+'); + } +} + +static char remcomInBuffer[BUFMAX]; +static char remcomOutBuffer[BUFMAX]; +static char lbuf[BUFMAX]; +static short error; + +void +debug_error(char *format, char *parm) +{ + if (remote_debug) + printk(format, parm); +} + +static void +print_regs(struct pt_regs *regs) +{ + printk("RAX=%016lx RBX=%016lx RCX=%016lx\n", + regs->rax, regs->rbx, regs->rcx); + printk("RDX=%016lx RSI=%016lx RDI=%016lx\n", + regs->rdx, regs->rsi, regs->rdi); + printk("RBP=%016lx PS=%016lx PC=%016lx\n", + regs->rbp, regs->eflags, regs->rip); + printk("R8=%016lx R9=%016lx R10=%016lx\n", + regs->r8, regs->r9, regs->r10); + printk("R11=%016lx R12=%016lx R13=%016lx\n", + regs->r11, regs->r12, regs->r13); + printk("R14=%016lx R15=%016lx RSP=%016lx\n", + regs->r14, regs->r15, regs->rsp); +} + +#define NEW_esp fn_call_lookaside[trap_cpu].rsp + +static void +regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) +{ + gdb_regs[_RAX] = regs->rax; + gdb_regs[_RBX] = regs->rbx; + gdb_regs[_RCX] = regs->rcx; + gdb_regs[_RDX] = regs->rdx; + gdb_regs[_RSI] = regs->rsi; + gdb_regs[_RDI] = regs->rdi; + gdb_regs[_RBP] = regs->rbp; + gdb_regs[ _PS] = regs->eflags; + gdb_regs[ _PC] = regs->rip; + gdb_regs[ _R8] = regs->r8; + gdb_regs[ _R9] = regs->r9; + gdb_regs[_R10] = regs->r10; + gdb_regs[_R11] = regs->r11; + gdb_regs[_R12] = regs->r12; + gdb_regs[_R13] = regs->r13; + gdb_regs[_R14] = regs->r14; + gdb_regs[_R15] = regs->r15; + gdb_regs[_RSP] = regs->rsp; + + /* Note, as we are a debugging the kernel, we will always + * trap in kernel code, this means no priviledge change, + * and so the pt_regs structure is not completely valid. In a non + * privilege change trap, only EFLAGS, CS and EIP are put on the stack, + * SS and ESP are not stacked, this means that the last 2 elements of + * pt_regs is not valid (they would normally refer to the user stack) + * also, using regs+1 is no good because you end up will a value that is + * 2 longs (8) too high. This used to cause stepping over functions + * to fail, so my fix is to use the address of regs->esp, which + * should point at the end of the stack frame. Note I have ignored + * completely exceptions that cause an error code to be stacked, such + * as double fault. Stuart Hughes, Zentropix. + * original code: gdb_regs[_ESP] = (int) (regs + 1) ; + + * this is now done on entry and moved to OLD_esp (as well as NEW_esp). + */ +} + +static void +gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *regs) +{ + regs->rax = gdb_regs[_RAX] ; + regs->rbx = gdb_regs[_RBX] ; + regs->rcx = gdb_regs[_RCX] ; + regs->rdx = gdb_regs[_RDX] ; + regs->rsi = gdb_regs[_RSI] ; + regs->rdi = gdb_regs[_RDI] ; + regs->rbp = gdb_regs[_RBP] ; + regs->eflags = gdb_regs[ _PS] ; + regs->rip = gdb_regs[ _PC] ; + regs->r8 = gdb_regs[ _R8] ; + regs->r9 = gdb_regs[ _R9] ; + regs->r10 = gdb_regs[ _R10] ; + regs->r11 = gdb_regs[ _R11] ; + regs->r12 = gdb_regs[ _R12] ; + regs->r13 = gdb_regs[ _R13] ; + regs->r14 = gdb_regs[ _R14] ; + regs->r15 = gdb_regs[ _R15] ; + #if 0 /* can't change these */ + regs->rsp = gdb_regs[_RSP] ; + regs->ss = gdb_regs[ _SS] ; + regs->fs = gdb_regs[_FS]; + regs->gs = gdb_regs[_GS]; +#endif +} /* gdb_regs_to_regs */ + +int thread_list = 0; +extern void thread_return(void); + +void +get_gdb_regs(struct task_struct *p, struct pt_regs *regs, unsigned long *gdb_regs) +{ + unsigned long **rbp, *rsp, *rsp0, pc; + int count = 0; + IF_SMP(int i); + if (!p || p == current) { + regs_to_gdb_regs(gdb_regs, regs); + return; + } +#ifdef CONFIG_SMP + for (i = 0; i < MAX_NO_CPUS; i++) { + if (p == kgdb_info.cpus_waiting[i].task) { + regs_to_gdb_regs(gdb_regs, + kgdb_info.cpus_waiting[i].regs); + gdb_regs[_RSP] = + (unsigned long)&kgdb_info.cpus_waiting[i].regs->rsp; + + return; + } + } +#endif + memset(gdb_regs, 0, NUMREGBYTES); + rsp = (unsigned long *)p->thread.rsp; + rbp = (unsigned long **)rsp[0]; + rsp += 2; + gdb_regs[_PC] = (unsigned long)thread_return; + gdb_regs[_RBP] = (unsigned long)rbp; + gdb_regs[_RSP] = (unsigned long)rsp; + +/* + * This code is to give a more informative notion of where a process + * is waiting. It is used only when the user asks for a thread info + * list. If he then switches to the thread, s/he will find the task + * is in schedule, but a back trace should show the same info we come + * up with. This code was shamelessly purloined from process.c. It was + * then enhanced to provide more registers than simply the program + * counter. + */ + + if (!thread_list) { + return; + } + + if (p->state == TASK_RUNNING) + return; + rsp0 = (unsigned long *)p->thread.rsp0; + if (rsp < (unsigned long *) p->thread_info || rsp > rsp0) + return; + /* include/asm-i386/system.h:switch_to() pushes ebp last. */ + do { + if (*rbp < rsp || *rbp > rsp0) + break; + rbp = (unsigned long **)*rbp; + rsp = (unsigned long *)rbp; + pc = rsp[1]; + + if (!in_sched_functions(pc)) + break; + gdb_regs[_PC] = (unsigned long)pc; + gdb_regs[_RSP] = (unsigned long)rsp; + gdb_regs[_RBP] = (unsigned long)rbp; + } while (count++ < 16); + return; +} + +/* convert the memory pointed to by mem into hex, placing result in buf */ +/* returns nonzero if any memory access fails. */ +int mem2hex( char* mem, char* buf, int count) +{ + int i; + unsigned char ch; + int ret = 0; + + for (i=0;i> 4]; + *buf++ = hexchars[ch % 16]; + } + *buf = 0; + if (ret) { + Dearly_printk("mem2hex: fault at accessing %p\n", mem); + } + return(ret); +} + +/* convert the hex array pointed to by buf into binary to be placed in mem */ +/* return nonzero if any memory access fails. */ +int hex2mem( char* buf, char* mem, int count) +{ + int i; + unsigned char ch; + int ret = 0; + + for (i=0;i (OLD_esp - (unsigned int) LOOKASIDE_SIZE))) { + addr = (char *) END_OF_LOOKASIDE - ((char *) OLD_esp - addr); + } + *addr = val; +} + +/* convert the memory pointed to by mem into hex, placing result in buf */ +/* return a pointer to the last char put in buf (null) */ +/* If MAY_FAULT is non-zero, then we should set mem_err in response to + a fault; if zero treat a fault like any other fault in the stub. */ +char * +mem2hex(char *mem, char *buf, int count, int may_fault) +{ + int i; + unsigned char ch; + + if (may_fault) { + mem_err_expected = 1; + mem_err = 0; + } + for (i = 0; i < count; i++) { + /* printk("%lx = ", mem) ; */ + + ch = get_char(mem++); + + /* printk("%02x\n", ch & 0xFF) ; */ + if (may_fault && mem_err) { + if (remote_debug) + printk("Mem fault fetching from addr %lx\n", + (long) (mem - 1)); + *buf = 0; /* truncate buffer */ + return (buf); + } + *buf++ = hexchars[ch >> 4]; + *buf++ = hexchars[ch % 16]; + } + *buf = 0; + if (may_fault) + mem_err_expected = 0; + return (buf); +} + +/* convert the hex array pointed to by buf into binary to be placed in mem */ +/* return a pointer to the character AFTER the last byte written */ +/* NOTE: We use the may fault flag to also indicate if the write is to + * the registers (0) or "other" memory (!=0) + */ +char * +hex2mem(char *buf, char *mem, int count, int may_fault) +{ + int i; + unsigned char ch; + + if (may_fault) { + mem_err_expected = 1; + mem_err = 0; + } + for (i = 0; i < count; i++) { + ch = hex(*buf++) << 4; + ch = ch + hex(*buf++); + set_char(mem++, ch, may_fault); + + if (may_fault && mem_err) { + if (remote_debug) + printk("Mem fault storing to addr %lx\n", + (long) (mem - 1)); + return (mem); + } + } + if (may_fault) + mem_err_expected = 0; + return (mem); +} +#endif + +/**********************************************/ +/* WHILE WE FIND NICE HEX CHARS, BUILD AN INT */ +/* RETURN NUMBER OF CHARS PROCESSED */ +/**********************************************/ +int +hexToLong(char **ptr, unsigned long *value) +{ + int numChars = 0; + int hexValue; + + *value = 0; + + while (**ptr) { + hexValue = hex(**ptr); + if (hexValue >= 0) { + *value = (*value << 4) | hexValue; + numChars++; + } else + break; + + (*ptr)++; + } + + return (numChars); +} + +#define stubhex(h) hex(h) +#ifdef old_thread_list + +static int +stub_unpack_int(char *buff, int fieldlength) +{ + int nibble; + int retval = 0; + + while (fieldlength) { + nibble = stubhex(*buff++); + retval |= nibble; + fieldlength--; + if (fieldlength) + retval = retval << 4; + } + return retval; +} +#endif +static char * +pack_hex_byte(char *pkt, int byte) +{ + *pkt++ = hexchars[(byte >> 4) & 0xf]; + *pkt++ = hexchars[(byte & 0xf)]; + return pkt; +} + +#define BUF_THREAD_ID_SIZE 16 + +static char * +pack_threadid(char *pkt, threadref * id) +{ + char *limit; + unsigned char *altid; + + altid = (unsigned char *) id; + limit = pkt + BUF_THREAD_ID_SIZE; + while (pkt < limit) + pkt = pack_hex_byte(pkt, *altid++); + return pkt; +} + +#ifdef old_thread_list +static char * +unpack_byte(char *buf, int *value) +{ + *value = stub_unpack_int(buf, 2); + return buf + 2; +} + +static char * +unpack_threadid(char *inbuf, threadref * id) +{ + char *altref; + char *limit = inbuf + BUF_THREAD_ID_SIZE; + int x, y; + + altref = (char *) id; + + while (inbuf < limit) { + x = stubhex(*inbuf++); + y = stubhex(*inbuf++); + *altref++ = (x << 4) | y; + } + return inbuf; +} +#endif +void +int_to_threadref(threadref * id, int value) +{ + unsigned char *scan; + + scan = (unsigned char *) id; + { + int i = 4; + while (i--) + *scan++ = 0; + } + *scan++ = (value >> 24) & 0xff; + *scan++ = (value >> 16) & 0xff; + *scan++ = (value >> 8) & 0xff; + *scan++ = (value & 0xff); +} +int +int_to_hex_v(unsigned char * id, int value) +{ + unsigned char *start = id; + int shift; + int ch; + + for (shift = 28; shift >= 0; shift -= 4) { + if ((ch = (value >> shift) & 0xf) || (id != start)) { + *id = hexchars[ch]; + id++; + } + } + if (id == start) + *id++ = '0'; + return id - start; +} +#ifdef old_thread_list + +static int +threadref_to_int(threadref * ref) +{ + int i, value = 0; + unsigned char *scan; + + scan = (char *) ref; + scan += 4; + i = 4; + while (i-- > 0) + value = (value << 8) | ((*scan++) & 0xff); + return value; +} +#endif +static int +cmp_str(char *s1, char *s2, int count) +{ + while (count--) { + if (*s1++ != *s2++) + return 0; + } + return 1; +} + +#if 1 /* this is a hold over from 2.4 where O(1) was "sometimes" */ +extern struct task_struct *kgdb_get_idle(int cpu); +#define idle_task(cpu) kgdb_get_idle(cpu) +#else +#define idle_task(cpu) init_tasks[cpu] +#endif + +extern int kgdb_pid_init_done; + +struct task_struct * +getthread(int pid) +{ + struct task_struct *thread; + if (pid >= PID_MAX && pid <= (PID_MAX + MAX_NO_CPUS)) { + if (!cpu_online(pid - PID_MAX)) + return NULL; + + return idle_task(pid - PID_MAX); + } else { + /* + * find_task_by_pid is relatively safe all the time + * Other pid functions require lock downs which imply + * that we may be interrupting them (as we get here + * in the middle of most any lock down). + * Still we don't want to call until the table exists! + */ + if (kgdb_pid_init_done){ + thread = find_task_by_pid(pid); + if (thread) { + return thread; + } + } + } + return NULL; +} +/* *INDENT-OFF* */ +struct hw_breakpoint { + unsigned enabled; + unsigned type; + unsigned len; + unsigned long addr; +} breakinfo[4] = { {enabled:0}, + {enabled:0}, + {enabled:0}, + {enabled:0}}; +/* *INDENT-ON* */ +unsigned long hw_breakpoint_status; +void +correct_hw_break(void) +{ + int breakno; + int correctit; + int breakbit; + unsigned long dr7; + + asm volatile ("movq %%db7, %0\n":"=r" (dr7) + :); + /* *INDENT-OFF* */ + do { + unsigned long addr0, addr1, addr2, addr3; + asm volatile ("movq %%db0, %0\n" + "movq %%db1, %1\n" + "movq %%db2, %2\n" + "movq %%db3, %3\n" + :"=r" (addr0), "=r"(addr1), + "=r"(addr2), "=r"(addr3) + :); + } while (0); + /* *INDENT-ON* */ + correctit = 0; + for (breakno = 0; breakno < 3; breakno++) { + breakbit = 2 << (breakno << 1); + if (!(dr7 & breakbit) && breakinfo[breakno].enabled) { + correctit = 1; + dr7 |= breakbit; + dr7 &= ~(0xf0000 << (breakno << 2)); + dr7 |= (((breakinfo[breakno].len << 2) | + breakinfo[breakno].type) << 16) << + (breakno << 2); + switch (breakno) { + case 0: + asm volatile ("movq %0, %%dr0\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 1: + asm volatile ("movq %0, %%dr1\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 2: + asm volatile ("movq %0, %%dr2\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 3: + asm volatile ("movq %0, %%dr3\n"::"r" + (breakinfo[breakno].addr)); + break; + } + } else if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { + correctit = 1; + dr7 &= ~breakbit; + dr7 &= ~(0xf0000 << (breakno << 2)); + } + } + if (correctit) { + asm volatile ("movq %0, %%db7\n"::"r" (dr7)); + } +} + +int +remove_hw_break(unsigned breakno) +{ + if (!breakinfo[breakno].enabled) { + return -1; + } + breakinfo[breakno].enabled = 0; + return 0; +} + +int +set_hw_break(unsigned breakno, unsigned type, unsigned len, unsigned addr) +{ + if (breakinfo[breakno].enabled) { + return -1; + } + breakinfo[breakno].enabled = 1; + breakinfo[breakno].type = type; + breakinfo[breakno].len = len; + breakinfo[breakno].addr = addr; + return 0; +} + +#ifdef CONFIG_SMP +static int in_kgdb_console = 0; + +int +in_kgdb(struct pt_regs *regs) +{ + unsigned long flags; + int cpu; + if (!kgdb_enabled) + return 0; + cpu = smp_processor_id(); + in_kgdb_called = 1; + if (!spin_is_locked(&kgdb_spinlock)) { + if (in_kgdb_here_log[cpu] || /* we are holding this cpu */ + in_kgdb_console) { /* or we are doing slow i/o */ + return 1; + } + return 0; + } + + /* As I see it the only reason not to let all cpus spin on + * the same spin_lock is to allow selected ones to proceed. + * This would be a good thing, so we leave it this way. + * Maybe someday.... Done ! + + * in_kgdb() is called from an NMI so we don't pretend + * to have any resources, like printk() for example. + */ + + local_irq_save(flags); /* only local here, to avoid hanging */ + /* + * log arival of this cpu + * The NMI keeps on ticking. Protect against recurring more + * than once, and ignor the cpu that has the kgdb lock + */ + in_kgdb_entry_log[cpu]++; + in_kgdb_here_log[cpu] = regs; + if (cpu == spinlock_cpu || waiting_cpus[cpu].task) + goto exit_in_kgdb; + + /* + * For protection of the initilization of the spin locks by kgdb + * it locks the kgdb spinlock before it gets the wait locks set + * up. We wait here for the wait lock to be taken. If the + * kgdb lock goes away first?? Well, it could be a slow exit + * sequence where the wait lock is removed prior to the kgdb lock + * so if kgdb gets unlocked, we just exit. + */ + + while (spin_is_locked(&kgdb_spinlock) && + !spin_is_locked(waitlocks + cpu)) ; + if (!spin_is_locked(&kgdb_spinlock)) + goto exit_in_kgdb; + + waiting_cpus[cpu].task = current; + waiting_cpus[cpu].pid = (current->pid) ? : (PID_MAX + cpu); + waiting_cpus[cpu].regs = regs; + + spin_unlock_wait(waitlocks + cpu); + + /* + * log departure of this cpu + */ + waiting_cpus[cpu].task = 0; + waiting_cpus[cpu].pid = 0; + waiting_cpus[cpu].regs = 0; + correct_hw_break(); + exit_in_kgdb: + in_kgdb_here_log[cpu] = 0; + local_irq_restore(flags); + return 1; + /* + spin_unlock(continuelocks + smp_processor_id()); + */ +} + +void +smp__in_kgdb(struct pt_regs regs) +{ + ack_APIC_irq(); + in_kgdb(®s); +} +#else +int +in_kgdb(struct pt_regs *regs) +{ + return (kgdb_spinlock); +} +#endif + +void +printexceptioninfo(int exceptionNo, int errorcode, char *buffer) +{ + unsigned long dr6; + int i; + switch (exceptionNo) { + case 1: /* debug exception */ + break; + case 3: /* breakpoint */ + sprintf(buffer, "Software breakpoint"); + return; + default: + sprintf(buffer, "Details not available"); + return; + } + asm volatile ("movq %%db6, %0\n":"=r" (dr6) + :); + if (dr6 & 0x4000) { + sprintf(buffer, "Single step"); + return; + } + for (i = 0; i < 4; ++i) { + if (dr6 & (1 << i)) { + sprintf(buffer, "Hardware breakpoint %d", i); + return; + } + } + sprintf(buffer, "Unknown trap"); + return; +} + +/* + * The ThreadExtraInfo query allows us to pass an arbitrary string + * for display with the "info threads" command. + */ + +void +print_extra_info(task_t *p, char *buf) +{ + if (!p) { + sprintf(buf, "Invalid thread"); + return; + } + sprintf(buf, "0x%p %8d %4d %c %s", + (void *)p, p->parent->pid, + task_cpu(p), + (p->state == 0) ? (task_curr(p)?'R':'r') : + (p->state < 0) ? 'U' : + (p->state & TASK_UNINTERRUPTIBLE) ? 'D' : + (p->state & TASK_STOPPED || p->ptrace & PT_PTRACED) ? 'T' : + (p->state & (TASK_ZOMBIE | TASK_DEAD)) ? 'Z' : + (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?', + p->comm); +} + +/* + * This function does all command procesing for interfacing to gdb. + * + * NOTE: The INT nn instruction leaves the state of the interrupt + * enable flag UNCHANGED. That means that when this routine + * is entered via a breakpoint (INT 3) instruction from code + * that has interrupts enabled, then interrupts will STILL BE + * enabled when this routine is entered. The first thing that + * we do here is disable interrupts so as to prevent recursive + * entries and bothersome serial interrupts while we are + * trying to run the serial port in polled mode. + * + * For kernel version 2.1.xx the kgdb_cli() actually gets a spin lock so + * it is always necessary to do a restore_flags before returning + * so as to let go of that lock. + */ +int +kgdb_handle_exception(int exceptionVector, + int signo, int err_code, struct pt_regs *linux_regs) +{ + struct task_struct *usethread = NULL; + struct task_struct *thread_list_start = 0, *thread = NULL; + struct task_struct *p; + unsigned long addr, length; + unsigned long breakno, breaktype; + char *ptr; + unsigned long newPC; + threadref thref; + unsigned long threadid, tmpid; + int thread_min = PID_MAX + MAX_NO_CPUS; +#ifdef old_thread_list + int maxthreads; +#endif + int nothreads; + unsigned long flags; + unsigned long gdb_regs[NUMREGS]; + unsigned long dr6; + IF_SMP(int entry_state = 0); /* 0, ok, 1, no nmi, 2 sync failed */ +#define NO_NMI 1 +#define NO_SYNC 2 +#define regs (*linux_regs) + /* + * If the entry is not from the kernel then return to the Linux + * trap handler and let it process the interrupt normally. + */ + if ((linux_regs->eflags & VM_MASK) || (3 & linux_regs->cs)) { + printk("ignoring non-kernel exception\n"); + print_regs(®s); + return (0); + } + /* + * If we're using eth mode, set the 'mode' in the netdevice. + */ + + if (kgdboe) + netpoll_set_trap(1); + + local_irq_save(flags); + + /* Get kgdb spinlock */ + + KGDB_SPIN_LOCK(&kgdb_spinlock); + rdtscll(kgdb_info.entry_tsc); + /* + * We depend on this spinlock and the NMI watch dog to control the + * other cpus. They will arrive at "in_kgdb()" as a result of the + * NMI and will wait there for the following spin locks to be + * released. + */ +#ifdef CONFIG_SMP + +#if 0 + if (cpu_callout_map & ~MAX_CPU_MASK) { + printk("kgdb : too many cpus, possibly not mapped" + " in contiguous space, change MAX_NO_CPUS" + " in kgdb_stub and make new kernel.\n" + " cpu_callout_map is %lx\n", cpu_callout_map); + goto exit_just_unlock; + } +#endif + if (spinlock_count == 1) { + int time, end_time, dum; + int i; + int cpu_logged_in[MAX_NO_CPUS] = {[0 ... MAX_NO_CPUS - 1] = (0) + }; + if (remote_debug) { + printk("kgdb : cpu %d entry, syncing others\n", + smp_processor_id()); + } + for (i = 0; i < MAX_NO_CPUS; i++) { + /* + * Use trylock as we may already hold the lock if + * we are holding the cpu. Net result is all + * locked. + */ + spin_trylock(&waitlocks[i]); + } + for (i = 0; i < MAX_NO_CPUS; i++) + cpu_logged_in[i] = 0; + /* + * Wait for their arrival. We know the watch dog is active if + * in_kgdb() has ever been called, as it is always called on a + * watchdog tick. + */ + rdtsc(dum, time); + end_time = time + 2; /* Note: we use the High order bits! */ + i = 1; + if (num_online_cpus() > 1) { + int me_in_kgdb = in_kgdb_entry_log[smp_processor_id()]; + smp_send_nmi_allbutself(); + + while (i < num_online_cpus() && time != end_time) { + int j; + for (j = 0; j < MAX_NO_CPUS; j++) { + if (waiting_cpus[j].task && + waiting_cpus[j].task != NOCPU && + !cpu_logged_in[j]) { + i++; + cpu_logged_in[j] = 1; + if (remote_debug) { + printk + ("kgdb : cpu %d arrived at kgdb\n", + j); + } + break; + } else if (!waiting_cpus[j].task && + !cpu_online(j)) { + waiting_cpus[j].task = NOCPU; + cpu_logged_in[j] = 1; + waiting_cpus[j].hold = 1; + break; + } + if (!waiting_cpus[j].task && + in_kgdb_here_log[j]) { + + int wait = 100000; + while (wait--) ; + if (!waiting_cpus[j].task && + in_kgdb_here_log[j]) { + printk + ("kgdb : cpu %d stall" + " in in_kgdb\n", + j); + i++; + cpu_logged_in[j] = 1; + waiting_cpus[j].task = + (struct task_struct + *) 1; + } + } + } + + if (in_kgdb_entry_log[smp_processor_id()] > + (me_in_kgdb + 10)) { + break; + } + + rdtsc(dum, time); + } + if (i < num_online_cpus()) { + printk + ("kgdb : time out, proceeding without sync\n"); +#if 0 + printk("kgdb : Waiting_cpus: 0 = %d, 1 = %d\n", + waiting_cpus[0].task != 0, + waiting_cpus[1].task != 0); + printk("kgdb : Cpu_logged in: 0 = %d, 1 = %d\n", + cpu_logged_in[0], cpu_logged_in[1]); + printk + ("kgdb : in_kgdb_here_log in: 0 = %d, 1 = %d\n", + in_kgdb_here_log[0] != 0, + in_kgdb_here_log[1] != 0); +#endif + entry_state = NO_SYNC; + } else { +#if 0 + int ent = + in_kgdb_entry_log[smp_processor_id()] - + me_in_kgdb; + printk("kgdb : sync after %d entries\n", ent); +#endif + } + } else { + if (remote_debug) { + printk + ("kgdb : %d cpus, but watchdog not active\n" + "proceeding without locking down other cpus\n", + (int)num_online_cpus()); + entry_state = NO_NMI; + } + } + } +#endif + + if (remote_debug) { + unsigned long *lp = (unsigned long *) &linux_regs; + + printk("handle_exception(exceptionVector=%d, " + "signo=%d, err_code=%d, linux_regs=%p)\n", + exceptionVector, signo, err_code, linux_regs); + if (debug_regs) { + print_regs(®s); + printk("Stk: %8lx %8lx %8lx %8lx" + " %8lx %8lx %8lx %8lx\n", + lp[0], lp[1], lp[2], lp[3], + lp[4], lp[5], lp[6], lp[7]); + printk(" %8lx %8lx %8lx %8lx" + " %8lx %8lx %8lx %8lx\n", + lp[8], lp[9], lp[10], lp[11], + lp[12], lp[13], lp[14], lp[15]); + printk(" %8lx %8lx %8lx %8lx " + "%8lx %8lx %8lx %8lx\n", + lp[16], lp[17], lp[18], lp[19], + lp[20], lp[21], lp[22], lp[23]); + printk(" %8lx %8lx %8lx %8lx " + "%8lx %8lx %8lx %8lx\n", + lp[24], lp[25], lp[26], lp[27], + lp[28], lp[29], lp[30], lp[31]); + } + } + + /* Disable hardware debugging while we are in kgdb */ + /* Get the debug register status register */ +/* *INDENT-OFF* */ + __asm__("movq %0,%%db7" + : /* no output */ + :"r"(0UL)); + + asm volatile ("movq %%db6, %0\n" + :"=r" (hw_breakpoint_status) + :); + +#if 0 +/* *INDENT-ON* */ + switch (exceptionVector) { + case 0: /* divide error */ + case 1: /* debug exception */ + case 2: /* NMI */ + case 3: /* breakpoint */ + case 4: /* overflow */ + case 5: /* bounds check */ + case 6: /* invalid opcode */ + case 7: /* device not available */ + case 8: /* double fault (errcode) */ + case 10: /* invalid TSS (errcode) */ + case 12: /* stack fault (errcode) */ + case 16: /* floating point error */ + case 17: /* alignment check (errcode) */ + default: /* any undocumented */ + break; + case 11: /* segment not present (errcode) */ + case 13: /* general protection (errcode) */ + case 14: /* page fault (special errcode) */ + case 19: /* cache flush denied */ + if (mem_err_expected) { + /* + * This fault occured because of the + * get_char or set_char routines. These + * two routines use either eax of edx to + * indirectly reference the location in + * memory that they are working with. + * For a page fault, when we return the + * instruction will be retried, so we + * have to make sure that these + * registers point to valid memory. + */ + mem_err = 1; /* set mem error flag */ + mem_err_expected = 0; + mem_err_cnt++; /* helps in debugging */ + /* make valid address */ + regs.eax = (long) &garbage_loc; + /* make valid address */ + regs.edx = (long) &garbage_loc; + if (remote_debug) + printk("Return after memory error: " + "mem_err_cnt=%d\n", mem_err_cnt); + if (debug_regs) + print_regs(®s); + goto exit_kgdb; + } + break; + } +#endif + if (remote_debug) + printk("kgdb : entered kgdb on cpu %d\n", smp_processor_id()); + + gdb_i386vector = exceptionVector; + gdb_i386errcode = err_code; + kgdb_info.called_from = __builtin_return_address(0); +#ifdef CONFIG_SMP + /* + * OK, we can now communicate, lets tell gdb about the sync. + * but only if we had a problem. + */ + switch (entry_state) { + case NO_NMI: + to_gdb("NMI not active, other cpus not stopped\n"); + break; + case NO_SYNC: + to_gdb("Some cpus not stopped, see 'kgdb_info' for details\n"); + default:; + } + +#endif +/* + * Set up the gdb function call area. + */ + trap_cpu = smp_processor_id(); + OLD_esp = NEW_esp = (unsigned long) (&linux_regs->rsp); + + IF_SMP(once_again:) + /* reply to host that an exception has occurred */ + remcomOutBuffer[0] = 'S'; + remcomOutBuffer[1] = hexchars[signo >> 4]; + remcomOutBuffer[2] = hexchars[signo % 16]; + remcomOutBuffer[3] = 0; + + putpacket(remcomOutBuffer); + + while (1 == 1) { + error = 0; + remcomOutBuffer[0] = 0; + getpacket(remcomInBuffer); + switch (remcomInBuffer[0]) { + case '?': + remcomOutBuffer[0] = 'S'; + remcomOutBuffer[1] = hexchars[signo >> 4]; + remcomOutBuffer[2] = hexchars[signo % 16]; + remcomOutBuffer[3] = 0; + break; + case 'd': + remote_debug = !(remote_debug); /* toggle debug flag */ + printk("Remote debug %s\n", + remote_debug ? "on" : "off"); + break; + case 'g': /* return the value of the CPU registers */ + get_gdb_regs(usethread, ®s, gdb_regs); + mem2hex((char *) gdb_regs, + remcomOutBuffer, NUMREGBYTES); + break; + case 'G': /* set the value of the CPU registers - return OK */ + hex2mem(&remcomInBuffer[1], + (char *) gdb_regs, NUMREGBYTES); + if (!usethread || usethread == current) { + gdb_regs_to_regs(gdb_regs, ®s); + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "E00"); + } + break; + + case 'P':{ /* set the value of a single CPU register - + return OK */ + /* + * For some reason, gdb wants to talk about psudo + * registers (greater than 15). + */ + unsigned long regno; + + ptr = &remcomInBuffer[1]; + regs_to_gdb_regs(gdb_regs, ®s); + if ((!usethread || usethread == current) && + hexToLong(&ptr, ®no) && + *ptr++ == '=' && (regno >= 0)) { + if (regno >= NUMREGS) + break; + hex2mem(ptr, (char *) &gdb_regs[regno], + 8); + gdb_regs_to_regs(gdb_regs, ®s); + strcpy(remcomOutBuffer, "OK"); + break; + } + strcpy(remcomOutBuffer, "E01"); + break; + } + + /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */ + case 'm': + /* TRY TO READ %x,%x. IF SUCCEED, SET PTR = 0 */ + ptr = &remcomInBuffer[1]; + if (hexToLong(&ptr, &addr) && + (*(ptr++) == ',') && (hexToLong(&ptr, &length))) { + ptr = 0; + /* + * hex doubles the byte count + */ + if (length > (BUFMAX / 2)) + length = BUFMAX / 2; + if (mem2hex((char *) addr, + remcomOutBuffer, length)) { + strcpy(remcomOutBuffer, "E03"); + debug_error("memory fault\n", NULL); + } + } + + if (ptr) { + strcpy(remcomOutBuffer, "E01"); + debug_error + ("malformed read memory command: %s\n", + remcomInBuffer); + } + break; + + /* MAA..AA,LLLL: + Write LLLL bytes at address AA.AA return OK */ + case 'M': + /* TRY TO READ '%x,%x:'. IF SUCCEED, SET PTR = 0 */ + ptr = &remcomInBuffer[1]; + if (hexToLong(&ptr, &addr) && + (*(ptr++) == ',') && + (hexToLong(&ptr, &length)) && (*(ptr++) == ':')) { + if (hex2mem(ptr, (char *) addr, length)) { + strcpy(remcomOutBuffer, "E03"); + debug_error("memory fault\n", NULL); + } else { + strcpy(remcomOutBuffer, "OK"); + } + + ptr = 0; + } + if (ptr) { + strcpy(remcomOutBuffer, "E02"); + debug_error + ("malformed write memory command: %s\n", + remcomInBuffer); + } + break; + case 'S': + remcomInBuffer[0] = 's'; + case 'C': + /* Csig;AA..AA where ;AA..AA is optional + * continue with signal + * Since signals are meaning less to us, delete that + * part and then fall into the 'c' code. + */ + ptr = &remcomInBuffer[1]; + length = 2; + while (*ptr && *ptr != ';') { + length++; + ptr++; + } + if (*ptr) { + do { + ptr++; + *(ptr - length++) = *ptr; + } while (*ptr); + } else { + remcomInBuffer[1] = 0; + } + + /* cAA..AA Continue at address AA..AA(optional) */ + /* sAA..AA Step one instruction from AA..AA(optional) */ + /* D detach, reply OK and then continue */ + case 'c': + case 's': + case 'D': + + /* try to read optional parameter, + pc unchanged if no parm */ + ptr = &remcomInBuffer[1]; + if (hexToLong(&ptr, &addr)) { + if (remote_debug) + printk("Changing EIP to 0x%lx\n", addr); + + regs.rip = addr; + } + + newPC = regs.rip; + + /* clear the trace bit */ + regs.eflags &= 0xfffffeff; + + /* set the trace bit if we're stepping */ + if (remcomInBuffer[0] == 's') + regs.eflags |= 0x100; + + /* detach is a friendly version of continue. Note that + debugging is still enabled (e.g hit control C) + */ + if (remcomInBuffer[0] == 'D') { + strcpy(remcomOutBuffer, "OK"); + putpacket(remcomOutBuffer); + } + + if (remote_debug) { + printk("Resuming execution\n"); + print_regs(®s); + } + asm volatile ("movq %%db6, %0\n":"=r" (dr6) + :); + if (!(dr6 & 0x4000)) { + for (breakno = 0; breakno < 4; ++breakno) { + if (dr6 & (1 << breakno) && + (breakinfo[breakno].type == 0)) { + /* Set restore flag */ + regs.eflags |= 0x10000; + break; + } + } + } + + if (kgdboe) + netpoll_set_trap(0); + + correct_hw_break(); + asm volatile ("movq %0, %%db6\n"::"r" (0UL)); + goto exit_kgdb; + + /* kill the program */ + case 'k': /* do nothing */ + break; + + /* query */ + case 'q': + nothreads = 0; + switch (remcomInBuffer[1]) { + case 'f': + threadid = 1; + thread_list = 2; + thread_list_start = (usethread ? : current); + case 's': + if (!cmp_str(&remcomInBuffer[2], + "ThreadInfo", 10)) + break; + + remcomOutBuffer[nothreads++] = 'm'; + for (; threadid < PID_MAX + MAX_NO_CPUS; + threadid++) { + thread = getthread(threadid); + if (thread) { + nothreads += int_to_hex_v( + &remcomOutBuffer[ + nothreads], + threadid); + if (thread_min > threadid) + thread_min = threadid; + remcomOutBuffer[ + nothreads] = ','; + nothreads++; + if (nothreads > BUFMAX - 10) + break; + } + } + if (remcomOutBuffer[nothreads - 1] == 'm') { + remcomOutBuffer[nothreads - 1] = 'l'; + } else { + nothreads--; + } + remcomOutBuffer[nothreads] = 0; + break; + +#ifdef old_thread_list /* Old thread info request */ + case 'L': + /* List threads */ + thread_list = 2; + thread_list_start = (usethread ? : current); + unpack_byte(remcomInBuffer + 3, &maxthreads); + unpack_threadid(remcomInBuffer + 5, &thref); + do { + int buf_thread_limit = + (BUFMAX - 22) / BUF_THREAD_ID_SIZE; + if (maxthreads > buf_thread_limit) { + maxthreads = buf_thread_limit; + } + } while (0); + remcomOutBuffer[0] = 'q'; + remcomOutBuffer[1] = 'M'; + remcomOutBuffer[4] = '0'; + pack_threadid(remcomOutBuffer + 5, &thref); + + /* If start flag set start at 0. */ + if (remcomInBuffer[2] == '1') + threadid = 0; + else + threadid = threadref_to_int(&thref); + for (nothreads = 0; + nothreads < maxthreads && + threadid < PID_MAX + MAX_NO_CPUS; + threadid++) { + thread = getthread(threadid); + if (thread) { + int_to_threadref(&thref, + threadid); + pack_threadid(remcomOutBuffer + + 21 + + nothreads * 16, + &thref); + nothreads++; + if (thread_min > threadid) + thread_min = threadid; + } + } + + if (threadid == PID_MAX + MAX_NO_CPUS) { + remcomOutBuffer[4] = '1'; + } + pack_hex_byte(remcomOutBuffer + 2, nothreads); + remcomOutBuffer[21 + nothreads * 16] = '\0'; + break; +#endif + case 'C': + /* Current thread id */ + remcomOutBuffer[0] = 'Q'; + remcomOutBuffer[1] = 'C'; + threadid = current->pid; + if (!threadid) { + /* + * idle thread + */ + for (threadid = PID_MAX; + threadid < PID_MAX + MAX_NO_CPUS; + threadid++) { + if (current == + idle_task(threadid - + PID_MAX)) + break; + } + } + int_to_threadref(&thref, threadid); + pack_threadid(remcomOutBuffer + 2, &thref); + remcomOutBuffer[18] = '\0'; + break; + + case 'E': + /* Print exception info */ + printexceptioninfo(exceptionVector, + err_code, remcomOutBuffer); + break; + case 'T': + ptr = &remcomInBuffer[0]; + if (strncmp(ptr, "qThreadExtraInfo,", + strlen("qThreadExtraInfo,")) == 0) { + ptr += strlen("qThreadExtraInfo,"); + hexToLong(&ptr, &tmpid); + p = getthread(tmpid); + print_extra_info(p, lbuf); + mem2hex(lbuf, remcomOutBuffer, + strlen(lbuf)); + } + break; +#if 0 + case 'T':{ + char * nptr; + /* Thread extra info */ + if (!cmp_str(&remcomInBuffer[2], + "hreadExtraInfo,", 15)) { + break; + } + ptr = &remcomInBuffer[17]; + hexToLong(&ptr, &threadid); + thread = getthread(threadid); + nptr = &thread->comm[0]; + length = 0; + ptr = &remcomOutBuffer[0]; + do { + length++; + ptr = pack_hex_byte(ptr, *nptr++); + } while (*nptr && length < 16); + /* + * would like that 16 to be the size of + * task_struct.comm but don't know the + * syntax.. + */ + *ptr = 0; + } +#endif + } + break; + + /* task related */ + case 'H': + switch (remcomInBuffer[1]) { + case 'g': + ptr = &remcomInBuffer[2]; + hexToLong(&ptr, &threadid); + thread = getthread(threadid); + if (!thread) { + remcomOutBuffer[0] = 'E'; + remcomOutBuffer[1] = '\0'; + break; + } + /* + * Just in case I forget what this is all about, + * the "thread info" command to gdb causes it + * to ask for a thread list. It then switches + * to each thread and asks for the registers. + * For this (and only this) usage, we want to + * fudge the registers of tasks not on the run + * list (i.e. waiting) to show the routine that + * called schedule. Also, gdb, is a minimalist + * in that if the current thread is the last + * it will not re-read the info when done. + * This means that in this case we must show + * the real registers. So here is how we do it: + * Each entry we keep track of the min + * thread in the list (the last that gdb will) + * get info for. We also keep track of the + * starting thread. + * "thread_list" is cleared when switching back + * to the min thread if it is was current, or + * if it was not current, thread_list is set + * to 1. When the switch to current comes, + * if thread_list is 1, clear it, else do + * nothing. + */ + usethread = thread; + if ((thread_list == 1) && + (thread == thread_list_start)) { + thread_list = 0; + } + if (thread_list && (threadid == thread_min)) { + if (thread == thread_list_start) { + thread_list = 0; + } else { + thread_list = 1; + } + } + /* follow through */ + case 'c': + remcomOutBuffer[0] = 'O'; + remcomOutBuffer[1] = 'K'; + remcomOutBuffer[2] = '\0'; + break; + } + break; + + /* Query thread status */ + case 'T': + ptr = &remcomInBuffer[1]; + hexToLong(&ptr, &threadid); + thread = getthread(threadid); + if (thread) { + remcomOutBuffer[0] = 'O'; + remcomOutBuffer[1] = 'K'; + remcomOutBuffer[2] = '\0'; + if (thread_min > threadid) + thread_min = threadid; + } else { + remcomOutBuffer[0] = 'E'; + remcomOutBuffer[1] = '\0'; + } + break; + + case 'Y': /* set up a hardware breakpoint */ + ptr = &remcomInBuffer[1]; + hexToLong(&ptr, &breakno); + ptr++; + hexToLong(&ptr, &breaktype); + ptr++; + hexToLong(&ptr, &length); + ptr++; + hexToLong(&ptr, &addr); + if (set_hw_break(breakno & 0x3, + breaktype & 0x3, + length & 0x3, addr) == 0) { + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "ERROR"); + } + break; + + /* Remove hardware breakpoint */ + case 'y': + ptr = &remcomInBuffer[1]; + hexToLong(&ptr, &breakno); + if (remove_hw_break(breakno & 0x3) == 0) { + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "ERROR"); + } + break; + + case 'r': /* reboot */ + strcpy(remcomOutBuffer, "OK"); + putpacket(remcomOutBuffer); + /*to_gdb("Rebooting\n"); */ + /* triplefault no return from here */ + { + static long no_idt[2]; + __asm__ __volatile__("lidt %0"::"m"(no_idt[0])); + BREAKPOINT; + } + + } /* switch */ + + /* reply to the request */ + putpacket(remcomOutBuffer); + } /* while(1==1) */ + /* + * reached by goto only. + */ + exit_kgdb: + /* + * Here is where we set up to trap a gdb function call. NEW_esp + * will be changed if we are trying to do this. We handle both + * adding and subtracting, thus allowing gdb to put grung on + * the stack which it removes later. + */ + if (NEW_esp != OLD_esp) { + unsigned long *ptr = END_OF_LOOKASIDE; + if (NEW_esp < OLD_esp) + ptr -= (OLD_esp - NEW_esp) / sizeof (unsigned long); + *--ptr = linux_regs->eflags; + *--ptr = linux_regs->cs; + *--ptr = linux_regs->rip; + *--ptr = linux_regs->rcx; + *--ptr = linux_regs->rbx; + *--ptr = linux_regs->rax; + linux_regs->rcx = NEW_esp - (sizeof (unsigned long) * 6); + linux_regs->rbx = (unsigned long) END_OF_LOOKASIDE; + if (NEW_esp < OLD_esp) { + linux_regs->rip = (unsigned long) fn_call_stub; + } else { + linux_regs->rip = (unsigned long) fn_rtn_stub; + linux_regs->rax = NEW_esp; + } + linux_regs->eflags &= ~(IF_BIT | TF_BIT); + } +#ifdef CONFIG_SMP + /* + * Release gdb wait locks + * Sanity check time. Must have at least one cpu to run. Also single + * step must not be done if the current cpu is on hold. + */ + if (spinlock_count == 1) { + int ss_hold = (regs.eflags & 0x100) && kgdb_info.hold_on_sstep; + int cpu_avail = 0; + int i; + + for (i = 0; i < MAX_NO_CPUS; i++) { + if (!cpu_online(i)) + break; + if (!hold_cpu(i)) { + cpu_avail = 1; + } + } + /* + * Early in the bring up there will be NO cpus on line... + */ + if (!cpu_avail && !cpus_empty(cpu_online_map)) { + to_gdb("No cpus unblocked, see 'kgdb_info.hold_cpu'\n"); + goto once_again; + } + if (hold_cpu(smp_processor_id()) && (regs.eflags & 0x100)) { + to_gdb + ("Current cpu must be unblocked to single step\n"); + goto once_again; + } + if (!(ss_hold)) { + int i; + for (i = 0; i < MAX_NO_CPUS; i++) { + if (!hold_cpu(i)) { + spin_unlock(&waitlocks[i]); + } + } + } else { + spin_unlock(&waitlocks[smp_processor_id()]); + } + /* Release kgdb spinlock */ + KGDB_SPIN_UNLOCK(&kgdb_spinlock); + /* + * If this cpu is on hold, this is where we + * do it. Note, the NMI will pull us out of here, + * but will return as the above lock is not held. + * We will stay here till another cpu releases the lock for us. + */ + spin_unlock_wait(waitlocks + smp_processor_id()); + local_irq_restore(flags); + return (1); + } +#if 0 +exit_just_unlock: +#endif +#endif + /* Release kgdb spinlock */ + KGDB_SPIN_UNLOCK(&kgdb_spinlock); + local_irq_restore(flags); + return (1); +} + +#undef regs +static int kgdb_notify(struct notifier_block *self, unsigned long cmd, void *ptr) +{ + struct die_args *d = ptr; + + if (!kgdb_enabled || (cmd == DIE_DEBUG && user_mode(d->regs))) + return NOTIFY_DONE; + if (cmd == DIE_NMI_IPI) { + if (in_kgdb(d->regs)) + return NOTIFY_BAD; + } else if (kgdb_handle_exception(d->trapnr, d->signr, d->err, d->regs)) + return NOTIFY_BAD; /* skip */ + + return NOTIFY_DONE; +} + +static struct notifier_block kgdb_notifier = { + .notifier_call = kgdb_notify, + .priority = 0, +}; + +void set_debug_traps(void) +{ + static int initialized = 0; + + if (!initialized) { + initialized = 1; + notifier_chain_register(&die_chain, &kgdb_notifier); + } +} + +/* + * Provide the command line "gdb" initial break + */ +int __init kgdb_initial_break(char * str) +{ + if (*str == '\0'){ + breakpoint(); + return 1; + } + return 0; +} +__setup("gdb",kgdb_initial_break); + +/* This function will generate a breakpoint exception. It is used at the + beginning of a program to sync up with a debugger and can be used + otherwise as a quick means to stop program execution and "break" into + the debugger. */ +/* But really, just use the BREAKPOINT macro. We will handle the int stuff + */ + +void breakpoint(void) +{ + + set_debug_traps(); + kgdb_enabled = 1; +#if 0 + /* + * These calls were not enough to allow breakpoint to be + * called before trap_init(). I moved the argument parsing + * after trap_init() and it seems to work. + */ + set_intr_usr_gate(3,&int3); /* disable ints on trap */ + set_intr_gate(1,&debug); + set_intr_gate(14,&page_fault); +#endif + + BREAKPOINT; +} + +#ifdef later +/* + * possibly we should not go thru the traps.c code at all? Someday. + */ +void +do_kgdb_int3(struct pt_regs *regs, long error_code) +{ + kgdb_handle_exception(3, 5, error_code, regs); + return; +} +#endif +#undef regs +#ifdef CONFIG_TRAP_BAD_SYSCALL_EXITS +asmlinkage void +bad_sys_call_exit(int stuff) +{ + struct pt_regs *regs = (struct pt_regs *) &stuff; + printk("Sys call %d return with %x preempt_count\n", + (int) regs->orig_eax, preempt_count()); +} +#endif +#ifdef CONFIG_STACK_OVERFLOW_TEST +#include +asmlinkage void +stack_overflow(void) +{ +#ifdef BREAKPOINT + BREAKPOINT; +#else + printk("Kernel stack overflow, looping forever\n"); +#endif + while (1) { + } +} +#endif + +#if defined(CONFIG_SMP) || defined(CONFIG_KGDB_CONSOLE) +char gdbconbuf[BUFMAX]; + +static void +kgdb_gdb_message(const char *s, unsigned count) +{ + int i; + int wcount; + char *bufptr; + /* + * This takes care of NMI while spining out chars to gdb + */ + IF_SMP(in_kgdb_console = 1); + gdbconbuf[0] = 'O'; + bufptr = gdbconbuf + 1; + while (count > 0) { + if ((count << 1) > (BUFMAX - 2)) { + wcount = (BUFMAX - 2) >> 1; + } else { + wcount = count; + } + count -= wcount; + for (i = 0; i < wcount; i++) { + bufptr = pack_hex_byte(bufptr, s[i]); + } + *bufptr = '\0'; + s += wcount; + + putpacket(gdbconbuf); + + } + IF_SMP(in_kgdb_console = 0); +} +#endif +#ifdef CONFIG_SMP +static void +to_gdb(const char *s) +{ + int count = 0; + while (s[count] && (count++ < BUFMAX)) ; + kgdb_gdb_message(s, count); +} +#endif +#ifdef CONFIG_KGDB_CONSOLE +#include +#include +#include +#include + +void +kgdb_console_write(struct console *co, const char *s, unsigned count) +{ + + if (gdb_i386vector == -1) { + /* + * We have not yet talked to gdb. What to do... + * lets break, on continue we can do the write. + * But first tell him whats up. Uh, well no can do, + * as this IS the console. Oh well... + * We do need to wait or the messages will be lost. + * Other option would be to tell the above code to + * ignore this breakpoint and do an auto return, + * but that might confuse gdb. Also this happens + * early enough in boot up that we don't have the traps + * set up yet, so... + */ + breakpoint(); + } + kgdb_gdb_message(s, count); +} + +/* + * ------------------------------------------------------------ + * Serial KGDB driver + * ------------------------------------------------------------ + */ + +static struct console kgdbcons = { + name:"kgdb", + write:kgdb_console_write, +#ifdef CONFIG_KGDB_USER_CONSOLE + device:kgdb_console_device, +#endif + flags:CON_PRINTBUFFER | CON_ENABLED, + index:-1, +}; + +/* + * The trick here is that this file gets linked before printk.o + * That means we get to peer at the console info in the command + * line before it does. If we are up, we register, otherwise, + * do nothing. By returning 0, we allow printk to look also. + */ +static int kgdb_console_enabled; + +int __init +kgdb_console_init(char *str) +{ + if ((strncmp(str, "kgdb", 4) == 0) || (strncmp(str, "gdb", 3) == 0)) { + register_console(&kgdbcons); + kgdb_console_enabled = 1; + } + return 0; /* let others look at the string */ +} + +__setup("console=", kgdb_console_init); + +#ifdef CONFIG_KGDB_USER_CONSOLE +static kdev_t kgdb_console_device(struct console *c); +/* This stuff sort of works, but it knocks out telnet devices + * we are leaving it here in case we (or you) find time to figure it out + * better.. + */ + +/* + * We need a real char device as well for when the console is opened for user + * space activities. + */ + +static int +kgdb_consdev_open(struct inode *inode, struct file *file) +{ + return 0; +} + +static ssize_t +kgdb_consdev_write(struct file *file, const char *buf, + size_t count, loff_t * ppos) +{ + int size, ret = 0; + static char kbuf[128]; + static DECLARE_MUTEX(sem); + + /* We are not reentrant... */ + if (down_interruptible(&sem)) + return -ERESTARTSYS; + + while (count > 0) { + /* need to copy the data from user space */ + size = count; + if (size > sizeof (kbuf)) + size = sizeof (kbuf); + if (copy_from_user(kbuf, buf, size)) { + ret = -EFAULT; + break;; + } + kgdb_console_write(&kgdbcons, kbuf, size); + count -= size; + ret += size; + buf += size; + } + + up(&sem); + + return ret; +} + +struct file_operations kgdb_consdev_fops = { + open:kgdb_consdev_open, + write:kgdb_consdev_write +}; +static kdev_t +kgdb_console_device(struct console *c) +{ + return MKDEV(TTYAUX_MAJOR, 1); +} + +/* + * This routine gets called from the serial stub in the i386/lib + * This is so it is done late in bring up (just before the console open). + */ +void +kgdb_console_finit(void) +{ + if (kgdb_console_enabled) { + char *cptr = cdevname(MKDEV(TTYAUX_MAJOR, 1)); + char *cp = cptr; + while (*cptr && *cptr != '(') + cptr++; + *cptr = 0; + unregister_chrdev(TTYAUX_MAJOR, cp); + register_chrdev(TTYAUX_MAJOR, "kgdb", &kgdb_consdev_fops); + } +} +#endif +#endif +#ifdef CONFIG_KGDB_TS +#include /* time stamp code */ +#include /* in_interrupt */ +#ifdef CONFIG_KGDB_TS_64 +#define DATA_POINTS 64 +#endif +#ifdef CONFIG_KGDB_TS_128 +#define DATA_POINTS 128 +#endif +#ifdef CONFIG_KGDB_TS_256 +#define DATA_POINTS 256 +#endif +#ifdef CONFIG_KGDB_TS_512 +#define DATA_POINTS 512 +#endif +#ifdef CONFIG_KGDB_TS_1024 +#define DATA_POINTS 1024 +#endif +#ifndef DATA_POINTS +#define DATA_POINTS 128 /* must be a power of two */ +#endif +#define INDEX_MASK (DATA_POINTS - 1) +#if (INDEX_MASK & DATA_POINTS) +#error "CONFIG_KGDB_TS_COUNT must be a power of 2" +#endif +struct kgdb_and_then_struct { +#ifdef CONFIG_SMP + int on_cpu; +#endif + struct task_struct *task; + long long at_time; + int from_ln; + char *in_src; + void *from; + int *with_shpf; + int data0; + int data1; +}; +struct kgdb_and_then_struct2 { +#ifdef CONFIG_SMP + int on_cpu; +#endif + struct task_struct *task; + long long at_time; + int from_ln; + char *in_src; + void *from; + int *with_shpf; + struct task_struct *t1; + struct task_struct *t2; +}; +struct kgdb_and_then_struct kgdb_data[DATA_POINTS]; + +struct kgdb_and_then_struct *kgdb_and_then = &kgdb_data[0]; +int kgdb_and_then_count; + +void +kgdb_tstamp(int line, char *source, int data0, int data1) +{ + static spinlock_t ts_spin = SPIN_LOCK_UNLOCKED; + unsigned long flags; + + local_irq_save(flags); + spin_lock(&ts_spin); + rdtscll(kgdb_and_then->at_time); +#ifdef CONFIG_SMP + kgdb_and_then->on_cpu = smp_processor_id(); +#endif + kgdb_and_then->task = current; + kgdb_and_then->from_ln = line; + kgdb_and_then->in_src = source; + kgdb_and_then->from = __builtin_return_address(0); + kgdb_and_then->with_shpf = (int *)(long)(((flags & IF_BIT) >> 9) | + (preempt_count() << 8)); + kgdb_and_then->data0 = data0; + kgdb_and_then->data1 = data1; + kgdb_and_then = &kgdb_data[++kgdb_and_then_count & INDEX_MASK]; + spin_unlock(&ts_spin); + local_irq_restore(flags); +#ifdef CONFIG_PREEMPT + +#endif + return; +} +#endif +typedef int gdb_debug_hook(int exceptionVector, + int signo, int err_code, struct pt_regs *linux_regs); +gdb_debug_hook *linux_debug_hook = &kgdb_handle_exception; /* histerical reasons... */ + +static int kgdb_need_breakpoint[NR_CPUS]; + +void kgdb_schedule_breakpoint(void) +{ + kgdb_need_breakpoint[smp_processor_id()] = 1; +} + +void kgdb_process_breakpoint(void) +{ + /* + * Handle a breakpoint queued from inside network driver code + * to avoid reentrancy issues + */ + if (kgdb_need_breakpoint[smp_processor_id()]) { + kgdb_need_breakpoint[smp_processor_id()] = 0; + kgdb_enabled = 1; + BREAKPOINT; + } +} + --- diff/arch/x86_64/lib/kgdb_serial.c 1970-01-01 01:00:00.000000000 +0100 +++ source/arch/x86_64/lib/kgdb_serial.c 2004-04-21 10:46:51.327779104 +0100 @@ -0,0 +1,490 @@ +/* + * Serial interface GDB stub + * + * Written (hacked together) by David Grothe (dave@gcom.com) + * Modified to allow invokation early in boot see also + * kgdb.h for instructions by George Anzinger(george@mvista.com) + * Modified to handle debugging over ethernet by Robert Walsh + * and wangdi , based on + * code by San Mehat. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_KGDB_USER_CONSOLE +extern void kgdb_console_finit(void); +#endif +#define PRNT_off +#define TEST_EXISTANCE +#ifdef PRNT +#define dbprintk(s) printk s +#else +#define dbprintk(s) +#endif +#define TEST_INTERRUPT_off +#ifdef TEST_INTERRUPT +#define intprintk(s) printk s +#else +#define intprintk(s) +#endif + +#define IRQ_T(info) ((info->flags & ASYNC_SHARE_IRQ) ? SA_SHIRQ : SA_INTERRUPT) + +#define GDB_BUF_SIZE 512 /* power of 2, please */ + +static char gdb_buf[GDB_BUF_SIZE]; +static int gdb_buf_in_inx; +static atomic_t gdb_buf_in_cnt; +static int gdb_buf_out_inx; + +struct async_struct *gdb_async_info; +static int gdb_async_irq; + +#define outb_px(a,b) outb_p(b,a) + +static void program_uart(struct async_struct *info); +static void write_char(struct async_struct *info, int chr); +/* + * Get a byte from the hardware data buffer and return it + */ +static int +read_data_bfr(struct async_struct *info) +{ + char it = inb_p(info->port + UART_LSR); + + if (it & UART_LSR_DR) + return (inb_p(info->port + UART_RX)); + /* + * If we have a framing error assume somebody messed with + * our uart. Reprogram it and send '-' both ways... + */ + if (it & 0xc) { + program_uart(info); + write_char(info, '-'); + return ('-'); + } + return (-1); + +} /* read_data_bfr */ + +/* + * Get a char if available, return -1 if nothing available. + * Empty the receive buffer first, then look at the interface hardware. + + * Locking here is a bit of a problem. We MUST not lock out communication + * if we are trying to talk to gdb about a kgdb entry. ON the other hand + * we can loose chars in the console pass thru if we don't lock. It is also + * possible that we could hold the lock or be waiting for it when kgdb + * NEEDS to talk. Since kgdb locks down the world, it does not need locks. + * We do, of course have possible issues with interrupting a uart operation, + * but we will just depend on the uart status to help keep that straight. + + */ +static spinlock_t uart_interrupt_lock = SPIN_LOCK_UNLOCKED; +#ifdef CONFIG_SMP +extern spinlock_t kgdb_spinlock; +#endif + +static int +read_char(struct async_struct *info) +{ + int chr; + unsigned long flags; + local_irq_save(flags); +#ifdef CONFIG_SMP + if (!spin_is_locked(&kgdb_spinlock)) { + spin_lock(&uart_interrupt_lock); + } +#endif + if (atomic_read(&gdb_buf_in_cnt) != 0) { /* intr routine has q'd chars */ + chr = gdb_buf[gdb_buf_out_inx++]; + gdb_buf_out_inx &= (GDB_BUF_SIZE - 1); + atomic_dec(&gdb_buf_in_cnt); + } else { + chr = read_data_bfr(info); + } +#ifdef CONFIG_SMP + if (!spin_is_locked(&kgdb_spinlock)) { + spin_unlock(&uart_interrupt_lock); + } +#endif + local_irq_restore(flags); + return (chr); +} + +/* + * Wait until the interface can accept a char, then write it. + */ +static void +write_char(struct async_struct *info, int chr) +{ + while (!(inb_p(info->port + UART_LSR) & UART_LSR_THRE)) ; + + outb_p(chr, info->port + UART_TX); + +} /* write_char */ + +/* + * Mostly we don't need a spinlock, but since the console goes + * thru here with interrutps on, well, we need to catch those + * chars. + */ +/* + * This is the receiver interrupt routine for the GDB stub. + * It will receive a limited number of characters of input + * from the gdb host machine and save them up in a buffer. + * + * When the gdb stub routine tty_getDebugChar() is called it + * draws characters out of the buffer until it is empty and + * then reads directly from the serial port. + * + * We do not attempt to write chars from the interrupt routine + * since the stubs do all of that via tty_putDebugChar() which + * writes one byte after waiting for the interface to become + * ready. + * + * The debug stubs like to run with interrupts disabled since, + * after all, they run as a consequence of a breakpoint in + * the kernel. + * + * Perhaps someone who knows more about the tty driver than I + * care to learn can make this work for any low level serial + * driver. + */ +static irqreturn_t +gdb_interrupt(int irq, void *dev_id, struct pt_regs *regs) +{ + struct async_struct *info; + unsigned long flags; + + info = gdb_async_info; + if (!info || !info->tty || irq != gdb_async_irq) + return IRQ_NONE; + + local_irq_save(flags); + spin_lock(&uart_interrupt_lock); + do { + int chr = read_data_bfr(info); + intprintk(("Debug char on int: %x hex\n", chr)); + if (chr < 0) + continue; + + if (chr == 3) { /* Ctrl-C means remote interrupt */ + BREAKPOINT; + continue; + } + + if (atomic_read(&gdb_buf_in_cnt) >= GDB_BUF_SIZE) { + /* buffer overflow tosses early char */ + read_char(info); + } + gdb_buf[gdb_buf_in_inx++] = chr; + gdb_buf_in_inx &= (GDB_BUF_SIZE - 1); + } while (inb_p(info->port + UART_IIR) & UART_IIR_RDI); + spin_unlock(&uart_interrupt_lock); + local_irq_restore(flags); + return IRQ_HANDLED; +} /* gdb_interrupt */ + +/* + * Just a NULL routine for testing. + */ +void +gdb_null(void) +{ +} /* gdb_null */ + +/* These structure are filled in with values defined in asm/kgdb_local.h + */ +static struct serial_state state = SB_STATE; +static struct async_struct local_info = SB_INFO; +static int ok_to_enable_ints = 0; +static void kgdb_enable_ints_now(void); + +extern char *kgdb_version; +/* + * Hook an IRQ for KGDB. + * + * This routine is called from tty_putDebugChar, below. + */ +static int ints_disabled = 1; +int +gdb_hook_interrupt(struct async_struct *info, int verb) +{ + struct serial_state *state = info->state; + unsigned long flags; + int port; +#ifdef TEST_EXISTANCE + int scratch, scratch2; +#endif + + /* The above fails if memory managment is not set up yet. + * Rather than fail the set up, just keep track of the fact + * and pick up the interrupt thing later. + */ + gdb_async_info = info; + port = gdb_async_info->port; + gdb_async_irq = state->irq; + if (verb) { + printk("kgdb %s : port =%x, IRQ=%d, divisor =%d\n", + kgdb_version, + port, + gdb_async_irq, gdb_async_info->state->custom_divisor); + } + local_irq_save(flags); +#ifdef TEST_EXISTANCE + /* Existance test */ + /* Should not need all this, but just in case.... */ + + scratch = inb_p(port + UART_IER); + outb_px(port + UART_IER, 0); + outb_px(0xff, 0x080); + scratch2 = inb_p(port + UART_IER); + outb_px(port + UART_IER, scratch); + if (scratch2) { + printk + ("gdb_hook_interrupt: Could not clear IER, not a UART!\n"); + local_irq_restore(flags); + return 1; /* We failed; there's nothing here */ + } + scratch2 = inb_p(port + UART_LCR); + outb_px(port + UART_LCR, 0xBF); /* set up for StarTech test */ + outb_px(port + UART_EFR, 0); /* EFR is the same as FCR */ + outb_px(port + UART_LCR, 0); + outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO); + scratch = inb_p(port + UART_IIR) >> 6; + if (scratch == 1) { + printk("gdb_hook_interrupt: Undefined UART type!" + " Not a UART! \n"); + local_irq_restore(flags); + return 1; + } else { + dbprintk(("gdb_hook_interrupt: UART type " + "is %d where 0=16450, 2=16550 3=16550A\n", scratch)); + } + scratch = inb_p(port + UART_MCR); + outb_px(port + UART_MCR, UART_MCR_LOOP | scratch); + outb_px(port + UART_MCR, UART_MCR_LOOP | 0x0A); + scratch2 = inb_p(port + UART_MSR) & 0xF0; + outb_px(port + UART_MCR, scratch); + if (scratch2 != 0x90) { + printk("gdb_hook_interrupt: " + "Loop back test failed! Not a UART!\n"); + local_irq_restore(flags); + return scratch2 + 1000; /* force 0 to fail */ + } +#endif /* test existance */ + program_uart(info); + local_irq_restore(flags); + + return (0); + +} /* gdb_hook_interrupt */ + +static void +program_uart(struct async_struct *info) +{ + int port = info->port; + + (void) inb_p(port + UART_RX); + outb_px(port + UART_IER, 0); + + (void) inb_p(port + UART_RX); /* serial driver comments say */ + (void) inb_p(port + UART_IIR); /* this clears the interrupt regs */ + (void) inb_p(port + UART_MSR); + outb_px(port + UART_LCR, UART_LCR_WLEN8 | UART_LCR_DLAB); + outb_px(port + UART_DLL, info->state->custom_divisor & 0xff); /* LS */ + outb_px(port + UART_DLM, info->state->custom_divisor >> 8); /* MS */ + outb_px(port + UART_MCR, info->MCR); + + outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR_TRIGGER_1 | UART_FCR_CLEAR_XMIT | UART_FCR_CLEAR_RCVR); /* set fcr */ + outb_px(port + UART_LCR, UART_LCR_WLEN8); /* reset DLAB */ + outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR_TRIGGER_1); /* set fcr */ + if (!ints_disabled) { + intprintk(("KGDB: Sending %d to port %x offset %d\n", + gdb_async_info->IER, + (int) gdb_async_info->port, UART_IER)); + outb_px(gdb_async_info->port + UART_IER, gdb_async_info->IER); + } + return; +} + +/* + * tty_getDebugChar + * + * This is a GDB stub routine. It waits for a character from the + * serial interface and then returns it. If there is no serial + * interface connection then it returns a bogus value which will + * almost certainly cause the system to hang. In the + */ +int kgdb_in_isr = 0; +int kgdb_in_lsr = 0; +extern spinlock_t kgdb_spinlock; + +/* Caller takes needed protections */ + +int +tty_getDebugChar(void) +{ + volatile int chr, dum, time, end_time; + + dbprintk(("tty_getDebugChar(port %x): ", gdb_async_info->port)); + + if (gdb_async_info == NULL) { + gdb_hook_interrupt(&local_info, 0); + } + /* + * This trick says if we wait a very long time and get + * no char, return the -1 and let the upper level deal + * with it. + */ + rdtsc(dum, time); + end_time = time + 2; + while (((chr = read_char(gdb_async_info)) == -1) && + (end_time - time) > 0) { + rdtsc(dum, time); + }; + /* + * This covers our butts if some other code messes with + * our uart, hay, it happens :o) + */ + if (chr == -1) + program_uart(gdb_async_info); + + dbprintk(("%c\n", chr > ' ' && chr < 0x7F ? chr : ' ')); + return (chr); + +} /* tty_getDebugChar */ + +static int count = 3; +static spinlock_t one_at_atime = SPIN_LOCK_UNLOCKED; + +static int __init +kgdb_enable_ints(void) +{ + set_debug_traps(); + if (kgdboe) { + return 0; + } + if (gdb_async_info == NULL) { + gdb_hook_interrupt(&local_info, 1); + } + ok_to_enable_ints = 1; + kgdb_enable_ints_now(); +#ifdef CONFIG_KGDB_USER_CONSOLE + kgdb_console_finit(); +#endif + return 0; +} + +#ifdef CONFIG_SERIAL_8250 +void shutdown_for_kgdb(struct async_struct *gdb_async_info); +#endif + +#define kgdb_mem_init_done() (1) + +static void +kgdb_enable_ints_now(void) +{ + if (!spin_trylock(&one_at_atime)) + return; + if (!ints_disabled) + goto exit; + if (kgdb_mem_init_done() && + ints_disabled) { /* don't try till mem init */ +#ifdef CONFIG_SERIAL_8250 + /* + * The ifdef here allows the system to be configured + * without the serial driver. + * Don't make it a module, however, it will steal the port + */ + shutdown_for_kgdb(gdb_async_info); +#endif + ints_disabled = request_irq(gdb_async_info->state->irq, + gdb_interrupt, + IRQ_T(gdb_async_info), + "KGDB-stub", NULL); + intprintk(("KGDB: request_irq returned %d\n", ints_disabled)); + } + if (!ints_disabled) { + intprintk(("KGDB: Sending %d to port %x offset %d\n", + gdb_async_info->IER, + (int) gdb_async_info->port, UART_IER)); + outb_px(gdb_async_info->port + UART_IER, gdb_async_info->IER); + } + exit: + spin_unlock(&one_at_atime); +} + +/* + * tty_putDebugChar + * + * This is a GDB stub routine. It waits until the interface is ready + * to transmit a char and then sends it. If there is no serial + * interface connection then it simply returns to its caller, having + * pretended to send the char. Caller takes needed protections. + */ +void +tty_putDebugChar(int chr) +{ + dbprintk(("tty_putDebugChar(port %x): chr=%02x '%c', ints_on=%d\n", + gdb_async_info->port, + chr, + chr > ' ' && chr < 0x7F ? chr : ' ', ints_disabled ? 0 : 1)); + + if (gdb_async_info == NULL) { + gdb_hook_interrupt(&local_info, 0); + } + + write_char(gdb_async_info, chr); /* this routine will wait */ + count = (chr == '#') ? 0 : count + 1; + if ((count == 2)) { /* try to enable after */ + if (ints_disabled & ok_to_enable_ints) + kgdb_enable_ints_now(); /* try to enable after */ + + /* We do this a lot because, well we really want to get these + * interrupts. The serial driver will clear these bits when it + * initializes the chip. Every thing else it does is ok, + * but this. + */ + if (!ints_disabled) { + outb_px(gdb_async_info->port + UART_IER, + gdb_async_info->IER); + } + } + +} /* tty_putDebugChar */ + +/* + * This does nothing for the serial port, since it doesn't buffer. + */ + +void tty_flushDebugChar(void) +{ +} + +module_init(kgdb_enable_ints); --- diff/drivers/char/drm/drm_irq.h 1970-01-01 01:00:00.000000000 +0100 +++ source/drivers/char/drm/drm_irq.h 2004-04-21 10:46:51.392769224 +0100 @@ -0,0 +1,372 @@ +/** + * \file drm_irq.h + * IRQ support + * + * \author Rickard E. (Rik) Faith + * \author Gareth Hughes + */ + +/* + * Created: Fri Mar 19 14:30:16 1999 by faith@valinux.com + * + * Copyright 1999, 2000 Precision Insight, Inc., Cedar Park, Texas. + * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#define __NO_VERSION__ +#include "drmP.h" + +#include /* For task queue support */ + +#ifndef __HAVE_SHARED_IRQ +#define __HAVE_SHARED_IRQ 0 +#endif + +#if __HAVE_SHARED_IRQ +#define DRM_IRQ_TYPE SA_SHIRQ +#else +#define DRM_IRQ_TYPE 0 +#endif + +/** + * Get interrupt from bus id. + * + * \param inode device inode. + * \param filp file pointer. + * \param cmd command. + * \param arg user argument, pointing to a drm_irq_busid structure. + * \return zero on success or a negative number on failure. + * + * Finds the PCI device with the specified bus id and gets its IRQ number. + * This IOCTL is deprecated, and will now return EINVAL for any busid not equal + * to that of the device that this DRM instance attached to. + */ +int DRM(irq_by_busid)(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg) +{ + drm_file_t *priv = filp->private_data; + drm_device_t *dev = priv->dev; + drm_irq_busid_t p; + + if (copy_from_user(&p, (drm_irq_busid_t *)arg, sizeof(p))) + return -EFAULT; + + if ((p.busnum >> 8) != dev->pci_domain || + (p.busnum & 0xff) != dev->pci_bus || + p.devnum != dev->pci_slot || + p.funcnum != dev->pci_func) + return -EINVAL; + + p.irq = dev->irq; + + DRM_DEBUG("%d:%d:%d => IRQ %d\n", + p.busnum, p.devnum, p.funcnum, p.irq); + if (copy_to_user((drm_irq_busid_t *)arg, &p, sizeof(p))) + return -EFAULT; + return 0; +} + +#if __HAVE_IRQ + +/** + * Install IRQ handler. + * + * \param dev DRM device. + * \param irq IRQ number. + * + * Initializes the IRQ related data, and setups drm_device::vbl_queue. Installs the handler, calling the driver + * \c DRM(driver_irq_preinstall)() and \c DRM(driver_irq_postinstall)() functions + * before and after the installation. + */ +int DRM(irq_install)( drm_device_t *dev ) +{ + int ret; + + if ( dev->irq == 0 ) + return -EINVAL; + + down( &dev->struct_sem ); + + /* Driver must have been initialized */ + if ( !dev->dev_private ) { + up( &dev->struct_sem ); + return -EINVAL; + } + + if ( dev->irq_enabled ) { + up( &dev->struct_sem ); + return -EBUSY; + } + dev->irq_enabled = 1; + up( &dev->struct_sem ); + + DRM_DEBUG( "%s: irq=%d\n", __FUNCTION__, dev->irq ); + +#if __HAVE_DMA + dev->dma->next_buffer = NULL; + dev->dma->next_queue = NULL; + dev->dma->this_buffer = NULL; +#endif + +#if __HAVE_IRQ_BH + INIT_WORK(&dev->work, DRM(irq_immediate_bh), dev); +#endif + +#if __HAVE_VBL_IRQ + init_waitqueue_head(&dev->vbl_queue); + + spin_lock_init( &dev->vbl_lock ); + + INIT_LIST_HEAD( &dev->vbl_sigs.head ); + + dev->vbl_pending = 0; +#endif + + /* Before installing handler */ + DRM(driver_irq_preinstall)(dev); + + /* Install handler */ + ret = request_irq( dev->irq, DRM(irq_handler), + DRM_IRQ_TYPE, dev->devname, dev ); + if ( ret < 0 ) { + down( &dev->struct_sem ); + dev->irq_enabled = 0; + up( &dev->struct_sem ); + return ret; + } + + /* After installing handler */ + DRM(driver_irq_postinstall)(dev); + + return 0; +} + +/** + * Uninstall the IRQ handler. + * + * \param dev DRM device. + * + * Calls the driver's \c DRM(driver_irq_uninstall)() function, and stops the irq. + */ +int DRM(irq_uninstall)( drm_device_t *dev ) +{ + int irq_enabled; + + down( &dev->struct_sem ); + irq_enabled = dev->irq_enabled; + dev->irq_enabled = 0; + up( &dev->struct_sem ); + + if ( !irq_enabled ) + return -EINVAL; + + DRM_DEBUG( "%s: irq=%d\n", __FUNCTION__, dev->irq ); + + DRM(driver_irq_uninstall)( dev ); + + free_irq( dev->irq, dev ); + + return 0; +} + +/** + * IRQ control ioctl. + * + * \param inode device inode. + * \param filp file pointer. + * \param cmd command. + * \param arg user argument, pointing to a drm_control structure. + * \return zero on success or a negative number on failure. + * + * Calls irq_install() or irq_uninstall() according to \p arg. + */ +int DRM(control)( struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg ) +{ + drm_file_t *priv = filp->private_data; + drm_device_t *dev = priv->dev; + drm_control_t ctl; + + if ( copy_from_user( &ctl, (drm_control_t *)arg, sizeof(ctl) ) ) + return -EFAULT; + + switch ( ctl.func ) { + case DRM_INST_HANDLER: + if (dev->if_version < DRM_IF_VERSION(1, 2) && + ctl.irq != dev->irq) + return -EINVAL; + return DRM(irq_install)( dev ); + case DRM_UNINST_HANDLER: + return DRM(irq_uninstall)( dev ); + default: + return -EINVAL; + } +} + +#if __HAVE_VBL_IRQ + +/** + * Wait for VBLANK. + * + * \param inode device inode. + * \param filp file pointer. + * \param cmd command. + * \param data user argument, pointing to a drm_wait_vblank structure. + * \return zero on success or a negative number on failure. + * + * Verifies the IRQ is installed. + * + * If a signal is requested checks if this task has already scheduled the same signal + * for the same vblank sequence number - nothing to be done in + * that case. If the number of tasks waiting for the interrupt exceeds 100 the + * function fails. Otherwise adds a new entry to drm_device::vbl_sigs for this + * task. + * + * If a signal is not requested, then calls vblank_wait(). + */ +int DRM(wait_vblank)( DRM_IOCTL_ARGS ) +{ + drm_file_t *priv = filp->private_data; + drm_device_t *dev = priv->dev; + drm_wait_vblank_t vblwait; + struct timeval now; + int ret = 0; + unsigned int flags; + + if (!dev->irq) + return -EINVAL; + + DRM_COPY_FROM_USER_IOCTL( vblwait, (drm_wait_vblank_t *)data, + sizeof(vblwait) ); + + switch ( vblwait.request.type & ~_DRM_VBLANK_FLAGS_MASK ) { + case _DRM_VBLANK_RELATIVE: + vblwait.request.sequence += atomic_read( &dev->vbl_received ); + vblwait.request.type &= ~_DRM_VBLANK_RELATIVE; + case _DRM_VBLANK_ABSOLUTE: + break; + default: + return -EINVAL; + } + + flags = vblwait.request.type & _DRM_VBLANK_FLAGS_MASK; + + if ( flags & _DRM_VBLANK_SIGNAL ) { + unsigned long irqflags; + drm_vbl_sig_t *vbl_sig; + + vblwait.reply.sequence = atomic_read( &dev->vbl_received ); + + spin_lock_irqsave( &dev->vbl_lock, irqflags ); + + /* Check if this task has already scheduled the same signal + * for the same vblank sequence number; nothing to be done in + * that case + */ + list_for_each_entry( vbl_sig, &dev->vbl_sigs.head, head ) { + if (vbl_sig->sequence == vblwait.request.sequence + && vbl_sig->info.si_signo == vblwait.request.signal + && vbl_sig->task == current) + { + spin_unlock_irqrestore( &dev->vbl_lock, irqflags ); + goto done; + } + } + + if ( dev->vbl_pending >= 100 ) { + spin_unlock_irqrestore( &dev->vbl_lock, irqflags ); + return -EBUSY; + } + + dev->vbl_pending++; + + spin_unlock_irqrestore( &dev->vbl_lock, irqflags ); + + if ( !( vbl_sig = DRM_MALLOC( sizeof( drm_vbl_sig_t ) ) ) ) { + return -ENOMEM; + } + + memset( (void *)vbl_sig, 0, sizeof(*vbl_sig) ); + + vbl_sig->sequence = vblwait.request.sequence; + vbl_sig->info.si_signo = vblwait.request.signal; + vbl_sig->task = current; + + spin_lock_irqsave( &dev->vbl_lock, irqflags ); + + list_add_tail( (struct list_head *) vbl_sig, &dev->vbl_sigs.head ); + + spin_unlock_irqrestore( &dev->vbl_lock, irqflags ); + } else { + ret = DRM(vblank_wait)( dev, &vblwait.request.sequence ); + + do_gettimeofday( &now ); + vblwait.reply.tval_sec = now.tv_sec; + vblwait.reply.tval_usec = now.tv_usec; + } + +done: + DRM_COPY_TO_USER_IOCTL( (drm_wait_vblank_t *)data, vblwait, + sizeof(vblwait) ); + + return ret; +} + +/** + * Send the VBLANK signals. + * + * \param dev DRM device. + * + * Sends a signal for each task in drm_device::vbl_sigs and empties the list. + * + * If a signal is not requested, then calls vblank_wait(). + */ +void DRM(vbl_send_signals)( drm_device_t *dev ) +{ + struct list_head *list, *tmp; + drm_vbl_sig_t *vbl_sig; + unsigned int vbl_seq = atomic_read( &dev->vbl_received ); + unsigned long flags; + + spin_lock_irqsave( &dev->vbl_lock, flags ); + + list_for_each_safe( list, tmp, &dev->vbl_sigs.head ) { + vbl_sig = list_entry( list, drm_vbl_sig_t, head ); + if ( ( vbl_seq - vbl_sig->sequence ) <= (1<<23) ) { + vbl_sig->info.si_code = vbl_seq; + send_sig_info( vbl_sig->info.si_signo, &vbl_sig->info, vbl_sig->task ); + + list_del( list ); + + DRM_FREE( vbl_sig, sizeof(*vbl_sig) ); + + dev->vbl_pending--; + } + } + + spin_unlock_irqrestore( &dev->vbl_lock, flags ); +} + +#endif /* __HAVE_VBL_IRQ */ + +#endif /* __HAVE_IRQ */ --- diff/drivers/net/kgdb_eth.c 1970-01-01 01:00:00.000000000 +0100 +++ source/drivers/net/kgdb_eth.c 2004-04-21 10:46:51.521749616 +0100 @@ -0,0 +1,132 @@ +/* + * Network interface GDB stub + * + * Written by San Mehat (nettwerk@biodome.org) + * Based upon 'gdbserial' by David Grothe (dave@gcom.com) + * and Scott Foehner (sfoehner@engr.sgi.com) + * + * Twiddled for 2.6 by Robert Walsh + * and wangdi . + * + * Refactored for netpoll API by Matt Mackall + * + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#define IN_BUF_SIZE 512 /* power of 2, please */ +#define OUT_BUF_SIZE 256 + +static char in_buf[IN_BUF_SIZE], out_buf[OUT_BUF_SIZE]; +static int in_head, in_tail, out_count; +static atomic_t in_count; +int kgdboe = 0; /* Default to tty mode */ + +extern void set_debug_traps(void); +extern void breakpoint(void); +static void rx_hook(struct netpoll *np, int port, char *msg, int len); + +static struct netpoll np = { + .name = "kgdboe", + .dev_name = "eth0", + .rx_hook = rx_hook, + .local_port = 6443, + .remote_port = 6442, + .remote_mac = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, +}; +static int configured; + +int eth_getDebugChar(void) +{ + int chr; + + while (atomic_read(&in_count) == 0) + netpoll_poll(&np); + + chr = in_buf[in_tail++]; + in_tail &= (IN_BUF_SIZE - 1); + atomic_dec(&in_count); + return chr; +} + +void eth_flushDebugChar(void) +{ + if(out_count && np.dev) { + netpoll_send_udp(&np, out_buf, out_count); + out_count = 0; + } +} + +void eth_putDebugChar(int chr) +{ + out_buf[out_count++] = chr; + if(out_count == OUT_BUF_SIZE) + eth_flushDebugChar(); +} + +static void rx_hook(struct netpoll *np, int port, char *msg, int len) +{ + int i; + + np->remote_port = port; + + /* Is this gdb trying to attach? */ + if (!netpoll_trap() && len == 8 && !strncmp(msg, "$Hc-1#09", 8)) + kgdb_schedule_breakpoint(); + + for (i = 0; i < len; i++) { + if (msg[i] == 3) + kgdb_schedule_breakpoint(); + + if (atomic_read(&in_count) >= IN_BUF_SIZE) { + /* buffer overflow, clear it */ + in_head = in_tail = 0; + atomic_set(&in_count, 0); + break; + } + in_buf[in_head++] = msg[i]; + in_head &= (IN_BUF_SIZE - 1); + atomic_inc(&in_count); + } +} + +static int option_setup(char *opt) +{ + configured = !netpoll_parse_options(&np, opt); + return 0; +} +__setup("kgdboe=", option_setup); + +static int init_kgdboe(void) +{ +#ifdef CONFIG_SMP + if (num_online_cpus() > CONFIG_NO_KGDB_CPUS) { + printk("kgdb: too manu cpus. Cannot enable debugger with more than %d cpus\n", CONFIG_NO_KGDB_CPUS); + return -1; + } +#endif + + set_debug_traps(); + + if(!configured || netpoll_setup(&np)) + return 1; + + kgdboe = 1; + printk(KERN_INFO "kgdb: debugging over ethernet enabled\n"); + + return 0; +} + +module_init(init_kgdboe); --- diff/fs/cifs/fcntl.c 1970-01-01 01:00:00.000000000 +0100 +++ source/fs/cifs/fcntl.c 2004-04-21 10:46:51.676726056 +0100 @@ -0,0 +1,97 @@ +/* + * fs/cifs/fcntl.c + * + * vfs operations that deal with the file control API + * + * Copyright (C) International Business Machines Corp., 2003,2004 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include +#include +#include +#include "cifsglob.h" +#include "cifsproto.h" +#include "cifs_unicode.h" +#include "cifs_debug.h" + +int cifs_directory_notify(unsigned long arg, struct file * file) +{ + int xid; + int rc = -EINVAL; + struct cifs_sb_info *cifs_sb; + struct cifsTconInfo *pTcon; + char *full_path = NULL; + + xid = GetXid(); + cifs_sb = CIFS_SB(file->f_dentry->d_sb); + pTcon = cifs_sb->tcon; + full_path = build_path_from_dentry(file->f_dentry); + cFYI(1,("cifs dir notify on file %s",full_path)); + /* CIFSSMBNotify */ + FreeXid(xid); + return rc; +} + + +long cifs_fcntl(int file_desc, unsigned int command, unsigned long arg, + struct file * file) +{ + /* Few few file control functions need to be specially mapped. So far + only: + F_NOTIFY (for directory change notification) + And eventually: + F_GETLEASE + F_SETLEASE + need to be mapped here. The others either already are mapped downstream + or do not need to go to the server (client only sideeffects): + F_DUPFD: + F_GETFD: + F_SETFD: + F_GETFL: + F_SETFL: + F_GETLK: + F_SETLK: + F_SETLKW: + F_GETOWN: + F_SETOWN: + F_GETSIG: + F_SETSIG: + */ + long rc = 0; + + cFYI(1,("cifs_fcntl: command %d with arg %lx",command,arg)); /* BB removeme BB */ + + switch (command) { + case F_NOTIFY: + /* let the local call have a chance to fail first */ + rc = generic_file_fcntl(file_desc,command,arg,file); + if(rc) + return rc; + else { + /* local call succeeded try to do remote notify to + pick up changes from other clients to server file */ + cifs_directory_notify(arg, file); + /* BB add case to long and return rc from above */ + return rc; + } + break; + default: + break; + } + return generic_file_fcntl(file_desc,command,arg,file); +} + --- diff/include/asm-alpha/lockmeter.h 1970-01-01 01:00:00.000000000 +0100 +++ source/include/asm-alpha/lockmeter.h 2004-04-21 10:46:51.726718456 +0100 @@ -0,0 +1,84 @@ +/* + * Written by John Hawkes (hawkes@sgi.com) + * Based on klstat.h by Jack Steiner (steiner@sgi.com) + * + * Modified by Peter Rival (frival@zk3.dec.com) + */ + +#ifndef _ALPHA_LOCKMETER_H +#define _ALPHA_LOCKMETER_H + +#include +#define CPU_CYCLE_FREQUENCY hwrpb->cycle_freq + +#define get_cycles64() get_cycles() + +#define THIS_CPU_NUMBER smp_processor_id() + +#include + +#define SPINLOCK_MAGIC_INIT /**/ + +/* + * Macros to cache and retrieve an index value inside of a lock + * these macros assume that there are less than 65536 simultaneous + * (read mode) holders of a rwlock. + * We also assume that the hash table has less than 32767 entries. + * the high order bit is used for write locking a rw_lock + * Note: although these defines and macros are the same as what is being used + * in include/asm-i386/lockmeter.h, they are present here to easily + * allow an alternate Alpha implementation. + */ +/* + * instrumented spinlock structure -- never used to allocate storage + * only used in macros below to overlay a spinlock_t + */ +typedef struct inst_spinlock_s { + /* remember, Alpha is little endian */ + unsigned short lock; + unsigned short index; +} inst_spinlock_t; +#define PUT_INDEX(lock_ptr,indexv) ((inst_spinlock_t *)(lock_ptr))->index = indexv +#define GET_INDEX(lock_ptr) ((inst_spinlock_t *)(lock_ptr))->index + +/* + * macros to cache and retrieve an index value in a read/write lock + * as well as the cpu where a reader busy period started + * we use the 2nd word (the debug word) for this, so require the + * debug word to be present + */ +/* + * instrumented rwlock structure -- never used to allocate storage + * only used in macros below to overlay a rwlock_t + */ +typedef struct inst_rwlock_s { + volatile int lock; + unsigned short index; + unsigned short cpu; +} inst_rwlock_t; +#define PUT_RWINDEX(rwlock_ptr,indexv) ((inst_rwlock_t *)(rwlock_ptr))->index = indexv +#define GET_RWINDEX(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->index +#define PUT_RW_CPU(rwlock_ptr,cpuv) ((inst_rwlock_t *)(rwlock_ptr))->cpu = cpuv +#define GET_RW_CPU(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->cpu + +/* + * return true if rwlock is write locked + * (note that other lock attempts can cause the lock value to be negative) + */ +#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) (((inst_rwlock_t *)rwlock_ptr)->lock & 1) +#define IABS(x) ((x) > 0 ? (x) : -(x)) + +#define RWLOCK_READERS(rwlock_ptr) rwlock_readers(rwlock_ptr) +extern inline int rwlock_readers(rwlock_t *rwlock_ptr) +{ + int tmp = (int) ((inst_rwlock_t *)rwlock_ptr)->lock; + /* readers subtract 2, so we have to: */ + /* - andnot off a possible writer (bit 0) */ + /* - get the absolute value */ + /* - divide by 2 (right shift by one) */ + /* to find the number of readers */ + if (tmp == 0) return(0); + else return(IABS(tmp & ~1)>>1); +} + +#endif /* _ALPHA_LOCKMETER_H */ --- diff/include/asm-alpha/setup.h 1970-01-01 01:00:00.000000000 +0100 +++ source/include/asm-alpha/setup.h 2004-04-21 10:46:51.731717696 +0100 @@ -0,0 +1,6 @@ +#ifndef __ALPHA_SETUP_H +#define __ALPHA_SETUP_H + +#define COMMAND_LINE_SIZE 256 + +#endif --- diff/include/asm-i386/kgdb.h 1970-01-01 01:00:00.000000000 +0100 +++ source/include/asm-i386/kgdb.h 2004-04-21 10:46:51.738716632 +0100 @@ -0,0 +1,69 @@ +#ifndef __KGDB +#define __KGDB + +/* + * This file should not include ANY others. This makes it usable + * most anywhere without the fear of include order or inclusion. + * Make it so! + * + * This file may be included all the time. It is only active if + * CONFIG_KGDB is defined, otherwise it stubs out all the macros + * and entry points. + */ +#if defined(CONFIG_KGDB) && !defined(__ASSEMBLY__) + +extern void breakpoint(void); +#define INIT_KGDB_INTS kgdb_enable_ints() + +#ifndef BREAKPOINT +#define BREAKPOINT asm(" int $3") +#endif + +extern void kgdb_schedule_breakpoint(void); +extern void kgdb_process_breakpoint(void); + +extern int kgdb_tty_hook(void); +extern int kgdb_eth_hook(void); +extern int kgdboe; + +/* + * GDB debug stub (or any debug stub) can point the 'linux_debug_hook' + * pointer to its routine and it will be entered as the first thing + * when a trap occurs. + * + * Return values are, at present, undefined. + * + * The debug hook routine does not necessarily return to its caller. + * It has the register image and thus may choose to resume execution + * anywhere it pleases. + */ +struct pt_regs; + +extern int kgdb_handle_exception(int trapno, + int signo, int err_code, struct pt_regs *regs); +extern int in_kgdb(struct pt_regs *regs); + +#ifdef CONFIG_KGDB_TS +void kgdb_tstamp(int line, char *source, int data0, int data1); +/* + * This is the time stamp function. The macro adds the source info and + * does a cast on the data to allow most any 32-bit value. + */ + +#define kgdb_ts(data0,data1) kgdb_tstamp(__LINE__,__FILE__,(int)data0,(int)data1) +#else +#define kgdb_ts(data0,data1) +#endif +#else /* CONFIG_KGDB && ! __ASSEMBLY__ ,stubs follow... */ +#ifndef BREAKPOINT +#define BREAKPOINT +#endif +#define kgdb_ts(data0,data1) +#define in_kgdb +#define kgdb_handle_exception +#define breakpoint +#define INIT_KGDB_INTS +#define kgdb_process_breakpoint() do {} while(0) + +#endif +#endif /* __KGDB */ --- diff/include/asm-i386/kgdb_local.h 1970-01-01 01:00:00.000000000 +0100 +++ source/include/asm-i386/kgdb_local.h 2004-04-21 10:46:51.738716632 +0100 @@ -0,0 +1,102 @@ +#ifndef __KGDB_LOCAL +#define ___KGDB_LOCAL +#include +#include +#include +#include +#include +#include +#include +#include + +#define PORT 0x3f8 +#ifdef CONFIG_KGDB_PORT +#undef PORT +#define PORT CONFIG_KGDB_PORT +#endif +#define IRQ 4 +#ifdef CONFIG_KGDB_IRQ +#undef IRQ +#define IRQ CONFIG_KGDB_IRQ +#endif +#define SB_CLOCK 1843200 +#define SB_BASE (SB_CLOCK/16) +#define SB_BAUD9600 SB_BASE/9600 +#define SB_BAUD192 SB_BASE/19200 +#define SB_BAUD384 SB_BASE/38400 +#define SB_BAUD576 SB_BASE/57600 +#define SB_BAUD1152 SB_BASE/115200 +#ifdef CONFIG_KGDB_9600BAUD +#define SB_BAUD SB_BAUD9600 +#endif +#ifdef CONFIG_KGDB_19200BAUD +#define SB_BAUD SB_BAUD192 +#endif +#ifdef CONFIG_KGDB_38400BAUD +#define SB_BAUD SB_BAUD384 +#endif +#ifdef CONFIG_KGDB_57600BAUD +#define SB_BAUD SB_BAUD576 +#endif +#ifdef CONFIG_KGDB_115200BAUD +#define SB_BAUD SB_BAUD1152 +#endif +#ifndef SB_BAUD +#define SB_BAUD SB_BAUD1152 /* Start with this if not given */ +#endif + +#ifndef CONFIG_X86_TSC +#undef rdtsc +#define rdtsc(a,b) if (a++ > 10000){a = 0; b++;} +#undef rdtscll +#define rdtscll(s) s++ +#endif + +#ifdef _raw_read_unlock /* must use a name that is "define"ed, not an inline */ +#undef spin_lock +#undef spin_trylock +#undef spin_unlock +#define spin_lock _raw_spin_lock +#define spin_trylock _raw_spin_trylock +#define spin_unlock _raw_spin_unlock +#else +#endif +#undef spin_unlock_wait +#define spin_unlock_wait(x) do { cpu_relax(); barrier();} \ + while(spin_is_locked(x)) + +#define SB_IER 1 +#define SB_MCR UART_MCR_OUT2 | UART_MCR_DTR | UART_MCR_RTS + +#define FLAGS 0 +#define SB_STATE { \ + magic: SSTATE_MAGIC, \ + baud_base: SB_BASE, \ + port: PORT, \ + irq: IRQ, \ + flags: FLAGS, \ + custom_divisor:SB_BAUD} +#define SB_INFO { \ + magic: SERIAL_MAGIC, \ + port: PORT,0,FLAGS, \ + state: &state, \ + tty: (struct tty_struct *)&state, \ + IER: SB_IER, \ + MCR: SB_MCR} +extern void putDebugChar(int); +/* RTAI support needs us to really stop/start interrupts */ + +#define kgdb_sti() __asm__ __volatile__("sti": : :"memory") +#define kgdb_cli() __asm__ __volatile__("cli": : :"memory") +#define kgdb_local_save_flags(x) __asm__ __volatile__(\ + "pushfl ; popl %0":"=g" (x): /* no input */) +#define kgdb_local_irq_restore(x) __asm__ __volatile__(\ + "pushl %0 ; popfl": \ + /* no output */ :"g" (x):"memory", "cc") +#define kgdb_local_irq_save(x) kgdb_local_save_flags(x); kgdb_cli() + +#ifdef CONFIG_SERIAL +extern void shutdown_for_kgdb(struct async_struct *info); +#endif +#define INIT_KDEBUG putDebugChar("+"); +#endif /* __KGDB_LOCAL */ --- diff/include/asm-i386/lockmeter.h 1970-01-01 01:00:00.000000000 +0100 +++ source/include/asm-i386/lockmeter.h 2004-04-21 10:46:51.739716480 +0100 @@ -0,0 +1,115 @@ +/* + * Copyright (C) 1999,2000 Silicon Graphics, Inc. + * + * Written by John Hawkes (hawkes@sgi.com) + * Based on klstat.h by Jack Steiner (steiner@sgi.com) + * + * Modified by Ray Bryant (raybry@us.ibm.com) + * Changes Copyright (C) 2000 IBM, Inc. + * Added save of index in spinlock_t to improve efficiency + * of "hold" time reporting for spinlocks. + * Added support for hold time statistics for read and write + * locks. + * Moved machine dependent code here from include/lockmeter.h. + * + */ + +#ifndef _I386_LOCKMETER_H +#define _I386_LOCKMETER_H + +#include +#include + +#include + +#ifdef __KERNEL__ +extern unsigned long cpu_khz; +#define CPU_CYCLE_FREQUENCY (cpu_khz * 1000) +#else +#define CPU_CYCLE_FREQUENCY 450000000 +#endif + +#define THIS_CPU_NUMBER smp_processor_id() + +/* + * macros to cache and retrieve an index value inside of a spin lock + * these macros assume that there are less than 65536 simultaneous + * (read mode) holders of a rwlock. Not normally a problem!! + * we also assume that the hash table has less than 65535 entries. + */ +/* + * instrumented spinlock structure -- never used to allocate storage + * only used in macros below to overlay a spinlock_t + */ +typedef struct inst_spinlock_s { + /* remember, Intel is little endian */ + unsigned short lock; + unsigned short index; +} inst_spinlock_t; +#define PUT_INDEX(lock_ptr,indexv) ((inst_spinlock_t *)(lock_ptr))->index = indexv +#define GET_INDEX(lock_ptr) ((inst_spinlock_t *)(lock_ptr))->index + +/* + * macros to cache and retrieve an index value in a read/write lock + * as well as the cpu where a reader busy period started + * we use the 2nd word (the debug word) for this, so require the + * debug word to be present + */ +/* + * instrumented rwlock structure -- never used to allocate storage + * only used in macros below to overlay a rwlock_t + */ +typedef struct inst_rwlock_s { + volatile int lock; + unsigned short index; + unsigned short cpu; +} inst_rwlock_t; +#define PUT_RWINDEX(rwlock_ptr,indexv) ((inst_rwlock_t *)(rwlock_ptr))->index = indexv +#define GET_RWINDEX(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->index +#define PUT_RW_CPU(rwlock_ptr,cpuv) ((inst_rwlock_t *)(rwlock_ptr))->cpu = cpuv +#define GET_RW_CPU(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->cpu + +/* + * return the number of readers for a rwlock_t + */ +#define RWLOCK_READERS(rwlock_ptr) rwlock_readers(rwlock_ptr) + +extern inline int rwlock_readers(rwlock_t *rwlock_ptr) +{ + int tmp = (int) rwlock_ptr->lock; + /* read and write lock attempts may cause the lock value to temporarily */ + /* be negative. Until it is >= 0 we know nothing (i. e. can't tell if */ + /* is -1 because it was write locked and somebody tried to read lock it */ + /* or if it is -1 because it was read locked and somebody tried to write*/ + /* lock it. ........................................................... */ + do { + tmp = (int) rwlock_ptr->lock; + } while (tmp < 0); + if (tmp == 0) return(0); + else return(RW_LOCK_BIAS-tmp); +} + +/* + * return true if rwlock is write locked + * (note that other lock attempts can cause the lock value to be negative) + */ +#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((rwlock_ptr)->lock <= 0) +#define IABS(x) ((x) > 0 ? (x) : -(x)) +#define RWLOCK_IS_READ_LOCKED(rwlock_ptr) ((IABS((rwlock_ptr)->lock) % RW_LOCK_BIAS) != 0) + +/* this is a lot of typing just to get gcc to emit "rdtsc" */ +static inline long long get_cycles64 (void) +{ + union longlong_u { + long long intlong; + struct intint_s { + uint32_t eax; + uint32_t edx; + } intint; + } longlong; + + rdtsc(longlong.intint.eax,longlong.intint.edx); + return longlong.intlong; +} + +#endif /* _I386_LOCKMETER_H */ --- diff/include/asm-ia64/lockmeter.h 1970-01-01 01:00:00.000000000 +0100 +++ source/include/asm-ia64/lockmeter.h 2004-04-21 10:46:51.743715872 +0100 @@ -0,0 +1,72 @@ +/* + * Copyright (C) 1999,2000 Silicon Graphics, Inc. + * + * Written by John Hawkes (hawkes@sgi.com) + * Based on klstat.h by Jack Steiner (steiner@sgi.com) + */ + +#ifndef _IA64_LOCKMETER_H +#define _IA64_LOCKMETER_H + +#ifdef local_cpu_data +#define CPU_CYCLE_FREQUENCY local_cpu_data->itc_freq +#else +#define CPU_CYCLE_FREQUENCY my_cpu_data.itc_freq +#endif +#define get_cycles64() get_cycles() + +#define THIS_CPU_NUMBER smp_processor_id() + +/* + * macros to cache and retrieve an index value inside of a lock + * these macros assume that there are less than 65536 simultaneous + * (read mode) holders of a rwlock. + * we also assume that the hash table has less than 32767 entries. + */ +/* + * instrumented spinlock structure -- never used to allocate storage + * only used in macros below to overlay a spinlock_t + */ +typedef struct inst_spinlock_s { + /* remember, Intel is little endian */ + volatile unsigned short lock; + volatile unsigned short index; +} inst_spinlock_t; +#define PUT_INDEX(lock_ptr,indexv) ((inst_spinlock_t *)(lock_ptr))->index = indexv +#define GET_INDEX(lock_ptr) ((inst_spinlock_t *)(lock_ptr))->index + +/* + * macros to cache and retrieve an index value in a read/write lock + * as well as the cpu where a reader busy period started + * we use the 2nd word (the debug word) for this, so require the + * debug word to be present + */ +/* + * instrumented rwlock structure -- never used to allocate storage + * only used in macros below to overlay a rwlock_t + */ +typedef struct inst_rwlock_s { + volatile int read_counter:31; + volatile int write_lock:1; + volatile unsigned short index; + volatile unsigned short cpu; +} inst_rwlock_t; +#define PUT_RWINDEX(rwlock_ptr,indexv) ((inst_rwlock_t *)(rwlock_ptr))->index = indexv +#define GET_RWINDEX(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->index +#define PUT_RW_CPU(rwlock_ptr,cpuv) ((inst_rwlock_t *)(rwlock_ptr))->cpu = cpuv +#define GET_RW_CPU(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->cpu + +/* + * return the number of readers for a rwlock_t + */ +#define RWLOCK_READERS(rwlock_ptr) ((rwlock_ptr)->read_counter) + +/* + * return true if rwlock is write locked + * (note that other lock attempts can cause the lock value to be negative) + */ +#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((rwlock_ptr)->write_lock) +#define RWLOCK_IS_READ_LOCKED(rwlock_ptr) ((rwlock_ptr)->read_counter) + +#endif /* _IA64_LOCKMETER_H */ + --- diff/include/asm-ia64/setup.h 1970-01-01 01:00:00.000000000 +0100 +++ source/include/asm-ia64/setup.h 2004-04-21 10:46:51.744715720 +0100 @@ -0,0 +1,6 @@ +#ifndef __IA64_SETUP_H +#define __IA64_SETUP_H + +#define COMMAND_LINE_SIZE 512 + +#endif --- diff/include/asm-mips/lockmeter.h 1970-01-01 01:00:00.000000000 +0100 +++ source/include/asm-mips/lockmeter.h 2004-04-21 10:46:51.748715112 +0100 @@ -0,0 +1,126 @@ +/* + * Copyright (C) 1999,2000 Silicon Graphics, Inc. + * + * Written by John Hawkes (hawkes@sgi.com) + * Based on klstat.h by Jack Steiner (steiner@sgi.com) + * Ported to mips32 for Asita Technologies + * by D.J. Barrow ( dj.barrow@asitatechnologies.com ) + */ +#ifndef _ASM_LOCKMETER_H +#define _ASM_LOCKMETER_H + +/* do_gettimeoffset is a function pointer on mips */ +/* & it is not included by */ +#include +#include +#include + +#define SPINLOCK_MAGIC_INIT /* */ + +#define CPU_CYCLE_FREQUENCY get_cpu_cycle_frequency() + +#define THIS_CPU_NUMBER smp_processor_id() + +static uint32_t cpu_cycle_frequency = 0; + +static uint32_t get_cpu_cycle_frequency(void) +{ + /* a total hack, slow and invasive, but ... it works */ + int sec; + uint32_t start_cycles; + struct timeval tv; + + if (cpu_cycle_frequency == 0) { /* uninitialized */ + do_gettimeofday(&tv); + sec = tv.tv_sec; /* set up to catch the tv_sec rollover */ + while (sec == tv.tv_sec) { do_gettimeofday(&tv); } + sec = tv.tv_sec; /* rolled over to a new sec value */ + start_cycles = get_cycles(); + while (sec == tv.tv_sec) { do_gettimeofday(&tv); } + cpu_cycle_frequency = get_cycles() - start_cycles; + } + + return cpu_cycle_frequency; +} + +extern struct timeval xtime; + +static uint64_t get_cycles64(void) +{ + static uint64_t last_get_cycles64 = 0; + uint64_t ret; + unsigned long sec; + unsigned long usec, usec_offset; + +again: + sec = xtime.tv_sec; + usec = xtime.tv_usec; + usec_offset = do_gettimeoffset(); + if ((xtime.tv_sec != sec) || + (xtime.tv_usec != usec)|| + (usec_offset >= 20000)) + goto again; + + ret = ((uint64_t)(usec + usec_offset) * cpu_cycle_frequency); + /* We can't do a normal 64 bit division on mips without libgcc.a */ + do_div(ret,1000000); + ret += ((uint64_t)sec * cpu_cycle_frequency); + + /* XXX why does time go backwards? do_gettimeoffset? general time adj? */ + if (ret <= last_get_cycles64) + ret = last_get_cycles64+1; + last_get_cycles64 = ret; + + return ret; +} + +/* + * macros to cache and retrieve an index value inside of a lock + * these macros assume that there are less than 65536 simultaneous + * (read mode) holders of a rwlock. + * we also assume that the hash table has less than 32767 entries. + * the high order bit is used for write locking a rw_lock + */ +#define INDEX_MASK 0x7FFF0000 +#define READERS_MASK 0x0000FFFF +#define INDEX_SHIFT 16 +#define PUT_INDEX(lockp,index) \ + lockp->lock = (((lockp->lock) & ~INDEX_MASK) | (index) << INDEX_SHIFT) +#define GET_INDEX(lockp) \ + (((lockp->lock) & INDEX_MASK) >> INDEX_SHIFT) + +/* + * macros to cache and retrieve an index value in a read/write lock + * as well as the cpu where a reader busy period started + * we use the 2nd word (the debug word) for this, so require the + * debug word to be present + */ +/* + * instrumented rwlock structure -- never used to allocate storage + * only used in macros below to overlay a rwlock_t + */ +typedef struct inst_rwlock_s { + volatile int lock; + unsigned short index; + unsigned short cpu; +} inst_rwlock_t; +#define PUT_RWINDEX(rwlock_ptr,indexv) ((inst_rwlock_t *)(rwlock_ptr))->index = indexv +#define GET_RWINDEX(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->index +#define PUT_RW_CPU(rwlock_ptr,cpuv) ((inst_rwlock_t *)(rwlock_ptr))->cpu = cpuv +#define GET_RW_CPU(rwlock_ptr) ((inst_rwlock_t *)(rwlock_ptr))->cpu + +/* + * return the number of readers for a rwlock_t + */ +#define RWLOCK_READERS(rwlock_ptr) rwlock_readers(rwlock_ptr) + +extern inline int rwlock_readers(rwlock_t *rwlock_ptr) +{ + int tmp = (int) rwlock_ptr->lock; + return (tmp >= 0) ? tmp : 0; +} + +#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((rwlock_ptr)->lock < 0) +#define RWLOCK_IS_READ_LOCKED(rwlock_ptr) ((rwlock_ptr)->lock > 0) + +#endif /* _ASM_LOCKMETER_H */ --- diff/include/asm-mips/setup.h 1970-01-01 01:00:00.000000000 +0100 +++ source/include/asm-mips/setup.h 2004-04-21 10:46:51.749714960 +0100 @@ -0,0 +1,8 @@ +#ifdef __KERNEL__ +#ifndef _MIPS_SETUP_H +#define _MIPS_SETUP_H + +#define COMMAND_LINE_SIZE 256 + +#endif /* __SETUP_H */ +#endif /* __KERNEL__ */ --- diff/include/asm-sh/setup.h 1970-01-01 01:00:00.000000000 +0100 +++ source/include/asm-sh/setup.h 2004-04-21 10:46:51.756713896 +0100 @@ -0,0 +1,8 @@ +#ifdef __KERNEL__ +#ifndef _SH_SETUP_H +#define _SH_SETUP_H + +#define COMMAND_LINE_SIZE 256 + +#endif /* _SH_SETUP_H */ +#endif /* __KERNEL__ */ --- diff/include/asm-sparc64/lockmeter.h 1970-01-01 01:00:00.000000000 +0100 +++ source/include/asm-sparc64/lockmeter.h 2004-04-21 10:46:51.756713896 +0100 @@ -0,0 +1,45 @@ +/* + * Copyright (C) 2000 Anton Blanchard (anton@linuxcare.com) + * Copyright (C) 2003 David S. Miller (davem@redhat.com) + */ + +#ifndef _SPARC64_LOCKMETER_H +#define _SPARC64_LOCKMETER_H + +#include +#include +#include +#include + +/* Actually, this is not the CPU frequency by the system tick + * frequency which is good enough for lock metering. + */ +#define CPU_CYCLE_FREQUENCY (timer_tick_offset * HZ) +#define THIS_CPU_NUMBER smp_processor_id() + +#define PUT_INDEX(lock_ptr,indexv) (lock_ptr)->index = (indexv) +#define GET_INDEX(lock_ptr) (lock_ptr)->index + +#define PUT_RWINDEX(rwlock_ptr,indexv) (rwlock_ptr)->index = (indexv) +#define GET_RWINDEX(rwlock_ptr) (rwlock_ptr)->index +#define PUT_RW_CPU(rwlock_ptr,cpuv) (rwlock_ptr)->cpu = (cpuv) +#define GET_RW_CPU(rwlock_ptr) (rwlock_ptr)->cpu + +#define RWLOCK_READERS(rwlock_ptr) rwlock_readers(rwlock_ptr) + +extern inline int rwlock_readers(rwlock_t *rwlock_ptr) +{ + signed int tmp = rwlock_ptr->lock; + + if (tmp > 0) + return tmp; + else + return 0; +} + +#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((signed int)((rwlock_ptr)->lock) < 0) +#define RWLOCK_IS_READ_LOCKED(rwlock_ptr) ((signed int)((rwlock_ptr)->lock) > 0) + +#define get_cycles64() get_cycles() + +#endif /* _SPARC64_LOCKMETER_H */ --- diff/include/asm-um/setup.h 1970-01-01 01:00:00.000000000 +0100 +++ source/include/asm-um/setup.h 2004-04-21 10:46:51.760713288 +0100 @@ -0,0 +1,6 @@ +#ifndef SETUP_H_INCLUDED +#define SETUP_H_INCLUDED + +#define COMMAND_LINE_SIZE 512 + +#endif /* SETUP_H_INCLUDED */ --- diff/include/asm-v850/setup.h 1970-01-01 01:00:00.000000000 +0100 +++ source/include/asm-v850/setup.h 2004-04-21 10:46:51.760713288 +0100 @@ -0,0 +1,6 @@ +#ifndef _V850_SETUP_H +#define _V850_SETUP_H + +#define COMMAND_LINE_SIZE 512 + +#endif /* __SETUP_H */ --- diff/include/asm-x86_64/kgdb.h 1970-01-01 01:00:00.000000000 +0100 +++ source/include/asm-x86_64/kgdb.h 2004-04-21 10:46:51.761713136 +0100 @@ -0,0 +1,71 @@ +#ifndef __KGDB +#define __KGDB + +/* + * This file should not include ANY others. This makes it usable + * most anywhere without the fear of include order or inclusion. + * Make it so! + * + * This file may be included all the time. It is only active if + * CONFIG_KGDB is defined, otherwise it stubs out all the macros + * and entry points. + */ +#if defined(CONFIG_KGDB) && !defined(__ASSEMBLY__) + +extern void breakpoint(void); +#define INIT_KGDB_INTS kgdb_enable_ints() + +#ifndef BREAKPOINT +#define BREAKPOINT asm(" int $3") +#endif + +extern void kgdb_schedule_breakpoint(void); +extern void kgdb_process_breakpoint(void); + +extern int kgdb_tty_hook(void); +extern int kgdb_eth_hook(void); +extern int kgdboe; + +/* + * GDB debug stub (or any debug stub) can point the 'linux_debug_hook' + * pointer to its routine and it will be entered as the first thing + * when a trap occurs. + * + * Return values are, at present, undefined. + * + * The debug hook routine does not necessarily return to its caller. + * It has the register image and thus may choose to resume execution + * anywhere it pleases. + */ +struct pt_regs; + +extern int kgdb_handle_exception(int trapno, + int signo, int err_code, struct pt_regs *regs); +extern int in_kgdb(struct pt_regs *regs); + +extern void set_debug_traps(void); + +#ifdef CONFIG_KGDB_TS +void kgdb_tstamp(int line, char *source, int data0, int data1); +/* + * This is the time stamp function. The macro adds the source info and + * does a cast on the data to allow most any 32-bit value. + */ + +#define kgdb_ts(data0,data1) kgdb_tstamp(__LINE__,__FILE__,(int)data0,(int)data1) +#else +#define kgdb_ts(data0,data1) +#endif +#else /* CONFIG_KGDB && ! __ASSEMBLY__ ,stubs follow... */ +#ifndef BREAKPOINT +#define BREAKPOINT +#endif +#define kgdb_ts(data0,data1) +#define in_kgdb (0) +#define kgdb_handle_exception +#define breakpoint +#define INIT_KGDB_INTS +#define kgdb_process_breakpoint() do {} while(0) + +#endif +#endif /* __KGDB */ --- diff/include/asm-x86_64/kgdb_local.h 1970-01-01 01:00:00.000000000 +0100 +++ source/include/asm-x86_64/kgdb_local.h 2004-04-21 10:46:51.761713136 +0100 @@ -0,0 +1,102 @@ +#ifndef __KGDB_LOCAL +#define ___KGDB_LOCAL +#include +#include +#include +#include +#include +#include +#include +#include + +#define PORT 0x3f8 +#ifdef CONFIG_KGDB_PORT +#undef PORT +#define PORT CONFIG_KGDB_PORT +#endif +#define IRQ 4 +#ifdef CONFIG_KGDB_IRQ +#undef IRQ +#define IRQ CONFIG_KGDB_IRQ +#endif +#define SB_CLOCK 1843200 +#define SB_BASE (SB_CLOCK/16) +#define SB_BAUD9600 SB_BASE/9600 +#define SB_BAUD192 SB_BASE/19200 +#define SB_BAUD384 SB_BASE/38400 +#define SB_BAUD576 SB_BASE/57600 +#define SB_BAUD1152 SB_BASE/115200 +#ifdef CONFIG_KGDB_9600BAUD +#define SB_BAUD SB_BAUD9600 +#endif +#ifdef CONFIG_KGDB_19200BAUD +#define SB_BAUD SB_BAUD192 +#endif +#ifdef CONFIG_KGDB_38400BAUD +#define SB_BAUD SB_BAUD384 +#endif +#ifdef CONFIG_KGDB_57600BAUD +#define SB_BAUD SB_BAUD576 +#endif +#ifdef CONFIG_KGDB_115200BAUD +#define SB_BAUD SB_BAUD1152 +#endif +#ifndef SB_BAUD +#define SB_BAUD SB_BAUD1152 /* Start with this if not given */ +#endif + +#ifndef CONFIG_X86_TSC +#undef rdtsc +#define rdtsc(a,b) if (a++ > 10000){a = 0; b++;} +#undef rdtscll +#define rdtscll(s) s++ +#endif + +#ifdef _raw_read_unlock /* must use a name that is "define"ed, not an inline */ +#undef spin_lock +#undef spin_trylock +#undef spin_unlock +#define spin_lock _raw_spin_lock +#define spin_trylock _raw_spin_trylock +#define spin_unlock _raw_spin_unlock +#else +#endif +#undef spin_unlock_wait +#define spin_unlock_wait(x) do { cpu_relax(); barrier();} \ + while(spin_is_locked(x)) + +#define SB_IER 1 +#define SB_MCR UART_MCR_OUT2 | UART_MCR_DTR | UART_MCR_RTS + +#define FLAGS 0 +#define SB_STATE { \ + magic: SSTATE_MAGIC, \ + baud_base: SB_BASE, \ + port: PORT, \ + irq: IRQ, \ + flags: FLAGS, \ + custom_divisor:SB_BAUD} +#define SB_INFO { \ + magic: SERIAL_MAGIC, \ + port: PORT,0,FLAGS, \ + state: &state, \ + tty: (struct tty_struct *)&state, \ + IER: SB_IER, \ + MCR: SB_MCR} +extern void putDebugChar(int); +/* RTAI support needs us to really stop/start interrupts */ + +#define kgdb_sti() __asm__ __volatile__("sti": : :"memory") +#define kgdb_cli() __asm__ __volatile__("cli": : :"memory") +#define kgdb_local_save_flags(x) __asm__ __volatile__(\ + "pushfl ; popl %0":"=g" (x): /* no input */) +#define kgdb_local_irq_restore(x) __asm__ __volatile__(\ + "pushl %0 ; popfl": \ + /* no output */ :"g" (x):"memory", "cc") +#define kgdb_local_irq_save(x) kgdb_local_save_flags(x); kgdb_cli() + +#ifdef CONFIG_SERIAL +extern void shutdown_for_kgdb(struct async_struct *info); +#endif +#define INIT_KDEBUG putDebugChar("+"); +#endif /* __KGDB_LOCAL */ --- diff/include/linux/dwarf2-lang.h 1970-01-01 01:00:00.000000000 +0100 +++ source/include/linux/dwarf2-lang.h 2004-04-21 10:46:51.768712072 +0100 @@ -0,0 +1,132 @@ +#ifndef DWARF2_LANG +#define DWARF2_LANG +#include + +/* + * This is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2, or (at your option) any later + * version. + */ +/* + * This file defines macros that allow generation of DWARF debug records + * for asm files. This file is platform independent. Register numbers + * (which are about the only thing that is platform dependent) are to be + * supplied by a platform defined file. + */ +#define DWARF_preamble() .section .debug_frame,"",@progbits +/* + * This macro starts a debug frame section. The debug_frame describes + * where to find the registers that the enclosing function saved on + * entry. + * + * ORD is use by the label generator and should be the same as what is + * passed to CFI_postamble. + * + * pc, pc register gdb ordinal. + * + * code_align this is the factor used to define locations or regions + * where the given definitions apply. If you use labels to define these + * this should be 1. + * + * data_align this is the factor used to define register offsets. If + * you use struct offset, this should be the size of the register in + * bytes or the negative of that. This is how it is used: you will + * define a register as the reference register, say the stack pointer, + * then you will say where a register is located relative to this + * reference registers value, say 40 for register 3 (the gdb register + * number). The <40> will be multiplied by to define the + * byte offset of the given register (3, in this example). So if your + * <40> is the byte offset and the reference register points at the + * begining, you would want 1 for the data_offset. If <40> was the 40th + * 4-byte element in that structure you would want 4. And if your + * reference register points at the end of the structure you would want + * a negative data_align value(and you would have to do other math as + * well). + */ + +#define CFI_preamble(ORD, pc, code_align, data_align) \ +.section .debug_frame,"",@progbits ; \ +frame/**/_/**/ORD: \ + .long end/**/_/**/ORD-start/**/_/**/ORD; \ +start/**/_/**/ORD: \ + .long DW_CIE_ID; \ + .byte DW_CIE_VERSION; \ + .byte 0 ; \ + .uleb128 code_align; \ + .sleb128 data_align; \ + .byte pc; + +/* + * After the above macro and prior to the CFI_postamble, you need to + * define the initial state. This starts with defining the reference + * register and, usually the pc. Here are some helper macros: + */ + +#define CFA_define_reference(reg, offset) \ + .byte DW_CFA_def_cfa; \ + .uleb128 reg; \ + .uleb128 (offset); + +#define CFA_define_offset(reg, offset) \ + .byte (DW_CFA_offset + reg); \ + .uleb128 (offset); + +#define CFI_postamble(ORD) \ + .align 4; \ +end/**/_/**/ORD: +/* + * So now your code pushs stuff on the stack, you need a new location + * and the rules for what to do. This starts a running description of + * the call frame. You need to describe what changes with respect to + * the call registers as the location of the pc moves through the code. + * The following builds an FDE (fram descriptor entry?). Like the + * above, it has a preamble and a postamble. It also is tied to the CFI + * above. + * The first entry after the preamble must be the location in the code + * that the call frame is being described for. + */ +#define FDE_preamble(ORD, fde_no, initial_address, length) \ + .long FDE_end/**/_/**/fde_no-FDE_start/**/_/**/fde_no; \ +FDE_start/**/_/**/fde_no: \ + .long frame/**/_/**/ORD; \ + .long initial_address; \ + .long length; + +#define FDE_postamble(fde_no) \ + .align 4; \ +FDE_end/**/_/**/fde_no: +/* + * That done, you can now add registers, subtract registers, move the + * reference and even change the reference. You can also define a new + * area of code the info applies to. For discontinuous bits you should + * start a new FDE. You may have as many as you like. + */ + +/* + * To advance the address by + */ + +#define FDE_advance(bytes) \ + .byte DW_CFA_advance_loc4 \ + .long bytes + + + +/* + * With the above you can define all the register locations. But + * suppose the reference register moves... Takes the new offset NOT an + * increment. This is how esp is tracked if it is not saved. + */ + +#define CFA_define_cfa_offset(offset) \ + .byte $DW_CFA_def_cfa_offset; \ + .uleb128 (offset); +/* + * Or suppose you want to use a different reference register... + */ +#define CFA_define_cfa_register(reg) \ + .byte DW_CFA_def_cfa_register; \ + .uleb128 reg; + +#endif --- diff/include/linux/dwarf2.h 1970-01-01 01:00:00.000000000 +0100 +++ source/include/linux/dwarf2.h 2004-04-21 10:46:51.767712224 +0100 @@ -0,0 +1,738 @@ +/* Declarations and definitions of codes relating to the DWARF2 symbolic + debugging information format. + Copyright (C) 1992, 1993, 1995, 1996, 1997, 1999, 2000, 2001, 2002 + Free Software Foundation, Inc. + + Written by Gary Funck (gary@intrepid.com) The Ada Joint Program + Office (AJPO), Florida State Unviversity and Silicon Graphics Inc. + provided support for this effort -- June 21, 1995. + + Derived from the DWARF 1 implementation written by Ron Guilmette + (rfg@netcom.com), November 1990. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free + Software Foundation; either version 2, or (at your option) any later + version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING. If not, write to the Free + Software Foundation, 59 Temple Place - Suite 330, Boston, MA + 02111-1307, USA. */ + +/* This file is derived from the DWARF specification (a public document) + Revision 2.0.0 (July 27, 1993) developed by the UNIX International + Programming Languages Special Interest Group (UI/PLSIG) and distributed + by UNIX International. Copies of this specification are available from + UNIX International, 20 Waterview Boulevard, Parsippany, NJ, 07054. + + This file also now contains definitions from the DWARF 3 specification. */ + +/* This file is shared between GCC and GDB, and should not contain + prototypes. */ + +#ifndef _ELF_DWARF2_H +#define _ELF_DWARF2_H + +/* Structure found in the .debug_line section. */ +#ifndef __ASSEMBLY__ +typedef struct +{ + unsigned char li_length [4]; + unsigned char li_version [2]; + unsigned char li_prologue_length [4]; + unsigned char li_min_insn_length [1]; + unsigned char li_default_is_stmt [1]; + unsigned char li_line_base [1]; + unsigned char li_line_range [1]; + unsigned char li_opcode_base [1]; +} +DWARF2_External_LineInfo; + +typedef struct +{ + unsigned long li_length; + unsigned short li_version; + unsigned int li_prologue_length; + unsigned char li_min_insn_length; + unsigned char li_default_is_stmt; + int li_line_base; + unsigned char li_line_range; + unsigned char li_opcode_base; +} +DWARF2_Internal_LineInfo; + +/* Structure found in .debug_pubnames section. */ +typedef struct +{ + unsigned char pn_length [4]; + unsigned char pn_version [2]; + unsigned char pn_offset [4]; + unsigned char pn_size [4]; +} +DWARF2_External_PubNames; + +typedef struct +{ + unsigned long pn_length; + unsigned short pn_version; + unsigned long pn_offset; + unsigned long pn_size; +} +DWARF2_Internal_PubNames; + +/* Structure found in .debug_info section. */ +typedef struct +{ + unsigned char cu_length [4]; + unsigned char cu_version [2]; + unsigned char cu_abbrev_offset [4]; + unsigned char cu_pointer_size [1]; +} +DWARF2_External_CompUnit; + +typedef struct +{ + unsigned long cu_length; + unsigned short cu_version; + unsigned long cu_abbrev_offset; + unsigned char cu_pointer_size; +} +DWARF2_Internal_CompUnit; + +typedef struct +{ + unsigned char ar_length [4]; + unsigned char ar_version [2]; + unsigned char ar_info_offset [4]; + unsigned char ar_pointer_size [1]; + unsigned char ar_segment_size [1]; +} +DWARF2_External_ARange; + +typedef struct +{ + unsigned long ar_length; + unsigned short ar_version; + unsigned long ar_info_offset; + unsigned char ar_pointer_size; + unsigned char ar_segment_size; +} +DWARF2_Internal_ARange; + +#define ENUM(name) enum name { +#define IF_NOT_ASM(a) a +#define COMMA , +#else +#define ENUM(name) +#define IF_NOT_ASM(a) +#define COMMA + +#endif + +/* Tag names and codes. */ +ENUM(dwarf_tag) + + DW_TAG_padding = 0x00 COMMA + DW_TAG_array_type = 0x01 COMMA + DW_TAG_class_type = 0x02 COMMA + DW_TAG_entry_point = 0x03 COMMA + DW_TAG_enumeration_type = 0x04 COMMA + DW_TAG_formal_parameter = 0x05 COMMA + DW_TAG_imported_declaration = 0x08 COMMA + DW_TAG_label = 0x0a COMMA + DW_TAG_lexical_block = 0x0b COMMA + DW_TAG_member = 0x0d COMMA + DW_TAG_pointer_type = 0x0f COMMA + DW_TAG_reference_type = 0x10 COMMA + DW_TAG_compile_unit = 0x11 COMMA + DW_TAG_string_type = 0x12 COMMA + DW_TAG_structure_type = 0x13 COMMA + DW_TAG_subroutine_type = 0x15 COMMA + DW_TAG_typedef = 0x16 COMMA + DW_TAG_union_type = 0x17 COMMA + DW_TAG_unspecified_parameters = 0x18 COMMA + DW_TAG_variant = 0x19 COMMA + DW_TAG_common_block = 0x1a COMMA + DW_TAG_common_inclusion = 0x1b COMMA + DW_TAG_inheritance = 0x1c COMMA + DW_TAG_inlined_subroutine = 0x1d COMMA + DW_TAG_module = 0x1e COMMA + DW_TAG_ptr_to_member_type = 0x1f COMMA + DW_TAG_set_type = 0x20 COMMA + DW_TAG_subrange_type = 0x21 COMMA + DW_TAG_with_stmt = 0x22 COMMA + DW_TAG_access_declaration = 0x23 COMMA + DW_TAG_base_type = 0x24 COMMA + DW_TAG_catch_block = 0x25 COMMA + DW_TAG_const_type = 0x26 COMMA + DW_TAG_constant = 0x27 COMMA + DW_TAG_enumerator = 0x28 COMMA + DW_TAG_file_type = 0x29 COMMA + DW_TAG_friend = 0x2a COMMA + DW_TAG_namelist = 0x2b COMMA + DW_TAG_namelist_item = 0x2c COMMA + DW_TAG_packed_type = 0x2d COMMA + DW_TAG_subprogram = 0x2e COMMA + DW_TAG_template_type_param = 0x2f COMMA + DW_TAG_template_value_param = 0x30 COMMA + DW_TAG_thrown_type = 0x31 COMMA + DW_TAG_try_block = 0x32 COMMA + DW_TAG_variant_part = 0x33 COMMA + DW_TAG_variable = 0x34 COMMA + DW_TAG_volatile_type = 0x35 COMMA + /* DWARF 3. */ + DW_TAG_dwarf_procedure = 0x36 COMMA + DW_TAG_restrict_type = 0x37 COMMA + DW_TAG_interface_type = 0x38 COMMA + DW_TAG_namespace = 0x39 COMMA + DW_TAG_imported_module = 0x3a COMMA + DW_TAG_unspecified_type = 0x3b COMMA + DW_TAG_partial_unit = 0x3c COMMA + DW_TAG_imported_unit = 0x3d COMMA + /* SGI/MIPS Extensions. */ + DW_TAG_MIPS_loop = 0x4081 COMMA + /* GNU extensions. */ + DW_TAG_format_label = 0x4101 COMMA /* For FORTRAN 77 and Fortran 90. */ + DW_TAG_function_template = 0x4102 COMMA /* For C++. */ + DW_TAG_class_template = 0x4103 COMMA /* For C++. */ + DW_TAG_GNU_BINCL = 0x4104 COMMA + DW_TAG_GNU_EINCL = 0x4105 COMMA + /* Extensions for UPC. See: http://upc.gwu.edu/~upc. */ + DW_TAG_upc_shared_type = 0x8765 COMMA + DW_TAG_upc_strict_type = 0x8766 COMMA + DW_TAG_upc_relaxed_type = 0x8767 +IF_NOT_ASM(};) + +#define DW_TAG_lo_user 0x4080 +#define DW_TAG_hi_user 0xffff + +/* Flag that tells whether entry has a child or not. */ +#define DW_children_no 0 +#define DW_children_yes 1 + +/* Form names and codes. */ +ENUM(dwarf_form) + + DW_FORM_addr = 0x01 COMMA + DW_FORM_block2 = 0x03 COMMA + DW_FORM_block4 = 0x04 COMMA + DW_FORM_data2 = 0x05 COMMA + DW_FORM_data4 = 0x06 COMMA + DW_FORM_data8 = 0x07 COMMA + DW_FORM_string = 0x08 COMMA + DW_FORM_block = 0x09 COMMA + DW_FORM_block1 = 0x0a COMMA + DW_FORM_data1 = 0x0b COMMA + DW_FORM_flag = 0x0c COMMA + DW_FORM_sdata = 0x0d COMMA + DW_FORM_strp = 0x0e COMMA + DW_FORM_udata = 0x0f COMMA + DW_FORM_ref_addr = 0x10 COMMA + DW_FORM_ref1 = 0x11 COMMA + DW_FORM_ref2 = 0x12 COMMA + DW_FORM_ref4 = 0x13 COMMA + DW_FORM_ref8 = 0x14 COMMA + DW_FORM_ref_udata = 0x15 COMMA + DW_FORM_indirect = 0x16 +IF_NOT_ASM(};) + +/* Attribute names and codes. */ + +ENUM(dwarf_attribute) + + DW_AT_sibling = 0x01 COMMA + DW_AT_location = 0x02 COMMA + DW_AT_name = 0x03 COMMA + DW_AT_ordering = 0x09 COMMA + DW_AT_subscr_data = 0x0a COMMA + DW_AT_byte_size = 0x0b COMMA + DW_AT_bit_offset = 0x0c COMMA + DW_AT_bit_size = 0x0d COMMA + DW_AT_element_list = 0x0f COMMA + DW_AT_stmt_list = 0x10 COMMA + DW_AT_low_pc = 0x11 COMMA + DW_AT_high_pc = 0x12 COMMA + DW_AT_language = 0x13 COMMA + DW_AT_member = 0x14 COMMA + DW_AT_discr = 0x15 COMMA + DW_AT_discr_value = 0x16 COMMA + DW_AT_visibility = 0x17 COMMA + DW_AT_import = 0x18 COMMA + DW_AT_string_length = 0x19 COMMA + DW_AT_common_reference = 0x1a COMMA + DW_AT_comp_dir = 0x1b COMMA + DW_AT_const_value = 0x1c COMMA + DW_AT_containing_type = 0x1d COMMA + DW_AT_default_value = 0x1e COMMA + DW_AT_inline = 0x20 COMMA + DW_AT_is_optional = 0x21 COMMA + DW_AT_lower_bound = 0x22 COMMA + DW_AT_producer = 0x25 COMMA + DW_AT_prototyped = 0x27 COMMA + DW_AT_return_addr = 0x2a COMMA + DW_AT_start_scope = 0x2c COMMA + DW_AT_stride_size = 0x2e COMMA + DW_AT_upper_bound = 0x2f COMMA + DW_AT_abstract_origin = 0x31 COMMA + DW_AT_accessibility = 0x32 COMMA + DW_AT_address_class = 0x33 COMMA + DW_AT_artificial = 0x34 COMMA + DW_AT_base_types = 0x35 COMMA + DW_AT_calling_convention = 0x36 COMMA + DW_AT_count = 0x37 COMMA + DW_AT_data_member_location = 0x38 COMMA + DW_AT_decl_column = 0x39 COMMA + DW_AT_decl_file = 0x3a COMMA + DW_AT_decl_line = 0x3b COMMA + DW_AT_declaration = 0x3c COMMA + DW_AT_discr_list = 0x3d COMMA + DW_AT_encoding = 0x3e COMMA + DW_AT_external = 0x3f COMMA + DW_AT_frame_base = 0x40 COMMA + DW_AT_friend = 0x41 COMMA + DW_AT_identifier_case = 0x42 COMMA + DW_AT_macro_info = 0x43 COMMA + DW_AT_namelist_items = 0x44 COMMA + DW_AT_priority = 0x45 COMMA + DW_AT_segment = 0x46 COMMA + DW_AT_specification = 0x47 COMMA + DW_AT_static_link = 0x48 COMMA + DW_AT_type = 0x49 COMMA + DW_AT_use_location = 0x4a COMMA + DW_AT_variable_parameter = 0x4b COMMA + DW_AT_virtuality = 0x4c COMMA + DW_AT_vtable_elem_location = 0x4d COMMA + /* DWARF 3 values. */ + DW_AT_allocated = 0x4e COMMA + DW_AT_associated = 0x4f COMMA + DW_AT_data_location = 0x50 COMMA + DW_AT_stride = 0x51 COMMA + DW_AT_entry_pc = 0x52 COMMA + DW_AT_use_UTF8 = 0x53 COMMA + DW_AT_extension = 0x54 COMMA + DW_AT_ranges = 0x55 COMMA + DW_AT_trampoline = 0x56 COMMA + DW_AT_call_column = 0x57 COMMA + DW_AT_call_file = 0x58 COMMA + DW_AT_call_line = 0x59 COMMA + /* SGI/MIPS extensions. */ + DW_AT_MIPS_fde = 0x2001 COMMA + DW_AT_MIPS_loop_begin = 0x2002 COMMA + DW_AT_MIPS_tail_loop_begin = 0x2003 COMMA + DW_AT_MIPS_epilog_begin = 0x2004 COMMA + DW_AT_MIPS_loop_unroll_factor = 0x2005 COMMA + DW_AT_MIPS_software_pipeline_depth = 0x2006 COMMA + DW_AT_MIPS_linkage_name = 0x2007 COMMA + DW_AT_MIPS_stride = 0x2008 COMMA + DW_AT_MIPS_abstract_name = 0x2009 COMMA + DW_AT_MIPS_clone_origin = 0x200a COMMA + DW_AT_MIPS_has_inlines = 0x200b COMMA + /* GNU extensions. */ + DW_AT_sf_names = 0x2101 COMMA + DW_AT_src_info = 0x2102 COMMA + DW_AT_mac_info = 0x2103 COMMA + DW_AT_src_coords = 0x2104 COMMA + DW_AT_body_begin = 0x2105 COMMA + DW_AT_body_end = 0x2106 COMMA + DW_AT_GNU_vector = 0x2107 COMMA + /* VMS extensions. */ + DW_AT_VMS_rtnbeg_pd_address = 0x2201 COMMA + /* UPC extension. */ + DW_AT_upc_threads_scaled = 0x3210 +IF_NOT_ASM(};) + +#define DW_AT_lo_user 0x2000 /* Implementation-defined range start. */ +#define DW_AT_hi_user 0x3ff0 /* Implementation-defined range end. */ + +/* Location atom names and codes. */ +ENUM(dwarf_location_atom) + + DW_OP_addr = 0x03 COMMA + DW_OP_deref = 0x06 COMMA + DW_OP_const1u = 0x08 COMMA + DW_OP_const1s = 0x09 COMMA + DW_OP_const2u = 0x0a COMMA + DW_OP_const2s = 0x0b COMMA + DW_OP_const4u = 0x0c COMMA + DW_OP_const4s = 0x0d COMMA + DW_OP_const8u = 0x0e COMMA + DW_OP_const8s = 0x0f COMMA + DW_OP_constu = 0x10 COMMA + DW_OP_consts = 0x11 COMMA + DW_OP_dup = 0x12 COMMA + DW_OP_drop = 0x13 COMMA + DW_OP_over = 0x14 COMMA + DW_OP_pick = 0x15 COMMA + DW_OP_swap = 0x16 COMMA + DW_OP_rot = 0x17 COMMA + DW_OP_xderef = 0x18 COMMA + DW_OP_abs = 0x19 COMMA + DW_OP_and = 0x1a COMMA + DW_OP_div = 0x1b COMMA + DW_OP_minus = 0x1c COMMA + DW_OP_mod = 0x1d COMMA + DW_OP_mul = 0x1e COMMA + DW_OP_neg = 0x1f COMMA + DW_OP_not = 0x20 COMMA + DW_OP_or = 0x21 COMMA + DW_OP_plus = 0x22 COMMA + DW_OP_plus_uconst = 0x23 COMMA + DW_OP_shl = 0x24 COMMA + DW_OP_shr = 0x25 COMMA + DW_OP_shra = 0x26 COMMA + DW_OP_xor = 0x27 COMMA + DW_OP_bra = 0x28 COMMA + DW_OP_eq = 0x29 COMMA + DW_OP_ge = 0x2a COMMA + DW_OP_gt = 0x2b COMMA + DW_OP_le = 0x2c COMMA + DW_OP_lt = 0x2d COMMA + DW_OP_ne = 0x2e COMMA + DW_OP_skip = 0x2f COMMA + DW_OP_lit0 = 0x30 COMMA + DW_OP_lit1 = 0x31 COMMA + DW_OP_lit2 = 0x32 COMMA + DW_OP_lit3 = 0x33 COMMA + DW_OP_lit4 = 0x34 COMMA + DW_OP_lit5 = 0x35 COMMA + DW_OP_lit6 = 0x36 COMMA + DW_OP_lit7 = 0x37 COMMA + DW_OP_lit8 = 0x38 COMMA + DW_OP_lit9 = 0x39 COMMA + DW_OP_lit10 = 0x3a COMMA + DW_OP_lit11 = 0x3b COMMA + DW_OP_lit12 = 0x3c COMMA + DW_OP_lit13 = 0x3d COMMA + DW_OP_lit14 = 0x3e COMMA + DW_OP_lit15 = 0x3f COMMA + DW_OP_lit16 = 0x40 COMMA + DW_OP_lit17 = 0x41 COMMA + DW_OP_lit18 = 0x42 COMMA + DW_OP_lit19 = 0x43 COMMA + DW_OP_lit20 = 0x44 COMMA + DW_OP_lit21 = 0x45 COMMA + DW_OP_lit22 = 0x46 COMMA + DW_OP_lit23 = 0x47 COMMA + DW_OP_lit24 = 0x48 COMMA + DW_OP_lit25 = 0x49 COMMA + DW_OP_lit26 = 0x4a COMMA + DW_OP_lit27 = 0x4b COMMA + DW_OP_lit28 = 0x4c COMMA + DW_OP_lit29 = 0x4d COMMA + DW_OP_lit30 = 0x4e COMMA + DW_OP_lit31 = 0x4f COMMA + DW_OP_reg0 = 0x50 COMMA + DW_OP_reg1 = 0x51 COMMA + DW_OP_reg2 = 0x52 COMMA + DW_OP_reg3 = 0x53 COMMA + DW_OP_reg4 = 0x54 COMMA + DW_OP_reg5 = 0x55 COMMA + DW_OP_reg6 = 0x56 COMMA + DW_OP_reg7 = 0x57 COMMA + DW_OP_reg8 = 0x58 COMMA + DW_OP_reg9 = 0x59 COMMA + DW_OP_reg10 = 0x5a COMMA + DW_OP_reg11 = 0x5b COMMA + DW_OP_reg12 = 0x5c COMMA + DW_OP_reg13 = 0x5d COMMA + DW_OP_reg14 = 0x5e COMMA + DW_OP_reg15 = 0x5f COMMA + DW_OP_reg16 = 0x60 COMMA + DW_OP_reg17 = 0x61 COMMA + DW_OP_reg18 = 0x62 COMMA + DW_OP_reg19 = 0x63 COMMA + DW_OP_reg20 = 0x64 COMMA + DW_OP_reg21 = 0x65 COMMA + DW_OP_reg22 = 0x66 COMMA + DW_OP_reg23 = 0x67 COMMA + DW_OP_reg24 = 0x68 COMMA + DW_OP_reg25 = 0x69 COMMA + DW_OP_reg26 = 0x6a COMMA + DW_OP_reg27 = 0x6b COMMA + DW_OP_reg28 = 0x6c COMMA + DW_OP_reg29 = 0x6d COMMA + DW_OP_reg30 = 0x6e COMMA + DW_OP_reg31 = 0x6f COMMA + DW_OP_breg0 = 0x70 COMMA + DW_OP_breg1 = 0x71 COMMA + DW_OP_breg2 = 0x72 COMMA + DW_OP_breg3 = 0x73 COMMA + DW_OP_breg4 = 0x74 COMMA + DW_OP_breg5 = 0x75 COMMA + DW_OP_breg6 = 0x76 COMMA + DW_OP_breg7 = 0x77 COMMA + DW_OP_breg8 = 0x78 COMMA + DW_OP_breg9 = 0x79 COMMA + DW_OP_breg10 = 0x7a COMMA + DW_OP_breg11 = 0x7b COMMA + DW_OP_breg12 = 0x7c COMMA + DW_OP_breg13 = 0x7d COMMA + DW_OP_breg14 = 0x7e COMMA + DW_OP_breg15 = 0x7f COMMA + DW_OP_breg16 = 0x80 COMMA + DW_OP_breg17 = 0x81 COMMA + DW_OP_breg18 = 0x82 COMMA + DW_OP_breg19 = 0x83 COMMA + DW_OP_breg20 = 0x84 COMMA + DW_OP_breg21 = 0x85 COMMA + DW_OP_breg22 = 0x86 COMMA + DW_OP_breg23 = 0x87 COMMA + DW_OP_breg24 = 0x88 COMMA + DW_OP_breg25 = 0x89 COMMA + DW_OP_breg26 = 0x8a COMMA + DW_OP_breg27 = 0x8b COMMA + DW_OP_breg28 = 0x8c COMMA + DW_OP_breg29 = 0x8d COMMA + DW_OP_breg30 = 0x8e COMMA + DW_OP_breg31 = 0x8f COMMA + DW_OP_regx = 0x90 COMMA + DW_OP_fbreg = 0x91 COMMA + DW_OP_bregx = 0x92 COMMA + DW_OP_piece = 0x93 COMMA + DW_OP_deref_size = 0x94 COMMA + DW_OP_xderef_size = 0x95 COMMA + DW_OP_nop = 0x96 COMMA + /* DWARF 3 extensions. */ + DW_OP_push_object_address = 0x97 COMMA + DW_OP_call2 = 0x98 COMMA + DW_OP_call4 = 0x99 COMMA + DW_OP_call_ref = 0x9a COMMA + /* GNU extensions. */ + DW_OP_GNU_push_tls_address = 0xe0 +IF_NOT_ASM(};) + +#define DW_OP_lo_user 0xe0 /* Implementation-defined range start. */ +#define DW_OP_hi_user 0xff /* Implementation-defined range end. */ + +/* Type encodings. */ +ENUM(dwarf_type) + + DW_ATE_void = 0x0 COMMA + DW_ATE_address = 0x1 COMMA + DW_ATE_boolean = 0x2 COMMA + DW_ATE_complex_float = 0x3 COMMA + DW_ATE_float = 0x4 COMMA + DW_ATE_signed = 0x5 COMMA + DW_ATE_signed_char = 0x6 COMMA + DW_ATE_unsigned = 0x7 COMMA + DW_ATE_unsigned_char = 0x8 COMMA + /* DWARF 3. */ + DW_ATE_imaginary_float = 0x9 +IF_NOT_ASM(};) + +#define DW_ATE_lo_user 0x80 +#define DW_ATE_hi_user 0xff + +/* Array ordering names and codes. */ +ENUM(dwarf_array_dim_ordering) + + DW_ORD_row_major = 0 COMMA + DW_ORD_col_major = 1 +IF_NOT_ASM(};) + +/* Access attribute. */ +ENUM(dwarf_access_attribute) + + DW_ACCESS_public = 1 COMMA + DW_ACCESS_protected = 2 COMMA + DW_ACCESS_private = 3 +IF_NOT_ASM(};) + +/* Visibility. */ +ENUM(dwarf_visibility_attribute) + + DW_VIS_local = 1 COMMA + DW_VIS_exported = 2 COMMA + DW_VIS_qualified = 3 +IF_NOT_ASM(};) + +/* Virtuality. */ +ENUM(dwarf_virtuality_attribute) + + DW_VIRTUALITY_none = 0 COMMA + DW_VIRTUALITY_virtual = 1 COMMA + DW_VIRTUALITY_pure_virtual = 2 +IF_NOT_ASM(};) + +/* Case sensitivity. */ +ENUM(dwarf_id_case) + + DW_ID_case_sensitive = 0 COMMA + DW_ID_up_case = 1 COMMA + DW_ID_down_case = 2 COMMA + DW_ID_case_insensitive = 3 +IF_NOT_ASM(};) + +/* Calling convention. */ +ENUM(dwarf_calling_convention) + + DW_CC_normal = 0x1 COMMA + DW_CC_program = 0x2 COMMA + DW_CC_nocall = 0x3 +IF_NOT_ASM(};) + +#define DW_CC_lo_user 0x40 +#define DW_CC_hi_user 0xff + +/* Inline attribute. */ +ENUM(dwarf_inline_attribute) + + DW_INL_not_inlined = 0 COMMA + DW_INL_inlined = 1 COMMA + DW_INL_declared_not_inlined = 2 COMMA + DW_INL_declared_inlined = 3 +IF_NOT_ASM(};) + +/* Discriminant lists. */ +ENUM(dwarf_discrim_list) + + DW_DSC_label = 0 COMMA + DW_DSC_range = 1 +IF_NOT_ASM(};) + +/* Line number opcodes. */ +ENUM(dwarf_line_number_ops) + + DW_LNS_extended_op = 0 COMMA + DW_LNS_copy = 1 COMMA + DW_LNS_advance_pc = 2 COMMA + DW_LNS_advance_line = 3 COMMA + DW_LNS_set_file = 4 COMMA + DW_LNS_set_column = 5 COMMA + DW_LNS_negate_stmt = 6 COMMA + DW_LNS_set_basic_block = 7 COMMA + DW_LNS_const_add_pc = 8 COMMA + DW_LNS_fixed_advance_pc = 9 COMMA + /* DWARF 3. */ + DW_LNS_set_prologue_end = 10 COMMA + DW_LNS_set_epilogue_begin = 11 COMMA + DW_LNS_set_isa = 12 +IF_NOT_ASM(};) + +/* Line number extended opcodes. */ +ENUM(dwarf_line_number_x_ops) + + DW_LNE_end_sequence = 1 COMMA + DW_LNE_set_address = 2 COMMA + DW_LNE_define_file = 3 +IF_NOT_ASM(};) + +/* Call frame information. */ +ENUM(dwarf_call_frame_info) + + DW_CFA_advance_loc = 0x40 COMMA + DW_CFA_offset = 0x80 COMMA + DW_CFA_restore = 0xc0 COMMA + DW_CFA_nop = 0x00 COMMA + DW_CFA_set_loc = 0x01 COMMA + DW_CFA_advance_loc1 = 0x02 COMMA + DW_CFA_advance_loc2 = 0x03 COMMA + DW_CFA_advance_loc4 = 0x04 COMMA + DW_CFA_offset_extended = 0x05 COMMA + DW_CFA_restore_extended = 0x06 COMMA + DW_CFA_undefined = 0x07 COMMA + DW_CFA_same_value = 0x08 COMMA + DW_CFA_register = 0x09 COMMA + DW_CFA_remember_state = 0x0a COMMA + DW_CFA_restore_state = 0x0b COMMA + DW_CFA_def_cfa = 0x0c COMMA + DW_CFA_def_cfa_register = 0x0d COMMA + DW_CFA_def_cfa_offset = 0x0e COMMA + + /* DWARF 3. */ + DW_CFA_def_cfa_expression = 0x0f COMMA + DW_CFA_expression = 0x10 COMMA + DW_CFA_offset_extended_sf = 0x11 COMMA + DW_CFA_def_cfa_sf = 0x12 COMMA + DW_CFA_def_cfa_offset_sf = 0x13 COMMA + + /* SGI/MIPS specific. */ + DW_CFA_MIPS_advance_loc8 = 0x1d COMMA + + /* GNU extensions. */ + DW_CFA_GNU_window_save = 0x2d COMMA + DW_CFA_GNU_args_size = 0x2e COMMA + DW_CFA_GNU_negative_offset_extended = 0x2f +IF_NOT_ASM(};) + +#define DW_CIE_ID 0xffffffff +#define DW_CIE_VERSION 1 + +#define DW_CFA_extended 0 +#define DW_CFA_lo_user 0x1c +#define DW_CFA_hi_user 0x3f + +#define DW_CHILDREN_no 0x00 +#define DW_CHILDREN_yes 0x01 + +#define DW_ADDR_none 0 + +/* Source language names and codes. */ +ENUM(dwarf_source_language) + + DW_LANG_C89 = 0x0001 COMMA + DW_LANG_C = 0x0002 COMMA + DW_LANG_Ada83 = 0x0003 COMMA + DW_LANG_C_plus_plus = 0x0004 COMMA + DW_LANG_Cobol74 = 0x0005 COMMA + DW_LANG_Cobol85 = 0x0006 COMMA + DW_LANG_Fortran77 = 0x0007 COMMA + DW_LANG_Fortran90 = 0x0008 COMMA + DW_LANG_Pascal83 = 0x0009 COMMA + DW_LANG_Modula2 = 0x000a COMMA + DW_LANG_Java = 0x000b COMMA + /* DWARF 3. */ + DW_LANG_C99 = 0x000c COMMA + DW_LANG_Ada95 = 0x000d COMMA + DW_LANG_Fortran95 = 0x000e COMMA + /* MIPS. */ + DW_LANG_Mips_Assembler = 0x8001 COMMA + /* UPC. */ + DW_LANG_Upc = 0x8765 +IF_NOT_ASM(};) + +#define DW_LANG_lo_user 0x8000 /* Implementation-defined range start. */ +#define DW_LANG_hi_user 0xffff /* Implementation-defined range start. */ + +/* Names and codes for macro information. */ +ENUM(dwarf_macinfo_record_type) + + DW_MACINFO_define = 1 COMMA + DW_MACINFO_undef = 2 COMMA + DW_MACINFO_start_file = 3 COMMA + DW_MACINFO_end_file = 4 COMMA + DW_MACINFO_vendor_ext = 255 +IF_NOT_ASM(};) + +/* @@@ For use with GNU frame unwind information. */ + +#define DW_EH_PE_absptr 0x00 +#define DW_EH_PE_omit 0xff + +#define DW_EH_PE_uleb128 0x01 +#define DW_EH_PE_udata2 0x02 +#define DW_EH_PE_udata4 0x03 +#define DW_EH_PE_udata8 0x04 +#define DW_EH_PE_sleb128 0x09 +#define DW_EH_PE_sdata2 0x0A +#define DW_EH_PE_sdata4 0x0B +#define DW_EH_PE_sdata8 0x0C +#define DW_EH_PE_signed 0x08 + +#define DW_EH_PE_pcrel 0x10 +#define DW_EH_PE_textrel 0x20 +#define DW_EH_PE_datarel 0x30 +#define DW_EH_PE_funcrel 0x40 +#define DW_EH_PE_aligned 0x50 + +#define DW_EH_PE_indirect 0x80 + +#endif /* _ELF_DWARF2_H */ --- diff/include/linux/lockmeter.h 1970-01-01 01:00:00.000000000 +0100 +++ source/include/linux/lockmeter.h 2004-04-21 10:46:51.774711160 +0100 @@ -0,0 +1,320 @@ +/* + * Copyright (C) 1999-2002 Silicon Graphics, Inc. + * + * Written by John Hawkes (hawkes@sgi.com) + * Based on klstat.h by Jack Steiner (steiner@sgi.com) + * + * Modified by Ray Bryant (raybry@us.ibm.com) Feb-Apr 2000 + * Changes Copyright (C) 2000 IBM, Inc. + * Added save of index in spinlock_t to improve efficiency + * of "hold" time reporting for spinlocks + * Added support for hold time statistics for read and write + * locks. + * Moved machine dependent code to include/asm/lockmeter.h. + * + */ + +#ifndef _LINUX_LOCKMETER_H +#define _LINUX_LOCKMETER_H + + +/*--------------------------------------------------- + * architecture-independent lockmeter.h + *-------------------------------------------------*/ + +/* + * raybry -- version 2: added efficient hold time statistics + * requires lstat recompile, so flagged as new version + * raybry -- version 3: added global reader lock data + * hawkes -- version 4: removed some unnecessary fields to simplify mips64 port + */ +#define LSTAT_VERSION 5 + +int lstat_update(void*, void*, int); +int lstat_update_time(void*, void*, int, uint32_t); + +/* + * Currently, the mips64 and sparc64 kernels talk to a 32-bit lockstat, so we + * need to force compatibility in the inter-communication data structure. + */ + +#if defined(CONFIG_MIPS32_COMPAT) +#define TIME_T uint32_t +#elif defined(CONFIG_SPARC) || defined(CONFIG_SPARC64) +#define TIME_T uint64_t +#else +#define TIME_T time_t +#endif + +#if defined(__KERNEL__) || (!defined(CONFIG_MIPS32_COMPAT) && !defined(CONFIG_SPARC) && !defined(CONFIG_SPARC64)) || (_MIPS_SZLONG==32) +#define POINTER void * +#else +#define POINTER int64_t +#endif + +/* + * Values for the "action" parameter passed to lstat_update. + * ZZZ - do we want a try-success status here??? + */ +#define LSTAT_ACT_NO_WAIT 0 +#define LSTAT_ACT_SPIN 1 +#define LSTAT_ACT_REJECT 2 +#define LSTAT_ACT_WW_SPIN 3 +#define LSTAT_ACT_SLEPT 4 /* UNUSED */ + +#define LSTAT_ACT_MAX_VALUES 4 /* NOTE: Increase to 5 if use ACT_SLEPT */ + +/* + * Special values for the low 2 bits of an RA passed to + * lstat_update. + */ +/* we use these values to figure out what kind of lock data */ +/* is stored in the statistics table entry at index ....... */ +#define LSTAT_RA_SPIN 0 /* spin lock data */ +#define LSTAT_RA_READ 1 /* read lock statistics */ +#define LSTAT_RA_SEMA 2 /* RESERVED */ +#define LSTAT_RA_WRITE 3 /* write lock statistics*/ + +#define LSTAT_RA(n) \ + ((void*)( ((unsigned long)__builtin_return_address(0) & ~3) | n) ) + +/* + * Constants used for lock addresses in the lstat_directory + * to indicate special values of the lock address. + */ +#define LSTAT_MULTI_LOCK_ADDRESS NULL + +/* + * Maximum size of the lockstats tables. Increase this value + * if its not big enough. (Nothing bad happens if its not + * big enough although some locks will not be monitored.) + * We record overflows of this quantity in lstat_control.dir_overflows + * + * Note: The max value here must fit into the field set + * and obtained by the macro's PUT_INDEX() and GET_INDEX(). + * This value depends on how many bits are available in the + * lock word in the particular machine implementation we are on. + */ +#define LSTAT_MAX_STAT_INDEX 2000 + +/* + * Size and mask for the hash table into the directory. + */ +#define LSTAT_HASH_TABLE_SIZE 4096 /* must be 2**N */ +#define LSTAT_HASH_TABLE_MASK (LSTAT_HASH_TABLE_SIZE-1) + +#define DIRHASH(ra) ((unsigned long)(ra)>>2 & LSTAT_HASH_TABLE_MASK) + +/* + * This defines an entry in the lockstat directory. It contains + * information about a lock being monitored. + * A directory entry only contains the lock identification - + * counts on usage of the lock are kept elsewhere in a per-cpu + * data structure to minimize cache line pinging. + */ +typedef struct { + POINTER caller_ra; /* RA of code that set lock */ + POINTER lock_ptr; /* lock address */ + ushort next_stat_index; /* Used to link multiple locks that have the same hash table value */ +} lstat_directory_entry_t; + +/* + * A multi-dimensioned array used to contain counts for lock accesses. + * The array is 3-dimensional: + * - CPU number. Keep from thrashing cache lines between CPUs + * - Directory entry index. Identifies the lock + * - Action. Indicates what kind of contention occurred on an + * access to the lock. + * + * The index of an entry in the directory is the same as the 2nd index + * of the entry in the counts array. + */ +/* + * This table contains data for spin_locks, write locks, and read locks + * Not all data is used for all cases. In particular, the hold time + * information is not stored here for read locks since that is a global + * (e. g. cannot be separated out by return address) quantity. + * See the lstat_read_lock_counts_t structure for the global read lock + * hold time. + */ +typedef struct { + uint64_t cum_wait_ticks; /* sum of wait times */ + /* for write locks, sum of time a */ + /* writer is waiting for a reader */ + int64_t cum_hold_ticks; /* cumulative sum of holds */ + /* not used for read mode locks */ + /* must be signed. ............... */ + uint32_t max_wait_ticks; /* max waiting time */ + uint32_t max_hold_ticks; /* max holding time */ + uint64_t cum_wait_ww_ticks; /* sum times writer waits on writer*/ + uint32_t max_wait_ww_ticks; /* max wait time writer vs writer */ + /* prev 2 only used for write locks*/ + uint32_t acquire_time; /* time lock acquired this CPU */ + uint32_t count[LSTAT_ACT_MAX_VALUES]; +} lstat_lock_counts_t; + +typedef lstat_lock_counts_t lstat_cpu_counts_t[LSTAT_MAX_STAT_INDEX]; + +/* + * User request to: + * - turn statistic collection on/off, or to reset + */ +#define LSTAT_OFF 0 +#define LSTAT_ON 1 +#define LSTAT_RESET 2 +#define LSTAT_RELEASE 3 + +#define LSTAT_MAX_READ_LOCK_INDEX 1000 +typedef struct { + POINTER lock_ptr; /* address of lock for output stats */ + uint32_t read_lock_count; + int64_t cum_hold_ticks; /* sum of read lock hold times over */ + /* all callers. ....................*/ + uint32_t write_index; /* last write lock hash table index */ + uint32_t busy_periods; /* count of busy periods ended this */ + uint64_t start_busy; /* time this busy period started. ..*/ + uint64_t busy_ticks; /* sum of busy periods this lock. ..*/ + uint64_t max_busy; /* longest busy period for this lock*/ + uint32_t max_readers; /* maximum number of readers ...... */ +#ifdef USER_MODE_TESTING + rwlock_t entry_lock; /* lock for this read lock entry... */ + /* avoid having more than one rdr at*/ + /* needed for user space testing... */ + /* not needed for kernel 'cause it */ + /* is non-preemptive. ............. */ +#endif +} lstat_read_lock_counts_t; +typedef lstat_read_lock_counts_t lstat_read_lock_cpu_counts_t[LSTAT_MAX_READ_LOCK_INDEX]; + +#if defined(__KERNEL__) || defined(USER_MODE_TESTING) + +#ifndef USER_MODE_TESTING +#include +#else +#include "asm_newlockmeter.h" +#endif + +/* + * Size and mask for the hash table into the directory. + */ +#define LSTAT_HASH_TABLE_SIZE 4096 /* must be 2**N */ +#define LSTAT_HASH_TABLE_MASK (LSTAT_HASH_TABLE_SIZE-1) + +#define DIRHASH(ra) ((unsigned long)(ra)>>2 & LSTAT_HASH_TABLE_MASK) + +/* + * This version eliminates the per processor lock stack. What we do is to + * store the index of the lock hash structure in unused bits in the lock + * itself. Then on unlock we can find the statistics record without doing + * any additional hash or lock stack lookup. This works for spin_locks. + * Hold time reporting is now basically as cheap as wait time reporting + * so we ignore the difference between LSTAT_ON_HOLD and LSTAT_ON_WAIT + * as in version 1.1.* of lockmeter. + * + * For rw_locks, we store the index of a global reader stats structure in + * the lock and the writer index is stored in the latter structure. + * For read mode locks we hash at the time of the lock to find an entry + * in the directory for reader wait time and the like. + * At unlock time for read mode locks, we update just the global structure + * so we don't need to know the reader directory index value at unlock time. + * + */ + +/* + * Protocol to change lstat_control.state + * This is complicated because we don't want the cum_hold_time for + * a rw_lock to be decremented in _read_lock_ without making sure it + * is incremented in _read_lock_ and vice versa. So here is the + * way we change the state of lstat_control.state: + * I. To Turn Statistics On + * After allocating storage, set lstat_control.state non-zero. + * This works because we don't start updating statistics for in use + * locks until the reader lock count goes to zero. + * II. To Turn Statistics Off: + * (0) Disable interrupts on this CPU + * (1) Seize the lstat_control.directory_lock + * (2) Obtain the current value of lstat_control.next_free_read_lock_index + * (3) Store a zero in lstat_control.state. + * (4) Release the lstat_control.directory_lock + * (5) For each lock in the read lock list up to the saved value + * (well, -1) of the next_free_read_lock_index, do the following: + * (a) Check validity of the stored lock address + * by making sure that the word at the saved addr + * has an index that matches this entry. If not + * valid, then skip this entry. + * (b) If there is a write lock already set on this lock, + * skip to (d) below. + * (c) Set a non-metered write lock on the lock + * (d) set the cached INDEX in the lock to zero + * (e) Release the non-metered write lock. + * (6) Re-enable interrupts + * + * These rules ensure that a read lock will not have its statistics + * partially updated even though the global lock recording state has + * changed. See put_lockmeter_info() for implementation. + * + * The reason for (b) is that there may be write locks set on the + * syscall path to put_lockmeter_info() from user space. If we do + * not do this check, then we can deadlock. A similar problem would + * occur if the lock was read locked by the current CPU. At the + * moment this does not appear to happen. + */ + +/* + * Main control structure for lockstat. Used to turn statistics on/off + * and to maintain directory info. + */ +typedef struct { + int state; + spinlock_t control_lock; /* used to serialize turning statistics on/off */ + spinlock_t directory_lock; /* for serialize adding entries to directory */ + volatile int next_free_dir_index;/* next free entry in the directory */ + /* FIXME not all of these fields are used / needed .............. */ + /* the following fields represent data since */ + /* first "lstat on" or most recent "lstat reset" */ + TIME_T first_started_time; /* time when measurement first enabled */ + TIME_T started_time; /* time when measurement last started */ + TIME_T ending_time; /* time when measurement last disabled */ + uint64_t started_cycles64; /* cycles when measurement last started */ + uint64_t ending_cycles64; /* cycles when measurement last disabled */ + uint64_t enabled_cycles64; /* total cycles with measurement enabled */ + int intervals; /* number of measurement intervals recorded */ + /* i. e. number of times did lstat on;lstat off */ + lstat_directory_entry_t *dir; /* directory */ + int dir_overflow; /* count of times ran out of space in directory */ + int rwlock_overflow; /* count of times we couldn't allocate a rw block*/ + ushort *hashtab; /* hash table for quick dir scans */ + lstat_cpu_counts_t *counts[NR_CPUS]; /* Array of pointers to per-cpu stats */ + int next_free_read_lock_index; /* next rwlock reader (global) stats block */ + lstat_read_lock_cpu_counts_t *read_lock_counts[NR_CPUS]; /* per cpu read lock stats */ +} lstat_control_t; + +#endif /* defined(__KERNEL__) || defined(USER_MODE_TESTING) */ + +typedef struct { + short lstat_version; /* version of the data */ + short state; /* the current state is returned */ + int maxcpus; /* Number of cpus present */ + int next_free_dir_index; /* index of the next free directory entry */ + TIME_T first_started_time; /* when measurement enabled for first time */ + TIME_T started_time; /* time in secs since 1969 when stats last turned on */ + TIME_T ending_time; /* time in secs since 1969 when stats last turned off */ + uint32_t cycleval; /* cycles per second */ +#ifdef notyet + void *kernel_magic_addr; /* address of kernel_magic */ + void *kernel_end_addr; /* contents of kernel magic (points to "end") */ +#endif + int next_free_read_lock_index; /* index of next (global) read lock stats struct */ + uint64_t started_cycles64; /* cycles when measurement last started */ + uint64_t ending_cycles64; /* cycles when stats last turned off */ + uint64_t enabled_cycles64; /* total cycles with measurement enabled */ + int intervals; /* number of measurement intervals recorded */ + /* i.e. number of times we did lstat on;lstat off*/ + int dir_overflow; /* number of times we wanted more space in directory */ + int rwlock_overflow; /* # of times we wanted more space in read_locks_count */ + struct new_utsname uts; /* info about machine where stats are measured */ + /* -T option of lockstat allows data to be */ + /* moved to another machine. ................. */ +} lstat_user_request_t; + +#endif /* _LINUX_LOCKMETER_H */ --- diff/include/linux/mempolicy.h 1970-01-01 01:00:00.000000000 +0100 +++ source/include/linux/mempolicy.h 2004-04-21 10:46:51.774711160 +0100 @@ -0,0 +1,221 @@ +#ifndef _LINUX_MEMPOLICY_H +#define _LINUX_MEMPOLICY_H 1 + +#include + +/* + * NUMA memory policies for Linux. + * Copyright 2003,2004 Andi Kleen SuSE Labs + */ + +/* Policies */ +#define MPOL_DEFAULT 0 +#define MPOL_PREFERRED 1 +#define MPOL_BIND 2 +#define MPOL_INTERLEAVE 3 + +#define MPOL_MAX MPOL_INTERLEAVE + +/* Flags for get_mem_policy */ +#define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */ +#define MPOL_F_ADDR (1<<1) /* look up vma using address */ + +/* Flags for mbind */ +#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */ + +#ifdef __KERNEL__ + +#include +#include +#include +#include +#include +#include + +struct vm_area_struct; + +#ifdef CONFIG_NUMA + +/* + * Describe a memory policy. + * + * A mempolicy can be either associated with a process or with a VMA. + * For VMA related allocations the VMA policy is preferred, otherwise + * the process policy is used. Interrupts ignore the memory policy + * of the current process. + * + * Locking policy for interlave: + * In process context there is no locking because only the process accesses + * its own state. All vma manipulation is somewhat protected by a down_read on + * mmap_sem. For allocating in the interleave policy the page_table_lock + * must be also aquired to protect il_next. + * + * Freeing policy: + * When policy is MPOL_BIND v.zonelist is kmalloc'ed and must be kfree'd. + * All other policies don't have any external state. mpol_free() handles this. + * + * Copying policy objects: + * For MPOL_BIND the zonelist must be always duplicated. mpol_clone() does this. + */ +struct mempolicy { + atomic_t refcnt; + short policy; /* See MPOL_* above */ + union { + struct zonelist *zonelist; /* bind */ + short preferred_node; /* preferred */ + DECLARE_BITMAP(nodes, MAX_NUMNODES); /* interleave */ + /* undefined for default */ + } v; +}; + +/* An NULL mempolicy pointer is a synonym of &default_policy. */ +extern struct mempolicy default_policy; + +/* + * Support for managing mempolicy data objects (clone, copy, destroy) + * The default fast path of a NULL MPOL_DEFAULT policy is always inlined. + */ + +extern void __mpol_free(struct mempolicy *pol); +static inline void mpol_free(struct mempolicy *pol) +{ + if (pol) + __mpol_free(pol); +} + +extern struct mempolicy *__mpol_copy(struct mempolicy *pol); +static inline struct mempolicy *mpol_copy(struct mempolicy *pol) +{ + if (pol) + pol = __mpol_copy(pol); + return pol; +} + +#define vma_policy(vma) ((vma)->vm_policy) +#define vma_set_policy(vma, pol) ((vma)->vm_policy = (pol)) + +static inline void mpol_get(struct mempolicy *pol) +{ + if (pol) + atomic_inc(&pol->refcnt); +} + +extern int __mpol_equal(struct mempolicy *a, struct mempolicy *b); +static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b) +{ + if (a == b) + return 1; + return __mpol_equal(a, b); +} +#define vma_mpol_equal(a,b) mpol_equal(vma_policy(a), vma_policy(b)) + +/* Could later add inheritance of the process policy here. */ + +#define mpol_set_vma_default(vma) ((vma)->vm_policy = NULL) + +/* + * Hugetlb policy. i386 hugetlb so far works with node numbers + * instead of zone lists, so give it special interfaces for now. + */ +extern int mpol_first_node(struct vm_area_struct *vma, unsigned long addr); +extern int mpol_node_valid(int nid, struct vm_area_struct *vma, + unsigned long addr); + +/* + * Tree of shared policies for a shared memory region. + * Maintain the policies in a pseudo mm that contains vmas. The vmas + * carry the policy. As a special twist the pseudo mm is indexed in pages, not + * bytes, so that we can work with shared memory segments bigger than + * unsigned long. + */ + +struct sp_node { + struct rb_node nd; + unsigned long start, end; + struct mempolicy *policy; +}; + +struct shared_policy { + struct rb_root root; + struct semaphore sem; +}; + +static inline void mpol_shared_policy_init(struct shared_policy *info) +{ + info->root = RB_ROOT; + init_MUTEX(&info->sem); +} + +int mpol_set_shared_policy(struct shared_policy *info, + struct vm_area_struct *vma, + struct mempolicy *new); +void mpol_free_shared_policy(struct shared_policy *p); +struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, + unsigned long idx); + +#else + +struct mempolicy {}; + +static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b) +{ + return 1; +} +#define vma_mpol_equal(a,b) 1 + +#define mpol_set_vma_default(vma) do {} while(0) + +static inline void mpol_free(struct mempolicy *p) +{ +} + +static inline void mpol_get(struct mempolicy *pol) +{ +} + +static inline struct mempolicy *mpol_copy(struct mempolicy *old) +{ + return NULL; +} + +static inline int mpol_first_node(struct vm_area_struct *vma, unsigned long a) +{ + return numa_node_id(); +} + +static inline int +mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long a) +{ + return 1; +} + +struct shared_policy {}; + +static inline int mpol_set_shared_policy(struct shared_policy *info, + struct vm_area_struct *vma, + struct mempolicy *new) +{ + return -EINVAL; +} + +static inline void mpol_shared_policy_init(struct shared_policy *info) +{ +} + +static inline void mpol_free_shared_policy(struct shared_policy *p) +{ +} + +static inline struct mempolicy * +mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) +{ + return NULL; +} + +#define vma_policy(vma) NULL +#define vma_set_policy(vma, pol) do {} while(0) + +#endif /* CONFIG_NUMA */ +#endif /* __KERNEL__ */ + +#endif --- diff/kernel/lockmeter.c 1970-01-01 01:00:00.000000000 +0100 +++ source/kernel/lockmeter.c 2004-04-21 10:46:51.790708728 +0100 @@ -0,0 +1,1178 @@ +/* + * Copyright (C) 1999,2000 Silicon Graphics, Inc. + * + * Written by John Hawkes (hawkes@sgi.com) + * Based on klstat.c by Jack Steiner (steiner@sgi.com) + * + * Modified by Ray Bryant (raybry@us.ibm.com) + * Changes Copyright (C) 2000 IBM, Inc. + * Added save of index in spinlock_t to improve efficiency + * of "hold" time reporting for spinlocks + * Added support for hold time statistics for read and write + * locks. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define ASSERT(cond) +#define bzero(loc,size) memset(loc,0,size) + +/*<---------------------------------------------------*/ +/* lockmeter.c */ +/*>---------------------------------------------------*/ + +static lstat_control_t lstat_control __cacheline_aligned = + { LSTAT_OFF, SPIN_LOCK_UNLOCKED, SPIN_LOCK_UNLOCKED, + 19 * 0, NR_CPUS * 0, 0, NR_CPUS * 0 }; + +static ushort lstat_make_dir_entry(void *, void *); + +/* + * lstat_lookup + * + * Given a RA, locate the directory entry for the lock. + */ +static ushort +lstat_lookup(void *lock_ptr, void *caller_ra) +{ + ushort index; + lstat_directory_entry_t *dirp; + + dirp = lstat_control.dir; + + index = lstat_control.hashtab[DIRHASH(caller_ra)]; + while (dirp[index].caller_ra != caller_ra) { + if (index == 0) { + return lstat_make_dir_entry(lock_ptr, caller_ra); + } + index = dirp[index].next_stat_index; + } + + if (dirp[index].lock_ptr != NULL && dirp[index].lock_ptr != lock_ptr) { + dirp[index].lock_ptr = NULL; + } + + return index; +} + +/* + * lstat_make_dir_entry + * Called to add a new lock to the lock directory. + */ +static ushort +lstat_make_dir_entry(void *lock_ptr, void *caller_ra) +{ + lstat_directory_entry_t *dirp; + ushort index, hindex; + unsigned long flags; + + /* lock the table without recursively reentering this metering code */ + local_irq_save(flags); + _raw_spin_lock(&lstat_control.directory_lock); + + hindex = DIRHASH(caller_ra); + index = lstat_control.hashtab[hindex]; + dirp = lstat_control.dir; + while (index && dirp[index].caller_ra != caller_ra) + index = dirp[index].next_stat_index; + + if (index == 0) { + if (lstat_control.next_free_dir_index < LSTAT_MAX_STAT_INDEX) { + index = lstat_control.next_free_dir_index++; + lstat_control.dir[index].caller_ra = caller_ra; + lstat_control.dir[index].lock_ptr = lock_ptr; + lstat_control.dir[index].next_stat_index = + lstat_control.hashtab[hindex]; + lstat_control.hashtab[hindex] = index; + } else { + lstat_control.dir_overflow++; + } + } + _raw_spin_unlock(&lstat_control.directory_lock); + local_irq_restore(flags); + return index; +} + +int +lstat_update(void *lock_ptr, void *caller_ra, int action) +{ + int index; + int cpu; + + ASSERT(action < LSTAT_ACT_MAX_VALUES); + + if (lstat_control.state == LSTAT_OFF) + return 0; + + index = lstat_lookup(lock_ptr, caller_ra); + cpu = THIS_CPU_NUMBER; + (*lstat_control.counts[cpu])[index].count[action]++; + (*lstat_control.counts[cpu])[index].acquire_time = get_cycles(); + + return index; +} + +int +lstat_update_time(void *lock_ptr, void *caller_ra, int action, uint32_t ticks) +{ + ushort index; + int cpu; + + ASSERT(action < LSTAT_ACT_MAX_VALUES); + + if (lstat_control.state == LSTAT_OFF) + return 0; + + index = lstat_lookup(lock_ptr, caller_ra); + cpu = THIS_CPU_NUMBER; + (*lstat_control.counts[cpu])[index].count[action]++; + (*lstat_control.counts[cpu])[index].cum_wait_ticks += (uint64_t) ticks; + if ((*lstat_control.counts[cpu])[index].max_wait_ticks < ticks) + (*lstat_control.counts[cpu])[index].max_wait_ticks = ticks; + + (*lstat_control.counts[cpu])[index].acquire_time = get_cycles(); + + return index; +} + +void +_metered_spin_lock(spinlock_t * lock_ptr) +{ + if (lstat_control.state == LSTAT_OFF) { + _raw_spin_lock(lock_ptr); /* do the real lock */ + PUT_INDEX(lock_ptr, 0); /* clean index in case lockmetering */ + /* gets turned on before unlock */ + } else { + void *this_pc = LSTAT_RA(LSTAT_RA_SPIN); + int index; + + if (_raw_spin_trylock(lock_ptr)) { + index = lstat_update(lock_ptr, this_pc, + LSTAT_ACT_NO_WAIT); + } else { + uint32_t start_cycles = get_cycles(); + _raw_spin_lock(lock_ptr); /* do the real lock */ + index = lstat_update_time(lock_ptr, this_pc, + LSTAT_ACT_SPIN, get_cycles() - start_cycles); + } + /* save the index in the lock itself for use in spin unlock */ + PUT_INDEX(lock_ptr, index); + } +} + +int +_metered_spin_trylock(spinlock_t * lock_ptr) +{ + if (lstat_control.state == LSTAT_OFF) { + return _raw_spin_trylock(lock_ptr); + } else { + int retval; + void *this_pc = LSTAT_RA(LSTAT_RA_SPIN); + + if ((retval = _raw_spin_trylock(lock_ptr))) { + int index = lstat_update(lock_ptr, this_pc, + LSTAT_ACT_NO_WAIT); + /* + * save the index in the lock itself for use in spin + * unlock + */ + PUT_INDEX(lock_ptr, index); + } else { + lstat_update(lock_ptr, this_pc, LSTAT_ACT_REJECT); + } + + return retval; + } +} + +void +_metered_spin_unlock(spinlock_t * lock_ptr) +{ + int index = -1; + + if (lstat_control.state != LSTAT_OFF) { + index = GET_INDEX(lock_ptr); + /* + * If statistics were turned off when we set the lock, + * then the index can be zero. If that is the case, + * then collect no stats on this call. + */ + if (index > 0) { + uint32_t hold_time; + int cpu = THIS_CPU_NUMBER; + hold_time = get_cycles() - + (*lstat_control.counts[cpu])[index].acquire_time; + (*lstat_control.counts[cpu])[index].cum_hold_ticks += + (uint64_t) hold_time; + if ((*lstat_control.counts[cpu])[index].max_hold_ticks < + hold_time) + (*lstat_control.counts[cpu])[index]. + max_hold_ticks = hold_time; + } + } + + /* make sure we don't have a stale index value saved */ + PUT_INDEX(lock_ptr, 0); + _raw_spin_unlock(lock_ptr); /* do the real unlock */ +} + +/* + * allocate the next global read lock structure and store its index + * in the rwlock at "lock_ptr". + */ +uint32_t +alloc_rwlock_struct(rwlock_t * rwlock_ptr) +{ + int index; + unsigned long flags; + int cpu = THIS_CPU_NUMBER; + + /* If we've already overflowed, then do a quick exit */ + if (lstat_control.next_free_read_lock_index > + LSTAT_MAX_READ_LOCK_INDEX) { + lstat_control.rwlock_overflow++; + return 0; + } + + local_irq_save(flags); + _raw_spin_lock(&lstat_control.directory_lock); + + /* It is possible this changed while we were waiting for the directory_lock */ + if (lstat_control.state == LSTAT_OFF) { + index = 0; + goto unlock; + } + + /* It is possible someone else got here first and set the index */ + if ((index = GET_RWINDEX(rwlock_ptr)) == 0) { + /* + * we can't turn on read stats for this lock while there are + * readers (this would mess up the running hold time sum at + * unlock time) + */ + if (RWLOCK_READERS(rwlock_ptr) != 0) { + index = 0; + goto unlock; + } + + /* + * if stats are turned on after being off, we may need to + * return an old index from when the statistics were on last + * time. + */ + for (index = 1; index < lstat_control.next_free_read_lock_index; + index++) + if ((*lstat_control.read_lock_counts[cpu])[index]. + lock_ptr == rwlock_ptr) + goto put_index_and_unlock; + + /* allocate the next global read lock structure */ + if (lstat_control.next_free_read_lock_index >= + LSTAT_MAX_READ_LOCK_INDEX) { + lstat_control.rwlock_overflow++; + index = 0; + goto unlock; + } + index = lstat_control.next_free_read_lock_index++; + + /* + * initialize the global read stats data structure for each + * cpu + */ + for (cpu = 0; cpu < num_online_cpus(); cpu++) { + (*lstat_control.read_lock_counts[cpu])[index].lock_ptr = + rwlock_ptr; + } +put_index_and_unlock: + /* store the index for the read lock structure into the lock */ + PUT_RWINDEX(rwlock_ptr, index); + } + +unlock: + _raw_spin_unlock(&lstat_control.directory_lock); + local_irq_restore(flags); + return index; +} + +void +_metered_read_lock(rwlock_t * rwlock_ptr) +{ + void *this_pc; + uint32_t start_cycles; + int index; + int cpu; + unsigned long flags; + int readers_before, readers_after; + uint64_t cycles64; + + if (lstat_control.state == LSTAT_OFF) { + _raw_read_lock(rwlock_ptr); + /* clean index in case lockmetering turns on before an unlock */ + PUT_RWINDEX(rwlock_ptr, 0); + return; + } + + this_pc = LSTAT_RA(LSTAT_RA_READ); + cpu = THIS_CPU_NUMBER; + index = GET_RWINDEX(rwlock_ptr); + + /* allocate the global stats entry for this lock, if needed */ + if (index == 0) + index = alloc_rwlock_struct(rwlock_ptr); + + readers_before = RWLOCK_READERS(rwlock_ptr); + if (_raw_read_trylock(rwlock_ptr)) { + /* + * We have decremented the lock to count a new reader, + * and have confirmed that no writer has it locked. + */ + /* update statistics if enabled */ + if (index > 0) { + local_irq_save(flags); + lstat_update((void *) rwlock_ptr, this_pc, + LSTAT_ACT_NO_WAIT); + /* preserve value of TSC so cum_hold_ticks and start_busy use same value */ + cycles64 = get_cycles64(); + (*lstat_control.read_lock_counts[cpu])[index]. + cum_hold_ticks -= cycles64; + + /* record time and cpu of start of busy period */ + /* this is not perfect (some race conditions are possible) */ + if (readers_before == 0) { + (*lstat_control.read_lock_counts[cpu])[index]. + start_busy = cycles64; + PUT_RW_CPU(rwlock_ptr, cpu); + } + readers_after = RWLOCK_READERS(rwlock_ptr); + if (readers_after > + (*lstat_control.read_lock_counts[cpu])[index]. + max_readers) + (*lstat_control.read_lock_counts[cpu])[index]. + max_readers = readers_after; + local_irq_restore(flags); + } + + return; + } + /* If we get here, then we could not quickly grab the read lock */ + + start_cycles = get_cycles(); /* start counting the wait time */ + + /* Now spin until read_lock is successful */ + _raw_read_lock(rwlock_ptr); + + lstat_update_time((void *) rwlock_ptr, this_pc, LSTAT_ACT_SPIN, + get_cycles() - start_cycles); + + /* update statistics if they are enabled for this lock */ + if (index > 0) { + local_irq_save(flags); + cycles64 = get_cycles64(); + (*lstat_control.read_lock_counts[cpu])[index].cum_hold_ticks -= + cycles64; + + /* this is not perfect (some race conditions are possible) */ + if (readers_before == 0) { + (*lstat_control.read_lock_counts[cpu])[index]. + start_busy = cycles64; + PUT_RW_CPU(rwlock_ptr, cpu); + } + readers_after = RWLOCK_READERS(rwlock_ptr); + if (readers_after > + (*lstat_control.read_lock_counts[cpu])[index].max_readers) + (*lstat_control.read_lock_counts[cpu])[index]. + max_readers = readers_after; + local_irq_restore(flags); + } +} + +void +_metered_read_unlock(rwlock_t * rwlock_ptr) +{ + int index; + int cpu; + unsigned long flags; + uint64_t busy_length; + uint64_t cycles64; + + if (lstat_control.state == LSTAT_OFF) { + _raw_read_unlock(rwlock_ptr); + return; + } + + index = GET_RWINDEX(rwlock_ptr); + cpu = THIS_CPU_NUMBER; + + if (index > 0) { + local_irq_save(flags); + /* + * preserve value of TSC so cum_hold_ticks and busy_ticks are + * consistent. + */ + cycles64 = get_cycles64(); + (*lstat_control.read_lock_counts[cpu])[index].cum_hold_ticks += + cycles64; + (*lstat_control.read_lock_counts[cpu])[index].read_lock_count++; + + /* + * once again, this is not perfect (some race conditions are + * possible) + */ + if (RWLOCK_READERS(rwlock_ptr) == 1) { + int cpu1 = GET_RW_CPU(rwlock_ptr); + uint64_t last_start_busy = + (*lstat_control.read_lock_counts[cpu1])[index]. + start_busy; + (*lstat_control.read_lock_counts[cpu])[index]. + busy_periods++; + if (cycles64 > last_start_busy) { + busy_length = cycles64 - last_start_busy; + (*lstat_control.read_lock_counts[cpu])[index]. + busy_ticks += busy_length; + if (busy_length > + (*lstat_control. + read_lock_counts[cpu])[index]. + max_busy) + (*lstat_control. + read_lock_counts[cpu])[index]. + max_busy = busy_length; + } + } + local_irq_restore(flags); + } + _raw_read_unlock(rwlock_ptr); +} + +void +_metered_write_lock(rwlock_t * rwlock_ptr) +{ + uint32_t start_cycles; + void *this_pc; + uint32_t spin_ticks = 0; /* in anticipation of a potential wait */ + int index; + int write_index = 0; + int cpu; + enum { + writer_writer_conflict, + writer_reader_conflict + } why_wait = writer_writer_conflict; + + if (lstat_control.state == LSTAT_OFF) { + _raw_write_lock(rwlock_ptr); + /* clean index in case lockmetering turns on before an unlock */ + PUT_RWINDEX(rwlock_ptr, 0); + return; + } + + this_pc = LSTAT_RA(LSTAT_RA_WRITE); + cpu = THIS_CPU_NUMBER; + index = GET_RWINDEX(rwlock_ptr); + + /* allocate the global stats entry for this lock, if needed */ + if (index == 0) { + index = alloc_rwlock_struct(rwlock_ptr); + } + + if (_raw_write_trylock(rwlock_ptr)) { + /* We acquired the lock on the first try */ + write_index = lstat_update((void *) rwlock_ptr, this_pc, + LSTAT_ACT_NO_WAIT); + /* save the write_index for use in unlock if stats enabled */ + if (index > 0) + (*lstat_control.read_lock_counts[cpu])[index]. + write_index = write_index; + return; + } + + /* If we get here, then we could not quickly grab the write lock */ + start_cycles = get_cycles(); /* start counting the wait time */ + + why_wait = RWLOCK_READERS(rwlock_ptr) ? + writer_reader_conflict : writer_writer_conflict; + + /* Now set the lock and wait for conflicts to disappear */ + _raw_write_lock(rwlock_ptr); + + spin_ticks = get_cycles() - start_cycles; + + /* update stats -- if enabled */ + if (index > 0 && spin_ticks) { + if (why_wait == writer_reader_conflict) { + /* waited due to a reader holding the lock */ + write_index = lstat_update_time((void *)rwlock_ptr, + this_pc, LSTAT_ACT_SPIN, spin_ticks); + } else { + /* + * waited due to another writer holding the lock + */ + write_index = lstat_update_time((void *)rwlock_ptr, + this_pc, LSTAT_ACT_WW_SPIN, spin_ticks); + (*lstat_control.counts[cpu])[write_index]. + cum_wait_ww_ticks += spin_ticks; + if (spin_ticks > + (*lstat_control.counts[cpu])[write_index]. + max_wait_ww_ticks) { + (*lstat_control.counts[cpu])[write_index]. + max_wait_ww_ticks = spin_ticks; + } + } + + /* save the directory index for use on write_unlock */ + (*lstat_control.read_lock_counts[cpu])[index]. + write_index = write_index; + } +} + +void +_metered_write_unlock(rwlock_t * rwlock_ptr) +{ + int index; + int cpu; + int write_index; + uint32_t hold_time; + + if (lstat_control.state == LSTAT_OFF) { + _raw_write_unlock(rwlock_ptr); + return; + } + + cpu = THIS_CPU_NUMBER; + index = GET_RWINDEX(rwlock_ptr); + + /* update statistics if stats enabled for this lock */ + if (index > 0) { + write_index = + (*lstat_control.read_lock_counts[cpu])[index].write_index; + + hold_time = get_cycles() - + (*lstat_control.counts[cpu])[write_index].acquire_time; + (*lstat_control.counts[cpu])[write_index].cum_hold_ticks += + (uint64_t) hold_time; + if ((*lstat_control.counts[cpu])[write_index].max_hold_ticks < + hold_time) + (*lstat_control.counts[cpu])[write_index]. + max_hold_ticks = hold_time; + } + _raw_write_unlock(rwlock_ptr); +} + +int +_metered_write_trylock(rwlock_t * rwlock_ptr) +{ + int retval; + void *this_pc = LSTAT_RA(LSTAT_RA_WRITE); + + if ((retval = _raw_write_trylock(rwlock_ptr))) { + lstat_update(rwlock_ptr, this_pc, LSTAT_ACT_NO_WAIT); + } else { + lstat_update(rwlock_ptr, this_pc, LSTAT_ACT_REJECT); + } + + return retval; +} + +static void +init_control_space(void) +{ + /* Set all control space pointers to null and indices to "empty" */ + int cpu; + + /* + * Access CPU_CYCLE_FREQUENCY at the outset, which in some + * architectures may trigger a runtime calculation that uses a + * spinlock. Let's do this before lockmetering is turned on. + */ + if (CPU_CYCLE_FREQUENCY == 0) + BUG(); + + lstat_control.hashtab = NULL; + lstat_control.dir = NULL; + for (cpu = 0; cpu < NR_CPUS; cpu++) { + lstat_control.counts[cpu] = NULL; + lstat_control.read_lock_counts[cpu] = NULL; + } +} + +static int +reset_lstat_data(void) +{ + int cpu, flags; + + flags = 0; + lstat_control.next_free_dir_index = 1; /* 0 is for overflows */ + lstat_control.next_free_read_lock_index = 1; + lstat_control.dir_overflow = 0; + lstat_control.rwlock_overflow = 0; + + lstat_control.started_cycles64 = 0; + lstat_control.ending_cycles64 = 0; + lstat_control.enabled_cycles64 = 0; + lstat_control.first_started_time = 0; + lstat_control.started_time = 0; + lstat_control.ending_time = 0; + lstat_control.intervals = 0; + + /* + * paranoia -- in case someone does a "lockstat reset" before + * "lockstat on" + */ + if (lstat_control.hashtab) { + bzero(lstat_control.hashtab, + LSTAT_HASH_TABLE_SIZE * sizeof (short)); + bzero(lstat_control.dir, LSTAT_MAX_STAT_INDEX * + sizeof (lstat_directory_entry_t)); + + for (cpu = 0; cpu < num_online_cpus(); cpu++) { + bzero(lstat_control.counts[cpu], + sizeof (lstat_cpu_counts_t)); + bzero(lstat_control.read_lock_counts[cpu], + sizeof (lstat_read_lock_cpu_counts_t)); + } + } +#ifdef NOTDEF + _raw_spin_unlock(&lstat_control.directory_lock); + local_irq_restore(flags); +#endif + return 1; +} + +static void +release_control_space(void) +{ + /* + * Called when either (1) allocation of kmem + * or (2) when user writes LSTAT_RELEASE to /pro/lockmeter. + * Assume that all pointers have been initialized to zero, + * i.e., nonzero pointers are valid addresses. + */ + int cpu; + + if (lstat_control.hashtab) { + kfree(lstat_control.hashtab); + lstat_control.hashtab = NULL; + } + + if (lstat_control.dir) { + vfree(lstat_control.dir); + lstat_control.dir = NULL; + } + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + if (lstat_control.counts[cpu]) { + vfree(lstat_control.counts[cpu]); + lstat_control.counts[cpu] = NULL; + } + if (lstat_control.read_lock_counts[cpu]) { + kfree(lstat_control.read_lock_counts[cpu]); + lstat_control.read_lock_counts[cpu] = NULL; + } + } +} + +int +get_lockmeter_info_size(void) +{ + return sizeof (lstat_user_request_t) + + num_online_cpus() * sizeof (lstat_cpu_counts_t) + + num_online_cpus() * sizeof (lstat_read_lock_cpu_counts_t) + + (LSTAT_MAX_STAT_INDEX * sizeof (lstat_directory_entry_t)); +} + +ssize_t +get_lockmeter_info(char *buffer, size_t max_len, loff_t * last_index) +{ + lstat_user_request_t req; + struct timeval tv; + ssize_t next_ret_bcount; + ssize_t actual_ret_bcount = 0; + int cpu; + + *last_index = 0; /* a one-shot read */ + + req.lstat_version = LSTAT_VERSION; + req.state = lstat_control.state; + req.maxcpus = num_online_cpus(); + req.cycleval = CPU_CYCLE_FREQUENCY; +#ifdef notyet + req.kernel_magic_addr = (void *) &_etext; + req.kernel_end_addr = (void *) &_etext; +#endif + req.uts = system_utsname; + req.intervals = lstat_control.intervals; + + req.first_started_time = lstat_control.first_started_time; + req.started_time = lstat_control.started_time; + req.started_cycles64 = lstat_control.started_cycles64; + + req.next_free_dir_index = lstat_control.next_free_dir_index; + req.next_free_read_lock_index = lstat_control.next_free_read_lock_index; + req.dir_overflow = lstat_control.dir_overflow; + req.rwlock_overflow = lstat_control.rwlock_overflow; + + if (lstat_control.state == LSTAT_OFF) { + if (req.intervals == 0) { + /* mesasurement is off and no valid data present */ + next_ret_bcount = sizeof (lstat_user_request_t); + req.enabled_cycles64 = 0; + + if ((actual_ret_bcount + next_ret_bcount) > max_len) + return actual_ret_bcount; + + copy_to_user(buffer, (void *) &req, next_ret_bcount); + actual_ret_bcount += next_ret_bcount; + return actual_ret_bcount; + } else { + /* + * measurement is off but valid data present + * fetch time info from lstat_control + */ + req.ending_time = lstat_control.ending_time; + req.ending_cycles64 = lstat_control.ending_cycles64; + req.enabled_cycles64 = lstat_control.enabled_cycles64; + } + } else { + /* + * this must be a read while data active--use current time, + * etc + */ + do_gettimeofday(&tv); + req.ending_time = tv.tv_sec; + req.ending_cycles64 = get_cycles64(); + req.enabled_cycles64 = req.ending_cycles64 - + req.started_cycles64 + lstat_control.enabled_cycles64; + } + + next_ret_bcount = sizeof (lstat_user_request_t); + if ((actual_ret_bcount + next_ret_bcount) > max_len) + return actual_ret_bcount; + + copy_to_user(buffer, (void *) &req, next_ret_bcount); + actual_ret_bcount += next_ret_bcount; + + if (!lstat_control.counts[0]) /* not initialized? */ + return actual_ret_bcount; + + next_ret_bcount = sizeof (lstat_cpu_counts_t); + for (cpu = 0; cpu < num_online_cpus(); cpu++) { + if ((actual_ret_bcount + next_ret_bcount) > max_len) + return actual_ret_bcount; /* leave early */ + copy_to_user(buffer + actual_ret_bcount, + lstat_control.counts[cpu], next_ret_bcount); + actual_ret_bcount += next_ret_bcount; + } + + next_ret_bcount = LSTAT_MAX_STAT_INDEX * + sizeof (lstat_directory_entry_t); + if (((actual_ret_bcount + next_ret_bcount) > max_len) + || !lstat_control.dir) + return actual_ret_bcount; /* leave early */ + + copy_to_user(buffer + actual_ret_bcount, lstat_control.dir, + next_ret_bcount); + actual_ret_bcount += next_ret_bcount; + + next_ret_bcount = sizeof (lstat_read_lock_cpu_counts_t); + for (cpu = 0; cpu < num_online_cpus(); cpu++) { + if (actual_ret_bcount + next_ret_bcount > max_len) + return actual_ret_bcount; + copy_to_user(buffer + actual_ret_bcount, + lstat_control.read_lock_counts[cpu], + next_ret_bcount); + actual_ret_bcount += next_ret_bcount; + } + + return actual_ret_bcount; +} + +/* + * Writing to the /proc lockmeter node enables or disables metering. + * based upon the first byte of the "written" data. + * The following values are defined: + * LSTAT_ON: 1st call: allocates storage, intializes and turns on measurement + * subsequent calls just turn on measurement + * LSTAT_OFF: turns off measurement + * LSTAT_RESET: resets statistics + * LSTAT_RELEASE: releases statistics storage + * + * This allows one to accumulate statistics over several lockstat runs: + * + * lockstat on + * lockstat off + * ...repeat above as desired... + * lockstat get + * ...now start a new set of measurements... + * lockstat reset + * lockstat on + * ... + * + */ +ssize_t +put_lockmeter_info(const char *buffer, size_t len) +{ + int error = 0; + int dirsize, countsize, read_lock_countsize, hashsize; + int cpu; + char put_char; + int i, read_lock_blocks; + unsigned long flags; + rwlock_t *lock_ptr; + struct timeval tv; + + if (len <= 0) + return -EINVAL; + + _raw_spin_lock(&lstat_control.control_lock); + + get_user(put_char, buffer); + switch (put_char) { + + case LSTAT_OFF: + if (lstat_control.state != LSTAT_OFF) { + /* + * To avoid seeing read lock hold times in an + * inconsisent state, we have to follow this protocol + * to turn off statistics + */ + local_irq_save(flags); + /* + * getting this lock will stop any read lock block + * allocations + */ + _raw_spin_lock(&lstat_control.directory_lock); + /* + * keep any more read lock blocks from being + * allocated + */ + lstat_control.state = LSTAT_OFF; + /* record how may read lock blocks there are */ + read_lock_blocks = + lstat_control.next_free_read_lock_index; + _raw_spin_unlock(&lstat_control.directory_lock); + /* now go through the list of read locks */ + cpu = THIS_CPU_NUMBER; + for (i = 1; i < read_lock_blocks; i++) { + lock_ptr = + (*lstat_control.read_lock_counts[cpu])[i]. + lock_ptr; + /* is this saved lock address still valid? */ + if (GET_RWINDEX(lock_ptr) == i) { + /* + * lock address appears to still be + * valid because we only hold one lock + * at a time, this can't cause a + * deadlock unless this is a lock held + * as part of the current system call + * path. At the moment there + * are no READ mode locks held to get + * here from user space, so we solve + * this by skipping locks held in + * write mode. + */ + if (RWLOCK_IS_WRITE_LOCKED(lock_ptr)) { + PUT_RWINDEX(lock_ptr, 0); + continue; + } + /* + * now we know there are no read + * holders of this lock! stop + * statistics collection for this + * lock + */ + _raw_write_lock(lock_ptr); + PUT_RWINDEX(lock_ptr, 0); + _raw_write_unlock(lock_ptr); + } + /* + * it may still be possible for the hold time + * sum to be negative e.g. if a lock is + * reallocated while "busy" we will have to fix + * this up in the data reduction program. + */ + } + local_irq_restore(flags); + lstat_control.intervals++; + lstat_control.ending_cycles64 = get_cycles64(); + lstat_control.enabled_cycles64 += + lstat_control.ending_cycles64 - + lstat_control.started_cycles64; + do_gettimeofday(&tv); + lstat_control.ending_time = tv.tv_sec; + /* + * don't deallocate the structures -- we may do a + * lockstat on to add to the data that is already + * there. Use LSTAT_RELEASE to release storage + */ + } else { + error = -EBUSY; /* already OFF */ + } + break; + + case LSTAT_ON: + if (lstat_control.state == LSTAT_OFF) { +#ifdef DEBUG_LOCKMETER + printk("put_lockmeter_info(cpu=%d): LSTAT_ON\n", + THIS_CPU_NUMBER); +#endif + lstat_control.next_free_dir_index = 1; /* 0 is for overflows */ + + dirsize = LSTAT_MAX_STAT_INDEX * + sizeof (lstat_directory_entry_t); + hashsize = + (1 + LSTAT_HASH_TABLE_SIZE) * sizeof (ushort); + countsize = sizeof (lstat_cpu_counts_t); + read_lock_countsize = + sizeof (lstat_read_lock_cpu_counts_t); +#ifdef DEBUG_LOCKMETER + printk(" dirsize:%d", dirsize); + printk(" hashsize:%d", hashsize); + printk(" countsize:%d", countsize); + printk(" read_lock_countsize:%d\n", + read_lock_countsize); +#endif +#ifdef DEBUG_LOCKMETER + { + int secs; + unsigned long cycles; + uint64_t cycles64; + + do_gettimeofday(&tv); + secs = tv.tv_sec; + do { + do_gettimeofday(&tv); + } while (secs == tv.tv_sec); + cycles = get_cycles(); + cycles64 = get_cycles64(); + secs = tv.tv_sec; + do { + do_gettimeofday(&tv); + } while (secs == tv.tv_sec); + cycles = get_cycles() - cycles; + cycles64 = get_cycles64() - cycles; + printk("lockmeter: cycleFrequency:%d " + "cycles:%d cycles64:%d\n", + CPU_CYCLE_FREQUENCY, cycles, cycles64); + } +#endif + + /* + * if this is the first call, allocate storage and + * initialize + */ + if (!lstat_control.hashtab) { + + spin_lock_init(&lstat_control.directory_lock); + + /* guarantee all pointers at zero */ + init_control_space(); + + lstat_control.hashtab = + kmalloc(hashsize, GFP_KERNEL); + if (!lstat_control.hashtab) { + error = -ENOSPC; +#ifdef DEBUG_LOCKMETER + printk("!!error kmalloc of hashtab\n"); +#endif + } + lstat_control.dir = vmalloc(dirsize); + if (!lstat_control.dir) { + error = -ENOSPC; +#ifdef DEBUG_LOCKMETER + printk("!!error kmalloc of dir\n"); +#endif + } + + for (cpu = 0; cpu < num_online_cpus(); cpu++) { + lstat_control.counts[cpu] = + vmalloc(countsize); + if (!lstat_control.counts[cpu]) { + error = -ENOSPC; +#ifdef DEBUG_LOCKMETER + printk("!!error vmalloc of " + "counts[%d]\n", cpu); +#endif + } + lstat_control.read_lock_counts[cpu] = + (lstat_read_lock_cpu_counts_t *) + kmalloc(read_lock_countsize, + GFP_KERNEL); + if (!lstat_control. + read_lock_counts[cpu]) { + error = -ENOSPC; +#ifdef DEBUG_LOCKMETER + printk("!!error kmalloc of " + "read_lock_counts[%d]\n", + cpu); +#endif + } + } + } + + if (error) { + /* + * One or more kmalloc failures -- free + * everything + */ + release_control_space(); + } else { + + if (!reset_lstat_data()) { + error = -EINVAL; + break; + }; + + /* + * record starting and ending times and the + * like + */ + if (lstat_control.intervals == 0) { + do_gettimeofday(&tv); + lstat_control.first_started_time = + tv.tv_sec; + } + lstat_control.started_cycles64 = get_cycles64(); + do_gettimeofday(&tv); + lstat_control.started_time = tv.tv_sec; + + lstat_control.state = LSTAT_ON; + } + } else { + error = -EBUSY; /* already ON */ + } + break; + + case LSTAT_RESET: + if (lstat_control.state == LSTAT_OFF) { + if (!reset_lstat_data()) + error = -EINVAL; + } else { + error = -EBUSY; /* still on; can't reset */ + } + break; + + case LSTAT_RELEASE: + if (lstat_control.state == LSTAT_OFF) { + release_control_space(); + lstat_control.intervals = 0; + lstat_control.enabled_cycles64 = 0; + } else { + error = -EBUSY; + } + break; + + default: + error = -EINVAL; + } /* switch */ + + _raw_spin_unlock(&lstat_control.control_lock); + return error ? error : len; +} + +#ifdef USER_MODE_TESTING +/* following used for user mode testing */ +void +lockmeter_init() +{ + int dirsize, hashsize, countsize, read_lock_countsize, cpu; + + printf("lstat_control is at %x size=%d\n", &lstat_control, + sizeof (lstat_control)); + printf("sizeof(spinlock_t)=%d\n", sizeof (spinlock_t)); + lstat_control.state = LSTAT_ON; + + lstat_control.directory_lock = SPIN_LOCK_UNLOCKED; + lstat_control.next_free_dir_index = 1; /* 0 is for overflows */ + lstat_control.next_free_read_lock_index = 1; + + dirsize = LSTAT_MAX_STAT_INDEX * sizeof (lstat_directory_entry_t); + hashsize = (1 + LSTAT_HASH_TABLE_SIZE) * sizeof (ushort); + countsize = sizeof (lstat_cpu_counts_t); + read_lock_countsize = sizeof (lstat_read_lock_cpu_counts_t); + + lstat_control.hashtab = (ushort *) malloc(hashsize); + + if (lstat_control.hashtab == 0) { + printf("malloc failure for at line %d in lockmeter.c\n", + __LINE__); + exit(0); + } + + lstat_control.dir = (lstat_directory_entry_t *) malloc(dirsize); + + if (lstat_control.dir == 0) { + printf("malloc failure for at line %d in lockmeter.c\n", cpu, + __LINE__); + exit(0); + } + + for (cpu = 0; cpu < num_online_cpus(); cpu++) { + int j, k; + j = (int) (lstat_control.counts[cpu] = + (lstat_cpu_counts_t *) malloc(countsize)); + k = (int) (lstat_control.read_lock_counts[cpu] = + (lstat_read_lock_cpu_counts_t *) + malloc(read_lock_countsize)); + if (j * k == 0) { + printf("malloc failure for cpu=%d at line %d in " + "lockmeter.c\n", cpu, __LINE__); + exit(0); + } + } + + memset(lstat_control.hashtab, 0, hashsize); + memset(lstat_control.dir, 0, dirsize); + + for (cpu = 0; cpu < num_online_cpus(); cpu++) { + memset(lstat_control.counts[cpu], 0, countsize); + memset(lstat_control.read_lock_counts[cpu], 0, + read_lock_countsize); + } +} + +asm(" \ +.align 4 \ +.globl __write_lock_failed \ +__write_lock_failed: \ + " LOCK "addl $" RW_LOCK_BIAS_STR ",(%eax) \ +1: cmpl $" RW_LOCK_BIAS_STR ",(%eax) \ + jne 1b \ +\ + " LOCK "subl $" RW_LOCK_BIAS_STR ",(%eax) \ + jnz __write_lock_failed \ + ret \ +\ +\ +.align 4 \ +.globl __read_lock_failed \ +__read_lock_failed: \ + lock ; incl (%eax) \ +1: cmpl $1,(%eax) \ + js 1b \ +\ + lock ; decl (%eax) \ + js __read_lock_failed \ + ret \ +"); +#endif + +EXPORT_SYMBOL(_metered_spin_lock); +EXPORT_SYMBOL(_metered_spin_unlock); +EXPORT_SYMBOL(_metered_spin_trylock); +EXPORT_SYMBOL(_metered_read_lock); +EXPORT_SYMBOL(_metered_read_unlock); +EXPORT_SYMBOL(_metered_write_lock); +EXPORT_SYMBOL(_metered_write_unlock); --- diff/mm/mempolicy.c 1970-01-01 01:00:00.000000000 +0100 +++ source/mm/mempolicy.c 2004-04-21 10:46:51.809705840 +0100 @@ -0,0 +1,1018 @@ +/* + * Simple NUMA memory policy for the Linux kernel. + * + * Copyright 2003,2004 Andi Kleen, SuSE Labs. + * Subject to the GNU Public License, version 2. + * + * NUMA policy allows the user to give hints in which node(s) memory should + * be allocated. + * + * Support four policies per VMA and per process: + * + * The VMA policy has priority over the process policy for a page fault. + * + * interleave Allocate memory interleaved over a set of nodes, + * with normal fallback if it fails. + * For VMA based allocations this interleaves based on the + * offset into the backing object or offset into the mapping + * for anonymous memory. For process policy an process counter + * is used. + * bind Only allocate memory on a specific set of nodes, + * no fallback. + * preferred Try a specific node first before normal fallback. + * As a special case node -1 here means do the allocation + * on the local CPU. This is normally identical to default, + * but useful to set in a VMA when you have a non default + * process policy. + * default Allocate on the local node first, or when on a VMA + * use the process policy. This is what Linux always did + * in a NUMA aware kernel and still does by, ahem, default. + * + * The process policy is applied for most non interrupt memory allocations + * in that process' context. Interrupts ignore the policies and always + * try to allocate on the local CPU. The VMA policy is only applied for memory + * allocations for a VMA in the VM. + * + * Currently there are a few corner cases in swapping where the policy + * is not applied, but the majority should be handled. When process policy + * is used it is not remembered over swap outs/swap ins. + * + * Only the highest zone in the zone hierarchy gets policied. Allocations + * requesting a lower zone just use default policy. This implies that + * on systems with highmem kernel lowmem allocation don't get policied. + * Same with GFP_DMA allocations. + * + * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between + * all users and remembered even when nobody has memory mapped. + */ + +/* Notebook: + fix mmap readahead to honour policy and enable policy for any page cache + object + statistics for bigpages + global policy for page cache? currently it uses process policy. Requires + first item above. + handle mremap for shared memory (currently ignored for the policy) + grows down? + make bind policy root only? It can trigger oom much faster and the + kernel is not always grateful with that. + could replace all the switch()es with a mempolicy_ops structure. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static kmem_cache_t *policy_cache; +static kmem_cache_t *sn_cache; + +#define PDprintk(fmt...) + +/* Highest zone. An specific allocation for a zone below that is not + policied. */ +static int policy_zone; + +static struct mempolicy default_policy = { + .refcnt = ATOMIC_INIT(1), /* never free it */ + .policy = MPOL_DEFAULT, +}; + +/* Check if all specified nodes are online */ +static int nodes_online(unsigned long *nodes) +{ + DECLARE_BITMAP(offline, MAX_NUMNODES); + + bitmap_copy(offline, node_online_map, MAX_NUMNODES); + if (bitmap_empty(offline, MAX_NUMNODES)) + set_bit(0, offline); + bitmap_complement(offline, MAX_NUMNODES); + bitmap_and(offline, offline, nodes, MAX_NUMNODES); + if (!bitmap_empty(offline, MAX_NUMNODES)) + return -EINVAL; + return 0; +} + +/* Do sanity checking on a policy */ +static int mpol_check_policy(int mode, unsigned long *nodes) +{ + int empty = bitmap_empty(nodes, MAX_NUMNODES); + + switch (mode) { + case MPOL_DEFAULT: + if (!empty) + return -EINVAL; + break; + case MPOL_BIND: + case MPOL_INTERLEAVE: + /* Preferred will only use the first bit, but allow + more for now. */ + if (empty) + return -EINVAL; + break; + } + return nodes_online(nodes); +} + +/* Copy a node mask from user space. */ +static int get_nodes(unsigned long *nodes, unsigned long *nmask, + unsigned long maxnode, int mode) +{ + unsigned long k; + unsigned long nlongs; + unsigned long endmask; + + --maxnode; + nlongs = BITS_TO_LONGS(maxnode); + if ((maxnode % BITS_PER_LONG) == 0) + endmask = ~0UL; + else + endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; + + /* When the user specified more nodes than supported just check + if the non supported part is all zero. */ + if (nmask && nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { + for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { + unsigned long t; + if (get_user(t, nmask + k)) + return -EFAULT; + if (k == nlongs - 1) { + if (t & endmask) + return -EINVAL; + } else if (t) + return -EINVAL; + } + nlongs = BITS_TO_LONGS(MAX_NUMNODES); + endmask = ~0UL; + } + + bitmap_zero(nodes, MAX_NUMNODES); + if (nmask && copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long))) + return -EFAULT; + nodes[nlongs-1] &= endmask; + return mpol_check_policy(mode, nodes); +} + +/* Generate a custom zonelist for the BIND policy. */ +static struct zonelist *bind_zonelist(unsigned long *nodes) +{ + struct zonelist *zl; + int num, max, nd; + + max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES); + zl = kmalloc(sizeof(void *) * max, GFP_KERNEL); + if (!zl) + return NULL; + num = 0; + for (nd = find_first_bit(nodes, MAX_NUMNODES); + nd < MAX_NUMNODES; + nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) { + int k; + for (k = MAX_NR_ZONES-1; k >= 0; k--) { + struct zone *z = &NODE_DATA(nd)->node_zones[k]; + if (!z->present_pages) + continue; + zl->zones[num++] = z; + if (k > policy_zone) + policy_zone = k; + } + } + BUG_ON(num >= max); + zl->zones[num] = NULL; + return zl; +} + +/* Create a new policy */ +static struct mempolicy *mpol_new(int mode, unsigned long *nodes) +{ + struct mempolicy *policy; + + PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]); + if (mode == MPOL_DEFAULT) + return NULL; + policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); + if (!policy) + return ERR_PTR(-ENOMEM); + atomic_set(&policy->refcnt, 1); + switch (mode) { + case MPOL_INTERLEAVE: + bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES); + break; + case MPOL_PREFERRED: + policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES); + if (policy->v.preferred_node >= MAX_NUMNODES) + policy->v.preferred_node = -1; + break; + case MPOL_BIND: + policy->v.zonelist = bind_zonelist(nodes); + if (policy->v.zonelist == NULL) { + kmem_cache_free(policy_cache, policy); + return ERR_PTR(-ENOMEM); + } + break; + } + policy->policy = mode; + return policy; +} + +/* Ensure all existing pages follow the policy. */ +static int +verify_pages(unsigned long addr, unsigned long end, unsigned long *nodes) +{ + while (addr < end) { + struct page *p; + pte_t *pte; + pmd_t *pmd; + pgd_t *pgd = pgd_offset_k(addr); + if (pgd_none(*pgd)) { + addr = (addr + PGDIR_SIZE) & PGDIR_MASK; + continue; + } + pmd = pmd_offset(pgd, addr); + if (pmd_none(*pmd)) { + addr = (addr + PMD_SIZE) & PMD_MASK; + continue; + } + p = NULL; + pte = pte_offset_map(pmd, addr); + if (pte_present(*pte)) + p = pte_page(*pte); + pte_unmap(pte); + if (p) { + unsigned nid = page_to_nid(p); + if (!test_bit(nid, nodes)) + return -EIO; + } + addr += PAGE_SIZE; + } + return 0; +} + +/* Step 1: check the range */ +static struct vm_area_struct * +check_range(struct mm_struct *mm, unsigned long start, unsigned long end, + unsigned long *nodes, unsigned long flags) +{ + int err; + struct vm_area_struct *first, *vma, *prev; + + first = find_vma(mm, start); + if (!first) + return ERR_PTR(-EFAULT); + prev = NULL; + for (vma = first; vma->vm_start < end; vma = vma->vm_next) { + if (!vma->vm_next && vma->vm_end < end) + return ERR_PTR(-EFAULT); + if (prev && prev->vm_end < vma->vm_start) + return ERR_PTR(-EFAULT); + if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) { + err = verify_pages(vma->vm_start, vma->vm_end, nodes); + if (err) { + first = ERR_PTR(err); + break; + } + } + prev = vma; + } + return first; +} + +/* Apply policy to a single VMA */ +static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) +{ + int err = 0; + struct mempolicy *old = vma->vm_policy; + + PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", + vma->vm_start, vma->vm_end, vma->vm_pgoff, + vma->vm_ops, vma->vm_file, + vma->vm_ops ? vma->vm_ops->set_policy : NULL); + + if (vma->vm_file) + spin_lock(&vma->vm_file->f_mapping->i_shared_lock); + if (vma->vm_ops && vma->vm_ops->set_policy) + err = vma->vm_ops->set_policy(vma, new); + if (!err) { + mpol_get(new); + vma->vm_policy = new; + mpol_free(old); + } + if (vma->vm_file) + spin_unlock(&vma->vm_file->f_mapping->i_shared_lock); + return err; +} + +/* Step 2: apply policy to a range and do splits. */ +static int mbind_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end, struct mempolicy *new) +{ + struct vm_area_struct *next; + int err; + + err = 0; + for (; vma->vm_start < end; vma = next) { + next = vma->vm_next; + if (vma->vm_start < start) + err = split_vma(vma->vm_mm, vma, start, 1); + if (!err && vma->vm_end > end) + err = split_vma(vma->vm_mm, vma, end, 0); + if (!err) + err = policy_vma(vma, new); + if (err) + break; + } + return err; +} + +/* Change policy for a memory range */ +asmlinkage long sys_mbind(unsigned long start, unsigned long len, + unsigned long mode, + unsigned long *nmask, unsigned long maxnode, + unsigned flags) +{ + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; + struct mempolicy *new; + unsigned long end; + DECLARE_BITMAP(nodes, MAX_NUMNODES); + int err; + + if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) + return -EINVAL; + if (start & ~PAGE_MASK) + return -EINVAL; + if (mode == MPOL_DEFAULT) + flags &= ~MPOL_MF_STRICT; + len = (len + PAGE_SIZE - 1) & PAGE_MASK; + end = start + len; + if (end < start) + return -EINVAL; + if (end == start) + return 0; + + err = get_nodes(nodes, nmask, maxnode, mode); + if (err) + return err; + + new = mpol_new(mode, nodes); + if (IS_ERR(new)) + return PTR_ERR(new); + + PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, + mode,nodes[0]); + + down_write(&mm->mmap_sem); + vma = check_range(mm, start, end, nodes, flags); + err = PTR_ERR(vma); + if (!IS_ERR(vma)) + err = mbind_range(vma, start, end, new); + up_write(&mm->mmap_sem); + mpol_free(new); + return err; +} + +/* Set the process memory policy */ +asmlinkage long sys_set_mempolicy(int mode, unsigned long *nmask, + unsigned long maxnode) +{ + int err; + struct mempolicy *new; + DECLARE_BITMAP(nodes, MAX_NUMNODES); + + if (mode > MPOL_MAX) + return -EINVAL; + err = get_nodes(nodes, nmask, maxnode, mode); + if (err) + return err; + new = mpol_new(mode, nodes); + if (IS_ERR(new)) + return PTR_ERR(new); + mpol_free(current->mempolicy); + current->mempolicy = new; + if (new && new->policy == MPOL_INTERLEAVE) + current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES); + return 0; +} + +/* Fill a zone bitmap for a policy */ +static void get_zonemask(struct mempolicy *p, unsigned long *nodes) +{ + int i; + + bitmap_zero(nodes, MAX_NUMNODES); + switch (p->policy) { + case MPOL_BIND: + for (i = 0; p->v.zonelist->zones[i]; i++) + __set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes); + break; + case MPOL_DEFAULT: + break; + case MPOL_INTERLEAVE: + bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES); + break; + case MPOL_PREFERRED: + /* or use current node instead of online map? */ + if (p->v.preferred_node < 0) + bitmap_copy(nodes, node_online_map, MAX_NUMNODES); + else + __set_bit(p->v.preferred_node, nodes); + break; + default: + BUG(); + } +} + +static int lookup_node(struct mm_struct *mm, unsigned long addr) +{ + struct page *p; + int err; + + err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL); + if (err >= 0) { + err = page_zone(p)->zone_pgdat->node_id; + put_page(p); + } + return err; +} + +/* Copy a kernel node mask to user space */ +static int copy_nodes_to_user(unsigned long *user_mask, unsigned long maxnode, + unsigned long *nodes) +{ + unsigned long copy = ALIGN(maxnode-1, 64) / 8; + + if (copy > sizeof(nodes)) { + if (copy > PAGE_SIZE) + return -EINVAL; + if (clear_user((char*)user_mask + sizeof(nodes), + copy - sizeof(nodes))) + return -EFAULT; + copy = sizeof(nodes); + } + return copy_to_user(user_mask, nodes, copy) ? -EFAULT : 0; +} + +/* Retrieve NUMA policy */ +asmlinkage long sys_get_mempolicy(int *policy, + unsigned long *nmask, unsigned long maxnode, + unsigned long addr, unsigned long flags) +{ + int err, pval; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma = NULL; + struct mempolicy *pol = current->mempolicy; + + if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) + return -EINVAL; + if (nmask != NULL && maxnode < numnodes) + return -EINVAL; + if (flags & MPOL_F_ADDR) { + down_read(&mm->mmap_sem); + vma = find_vma_intersection(mm, addr, addr+1); + if (!vma) { + up_read(&mm->mmap_sem); + return -EFAULT; + } + if (vma->vm_ops && vma->vm_ops->get_policy) + pol = vma->vm_ops->get_policy(vma, addr); + else + pol = vma->vm_policy; + } else if (addr) + return -EINVAL; + + if (!pol) + pol = &default_policy; + + if (flags & MPOL_F_NODE) { + if (flags & MPOL_F_ADDR) { + err = lookup_node(mm, addr); + if (err < 0) + goto out; + pval = err; + } else if (pol == current->mempolicy && + pol->policy == MPOL_INTERLEAVE) { + pval = current->il_next; + } else { + err = -EINVAL; + goto out; + } + } else + pval = pol->policy; + + err = -EFAULT; + if (policy && put_user(pval, policy)) + goto out; + + err = 0; + if (nmask) { + DECLARE_BITMAP(nodes, MAX_NUMNODES); + get_zonemask(pol, nodes); + err = copy_nodes_to_user(nmask, maxnode, nodes); + } + + out: + if (vma) + up_read(¤t->mm->mmap_sem); + return err; +} + +#ifdef CONFIG_COMPAT +/* The other functions are compatible */ +asmlinkage long compat_get_mempolicy(int *policy, + unsigned *nmask, unsigned maxnode, + unsigned addr, unsigned flags) +{ + long err; + unsigned long *nm = NULL; + if (nmask) + nm = compat_alloc_user_space(ALIGN(maxnode-1, 64) / 8); + err = sys_get_mempolicy(policy, nm, maxnode, addr, flags); + if (!err && copy_in_user(nmask, nm, ALIGN(maxnode-1, 32)/8)) + err = -EFAULT; + return err; +} +#endif + +/* Return effective policy for a VMA */ +static struct mempolicy * +get_vma_policy(struct vm_area_struct *vma, unsigned long addr) +{ + struct mempolicy *pol = current->mempolicy; + + if (vma) { + if (vma->vm_ops && vma->vm_ops->get_policy) + pol = vma->vm_ops->get_policy(vma, addr); + else if (vma->vm_policy && + vma->vm_policy->policy != MPOL_DEFAULT) + pol = vma->vm_policy; + } + if (!pol) + pol = &default_policy; + return pol; +} + +/* Return a zonelist representing a mempolicy */ +static struct zonelist *zonelist_policy(unsigned gfp, struct mempolicy *policy) +{ + int nd; + + switch (policy->policy) { + case MPOL_PREFERRED: + nd = policy->v.preferred_node; + if (nd < 0) + nd = numa_node_id(); + break; + case MPOL_BIND: + /* Lower zones don't get a policy applied */ + if (gfp >= policy_zone) + return policy->v.zonelist; + /*FALL THROUGH*/ + case MPOL_INTERLEAVE: /* should not happen */ + case MPOL_DEFAULT: + nd = numa_node_id(); + break; + default: + nd = 0; + BUG(); + } + return NODE_DATA(nd)->node_zonelists + (gfp & GFP_ZONEMASK); +} + +/* Do dynamic interleaving for a process */ +static unsigned interleave_nodes(struct mempolicy *policy) +{ + unsigned nid, next; + struct task_struct *me = current; + + nid = me->il_next; + BUG_ON(nid >= MAX_NUMNODES); + next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid); + if (next >= MAX_NUMNODES) + next = find_first_bit(policy->v.nodes, MAX_NUMNODES); + me->il_next = next; + return nid; +} + +/* Do static interleaving for a VMA with known offset. */ +static unsigned offset_il_node(struct mempolicy *pol, + struct vm_area_struct *vma, unsigned long off) +{ + unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES); + unsigned target = (unsigned)off % nnodes; + int c; + int nid = -1; + + c = 0; + do { + nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1); + c++; + } while (c <= target); + BUG_ON(nid >= MAX_NUMNODES); + BUG_ON(!test_bit(nid, pol->v.nodes)); + return nid; +} + +/* Allocate a page in interleaved policy. + Own path because it needs to do special accounting. */ +static struct page *alloc_page_interleave(unsigned gfp, unsigned nid) +{ + struct zonelist *zl; + struct page *page; + + BUG_ON(!test_bit(nid, node_online_map)); + zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK); + page = __alloc_pages(gfp, 0, zl); + if (page && page_zone(page) == zl->zones[0]) { + zl->zones[0]->pageset[get_cpu()].interleave_hit++; + put_cpu(); + } + return page; +} + +/** + * alloc_page_vma - Allocate a page for a VMA. + * + * @gfp: + * %GFP_USER user allocation. + * %GFP_KERNEL kernel allocations, + * %GFP_HIGHMEM highmem/user allocations, + * %GFP_FS allocation should not call back into a file system. + * %GFP_ATOMIC don't sleep. + * + * @vma: Pointer to VMA or NULL if not available. + * @addr: Virtual Address of the allocation. Must be inside the VMA. + * + * This function allocates a page from the kernel page pool and applies + * a NUMA policy associated with the VMA or the current process. + * When VMA is not NULL caller must hold down_read on the mmap_sem of the + * mm_struct of the VMA to prevent it from going away. Should be used for + * all allocations for pages that will be mapped into + * user space. Returns NULL when no page can be allocated. + * + * Should be called with the mm_sem of the vma hold. + */ +struct page * +alloc_page_vma(unsigned gfp, struct vm_area_struct *vma, unsigned long addr) +{ + struct mempolicy *pol = get_vma_policy(vma, addr); + + if (unlikely(pol->policy == MPOL_INTERLEAVE)) { + unsigned nid; + if (vma) { + unsigned long off; + BUG_ON(addr >= vma->vm_end); + BUG_ON(addr < vma->vm_start); + off = vma->vm_pgoff; + off += (addr - vma->vm_start) >> PAGE_SHIFT; + nid = offset_il_node(pol, vma, off); + } else { + /* fall back to process interleaving */ + nid = interleave_nodes(pol); + } + return alloc_page_interleave(gfp, nid); + } + return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol)); +} + +/** + * alloc_pages_current - Allocate pages. + * + * @gfp: + * %GFP_USER user allocation, + * %GFP_KERNEL kernel allocation, + * %GFP_HIGHMEM highmem allocation, + * %GFP_FS don't call back into a file system. + * %GFP_ATOMIC don't sleep. + * @order: Power of two of allocation size in pages. 0 is a single page. + * + * Allocate a page from the kernel page pool. When not in + * interrupt context and apply the current process NUMA policy. + * Returns NULL when no page can be allocated. + */ +struct page *alloc_pages_current(unsigned gfp, unsigned order) +{ + struct mempolicy *pol = current->mempolicy; + + if (!pol || in_interrupt()) + pol = &default_policy; + if (pol->policy == MPOL_INTERLEAVE && order == 0) + return alloc_page_interleave(gfp, interleave_nodes(pol)); + return __alloc_pages(gfp, order, zonelist_policy(gfp, pol)); +} +EXPORT_SYMBOL(alloc_pages_current); + +/* Slow path of a mempolicy copy */ +struct mempolicy *__mpol_copy(struct mempolicy *old) +{ + struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); + + if (!new) + return ERR_PTR(-ENOMEM); + *new = *old; + atomic_set(&new->refcnt, 1); + if (new->policy == MPOL_BIND) { + int sz = ksize(old->v.zonelist); + new->v.zonelist = kmalloc(sz, SLAB_KERNEL); + if (!new->v.zonelist) { + kmem_cache_free(policy_cache, new); + return ERR_PTR(-ENOMEM); + } + memcpy(new->v.zonelist, old->v.zonelist, sz); + } + return new; +} + +/* Slow path of a mempolicy comparison */ +int __mpol_equal(struct mempolicy *a, struct mempolicy *b) +{ + if (!a || !b) + return 0; + if (a->policy != b->policy) + return 0; + switch (a->policy) { + case MPOL_DEFAULT: + return 1; + case MPOL_INTERLEAVE: + return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES); + case MPOL_PREFERRED: + return a->v.preferred_node == b->v.preferred_node; + case MPOL_BIND: { + int i; + for (i = 0; a->v.zonelist->zones[i]; i++) + if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i]) + return 0; + return b->v.zonelist->zones[i] == NULL; + } + default: + BUG(); + return 0; + } +} + +/* Slow path of a mpol destructor. */ +extern void __mpol_free(struct mempolicy *p) +{ + if (!atomic_dec_and_test(&p->refcnt)) + return; + if (p->policy == MPOL_BIND) + kfree(p->v.zonelist); + p->policy = MPOL_DEFAULT; + kmem_cache_free(policy_cache, p); +} + +/* + * Hugetlb policy. Same as above, just works with node numbers instead of + * zonelists. + */ + +/* Find first node suitable for an allocation */ +int mpol_first_node(struct vm_area_struct *vma, unsigned long addr) +{ + struct mempolicy *pol = get_vma_policy(vma, addr); + + switch (pol->policy) { + case MPOL_DEFAULT: + return numa_node_id(); + case MPOL_BIND: + return pol->v.zonelist->zones[0]->zone_pgdat->node_id; + case MPOL_INTERLEAVE: + return interleave_nodes(pol); + case MPOL_PREFERRED: + return pol->v.preferred_node >= 0 ? + pol->v.preferred_node : numa_node_id(); + } + BUG(); + return 0; +} + +/* Find secondary valid nodes for an allocation */ +int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr) +{ + struct mempolicy *pol = get_vma_policy(vma, addr); + + switch (pol->policy) { + case MPOL_PREFERRED: + case MPOL_DEFAULT: + case MPOL_INTERLEAVE: + return 1; + case MPOL_BIND: { + struct zone **z; + for (z = pol->v.zonelist->zones; *z; z++) + if ((*z)->zone_pgdat->node_id == nid) + return 1; + return 0; + } + default: + BUG(); + return 0; + } +} + +/* + * Shared memory backing store policy support. + * + * Remember policies even when nobody has shared memory mapped. + * The policies are kept in Red-Black tree linked from the inode. + * They are protected by the sp->sem semaphore, which should be held + * for any accesses to the tree. + */ + +/* lookup first element intersecting start-end */ +/* Caller holds sp->sem */ +static struct sp_node * +sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) +{ + struct rb_node *n = sp->root.rb_node; + + while (n) { + struct sp_node *p = rb_entry(n, struct sp_node, nd); + if (start >= p->end) { + n = n->rb_right; + } else if (end < p->start) { + n = n->rb_left; + } else { + break; + } + } + if (!n) + return NULL; + for (;;) { + struct sp_node *w = NULL; + struct rb_node *prev = rb_prev(n); + if (!prev) + break; + w = rb_entry(prev, struct sp_node, nd); + if (w->end <= start) + break; + n = prev; + } + return rb_entry(n, struct sp_node, nd); +} + +/* Insert a new shared policy into the list. */ +/* Caller holds sp->sem */ +static void sp_insert(struct shared_policy *sp, struct sp_node *new) +{ + struct rb_node **p = &sp->root.rb_node; + struct rb_node *parent = NULL; + struct sp_node *nd; + + while (*p) { + parent = *p; + nd = rb_entry(parent, struct sp_node, nd); + if (new->start < nd->start) + p = &(*p)->rb_left; + else if (new->end > nd->end) + p = &(*p)->rb_right; + else + BUG(); + } + rb_link_node(&new->nd, parent, p); + rb_insert_color(&new->nd, &sp->root); + PDprintk("inserting %lx-%lx: %d\n", new->start, new->end, + new->policy ? new->policy->policy : 0); +} + +/* Find shared policy intersecting idx */ +struct mempolicy * +mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) +{ + struct mempolicy *pol = NULL; + struct sp_node *sn; + + down(&sp->sem); + sn = sp_lookup(sp, idx, idx+1); + if (sn) { + mpol_get(sn->policy); + pol = sn->policy; + } + up(&sp->sem); + return pol; +} + +static void sp_delete(struct shared_policy *sp, struct sp_node *n) +{ + PDprintk("deleting %lx-l%x\n", n->start, n->end); + rb_erase(&n->nd, &sp->root); + mpol_free(n->policy); + kmem_cache_free(sn_cache, n); +} + +struct sp_node * +sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol) +{ + struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL); + + if (!n) + return NULL; + n->start = start; + n->end = end; + mpol_get(pol); + n->policy = pol; + return n; +} + +/* Replace a policy range. */ +static int shared_policy_replace(struct shared_policy *sp, unsigned long start, + unsigned long end, struct sp_node *new) +{ + struct sp_node *n, *new2; + + down(&sp->sem); + n = sp_lookup(sp, start, end); + /* Take care of old policies in the same range. */ + while (n && n->start < end) { + struct rb_node *next = rb_next(&n->nd); + if (n->start >= start) { + if (n->end <= end) + sp_delete(sp, n); + else + n->start = end; + } else { + /* Old policy spanning whole new range. */ + if (n->end > end) { + new2 = sp_alloc(end, n->end, n->policy); + if (!new2) { + up(&sp->sem); + return -ENOMEM; + } + n->end = end; + sp_insert(sp, new2); + } + /* Old crossing beginning, but not end (easy) */ + if (n->start < start && n->end > start) + n->end = start; + } + if (!next) + break; + n = rb_entry(next, struct sp_node, nd); + } + if (new) + sp_insert(sp, new); + up(&sp->sem); + return 0; +} + +int mpol_set_shared_policy(struct shared_policy *info, + struct vm_area_struct *vma, struct mempolicy *npol) +{ + int err; + struct sp_node *new = NULL; + unsigned long sz = vma_pages(vma); + + PDprintk("set_shared_policy %lx sz %lu %d %lx\n", + vma->vm_pgoff, + sz, npol? npol->policy : -1, + npol ? npol->v.nodes[0] : -1); + + if (npol) { + new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); + if (!new) + return -ENOMEM; + } + err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); + if (err && new) + kmem_cache_free(sn_cache, new); + return err; +} + +/* Free a backing policy store on inode delete. */ +void mpol_free_shared_policy(struct shared_policy *p) +{ + struct sp_node *n; + struct rb_node *next; + + down(&p->sem); + next = rb_first(&p->root); + while (next) { + n = rb_entry(next, struct sp_node, nd); + next = rb_next(&n->nd); + rb_erase(&n->nd, &p->root); + mpol_free(n->policy); + kmem_cache_free(sn_cache, n); + } + up(&p->sem); +} + +static __init int numa_policy_init(void) +{ + policy_cache = kmem_cache_create("numa_policy", + sizeof(struct mempolicy), + 0, SLAB_PANIC, NULL, NULL); + + sn_cache = kmem_cache_create("shared_policy_node", + sizeof(struct sp_node), + 0, SLAB_PANIC, NULL, NULL); + return 0; +} +module_init(numa_policy_init); --- diff/drivers/char/watchdog/amd7xx_tco.c 2004-04-05 12:57:07.000000000 +0100 +++ source/drivers/char/watchdog/amd7xx_tco.c 1970-01-01 01:00:00.000000000 +0100 @@ -1,374 +0,0 @@ -/* - * AMD 766/768 TCO Timer Driver - * (c) Copyright 2002 Zwane Mwaikambo - * All Rights Reserved. - * - * Parts from; - * Hardware driver for the AMD 768 Random Number Generator (RNG) - * (c) Copyright 2001 Red Hat Inc - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version 2 - * as published by the Free Software Foundation. - * - * The author(s) of this software shall not be held liable for damages - * of any nature resulting due to the use of this software. This - * software is provided AS-IS with no warranties. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define AMDTCO_MODULE_VER "build 20021116" -#define AMDTCO_MODULE_NAME "amd7xx_tco" -#define PFX AMDTCO_MODULE_NAME ": " - -#define MAX_TIMEOUT 38 /* max of 38 seconds, although the system will only - * reset itself after the second timeout */ - -/* pmbase registers */ -#define TCO_RELOAD_REG 0x40 /* bits 0-5 are current count, 6-7 are reserved */ -#define TCO_INITVAL_REG 0x41 /* bits 0-5 are value to load, 6-7 are reserved */ -#define TCO_TIMEOUT_MASK 0x3f -#define TCO_STATUS1_REG 0x44 -#define TCO_STATUS2_REG 0x46 -#define NDTO_STS2 (1 << 1) /* we're interested in the second timeout */ -#define BOOT_STS (1 << 2) /* will be set if NDTO_STS2 was set before reboot */ -#define TCO_CTRL1_REG 0x48 -#define TCO_HALT (1 << 11) -#define NO_REBOOT (1 << 10) /* in DevB:3x48 */ - -static char banner[] __initdata = KERN_INFO PFX AMDTCO_MODULE_VER "\n"; -static int timeout = MAX_TIMEOUT; -static u32 pmbase; /* PMxx I/O base */ -static struct pci_dev *dev; -static struct semaphore open_sem; -static spinlock_t amdtco_lock; /* only for device access */ -static char expect_close; - -module_param(timeout, int, 0); -MODULE_PARM_DESC(timeout, "range is 0-38 seconds, default is 38"); - -#ifdef CONFIG_WATCHDOG_NOWAYOUT -static int nowayout = 1; -#else -static int nowayout = 0; -#endif - -module_param(nowayout, int, 0); -MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=CONFIG_WATCHDOG_NOWAYOUT)"); - -static inline u8 seconds_to_ticks(int seconds) -{ - /* the internal timer is stored as ticks which decrement - * every 0.6 seconds */ - return (seconds * 10) / 6; -} - -static inline int ticks_to_seconds(u8 ticks) -{ - return (ticks * 6) / 10; -} - -static inline int amdtco_status(void) -{ - u16 reg; - int status = 0; - - reg = inb(pmbase+TCO_CTRL1_REG); - if ((reg & TCO_HALT) == 0) - status |= WDIOF_KEEPALIVEPING; - - reg = inb(pmbase+TCO_STATUS2_REG); - if (reg & BOOT_STS) - status |= WDIOF_CARDRESET; - - return status; -} - -static inline void amdtco_ping(void) -{ - outb(1, pmbase+TCO_RELOAD_REG); -} - -static inline int amdtco_gettimeout(void) -{ - u8 reg = inb(pmbase+TCO_RELOAD_REG) & TCO_TIMEOUT_MASK; - return ticks_to_seconds(reg); -} - -static inline void amdtco_settimeout(unsigned int timeout) -{ - u8 reg = seconds_to_ticks(timeout) & TCO_TIMEOUT_MASK; - outb(reg, pmbase+TCO_INITVAL_REG); -} - -static inline void amdtco_global_enable(void) -{ - u16 reg; - - spin_lock(&amdtco_lock); - - /* clear NO_REBOOT on DevB:3x48 p97 */ - pci_read_config_word(dev, 0x48, ®); - reg &= ~NO_REBOOT; - pci_write_config_word(dev, 0x48, reg); - - spin_unlock(&amdtco_lock); -} - -static inline void amdtco_enable(void) -{ - u16 reg; - - spin_lock(&amdtco_lock); - reg = inw(pmbase+TCO_CTRL1_REG); - reg &= ~TCO_HALT; - outw(reg, pmbase+TCO_CTRL1_REG); - spin_unlock(&amdtco_lock); -} - -static inline void amdtco_disable(void) -{ - u16 reg; - - spin_lock(&amdtco_lock); - reg = inw(pmbase+TCO_CTRL1_REG); - reg |= TCO_HALT; - outw(reg, pmbase+TCO_CTRL1_REG); - spin_unlock(&amdtco_lock); -} - -static int amdtco_fop_open(struct inode *inode, struct file *file) -{ - if (down_trylock(&open_sem)) - return -EBUSY; - - if (timeout > MAX_TIMEOUT) - timeout = MAX_TIMEOUT; - - amdtco_disable(); - amdtco_settimeout(timeout); - amdtco_global_enable(); - amdtco_enable(); - amdtco_ping(); - printk(KERN_INFO PFX "Watchdog enabled, timeout = %ds of %ds\n", - amdtco_gettimeout(), timeout); - - return 0; -} - - -static int amdtco_fop_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) -{ - int new_timeout; - int tmp; - - static struct watchdog_info ident = { - .options = WDIOF_SETTIMEOUT | WDIOF_CARDRESET, - .identity = "AMD 766/768", - }; - - switch (cmd) { - default: - return -ENOIOCTLCMD; - - case WDIOC_GETSUPPORT: - if (copy_to_user((struct watchdog_info *)arg, &ident, sizeof ident)) - return -EFAULT; - return 0; - - case WDIOC_GETSTATUS: - return put_user(amdtco_status(), (int *)arg); - - case WDIOC_KEEPALIVE: - amdtco_ping(); - return 0; - - case WDIOC_SETTIMEOUT: - if (get_user(new_timeout, (int *)arg)) - return -EFAULT; - - if (new_timeout < 0) - return -EINVAL; - - if (new_timeout > MAX_TIMEOUT) - new_timeout = MAX_TIMEOUT; - - timeout = new_timeout; - amdtco_settimeout(timeout); - /* fall through and return the new timeout */ - - case WDIOC_GETTIMEOUT: - return put_user(amdtco_gettimeout(), (int *)arg); - - case WDIOC_SETOPTIONS: - if (copy_from_user(&tmp, (int *)arg, sizeof tmp)) - return -EFAULT; - - if (tmp & WDIOS_DISABLECARD) - amdtco_disable(); - - if (tmp & WDIOS_ENABLECARD) - amdtco_enable(); - - return 0; - } -} - - -static int amdtco_fop_release(struct inode *inode, struct file *file) -{ - if (expect_close == 42) { - amdtco_disable(); - printk(KERN_INFO PFX "Watchdog disabled\n"); - } else { - amdtco_ping(); - printk(KERN_CRIT PFX "Unexpected close!, timeout in %d seconds\n", timeout); - } - - expect_close = 0; - up(&open_sem); - return 0; -} - - -static ssize_t amdtco_fop_write(struct file *file, const char *data, size_t len, loff_t *ppos) -{ - if (ppos != &file->f_pos) - return -ESPIPE; - - if (len) { - if (!nowayout) { - size_t i; - char c; - expect_close = 0; - - for (i = 0; i != len; i++) { - if (get_user(c, data + i)) - return -EFAULT; - - if (c == 'V') - expect_close = 42; - } - } - amdtco_ping(); - } - - return len; -} - - -static int amdtco_notify_sys(struct notifier_block *this, unsigned long code, void *unused) -{ - if (code == SYS_DOWN || code == SYS_HALT) - amdtco_disable(); - - return NOTIFY_DONE; -} - - -static struct notifier_block amdtco_notifier = -{ - .notifier_call = amdtco_notify_sys, -}; - -static struct file_operations amdtco_fops = -{ - .owner = THIS_MODULE, - .write = amdtco_fop_write, - .ioctl = amdtco_fop_ioctl, - .open = amdtco_fop_open, - .release = amdtco_fop_release, -}; - -static struct miscdevice amdtco_miscdev = -{ - .minor = WATCHDOG_MINOR, - .name = "watchdog", - .fops = &amdtco_fops, -}; - -static struct pci_device_id amdtco_pci_tbl[] = { - /* AMD 766 PCI_IDs here */ - { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_OPUS_7443, PCI_ANY_ID, PCI_ANY_ID, }, - { 0, }, -}; - -MODULE_DEVICE_TABLE (pci, amdtco_pci_tbl); - -static int __init amdtco_init(void) -{ - int ret; - - sema_init(&open_sem, 1); - spin_lock_init(&amdtco_lock); - - dev = NULL; - while ((dev = pci_find_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { - if (pci_match_device (amdtco_pci_tbl, dev) != NULL) - goto found_one; - } - - return -ENODEV; - -found_one: - - if ((ret = register_reboot_notifier(&amdtco_notifier))) { - printk(KERN_ERR PFX "Unable to register reboot notifier err = %d\n", ret); - goto out_clean; - } - - if ((ret = misc_register(&amdtco_miscdev))) { - printk(KERN_ERR PFX "Unable to register miscdev on minor %d\n", WATCHDOG_MINOR); - goto out_unreg_reboot; - } - - pci_read_config_dword(dev, 0x58, &pmbase); - pmbase &= 0x0000FF00; - - if (pmbase == 0) { - printk (KERN_ERR PFX "power management base not set\n"); - ret = -EIO; - goto out_unreg_misc; - } - - /* ret = 0; */ - printk(banner); - goto out_clean; - -out_unreg_misc: - misc_deregister(&amdtco_miscdev); -out_unreg_reboot: - unregister_reboot_notifier(&amdtco_notifier); -out_clean: - return ret; -} - -static void __exit amdtco_exit(void) -{ - misc_deregister(&amdtco_miscdev); - unregister_reboot_notifier(&amdtco_notifier); -} - -module_init(amdtco_init); -module_exit(amdtco_exit); - -MODULE_AUTHOR("Zwane Mwaikambo "); -MODULE_DESCRIPTION("AMD 766/768 TCO Timer Driver"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_MISCDEV(WATCHDOG_MINOR); --- diff/include/asm-alpha/rmap.h 2002-10-16 04:28:23.000000000 +0100 +++ source/include/asm-alpha/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,7 +0,0 @@ -#ifndef _ALPHA_RMAP_H -#define _ALPHA_RMAP_H - -/* nothing to see, move along */ -#include - -#endif --- diff/include/asm-arm/rmap.h 2002-10-16 04:27:09.000000000 +0100 +++ source/include/asm-arm/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,6 +0,0 @@ -#ifndef _ARM_RMAP_H -#define _ARM_RMAP_H - -#include - -#endif /* _ARM_RMAP_H */ --- diff/include/asm-arm26/rmap.h 2003-06-30 10:07:24.000000000 +0100 +++ source/include/asm-arm26/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,66 +0,0 @@ -#ifndef _ARM_RMAP_H -#define _ARM_RMAP_H - -/* - * linux/include/asm-arm26/proc-armv/rmap.h - * - * Architecture dependant parts of the reverse mapping code, - * - * ARM is different since hardware page tables are smaller than - * the page size and Linux uses a "duplicate" one with extra info. - * For rmap this means that the first 2 kB of a page are the hardware - * page tables and the last 2 kB are the software page tables. - */ - -static inline void pgtable_add_rmap(struct page *page, struct mm_struct * mm, unsigned long address) -{ - page->mapping = (void *)mm; - page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); - inc_page_state(nr_page_table_pages); -} - -static inline void pgtable_remove_rmap(struct page *page) -{ - page->mapping = NULL; - page->index = 0; - dec_page_state(nr_page_table_pages); -} - -static inline struct mm_struct * ptep_to_mm(pte_t * ptep) -{ - struct page * page = virt_to_page(ptep); - return (struct mm_struct *)page->mapping; -} - -/* The page table takes half of the page */ -#define PTE_MASK ((PAGE_SIZE / 2) - 1) - -static inline unsigned long ptep_to_address(pte_t * ptep) -{ - struct page * page = virt_to_page(ptep); - unsigned long low_bits; - - low_bits = ((unsigned long)ptep & PTE_MASK) * PTRS_PER_PTE; - return page->index + low_bits; -} - -//FIXME!!! IS these correct? -static inline pte_addr_t ptep_to_paddr(pte_t *ptep) -{ - return (pte_addr_t)ptep; -} - -static inline pte_t *rmap_ptep_map(pte_addr_t pte_paddr) -{ - return (pte_t *)pte_paddr; -} - -static inline void rmap_ptep_unmap(pte_t *pte) -{ - return; -} - - -//#include - -#endif /* _ARM_RMAP_H */ --- diff/include/asm-cris/rmap.h 2002-10-16 04:28:24.000000000 +0100 +++ source/include/asm-cris/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,7 +0,0 @@ -#ifndef _CRIS_RMAP_H -#define _CRIS_RMAP_H - -/* nothing to see, move along :) */ -#include - -#endif --- diff/include/asm-generic/rmap.h 2004-04-21 10:45:35.849253584 +0100 +++ source/include/asm-generic/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,91 +0,0 @@ -#ifndef _GENERIC_RMAP_H -#define _GENERIC_RMAP_H -/* - * linux/include/asm-generic/rmap.h - * - * Architecture dependent parts of the reverse mapping code, - * this version should work for most architectures with a - * 'normal' page table layout. - * - * We use the struct page of the page table page to find out - * the process and full address of a page table entry: - * - page->mapping points to the process' mm_struct - * - page->index has the high bits of the address - * - the lower bits of the address are calculated from the - * offset of the page table entry within the page table page - * - * For CONFIG_HIGHPTE, we need to represent the address of a pte in a - * scalar pte_addr_t. The pfn of the pte's page is shifted left by PAGE_SIZE - * bits and is then ORed with the byte offset of the pte within its page. - * - * For CONFIG_HIGHMEM4G, the pte_addr_t is 32 bits. 20 for the pfn, 12 for - * the offset. - * - * For CONFIG_HIGHMEM64G, the pte_addr_t is 64 bits. 52 for the pfn, 12 for - * the offset. - */ -#include - -static inline void pgtable_add_rmap(struct page * page, struct mm_struct * mm, unsigned long address) -{ -#ifdef BROKEN_PPC_PTE_ALLOC_ONE - /* OK, so PPC calls pte_alloc() before mem_map[] is setup ... ;( */ - extern int mem_init_done; - - if (!mem_init_done) - return; -#endif - page->mapping = (void *)mm; - page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); - inc_page_state(nr_page_table_pages); -} - -static inline void pgtable_remove_rmap(struct page * page) -{ - page->mapping = NULL; - page->index = 0; - dec_page_state(nr_page_table_pages); -} - -static inline struct mm_struct * ptep_to_mm(pte_t * ptep) -{ - struct page * page = kmap_atomic_to_page(ptep); - return (struct mm_struct *) page->mapping; -} - -static inline unsigned long ptep_to_address(pte_t * ptep) -{ - struct page * page = kmap_atomic_to_page(ptep); - unsigned long low_bits; - low_bits = ((unsigned long)ptep & (PTRS_PER_PTE*sizeof(pte_t) - 1)) - * (PAGE_SIZE/sizeof(pte_t)); - return page->index + low_bits; -} - -#ifdef CONFIG_HIGHPTE -static inline pte_addr_t ptep_to_paddr(pte_t *ptep) -{ - pte_addr_t paddr; - paddr = ((pte_addr_t)page_to_pfn(kmap_atomic_to_page(ptep))) << PAGE_SHIFT; - return paddr + (pte_addr_t)((unsigned long)ptep & ~PAGE_MASK); -} -#else -static inline pte_addr_t ptep_to_paddr(pte_t *ptep) -{ - return (pte_addr_t)ptep; -} -#endif - -#ifndef CONFIG_HIGHPTE -static inline pte_t *rmap_ptep_map(pte_addr_t pte_paddr) -{ - return (pte_t *)pte_paddr; -} - -static inline void rmap_ptep_unmap(pte_t *pte) -{ - return; -} -#endif - -#endif /* _GENERIC_RMAP_H */ --- diff/include/asm-i386/rmap.h 2002-10-16 04:28:25.000000000 +0100 +++ source/include/asm-i386/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,21 +0,0 @@ -#ifndef _I386_RMAP_H -#define _I386_RMAP_H - -/* nothing to see, move along */ -#include - -#ifdef CONFIG_HIGHPTE -static inline pte_t *rmap_ptep_map(pte_addr_t pte_paddr) -{ - unsigned long pfn = (unsigned long)(pte_paddr >> PAGE_SHIFT); - unsigned long off = ((unsigned long)pte_paddr) & ~PAGE_MASK; - return (pte_t *)((char *)kmap_atomic(pfn_to_page(pfn), KM_PTE2) + off); -} - -static inline void rmap_ptep_unmap(pte_t *pte) -{ - kunmap_atomic(pte, KM_PTE2); -} -#endif - -#endif --- diff/include/asm-ia64/rmap.h 2002-10-16 04:27:56.000000000 +0100 +++ source/include/asm-ia64/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,7 +0,0 @@ -#ifndef _ASM_IA64_RMAP_H -#define _ASM_IA64_RMAP_H - -/* nothing to see, move along */ -#include - -#endif /* _ASM_IA64_RMAP_H */ --- diff/include/asm-m68k/rmap.h 2002-10-16 04:29:05.000000000 +0100 +++ source/include/asm-m68k/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,7 +0,0 @@ -#ifndef _M68K_RMAP_H -#define _M68K_RMAP_H - -/* nothing to see, move along */ -#include - -#endif --- diff/include/asm-m68knommu/rmap.h 2002-11-11 11:09:43.000000000 +0000 +++ source/include/asm-m68knommu/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,2 +0,0 @@ -/* Do not need anything here */ - --- diff/include/asm-mips/rmap.h 2003-07-08 09:55:19.000000000 +0100 +++ source/include/asm-mips/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,7 +0,0 @@ -#ifndef __ASM_RMAP_H -#define __ASM_RMAP_H - -/* nothing to see, move along */ -#include - -#endif /* __ASM_RMAP_H */ --- diff/include/asm-parisc/rmap.h 2002-10-16 04:29:05.000000000 +0100 +++ source/include/asm-parisc/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,7 +0,0 @@ -#ifndef _PARISC_RMAP_H -#define _PARISC_RMAP_H - -/* nothing to see, move along */ -#include - -#endif --- diff/include/asm-ppc/rmap.h 2002-10-16 04:29:01.000000000 +0100 +++ source/include/asm-ppc/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,9 +0,0 @@ -#ifndef _PPC_RMAP_H -#define _PPC_RMAP_H - -/* PPC calls pte_alloc() before mem_map[] is setup ... */ -#define BROKEN_PPC_PTE_ALLOC_ONE - -#include - -#endif --- diff/include/asm-ppc64/rmap.h 2002-10-16 04:29:07.000000000 +0100 +++ source/include/asm-ppc64/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,9 +0,0 @@ -#ifndef _PPC64_RMAP_H -#define _PPC64_RMAP_H - -/* PPC64 calls pte_alloc() before mem_map[] is setup ... */ -#define BROKEN_PPC_PTE_ALLOC_ONE - -#include - -#endif --- diff/include/asm-s390/rmap.h 2002-10-16 04:28:24.000000000 +0100 +++ source/include/asm-s390/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,7 +0,0 @@ -#ifndef _S390_RMAP_H -#define _S390_RMAP_H - -/* nothing to see, move along */ -#include - -#endif --- diff/include/asm-sh/rmap.h 2002-10-16 04:29:02.000000000 +0100 +++ source/include/asm-sh/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,7 +0,0 @@ -#ifndef _SH_RMAP_H -#define _SH_RMAP_H - -/* nothing to see, move along */ -#include - -#endif --- diff/include/asm-sparc/rmap.h 2002-10-16 04:29:04.000000000 +0100 +++ source/include/asm-sparc/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,7 +0,0 @@ -#ifndef _SPARC_RMAP_H -#define _SPARC_RMAP_H - -/* nothing to see, move along */ -#include - -#endif --- diff/include/asm-sparc64/rmap.h 2002-10-16 04:27:53.000000000 +0100 +++ source/include/asm-sparc64/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,7 +0,0 @@ -#ifndef _SPARC64_RMAP_H -#define _SPARC64_RMAP_H - -/* nothing to see, move along */ -#include - -#endif --- diff/include/asm-um/rmap.h 2002-10-16 04:28:25.000000000 +0100 +++ source/include/asm-um/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,6 +0,0 @@ -#ifndef __UM_RMAP_H -#define __UM_RMAP_H - -#include "asm/arch/rmap.h" - -#endif --- diff/include/asm-v850/rmap.h 2002-11-11 11:09:43.000000000 +0000 +++ source/include/asm-v850/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1 +0,0 @@ -/* Do not need anything here */ --- diff/include/asm-x86_64/rmap.h 2002-10-16 04:28:34.000000000 +0100 +++ source/include/asm-x86_64/rmap.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,7 +0,0 @@ -#ifndef _X8664_RMAP_H -#define _X8664_RMAP_H - -/* nothing to see, move along */ -#include - -#endif