diff -u --recursive --new-file 2.4.1/Documentation/rclock.txt v2.4.1-rc/Documentation/rclock.txt --- 2.4.1/Documentation/rclock.txt Thu Jan 1 05:30:00 1970 +++ v2.4.1-rc/Documentation/rclock.txt Fri Feb 23 14:19:13 2001 @@ -0,0 +1,36 @@ +This document describes the rclock kernel patch version 0.01. + +To use read-copy update, install the patch on your 2.4.1 kernel +and enable CONFIG_RCLOCK during configuration. + + +While most of the read-copy update code is arch independent, a portion +of it is arch dependent (user trap counter and SMP loca timer). +This implementation is only for i386 architecture. + +While the read-copy callback maintenance code and the design is solid +the linux implementation is somewhat lame. It needs to be cleaned +up with more optimization for less interference by the quiescent +state detection mechanism. + +Things that are known to work - + +1. Basic callback mechanism +2. The kmem_deferred_free interface +3. Basic quiescent state detection mechanism + + +Things that are unknown - + +1. Reliability of the CPU stall detection mechanism +2. rclock stuff under heavy load. This will be tested when our old + ptx test harness is ready to run on linux +3. Did I cover all the usertraps ? +4. How much interference the quiescent state detection mechanism has ? + +These are all being investigated and will be enhanced in a later released. + +For any problems, contact dipankar@sequent.com. +The detailed documentation is available at +http://lse.sourcefore.net/locking/rclock.html + diff -u --recursive --new-file 2.4.1/arch/i386/config.in v2.4.1-rc/arch/i386/config.in --- 2.4.1/arch/i386/config.in Tue Jan 9 02:57:56 2001 +++ v2.4.1-rc/arch/i386/config.in Tue Feb 20 16:46:33 2001 @@ -154,6 +154,7 @@ bool 'Math emulation' CONFIG_MATH_EMULATION bool 'MTRR (Memory Type Range Register) support' CONFIG_MTRR +bool 'Read-Copy-Update lock support' CONFIG_RCLOCK bool 'Symmetric multi-processing support' CONFIG_SMP if [ "$CONFIG_SMP" != "y" ]; then bool 'APIC and IO-APIC support on uniprocessors' CONFIG_X86_UP_IOAPIC diff -u --recursive --new-file 2.4.1/arch/i386/kernel/apic.c v2.4.1-rc/arch/i386/kernel/apic.c --- 2.4.1/arch/i386/kernel/apic.c Wed Dec 6 02:13:48 2000 +++ v2.4.1-rc/arch/i386/kernel/apic.c Tue Feb 20 16:46:33 2001 @@ -22,6 +22,11 @@ #include #include #include +#ifdef CONFIG_RCLOCK +#include +#include +#endif + #include #include @@ -654,6 +659,10 @@ { int user = user_mode(regs); int cpu = smp_processor_id(); +#ifdef CONFIG_RCLOCK + int cpunum = cpu_number_map(cpu); +#endif + /* * The profiling function is SMP safe. (nothing can mess @@ -663,6 +672,17 @@ */ if (!user) x86_do_profile(regs->eip); + +#ifdef CONFIG_RCLOCK + if (((RC_PLOCAL_rclockcurlist(cpunum) != NULL) && + RC_GEN_LT(RC_PLOCAL_rclockgen(cpunum), rc_ctrlblk.curgen)) || + (RC_PLOCAL_rclockcurlist(cpunum) == NULL && + RC_PLOCAL_rclocknxtlist(cpunum) != NULL) || + test_bit(cpunum, &rc_ctrlblk.needctxtmask) || + ((jiffies - rc_ctrlblk.clock) > RCLOCK_STALL_WARN)) + rc_chk_callbacks(user || (current == init_tasks[cpunum])); +#endif + if (--prof_counter[cpu] <= 0) { /* diff -u --recursive --new-file 2.4.1/arch/i386/kernel/entry.S v2.4.1-rc/arch/i386/kernel/entry.S --- 2.4.1/arch/i386/kernel/entry.S Thu Nov 9 06:39:50 2000 +++ v2.4.1-rc/arch/i386/kernel/entry.S Tue Feb 20 16:46:34 2001 @@ -196,6 +196,13 @@ pushl %eax # save orig_eax SAVE_ALL GET_CURRENT(%ebx) +#ifdef CONFIG_RCLOCK + pushl %eax + pushl processor(%ebx) + call SYMBOL_NAME(rc_inc_syscall_count) + addl $4,%esp + popl %eax +#endif cmpl $(NR_syscalls),%eax jae badsys testb $0x02,tsk_ptrace(%ebx) # PT_TRACESYS diff -u --recursive --new-file 2.4.1/arch/i386/kernel/traps.c v2.4.1-rc/arch/i386/kernel/traps.c --- 2.4.1/arch/i386/kernel/traps.c Tue Jan 16 06:24:20 2001 +++ v2.4.1-rc/arch/i386/kernel/traps.c Tue Feb 20 16:46:34 2001 @@ -49,6 +49,10 @@ #include +#ifdef CONFIG_RCLOCK +#include +#endif + asmlinkage int system_call(void); asmlinkage void lcall7(void); asmlinkage void lcall27(void); @@ -237,6 +241,11 @@ if (!(regs->xcs & 3)) goto kernel_trap; +#ifdef CONFIG_RCLOCK + /* TBD : figure out vm86 traps */ + RC_PLOCAL_usertrap(cpu_number_map(smp_processor_id()))++; +#endif + trap_signal: { struct task_struct *tsk = current; tsk->thread.error_code = error_code; @@ -319,6 +328,11 @@ if (!(regs->xcs & 3)) goto gp_in_kernel; +#ifdef CONFIG_RCLOCK + /* TBD : figure out vm86 traps */ + RC_PLOCAL_usertrap(cpu_number_map(smp_processor_id()))++; +#endif + current->thread.error_code = error_code; current->thread.trap_no = 13; force_sig(SIGSEGV, current); @@ -525,6 +539,11 @@ if (regs->eflags & VM_MASK) goto debug_vm86; +#ifdef CONFIG_RCLOCK + if (regs->xcs & 3) + RC_PLOCAL_usertrap(cpu_number_map(smp_processor_id()))++; +#endif + /* Save debug status register where ptrace can see it */ tsk->thread.debugreg[6] = condition; @@ -637,6 +656,10 @@ asmlinkage void do_coprocessor_error(struct pt_regs * regs, long error_code) { +#ifdef CONFIG_RCLOCK + if (regs->xcs & 3) + RC_PLOCAL_usertrap(cpu_number_map(smp_processor_id()))++; +#endif ignore_irq13 = 1; math_error((void *)regs->eip); } @@ -692,6 +715,10 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs * regs, long error_code) { +#ifdef CONFIG_RCLOCK + if (regs->xcs & 3) + RC_PLOCAL_usertrap(cpu_number_map(smp_processor_id()))++; +#endif if (cpu_has_xmm) { /* Handle SIMD FPU exceptions on PIII+ processors. */ ignore_irq13 = 1; diff -u --recursive --new-file 2.4.1/include/linux/kmemdef.h v2.4.1-rc/include/linux/kmemdef.h --- 2.4.1/include/linux/kmemdef.h Thu Jan 1 05:30:00 1970 +++ v2.4.1-rc/include/linux/kmemdef.h Fri Feb 23 13:05:25 2001 @@ -0,0 +1,72 @@ +/* + * Support for deferred freeing of memory using Read-Copy Update + * mechanism. + * + * Copyright (c) International Business Machines Corp., 2001 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * + * Author: Dipankar Sarma + * (Based on a Dynix/ptx implementation by + * Paul Mckenney ) + * + * + * + * For detailed explanation of Read-Copy Update mechanism see - + * http://lse.sourceforge.net/locking/rclock.html + * + */ + +#ifndef _LINUX_KMEMDEF_H +#define _LINUX_KMEMDEF_H + +#include + +/* Definition for item on batching lists. */ +struct kmem_defer_item { + struct kmem_defer_item *kdi_next;/* Pointer to next item. */ + void *kdi_ptr; /* Pointer to block to free. */ + int (*kdi_func)(void *ptr); + /* Pointer to free-prep function. */ +}; + +struct kmem_defer { + char kmemd_quiescent;/* =1 indicates that kmemd_callback */ + /* is available for use. No mutex is */ + /* needed for per-CPU use, since a */ + /* CPU clears only its bit if it was */ + /* set, and a callback sets the */ + /* specified bit only if it was clear.*/ + struct kmem_defer_item *kmemd_head;/* Head of list of blocks. */ + struct kmem_defer_item **kmemd_tail;/* Tail of list of blocks. */ + rc_callback_t *kmemd_callback;/* Pointer to callback element. */ +}; + +union kmem_defer_u { + struct kmem_defer kmemdef; + char pad[((sizeof(struct kmem_defer) / SMP_CACHE_BYTES) + 1) + * SMP_CACHE_BYTES]; +}; + + + + +extern void kmem_deferred_free(void *ptr, int (*func)(void *ptr), + struct kmem_defer_item *kp); +extern void kmemd_init(void); + + +#endif /* _LINUX_KMEMDEF_H */ diff -u --recursive --new-file 2.4.1/include/linux/rclock.h v2.4.1-rc/include/linux/rclock.h --- 2.4.1/include/linux/rclock.h Thu Jan 1 05:30:00 1970 +++ v2.4.1-rc/include/linux/rclock.h Fri Feb 23 13:05:13 2001 @@ -0,0 +1,310 @@ +/* + * Read-Copy Update mechanism for mutual exclusion + * + * Copyright (c) International Business Machines Corp., 2001 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * + * Author: Dipankar Sarma + * (Based on a Dynix/ptx implementation by + * Paul Mckenney ) + * + * + * For detailed explanation of Read-Copy Update mechanism see - + * http://lse.sourceforge.net/locking/rclock.html + * + */ + +#ifndef __LINUX_RCLOCK_H +#define __LINUX_RCLOCK_H + +#include +#include +#include + + + +typedef struct rc_callback rc_callback_t; +typedef int (*rc_callback_func_t)(rc_callback_t *rc, + void *arg1, + void *arg2); + + +/* Generation counter type. */ +typedef long rc_gen_t; + +/* + * Callback structure for use with rc_callback(). + * Users must use the rc_alloc_callback() and rc_free_callback() functions + * to allocate these. + * The rc_set_func() function may be used to modify an existing + * rc_callback struct. + */ +struct rc_callback { + rc_callback_t *next; + rc_callback_func_t callback; + void *arg1; + void *arg2; + char flags; /* flags, see below. */ +}; + +#define RCC_REGISTERED 0x01 /* Callback in use. */ +#define RCC_DEFER 0x02 /* Handler may defer. */ + +/* Control variables for rclock callback mechanism. */ + +typedef struct rc_ctrlblk { + + /* Control variables. */ + + spinlock_t mutex; /* Guard this struct and lists. */ + rc_gen_t curgen; /* Current generation number. */ + rc_gen_t maxgen; /* Max requested generation number. */ + unsigned long olmsk; /* Mask of CPUs currently online. */ + unsigned long needctxtmask; /* CPUs that need to switch in order*/ + /* for current callback to proceed. */ + + /* + * Timeout checking fields. This is used to detect CPUs going + * away for extended periods of time, either due to being in a + * tight loop in the kernel or due to hardware failure. + * The clock and clockbatch fields are initialized to + * the value of the jiffies when a new + * rclock batch starts. + * The rc_chk_callbacks() function will periodically complain + * if too much time is being consumed by the current batch. + */ + + unsigned long clock; /* Clocks since batch started */ + /* or since last warning. */ + unsigned long clockbatch; /* Clocks since batch started. */ + + /* Statistics. */ + + atomic_t nreg; /* Number of callbacks registered. */ + atomic_t nchk; /* Number checks from hardclock. */ + atomic_t nprc; /* Number of callbacks processed. */ + atomic_t ntogbl; /* Number of flushes to global list. */ + atomic_t nprcgbl; /* Number of callbacks processed from */ + /* global list. */ + int stalledsz; /* Record size of stalled array. */ + long stallwarns; /* Number of rclock stall warnings. */ + + /* + * Place to hold which CPUs are and are not stalled when preparing + * to print stall-warning messages. This field is not protected + * by mutex while the message is being composed, but this is + * only a problem if the CPU composing the message is itself + * stalled for more than RCLOCK_STALL_WARN microseconds. + */ + + char *stalled; /* CPUs stalled. */ + + /* Flag to indicate tlbtest suite underway. */ + + long under_test; + + /* Variable to collect fine-grained rclock performance measurements. */ + atomic_t clock_ticks; + +} rc_ctrlblk_t; + +extern rc_ctrlblk_t rc_ctrlblk; + + + /* Definition for stall limits. */ + +#define RCLOCK_MAX_STALL 150 /* 1.5s to first warn/panic. */ +#define RCLOCK_STALL_WARN 1000 /* 10s between warns. */ +#define RCLOCK_STALL_GRACE 3 /* 30ms grace for stalled CPUS*/ + +#define RC_CALLBACK_DONE 0 +#define RC_CALLBACK_DEFER 1 + +/* + * RC_GEN_XX(rc_gen_t a, rc_gen_t b) + * + * Returns true if generation counter ``a'' compares as specified to ``b''. + * This comparison allows for the fact that the counters wrap. + */ +#define RC_GEN_EQ(a, b) ((a) - (b) == 0) +#define RC_GEN_GE(a, b) ((a) - (b) >= 0) +#define RC_GEN_GT(a, b) ((a) - (b) > 0) +#define RC_GEN_LE(a, b) ((a) - (b) <= 0) +#define RC_GEN_LT(a, b) ((a) - (b) < 0) +#define RC_GEN_NE(a, b) ((a) - (b) != 0) + +#define NRC_RDLOG_REC 64 /* Size of the RC_RD log */ + +/* + * Data structures used to trace RC_RDPROTECT() and RC_RDUNPROTECT() + * calls. + */ +typedef struct rc_rdlog { + int rdl_nlogrecs; /* #of RC_RD records since boot */ + /* (this counter wraps) */ + char *rdl_eip[NRC_RDLOG_REC]; + /* RC_RD log records */ +} rc_rdlog_t; + + +typedef struct rc_plocal { + /* + * Per-cpu counters maintained to indicate quiscent state + * transition. + */ + long cswtchctr; + unsigned long syscall; + unsigned long usertrap; + unsigned long syncpoint; + + /* + * Per-CPU variables used by the write side of the read-copy lock + * mechanism. See kernel/rclock.c. + */ + long rclockcswtchctr; /* value of cswtchctr at beginning */ + /* of last rclock flush interval. */ + unsigned long rclocksyscall; /* value of per-cpu syscall at */ + /* beginning of last rclock flush */ + /* interval. */ + unsigned long rclockusertrap; /* value of per-cpu usertrap at */ + /* beginning of last rclock flush */ + /* interval. */ + unsigned long rclocksyncpoint; /* value of per-cpu syncpoint at */ + /* beginning of last rclock flush */ + /* interval. */ + rc_gen_t rclockgen; /* Generation for curlist. */ + rc_callback_t *rclocknxtlist; /* Callbacks in next round ... */ + rc_callback_t **rclocknxttail; /* ... and tail pointer. */ + rc_callback_t *rclockcurlist; /* Callbacks in current round ... */ + rc_callback_t **rclockcurtail; /* ... and tail pointer. */ + rc_callback_t *rclockintrlist; /* Callbacks waiting for rrupt ... */ + rc_callback_t **rclockintrtail; /* ... and tail pointer. */ + struct tasklet_struct rclocktasklet; +#ifdef DEBUG + int mnesting; /* Mutex nesting for that cpu */ +#endif +} rc_plocal_t; + +union rc_plocal_union { + rc_plocal_t rc_plocal; + char __pad [((sizeof(rc_plocal_t)/SMP_CACHE_BYTES) + 1) * + SMP_CACHE_BYTES]; +}; + +extern union rc_plocal_union rc_plocal_data[NR_CPUS]; + +#define RC_PLOCAL(cpu) ((rc_plocal_t *)&rc_plocal_data[(cpu)]) + +#define RC_PLOCAL_cswtchctr(cpu) ((RC_PLOCAL((cpu)))->cswtchctr) +#define RC_PLOCAL_usertrap(cpu) ((RC_PLOCAL((cpu)))->usertrap) +#define RC_PLOCAL_syscall(cpu) ((RC_PLOCAL((cpu)))->syscall) +#define RC_PLOCAL_syncpoint(cpu) ((RC_PLOCAL((cpu)))->syncpoint) + +#define RC_PLOCAL_rclockgen(cpu) ((RC_PLOCAL((cpu)))->rclockgen) +#define RC_PLOCAL_rclocknxtlist(cpu) ((RC_PLOCAL((cpu)))->rclocknxtlist) +#define RC_PLOCAL_rclockcurlist(cpu) ((RC_PLOCAL((cpu)))->rclockcurlist) +#define RC_PLOCAL_rclockintrlist(cpu) ((RC_PLOCAL((cpu)))->rclockintrlist) + +#define RC_PLOCAL_rclocktasklet(cpu) ((RC_PLOCAL((cpu)))->rclocktasklet) + +#ifdef DEBUG +#define RC_PLOCAL_mnesting(cpu) ((RC_PLOCAL((cpu)))->mnesting) +#endif + +#define CSWTCHCTR_INVALID 0 + +/* + * RC_MEMSYNC() + * + * Force all cacheable writes by the calling processor to complete + * to the point of being known to the system memory consistency model. + * Note well that this primitive does NOT guarantee that the writes have made + * it out to physical memory--it only guarantees that all pending cacheable + * writes by the calling processor are visible to all other processors in the + * system. + * + * This primitive is required for systems (such as the 80486, Pentium, + * PentiumPro, etc.) which do not support strict sequential consistency. + * On such systems, cacheable reads and writes by the calling processor + * that do not conflict with the execution stream of the calling processor, + * may be executed out of order, which could potentially cause data elements + * to be linked into data structures before they are completely initialized, + * even though the source code specifies all initialization operations before + * the link-in operations. + * + * Placing an ``RC_MEMSYNC()'' invocation between the initialization and + * link-in operations will ensure that the initialization is completed + * before the link-in starts. + */ +#define RC_MEMSYNC() mb() + +/* + * Some CPUs (such as the DEC Alpha) have no cheap implementation of + * RC_MEMSYNC(). In some cases on these machines, it is simplest to + * insert special instructions on both the read side and the write side. + * (There are other approaches, see the web page.) + * The RC_RDMEMSYNC() macro is to be used on the read side between the + * time the pointer is picked up and the time that it is dereferenced. + * + * This macro compiles to nothing on machines (such as x86 machines) + * that have strong memory-fence operations. + * TBD: This needs to be done in an arch independent way + */ +#define RC_RDMEMSYNC() + +#ifdef DEBUG +#define RC_RDPROTECT rc_rdprotect +#define RC_RDUNPROTECT rc_rdunprotect + +#define RC_ASSERT_DEBUG(x,y) { if (!(x)) panic(y);} + +extern void rc_rdprotect(void); +extern void rc_rdunprotect(void); + +#else /* DEBUG */ +#define RC_RDPROTECT() +#define RC_RDUNPROTECT() +#define RC_ASSERT_DEBUG(x,y) +#endif /* DEBUG */ + + +extern rc_callback_t *rc_alloc_callback(rc_callback_func_t func, + void *arg1, + void *arg2, + int flag); +extern void rc_free_callback(rc_callback_t *rp); +extern void rc_get_func_args(rc_callback_t *rp, + rc_callback_func_t *func, + void **arg1, + void **arg2); +extern void rc_set_func(rc_callback_t *rp, + rc_callback_func_t func, + void *arg1, + void *arg2); +extern void rc_set_func_args(rc_callback_t *rp, + rc_callback_func_t *func, + void **arg1, + void **arg2); +extern void rc_chk_callbacks(int not_in_kernel); +extern void rc_callback(rc_callback_t *rc); +extern int rc_callback_wrapper(rc_callback_t *rc, void *arg1, void *arg2); +extern void rc_init(void); +extern void rc_plocal_init(void); +extern void rc_intr(unsigned long); + + +#endif /* __LINUX_RCLOCK_H */ diff -u --recursive --new-file 2.4.1/init/main.c v2.4.1-rc/init/main.c --- 2.4.1/init/main.c Thu Jan 4 10:15:26 2001 +++ v2.4.1-rc/init/main.c Tue Feb 20 16:46:34 2001 @@ -27,6 +27,11 @@ #include #include #include +#ifdef CONFIG_RCLOCK +#include +#include +#endif + #include #include @@ -601,6 +606,15 @@ * make syscalls (and thus be locked). */ smp_init(); +#ifdef CONFIG_RCLOCK + /* + * RCLOCK subsystem is initialized right after SMP initialization. + * This allows RC to use online cpu maps and other SMP info. + */ + rc_init(); + kmemd_init(); +#endif /* CONFIG_RCLOCK */ + kernel_thread(init, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); unlock_kernel(); current->need_resched = 1; diff -u --recursive --new-file 2.4.1/kernel/Makefile v2.4.1-rc/kernel/Makefile --- 2.4.1/kernel/Makefile Sat Dec 30 03:37:24 2000 +++ v2.4.1-rc/kernel/Makefile Tue Feb 20 16:46:34 2001 @@ -19,6 +19,8 @@ obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += ksyms.o obj-$(CONFIG_PM) += pm.o +obj-$(CONFIG_RCLOCK) += rclock.o kmemdef.o + ifneq ($(CONFIG_IA64),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff -u --recursive --new-file 2.4.1/kernel/kmemdef.c v2.4.1-rc/kernel/kmemdef.c --- 2.4.1/kernel/kmemdef.c Thu Jan 1 05:30:00 1970 +++ v2.4.1-rc/kernel/kmemdef.c Fri Feb 23 13:05:06 2001 @@ -0,0 +1,245 @@ +/* + * Support for deferred freeing of memory using Read-Copy Update + * mechanism. + * + * Copyright (c) International Business Machines Corp., 2001 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * + * Author: Dipankar Sarma + * (Based on a Dynix/ptx implementation by + * Paul Mckenney ) + * + * + * For detailed explanation of Read-Copy Update mechanism see - + * http://lse.sourceforge.net/locking/rclock.html + * + */ + +/* + * Handles ``deferred'' kfree. The block to be freed is queued until + * it can be guaranteed that no CPU could be still holding a reference to + * it. The caller of kmem_deferred_free is responsible for ensuring that + * there is no path by which subsequent CPUs might gain a new reference + * to the block. + * + * The kmem_deferred_free function can be used to handle deletions from + * a data structure that is guarded by a read-copy lock. + * + * The rclock mechanism is used to determine that no CPU holds a reference + * to the block; this file illustrates how to batch requests to the rclock + * subsystem. + */ + +#include +#include +#include +#include +#include +#include + +static union kmem_defer_u kmem_defer_percpu[NR_CPUS] __cacheline_aligned; + +#define KMEM_DEFER_PERCPU(cpu) ((struct kmem_defer *)&kmem_defer_percpu[(cpu)]) + +static void kmemd_register_percpu(void); + + +/* + * Free up a chain of struct kmem_defer_item's. + */ +static void kmemd_callback_freeup(void *arg1) +{ + struct kmem_defer_item *kp = arg1; + struct kmem_defer_item *kp1; + + /* Free up the list of blocks. */ + while (kp != NULL) { + int cbfreed = 0; + + kp1 = kp->kdi_next; + + /* + * Invoke caller-supplied function to free up any structures + * subordinate to this one. + */ + if (kp->kdi_func != NULL) { + cbfreed = (*kp->kdi_func)(kp->kdi_ptr); + } + + /* Free up any remnants that need freeing. */ + if (!cbfreed) { + kfree(kp->kdi_ptr); + } + + kp = kp1; + } +} + + +/* + * Handles a per-CPU callback. Re-issues callback if the local list + * is not empty. + */ +static int kmemd_callback_percpu(rc_callback_t *rc, void *arg1, void *arg2) +{ + int cpu = smp_processor_id(); + /* Free up the blocks. */ + + kmemd_callback_freeup(arg1); + + /* + * If not on the home CPU, just say that the callback is quiescent. + */ + if ((int)arg2 != cpu) { + /* + * No sync required for kmemd_quiscent of the home + * cpu since a stale kmemd_quiescent would only + * delay registration of that per-cpu batch. + */ + KMEM_DEFER_PERCPU((int)arg2)->kmemd_quiescent = 1; + return (RC_CALLBACK_DONE); + } + + /* + * Disable interrupts to prevent races with device interrupts. + */ + cli(); + if (KMEM_DEFER_PERCPU(cpu)->kmemd_head == NULL) { + + /* Nothing in the list, so just say quiescent and leave. */ + + KMEM_DEFER_PERCPU(cpu)->kmemd_quiescent = 1; + sti(); + return (RC_CALLBACK_DONE); + } + + /* + * List not empty, so gather up the list and NULL it out. + * Note that this function re-enables interrupts. + */ + kmemd_register_percpu(); + return (RC_CALLBACK_DONE); +} + + +/* + * Initialize and allocate data structures. + */ +void kmemd_init() +{ + int i; + + /* Allocate per-CPU lists and global callback. */ + + /* Initialize per-CPU list of kmad entries. */ + for (i = 0; i < NR_CPUS; i++) { + KMEM_DEFER_PERCPU(i)->kmemd_quiescent = 1; + KMEM_DEFER_PERCPU(i)->kmemd_head = NULL; + KMEM_DEFER_PERCPU(i)->kmemd_tail = + &KMEM_DEFER_PERCPU(i)->kmemd_head; + KMEM_DEFER_PERCPU(i)->kmemd_callback = + rc_alloc_callback(kmemd_callback_percpu, + NULL, NULL, GFP_KERNEL); + /* + * That's right!!! We set up the per-CPU callback to + * be kmemd_callback_gbl() rather than kmemd_callback_percpu(). + * The callback will be set to kmemd_callback_percpu() for a + * given CPU when the first kmem_deferred_free*() request + * is invoked on that CPU. + */ + } +} + + + +/* + * Register the contents of the per-CPU deferral list with the rclock + * mechanism. Caller must have interrupts disabled (cli); they are enabled + * prior to return. + */ +static void kmemd_register_percpu() +{ + int cpu = smp_processor_id(); + struct kmem_defer_item *kp; + + /* Mark the callback unavailable, and grab the list. */ + + KMEM_DEFER_PERCPU(cpu)->kmemd_quiescent = 0; + kp = KMEM_DEFER_PERCPU(cpu)->kmemd_head; + KMEM_DEFER_PERCPU(cpu)->kmemd_head = NULL; + KMEM_DEFER_PERCPU(cpu)->kmemd_tail = + &KMEM_DEFER_PERCPU(cpu)->kmemd_head; + + /* Re-enable interrupts and register the callback. */ + + sti(); + rc_set_func(KMEM_DEFER_PERCPU(cpu)->kmemd_callback, + kmemd_callback_percpu, + kp, + (void *)cpu); + rc_callback(KMEM_DEFER_PERCPU(cpu)->kmemd_callback); + return; +} + + +/* + * Register a block of memory to be kfree()ed after the rclock + * mechanism launders it. The memory must have been obtained via + * kmalloc(), and chip (cli/sti) interrupts must be enabled on + * entry. + * + * The caller passes in a pointer to the block, just + * as for kfree, but must also pass in a pointer to a struct kmem_defer_item + * (normally reference by a pointer in the structure). + * The ``func'' parameter (if non-NULL) will be invoked just before + * the block is freed. This function allows the caller to provide code + * to free up more complex structures. If this function returns 1, + * the function is assumed to have also freed up the structure itself. + */ +void kmem_deferred_free(void *ptr, int (*func)(void *), + struct kmem_defer_item *kp) +{ + int cpu = smp_processor_id(); + + /* Initialize the block. */ + + kp->kdi_next = NULL; + kp->kdi_ptr = ptr; + kp->kdi_func = func; + + /* + * Do it locally. Disable interrupts in order + * to prevent an interrupt routine from playing with the + * same list at the same time. We don't have to worry + * about other CPUs in our private entry. + */ + cli(); + + /* Add to the list. */ + *KMEM_DEFER_PERCPU(cpu)->kmemd_tail = kp; + KMEM_DEFER_PERCPU(cpu)->kmemd_tail = &(kp->kdi_next); + + /* If the callback is available, fire it off. */ + if (KMEM_DEFER_PERCPU(cpu)->kmemd_quiescent) { + kmemd_register_percpu(); + return; + } + + sti(); + return; +} + diff -u --recursive --new-file 2.4.1/kernel/ksyms.c v2.4.1-rc/kernel/ksyms.c --- 2.4.1/kernel/ksyms.c Mon Jan 29 05:41:20 2001 +++ v2.4.1-rc/kernel/ksyms.c Tue Feb 20 16:46:34 2001 @@ -45,6 +45,10 @@ #include #include #include +#ifdef CONFIG_RCLOCK +#include +#include +#endif #if defined(CONFIG_PROC_FS) #include @@ -538,3 +542,14 @@ EXPORT_SYMBOL(tasklist_lock); EXPORT_SYMBOL(pidhash); + +#ifdef CONFIG_RCLOCK +EXPORT_SYMBOL(rc_alloc_callback); +EXPORT_SYMBOL(rc_free_callback); +EXPORT_SYMBOL(rc_callback); +EXPORT_SYMBOL(rc_set_func); +EXPORT_SYMBOL(rc_set_func_args); +EXPORT_SYMBOL(rc_get_func_args); +EXPORT_SYMBOL(rc_callback_wrapper); +EXPORT_SYMBOL(kmem_deferred_free); +#endif diff -u --recursive --new-file 2.4.1/kernel/rclock.c v2.4.1-rc/kernel/rclock.c --- 2.4.1/kernel/rclock.c Thu Jan 1 05:30:00 1970 +++ v2.4.1-rc/kernel/rclock.c Fri Feb 23 13:06:10 2001 @@ -0,0 +1,1011 @@ +/* + * Read-Copy Update mechanism for mutual exclusion + * + * Copyright (c) International Business Machines Corp., 2001 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Author: Dipankar Sarma + * (Based on a Dynix/ptx implementation by + * Paul Mckenney ) + * + * + * For detailed explanation of Read-Copy Update mechanism see - + * http://lse.sourceforge.net/locking/rclock.html + * + */ + + +/* + * Control how CPU stalls are handled. + */ +#ifdef DEBUG /* Make stalls panic only in debug kernel. */ +#define RCLOCK_STALL_PANIC /* Make stalls panic. */ +#endif /* DEBUG */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Definition for rclock control block. */ +rc_ctrlblk_t rc_ctrlblk = + { SPIN_LOCK_UNLOCKED, + 1, 1, /* Generations*/ + 0, 0, /* Masks. */ + 0, 0, /* Time chk. */ + ATOMIC_INIT(0), ATOMIC_INIT(0), + ATOMIC_INIT(0), ATOMIC_INIT(0), + ATOMIC_INIT(0), /* Stats. */ + 0, 0, /* stall info */ + NULL, /* stall list. */ + 0, /* under test */ + ATOMIC_INIT(0), /* clock ticks */ + }; + + +union rc_plocal_union rc_plocal_data[NR_CPUS] __cacheline_aligned; + +/* + * Set time limit for kernel panics on CPU stalls. Set to zero initially + * (meaning don't panic), but debug kernels initialize this in rc_init(). + * The initialization is done there to avoid having architecture-dependent + * code in this file. + */ +unsigned long rclock_stall_panic_interval = 0; + +/* Suppress CPU stall checking. Useful for testing. */ +int rclock_ignore_stalls = +#ifdef DEBUG + 0; +#else /* !DEBUG */ + 1; +#endif /* !DEBUG */ + +#define RC_LAST_CPU(cpumask, cpu) (cpumask != 0 && \ + !((cpumask) & ~(0x1 << (cpu)))) + +/* + * Put parameters into variables to allow easier patching. + * RCLOCK_STALL_WARN remains hard-coded to avoid slowing up + * the local timer handler + */ +unsigned long rclock_max_stall = RCLOCK_MAX_STALL; + +/* + * Snapshot the syncpoint counters. + */ + +#define RC_SNAPSHOT_SYNC(plocal) \ +{ \ + (plocal)->rclockcswtchctr = (plocal)->cswtchctr; \ + (plocal)->rclocksyscall = (plocal)->syscall; \ + (plocal)->rclockusertrap = (plocal)->usertrap; \ + (plocal)->rclocksyncpoint = (plocal)->syncpoint; \ +} + +/* + * Return TRUE if one of the syncpoint counters has changed since the + * last snapshot, or if we are not in the kernel. Either thing indicates + * that we have passed through a syncpoint since the last snapshot. + */ + +#define RC_CHK_SYNC(plocal) \ + (not_in_kernel || \ + ((plocal)->cswtchctr != (plocal)->rclockcswtchctr) || \ + ((plocal)->syscall != (plocal)->rclocksyscall) || \ + ((plocal)->usertrap != (plocal)->rclockusertrap) || \ + ((plocal)->syncpoint != (plocal)->rclocksyncpoint)) + +static void rc_chk_stalls(int not_in_kernel, int clocks); +static void rc_cleanup(void); +static void rc_reg_gen(rc_gen_t newgen); + +/* + * The caller must have interrupts blocked. + */ +static void rc_adv_callbacks(void) +{ + rc_plocal_t *rcpl = RC_PLOCAL(cpu_number_map(smp_processor_id())); + + /* Count entry. */ + atomic_inc(&rc_ctrlblk.nchk); + + + /* + * Check to see if there are any callbacks in the per-CPU current list, + * and, if so, if the global generation number has moved past the + * per-CPU generation number (indicating that they may now be + * processed). + */ + if ((rcpl->rclockcurlist != NULL) && + RC_GEN_GT(rc_ctrlblk.curgen, rcpl->rclockgen)) { + + register int wasempty = (rcpl->rclockintrlist == NULL); + + /* + * Promote the callbacks from current to interrupt lists. + * Note that the explicit check for NULL is required because + * the default initialization of rclockintrtail is to NULL. + */ + + if (rcpl->rclockintrlist == NULL) { + rcpl->rclockintrlist = rcpl->rclockcurlist; + } else { + *rcpl->rclockintrtail = rcpl->rclockcurlist; + } + rcpl->rclockintrtail = rcpl->rclockcurtail; + rcpl->rclockcurlist = NULL; + rcpl->rclockcurtail = &rcpl->rclockcurlist; + + /* + * We have some. Schedule the per-cpu tasklet if the + * interrupt list was not initially empty (in which case + * per-cpu finished callback processing tasklet is + * already scheduled). + */ + if (wasempty) { + tasklet_schedule(&rcpl->rclocktasklet); + } + } + + /* + * Check to see if there are any callbacks in the per-CPU next + * list, and if so, if they can be moved into the per-CPU current list. + */ + if ((rcpl->rclocknxtlist != NULL) && (rcpl->rclockcurlist == NULL)) { + + /* Move the lists. */ + rcpl->rclockcurlist = rcpl->rclocknxtlist; + rcpl->rclockcurtail = rcpl->rclocknxttail; + rcpl->rclocknxtlist = NULL; + rcpl->rclocknxttail = &rcpl->rclocknxtlist; + + /* + * Get the list's generation number. + * Also register the new generation number with + * the global mechanism. + */ + spin_lock(&rc_ctrlblk.mutex); + rcpl->rclockgen = rc_ctrlblk.curgen + 1; +#if 0 /* A different way of interpreting the gen numbers */ + if (rc_ctrlblk.needctxtmask != 0 || + RC_GEN_LT(rc_ctrlblk.curgen, rc_ctrlblk.maxgen)) { + rcpl->rclockgen = rc_ctrlblk.curgen + 1; + } else { + /* + * If there is no current batch + */ + rcpl->rclockgen = rc_ctrlblk.curgen; + } +#endif + rc_reg_gen(rcpl->rclockgen); + spin_unlock(&rc_ctrlblk.mutex); + } +} + + +/* + * + * Returns a pointer to a new callback, or NULL if not enough + * memory available and sleeping is not allowed by "flag". + * kmalloc() flags can be passed directly here. + */ +rc_callback_t *rc_alloc_callback(rc_callback_func_t func, void *arg1, + void *arg2, int flag) +{ + rc_callback_t *rp = NULL; + + rp = (rc_callback_t *)kmalloc((size_t)sizeof(*rp), flag); + if (rp == NULL) { + return (NULL); + } + rp->flags = 0; + rp->callback = func; + rp->arg1 = arg1; + rp->arg2 = arg2; + return (rp); +} + + +/* + * Register a new rclock callback cell. This cell will be invoked as soon + * as all CPUs have performed a context switch or been seen in the + * idle loop or in a user process. + * + * Note that it does not make too much sense to call this before user processes + * are running. + * + * This function must be called with chip-interrupts enabled. + */ + +void rc_callback(rc_callback_t *rc) +{ +#ifdef RCLOCK_PROF + int starttime = (int)get_cycles(); +#endif /* RCLOCK_PROF */ + rc_plocal_t *rcpl = RC_PLOCAL(cpu_number_map(smp_processor_id())); + + rc->flags |= RCC_REGISTERED; + + /* Count the new callback. */ + + atomic_inc(&rc_ctrlblk.nreg); + + /* Prepare it for insertion into the list. */ + + rc->next = NULL; + + /* + * Since online/offline of cpus happen only at boot and + * shutdown time in linux, as long as the boot/shutdown code + * doesn't use rclock, we need not bother about checking + * to see if the current cpu is online. If linux does + * support onlining/offlining of cpus then, this has + * to be revisited. + */ + /* + * Disable rrupts to avoid races with calls to this function + * from interrupt routines and to avoid races with the + * rc_chk_callbacks() processing called from local timer interrupt. + * A cli will disable the local timer interrupt. + */ + cli(); + + /* + * Add the callback to the per-CPU next-list. Note that we + * must explicitly check for the head being NULL, since the + * tail is (by default) initialized to NULL at startup. + */ + if (rcpl->rclocknxtlist == NULL) { + rcpl->rclocknxtlist = rc; + } else { + *rcpl->rclocknxttail = rc; + } + rcpl->rclocknxttail = &(rc->next); + + /* Re-enable interrupts. */ + sti(); + + /* + * Now, it is possible that the generation-advancement + * mechanism is currently quiescent. If so, the next + * local timer interrupt + * from this CPU will do the necessary jumpstarting. + */ + +#ifdef RCLOCK_PROF + + /* Fine-grained measurement of the cost of rclock callbacks. */ + starttime = (int)get_cycles() - starttime; + atomic_add(starttime, &rc_ctrlblk.clock_ticks); +#endif /* RCLOCK_PROF */ + +} + + +/* + * Wrapper for callback functions that do not know what to do with a + * rc_callback as their first parameter. Note that this function does + * -not- free up the callback, thus allowing the caller to keep one + * permanently allocated callback if he so choses. + * + * An example might be up(), used as follows: + * + * + * * Do something resulting in a pointer p that + * * references something that must be freed once + * * we know all CPUs are done looking at it. + * + * + * rp = rc_alloc_callback(rc_callback_wrapper, + * (void *)up, + * (void *)&my_sema); + * rc_callback(rp); + * down(&my_sema); + * + * Note that the semaphore cannot be v'ed before being p'ed because the + * callback will not be invoked until the current CPU is relinquished. + * However, this guarantee cannot be made if there is a blocking operation + * between the rc_callback() and the down(). + */ +int rc_callback_wrapper(rc_callback_t *rc, void *arg1, void *arg2) +{ + void (*func)(void *arg) = (void (*)(void *))arg1; + + (*func)(arg2); + + return RC_CALLBACK_DONE; +} + + +/* + * The caller must disable interrupts and must have verified that there is + * a rclock callback list to be handled and that this CPU has its + * bit set. + */ +void rc_chk_callbacks(int not_in_kernel) +{ + int btime; + int need_stall_chk = 0; + int cpu = cpu_number_map(smp_processor_id()); + rc_plocal_t *rcpl = RC_PLOCAL(cpu); + + /* Advance the plocal callbacks, if appropriate. */ + rc_adv_callbacks(); + + /* + * Check to see if the current batch is taking too long. + * We don't want to actually print the message at this point, because + * it is entirely possible that we are the last CPU, and we just + * recently passed through a quiescent state. If this is the case, + * we do not want to issue a false alarm! + */ + btime = jiffies - rc_ctrlblk.clock; + if (btime > RCLOCK_STALL_WARN) { + need_stall_chk = 1; + } + + /* + * If we are not participating (or if we have already finished + * participating) in the current generation, leave. We needed to + * execute the above code in order to avoid leaving callbacks + * stranded on local lists when a long callback-free interval occurs. + */ + if (!test_bit(cpu, &rc_ctrlblk.needctxtmask)) { + if (need_stall_chk) { + rc_chk_stalls(not_in_kernel, btime); + } + return; + } + + /* + * If the rclockcswtchctr is not set up, fix it and the other + * quiescent-state counters so that we will detect the next + * quiescent state on this CPU. + * However, if we interrupted a user process, + * we are already in a quiescent state and need not wait. + * Note that this code relies on the zeroing of the plocal area + * to initialize the quiescent-state counters. + */ + if (!not_in_kernel && (rcpl->rclockcswtchctr == CSWTCHCTR_INVALID)) { + RC_SNAPSHOT_SYNC(rcpl); + if (need_stall_chk) { + rc_chk_stalls(not_in_kernel, btime); + } + return; + } + + /* + * If this CPU has not passed through a synchronization point + * since the last snapshot, just return, since we cannot yet + * allow the current batch of callbacks to proceed. + */ + if (!RC_CHK_SYNC(rcpl)) { + if (need_stall_chk) { + rc_chk_stalls(not_in_kernel, btime); + } + return; + } + + spin_lock(&rc_ctrlblk.mutex); + + rc_cleanup(); /* Drops mutex. */ + + if (need_stall_chk) { + rc_chk_stalls(not_in_kernel, btime); + } +} + + +/* + * Check to see if a CPU has been stalling us for too long, if so, + * print a nasty message or panic. Interrupts are expected to + * be disabled. + */ +static void rc_chk_stalls(int not_in_kernel, int batchclock) +{ + int curpos; + int guilty = 0; + int i; + int force_panic = 0; + int stalled = 0; + unsigned long clocks; + int cpu = cpu_number_map(smp_processor_id()); + rc_plocal_t *rcpl; + + + spin_lock(&rc_ctrlblk.mutex); + + /* + * Recheck under the gate to prevent duplicate warning + * messages due to race. This time, also check whether + * some CPUs have not yet checked in. The problem is that + * CPUs that have not been involved with any rclock activity + * for some time will have very old generation numbers. + * They will therefore come here every RCLOCK_STALL_WARN + * interval. We could make local timer check directly that + * there is nothing to do, but it is better to sink the + * overhead down here where it will be invoked only once + * per RCLOCK_STALL_WARN (many seconds) rather than 100 + * times per second in local timer. In either case, reset + * the clock so that we won't check again for awhile. + */ + + /* Calculate microseconds since batch started. */ + + clocks = jiffies - rc_ctrlblk.clockbatch; + + if ((clocks > RCLOCK_STALL_WARN) && (rc_ctrlblk.needctxtmask != 0)) { + + /* + * It is time for another warning and there is + * something to warn about. + */ + + /* + * Count the warning whether or not we actually print + * something. Just in case the customer shuts this + * warning off when they shouldn't have... + */ + rc_ctrlblk.stallwarns++; + stalled = 1; + rcpl = RC_PLOCAL(cpu); + + /* Accumulate mask of stalled CPUs. */ + + curpos = 0; + rc_ctrlblk.stalled[0] = '\0'; + for (i = 0; i < smp_num_cpus; i++) { + if (test_bit(i, &rc_ctrlblk.needctxtmask)) { + curpos += + sprintf(&(rc_ctrlblk.stalled[curpos]), + " %d", i); + if ((cpu == i) && !RC_CHK_SYNC(rcpl)) { + /* + * This CPU is still holding + * things up... + */ + guilty = 1; + } + } + } + if (!guilty && + (clocks < RCLOCK_STALL_WARN + RCLOCK_STALL_GRACE)) { + + /* + * Someone else is stalling, give them a chance + * to 'fess up. We do this to prevent a race + * where the guilty party is done stalling by + * the time the NMI gets to him. This does not + * eliminate this race, but does restrict it + * to cases where the staller is running at + * SPLHI for many tens of milliseconds. + */ + stalled = 0; + rc_ctrlblk.stalled[0] = '\0'; + rc_ctrlblk.stallwarns--; + } + } else if (clocks > RCLOCK_STALL_WARN) { + + /* + * If the rclock subsystem is idle, prevent coming here + * for a RCLOCK_STALL_WARN interval (or until rclock has + * something to do). + */ + rc_ctrlblk.clock = jiffies; + } + + if (!stalled) { + + /* Nothing to do, just drop the gate and leave quietly. */ + + spin_unlock(&rc_ctrlblk.mutex); + return; + } + + /* + * Readjust the time to prevent drowning the console in a flood + * of stall-warning messages. + */ + rc_ctrlblk.clock = jiffies; + + /* + * Print warning if panic interval zero or not yet exceeded; + * panic otherwise unless running rclock torture tests. + * + * Do this outside of gate to make crash dumps more likely. + * + * Subsequent experience may require further changes in + * panic strategy. ;-) + */ + if ((rclock_stall_panic_interval != 0) && + (batchclock > rclock_stall_panic_interval)) { + force_panic = rc_ctrlblk.under_test ? 0 : 1; + } else { + force_panic = 0; + } + + if (guilty) { + +#if 0 + /* TBD: arch independent way to dump stack trace */ + char *retadrs[20]; + int retadrcnt = sizeof(retadrs) / sizeof(retadrs[0]); + int retadrspace; + + /* + * If we are one of the guilty parties, accumulate + * our stack trace (return addresses only). + */ + + curpos += sprintf(&(rc_ctrlblk.stalled[curpos]), + " Retadrs:"); + retadrspace = (rc_ctrlblk.stalledsz - 1 - curpos) / + (3 + 2 * sizeof(char *)); + if (retadrspace < retadrcnt) { + retadrcnt = retadrspace; + } + retadrcnt = getretadrs(retadrs, retadrcnt); + for (i = 0; i < retadrcnt; i++) { + curpos += sprintf(&(rc_ctrlblk.stalled[curpos]), + " 0x%x", + retadrs[i]); + } +#endif + } + + /* + * If the rclock_ignore_stalls global variable is set, + * don't clutter the screen. We have updated the stall + * count so they can see it from crash (and recorded a + * mini-stack-trace in rc_ctrlblk.stalled), so just leave. + */ + if (rclock_ignore_stalls) { + spin_unlock(&rc_ctrlblk.mutex); + return; + } + + spin_unlock(&rc_ctrlblk.mutex); + + if (force_panic) { + panic("RC: CPUs stalled for %d ms: %s", + batchclock * 1000, rc_ctrlblk.stalled); + /* + *+ A software or hardware error occurred in + *+ the indicated CPUs. This condition + *+ can be caused by hardware failure + *+ of the indicated CPU or by an in-kernel + *+ infinite loop. + *+ Corrective action: contact service. + *+ These messages may be suppressed by patching + *+ the rclock_ignore_stalls kernel variable + *+ to 1. Panics may be suppressed by patching + *+ the rclock_stall_panic_interval to 0. + */ + + } else { + printk(KERN_WARNING "RC: CPUs stalled for %d ms: %s", + batchclock * 1000, rc_ctrlblk.stalled); + } +} + + +/* + * Does cleanup actions if we have ended a rclock generation. + * Assumes mutex held on entry, releases it before returning. + * + * This function is a no-op if our bit is not set -- in this case + * some other CPU will/has done the work. + * + * Cleanup actions are clearing our bit, invalidating our quiescent-state + * counters, and doing end-of-generation work if all CPUs have checked + * in (if all bits are now clear). + * + * End-of-generation work is advancing callbacks to the next slot, + * sending off the soft interrupt to process any that are now ready, + * and registering the next generation of interest (the next numerical + * one if some callbacks were in the ``next'' slot or the current maximum + * otherwise). Note that a major purpose of this registering is to + * start up the next generation if there is interest in it. + */ +static void rc_cleanup(void) +{ + int cpu = cpu_number_map(smp_processor_id()); + rc_plocal_t *rcpl = RC_PLOCAL(cpu); + +#ifdef NOTDEF +#ifdef DEBUG + /* Debug hook back into tlbtest driver. */ + tlbtest_rc_not_in_kernel(0); +#endif /* DEBUG */ +#endif /* NOTDEF */ + + /* + * Clear our bit if not already cleared. If we are last, + * drop through and advance the generation. + */ + if (!test_bit(cpu, &rc_ctrlblk.needctxtmask)) { + spin_unlock(&rc_ctrlblk.mutex); + return; + } + clear_bit(cpu, &rc_ctrlblk.needctxtmask); + if (rc_ctrlblk.needctxtmask != 0) { + /* We are not last. Initialize counter and leave. */ + rcpl->rclockcswtchctr = CSWTCHCTR_INVALID; + spin_unlock(&rc_ctrlblk.mutex); + return; + } + + /* Initialize our counter. */ + rcpl->rclockcswtchctr = CSWTCHCTR_INVALID; + + /* Say that we are done with current generation. */ + rc_ctrlblk.curgen++; + + /* Stop spurious calls to rc_chk_stalls(). */ + rc_ctrlblk.clock = jiffies; + + rc_reg_gen(rc_ctrlblk.maxgen); + + spin_unlock(&rc_ctrlblk.mutex); + + /* Advance the cpu-local lists. */ + rc_adv_callbacks(); +} + + +/* + * Frees the specified callback. + */ +void rc_free_callback(rc_callback_t *rp) +{ + kfree((void *)rp); +} + + +/* + * Gets any or all of function and arguments for the specified + * callback. If one of the pointers is NULL, the corresponding + * item is not returned. Otherwise, the item is copied to the + * location referenced by the pointer. + */ +void rc_get_func_args(rc_callback_t *rp, rc_callback_func_t *func, + void **arg1, void **arg2) +{ + if (func != NULL) { + *func = rp->callback; + } + if (arg1 != NULL) { + *arg1 = rp->arg1; + } + if (arg2 != NULL) { + *arg2 = rp->arg2; + } +} + + +/* + * Initializes rclock mechanism. Assumed to be called early. + * Note that rclockcswtchctr and friends are implicitly + * initialized due to the choice of ``0'' for CSWTCHCTR_INVALID. + */ + +void rc_init() +{ + int i; + +#ifdef RCLOCK_STALL_PANIC + rclock_stall_panic_interval = RCLOCK_MAX_STALL; +#endif /* RCLOCK_STALL_PANIC */ + + rc_plocal_init(); + + rc_ctrlblk.olmsk = cpu_online_map; + + /* + * The control variables and lists are statically initialized, + * with the following exceptions where allocation is required. + * Allow space in stalled for all the CPU numbers and for + * 10-15 return addresses. + */ + rc_ctrlblk.stalledsz = smp_num_cpus * 4 + 160; + rc_ctrlblk.stalled = (char *)kmalloc(rc_ctrlblk.stalledsz, + GFP_ATOMIC); + + for (i = 0; i < smp_num_cpus; i++) { + tasklet_init(&RC_PLOCAL_rclocktasklet(i), rc_intr, 0UL); + } +} + +/* + * Initialize the per-processor areas that maintain various + * information used by RC subsystem + */ +void rc_plocal_init(void) +{ + memset(&rc_plocal_data[0], 0, sizeof(rc_plocal_data)); +} + +/* + * Handle SWINT_RCLOCK soft interrupt and dispatch callbacks. + */ +void rc_intr(unsigned long data) +{ + rc_callback_t *rp; + int retval; + char tmp_flags; + rc_plocal_t *rcpl = RC_PLOCAL(cpu_number_map(smp_processor_id())); + +#ifdef RCLOCK_PROF + int starttime = (int)get_cycles(); +#endif /* RCLOCK_PROF */ + + /* Invoke the per-CPU callouts. */ + while (rcpl->rclockintrlist != NULL) { + + /* + * Mask rrupts, pull first element from list. Since nested + * interrupts can only add to (never delete from) this list, + * we need not recheck for NULL pointer under the mask. + */ + + rp = rcpl->rclockintrlist; + rp->flags &= ~RCC_REGISTERED; + cli(); + rcpl->rclockintrlist = rp->next; + if (rcpl->rclockintrlist == NULL) { + rcpl->rclockintrtail = &rcpl->rclockintrlist; + } + sti(); + tmp_flags = rp->flags; + + /* Invoke callout. */ + + retval = (*(rp->callback))(rp, rp->arg1, rp->arg2); + + /* Check for deferral. */ + if ((tmp_flags & RCC_DEFER) && + (retval == RC_CALLBACK_DEFER)) { + + /* + * Crude but effective deferral. + * Later defer from here and add back to list, + * and also count deferrals. @@@ + */ + rc_callback(rp); + } + + /* Count it. */ + + atomic_inc(&rc_ctrlblk.nprc); + } + + +#ifdef RCLOCK_PROF + + /* + * Measure callback time. Note that this includes the time + * spent actually running the callback function. This is appropriate + * for performance testing of the rclock system, but other cases + * may need the time executing the callbacks subtracted out. + */ + + starttime = (int)get_cycles() - starttime; + atomic_add(starttime, &rc_ctrlblk.clock_ticks); +#endif /* RCLOCK_PROF */ +} + + +/* + * Register a new generation, and start it up if there is currently no + * active generation and the generation to be registered has not already + * occurred. + * + * The caller must not immediately follow a call to rc_reg_gen() with a call + * to rc_cleanup() -unless- his CPU is offlining (in which case rc_cleanup() + * is a no-op). Violating this rule will result in callbacks being processed + * too soon. The normal way to avoid this when a call to rc_cleanup() must + * follow one to rc_reg_gen() is to skip the call to rc_cleanup() if the + * value of the per-CPU copy of cswtchctr is invalid. + * + * Caller must hold the rc_ctrlblk lock. + */ +static void rc_reg_gen(rc_gen_t newgen) +{ + /* + * If the maximum generation number seen so far + * is less than ours, record the new maximum. + */ + if (RC_GEN_LT(rc_ctrlblk.maxgen, newgen)) { + + /* Record the new generation. */ + rc_ctrlblk.maxgen = newgen; + } + + /* + * Check to see if there is a currently active generation or if + * no one cares about the next generation. + */ + + if (RC_GEN_LT(rc_ctrlblk.maxgen, rc_ctrlblk.curgen) || + (rc_ctrlblk.needctxtmask != 0)) { + + /* + * The last CPU to check in will take care of moving to + * the next generation. If no one cares about the next + * generation, we will start it up as soon as a new + * callback hits a per-CPU current list or the global + * next list. + */ + return; + } + + /* + * Set up the mask to show that all processors need to pass through a + * quiescent state. This qualifies the selected processors to enter + * rc_chk_callbacks, where they will set up for the new generation. + * + * Also set up timeout fields. Backdate clock to get proper + * initial interval of rclock_max_stall jiffies despite only + * checking for full period of RCLOCK_STALL_WARN jiffies. + * + * Note that the generation has been updated first, and is checked + * outside of a lock. The race with the contents of + * rc_ctrlblk.needctxtmask is avoided because no one looks + * at rc_ctrlblk.needctxtmask without holding the lock. + */ + rc_ctrlblk.needctxtmask = rc_ctrlblk.olmsk; + rc_ctrlblk.clockbatch = jiffies; + rc_ctrlblk.clock = rc_ctrlblk.clockbatch + + rclock_max_stall - + RCLOCK_STALL_WARN; +} + + +/* + * Sets the value of the deferral bit to the desired state. If zero (default) + * the return value of the handler will be ignored and always treated as if + * RC_CALLBACK_DONE had been returned. If the bit is set, the return value + * is handled as expected -- handlers that return RC_CALLBACK_DEFER are + * reinvoked at a later time. + */ +void rc_set_defer(rc_callback_t *rp, int defer) +{ + + if (defer) { + rp->flags |= RCC_DEFER; + } else { + rp->flags &= ~RCC_DEFER; + } +} + + +/* + * Returns a pointer to a new callback, or NULL if KM_NOSLEEP specified. + */ +void rc_set_func(rc_callback_t *rp, rc_callback_func_t func, void *arg1, + void *arg2) +{ + rp->callback = func; + rp->arg1 = arg1; + rp->arg2 = arg2; +} + + +/* + * Sets any or all of function and arguments for the specified + * callback. If one of the pointers is NULL, the corresponding + * item is not changed. Otherwise, the item is updated with the + * value that the pointer points to. + */ +void rc_set_func_args(rc_callback_t *rp, rc_callback_func_t *func, void **arg1, + void **arg2) +{ + if (func != NULL) { + rp->callback = *func; + } + if (arg1 != NULL) { + rp->arg1 = *arg1; + } + if (arg2 != NULL) { + rp->arg2 = *arg2; + } +} + +/* + * Increment the per-cpu syscall count + */ +void rc_inc_syscall_count(int cpuid) +{ + RC_PLOCAL_syscall(cpu_number_map(cpuid))++; +} + + +#ifdef DEBUG +/* + * rc_rdprotect() + * + * Begin a section of code that manipulates data structures protected by + * the read-copy lock mechanism. The section is terminated by a + * rc_rdunprotect() invocation. rc_rdprotect's may be nested. It is + * illegal to block while under the influence of rc_rdprotect. + * + * In kernels (such as PTX) that do not support kernel preemption, + * rc_rdprotect() serves only documentation and debug purposes. + */ +void +rc_rdprotect(void) +{ + int index; + int cpu = cpu_number_map(smp_processor_id()); + + /* + * Increase the nesting level. + */ + RC_PLOCAL_mnesting(cpu)++; + + /* + * Record the EIP of the caller into the plocal structure + * to help in debugging nesting level mess ups. + */ + index = l.pl_rc_rdlog.rc_rdl_nlogrecs++ & NRC_RDLOG_REC - 1; + l.pl_rc_rdlog.rc_rdl_eip[index] = __builtin_return_address(0); + + /* + * Having an non-positive nesting level after bumping + * is a major error. + */ + RC_ASSERT_DEBUG(RC_PLOCAL_mnesting(cpu) > 0, + "rc_rdunprotect: non-positive nesting level"); +} + +/* + * rc_rdunprotect(p) + * + * End a section of code that manipulates data protected by the read-copy + * mechanism. + */ +void +rc_rdunprotect(void) +{ + int index; + int cpu = cpu_number_map(smp_processor_id()); + + /* + * Reduce the nesting level. + */ + RC_PLOCAL_mnesting(cpu)--; + + /* + * Record the EIP of the caller into the plocal structure + * to help in debugging nesting level mess ups. + */ + index = l.pl_rc_rdlog.rc_rdl_nlogrecs++ & NRC_RDLOG_REC - 1; + l.pl_rc_rdlog.rc_rdl_eip[index] = __builtin_return_address(0); + + /* + * Having a negative nesting level after decrementing + * is a major error. + */ + RC_ASSERT_DEBUG(RC_PLOCAL_mnesting(cpu) >= 0, + "rc_rdunprotect: negitive nesting level"); +} +#endif /* DEBUG */ diff -u --recursive --new-file 2.4.1/kernel/sched.c v2.4.1-rc/kernel/sched.c --- 2.4.1/kernel/sched.c Tue Jan 16 02:38:15 2001 +++ v2.4.1-rc/kernel/sched.c Tue Feb 20 16:46:34 2001 @@ -26,6 +26,10 @@ #include #include +#ifdef CONFIG_RCLOCK +#include +#endif + #include #include @@ -640,6 +644,18 @@ mmdrop(oldmm); } } + +#ifdef CONFIG_RCLOCK + /* + * Increment the per-processor context switch count + * We would need to block interrupts if this code can + * execute in interrupt context + */ + if (++RC_PLOCAL_cswtchctr(cpu_number_map(this_cpu)) == + CSWTCHCTR_INVALID) + RC_PLOCAL_cswtchctr(cpu_number_map(this_cpu)) = + CSWTCHCTR_INVALID + 1; +#endif /* * This just switches the register state and the diff -u --recursive --new-file 2.4.1/kernel/timer.c v2.4.1-rc/kernel/timer.c --- 2.4.1/kernel/timer.c Sun Dec 10 23:23:19 2000 +++ v2.4.1-rc/kernel/timer.c Tue Feb 20 16:46:34 2001 @@ -23,6 +23,11 @@ #include #include +#ifdef CONFIG_RCLOCK +#include +#include +#endif + #include /* @@ -673,8 +678,24 @@ void do_timer(struct pt_regs *regs) { +#ifdef CONFIG_RCLOCK + int cpu = cpu_number_map(smp_processor_id()); +#endif + (*(unsigned long *)&jiffies)++; #ifndef CONFIG_SMP + +#ifdef CONFIG_RCLOCK + if (((RC_PLOCAL_rclockcurlist(cpu) != NULL) && + RC_GEN_LT(RC_PLOCAL_rclockgen(cpu), rc_ctrlblk.curgen)) || + (RC_PLOCAL_rclockcurlist(cpu) == NULL && + RC_PLOCAL_rclocknxtlist(cpu) != NULL) || + test_bit(cpu, &rc_ctrlblk.needctxtmask) || + ((jiffies - rc_ctrlblk.clock) > RCLOCK_STALL_WARN)) + rc_chk_callbacks(user_mode(regs) || + (current == init_tasks[cpu])); +#endif + /* SMP process accounting uses the local APIC timer */ update_process_times(user_mode(regs));