From: jbaron@redhat.com <jbaron@redhat.com> Date: Fri, 22 Aug 2008 14:04:49 -0400 Subject: [misc] markers and tracepoints: markers Message-id: 1219428298-7519-6-git-send-email-jbaron@redhat.com O-Subject: [rhel5.3 patch 05/14] markers and tracepoints - markers Bugzilla: 329821 bz# 329821 diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index 0e155cc..e254231 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -68,6 +68,7 @@ SECTIONS .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */ *(.data) TRACEPOINTS_DATA + MARKERS_DATA CONSTRUCTORS } :data diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S index 4349de0..f62c8d2 100644 --- a/arch/ia64/kernel/vmlinux.lds.S +++ b/arch/ia64/kernel/vmlinux.lds.S @@ -207,7 +207,7 @@ SECTIONS data : { } :data .data : AT(ADDR(.data) - LOAD_OFFSET) - { *(.data) TRACEPOINTS_DATA *(.data1) *(.gnu.linkonce.d*) CONSTRUCTORS } + { *(.data) MARKERS_DATA TRACEPOINTS_DATA *(.data1) *(.gnu.linkonce.d*) CONSTRUCTORS } . = ALIGN(16); /* gp must be 16-byte aligned for exc. table */ .got : AT(ADDR(.got) - LOAD_OFFSET) diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 6024347..9032b85 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -174,6 +174,7 @@ SECTIONS .data : { *(.data) + MARKERS_DATA TRACEPOINTS_DATA *(.sdata) *(.got.plt) *(.got) @@ -181,6 +182,7 @@ SECTIONS #else .data : { *(.data) + MARKERS_DATA TRACEPOINTS_DATA *(.data.rel* .toc1) *(.branch_lt) diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 0971eed..5630caf 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -46,6 +46,7 @@ SECTIONS .data : { /* Data */ *(.data) + MARKERS_DATA TRACEPOINTS_DATA CONSTRUCTORS } diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index 56cab2a..cf41f55 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -68,6 +68,7 @@ SECTIONS /* Data */ .data : AT(ADDR(.data) - LOAD_OFFSET) { *(.data) + MARKERS_DATA TRACEPOINTS_DATA CONSTRUCTORS } :data diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 94ee54f..7a56cb2 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -16,6 +16,7 @@ *(.rodata) *(.rodata.*) \ *(__vermagic) /* Kernel version magic */ \ *(__tracepoints_strings)/* Tracepoints: strings */ \ + *(__markers_strings) /* Markers: strings */ \ } \ \ .rodata1 : AT(ADDR(.rodata1) - LOAD_OFFSET) { \ @@ -158,6 +159,12 @@ *(.kprobes.text) \ VMLINUX_SYMBOL(__kprobes_text_end) = .; +#define MARKERS_DATA \ + ALIGN_FUNCTION(); \ + VMLINUX_SYMBOL(__start___markers) = .; \ + *(__markers) \ + VMLINUX_SYMBOL(__stop___markers) = .; + #define TRACEPOINTS_DATA \ ALIGN_FUNCTION(); \ VMLINUX_SYMBOL(__start___tracepoints) = .; \ diff --git a/include/linux/marker.h b/include/linux/marker.h new file mode 100644 index 0000000..a7ac360 --- /dev/null +++ b/include/linux/marker.h @@ -0,0 +1,142 @@ +#ifndef _LINUX_MARKER_H +#define _LINUX_MARKER_H + +/* + * Code markup for dynamic and static tracing. + * + * See Documentation/marker.txt. + * + * (C) Copyright 2006 Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> + * + * This file is released under the GPLv2. + * See the file COPYING for more details. + */ + +#include <linux/types.h> + +struct module; +struct marker; + +/** + * marker_probe_func - Type of a marker probe function + * @probe_private: probe private data + * @call_private: call site private data + * @fmt: format string + * @args: variable argument list pointer. Use a pointer to overcome C's + * inability to pass this around as a pointer in a portable manner in + * the callee otherwise. + * + * Type of marker probe functions. They receive the mdata and need to parse the + * format string to recover the variable argument list. + */ +typedef void marker_probe_func(void *probe_private, void *call_private, + const char *fmt, va_list *args); + +struct marker_probe_closure { + marker_probe_func *func; /* Callback */ + void *probe_private; /* Private probe data */ +}; + +struct marker { + const char *name; /* Marker name */ + const char *format; /* Marker format string, describing the + * variable argument list. + */ + char state; /* Marker state. */ + char ptype; /* probe type : 0 : single, 1 : multi */ + /* Probe wrapper */ + void (*call)(const struct marker *mdata, void *call_private, ...); + struct marker_probe_closure single; + struct marker_probe_closure *multi; +} __attribute__((aligned(8))); + +#ifdef CONFIG_MARKERS + +/* + * Note : the empty asm volatile with read constraint is used here instead of a + * "used" attribute to fix a gcc 4.1.x bug. + * Make sure the alignment of the structure in the __markers section will + * not add unwanted padding between the beginning of the section and the + * structure. Force alignment to the same alignment as the section start. + */ +#define __trace_mark(name, call_private, format, args...) \ + do { \ + static const char __mstrtab_name_##name[] \ + __attribute__((section("__markers_strings"))) \ + = #name; \ + static const char __mstrtab_format_##name[] \ + __attribute__((section("__markers_strings"))) \ + = format; \ + static struct marker __mark_##name \ + __attribute__((section("__markers"), aligned(8))) = \ + { __mstrtab_name_##name, __mstrtab_format_##name, \ + 0, 0, marker_probe_cb, \ + { __mark_empty_function, NULL}, NULL }; \ + asm volatile("" : : "i" (&__mark_##name)); \ + __mark_check_format(format, ## args); \ + if (unlikely(__mark_##name.state)) { \ + (*__mark_##name.call) \ + (&__mark_##name, call_private, ## args);\ + } \ + } while (0) + +extern void marker_update_probe_range(struct marker *begin, + struct marker *end); +#else /* !CONFIG_MARKERS */ +#define __trace_mark(name, call_data, format, args...) \ + __mark_check_format(format, ## args) +static inline void marker_update_probe_range(struct marker *begin, + struct marker *end) +{ } +#endif /* CONFIG_MARKERS */ + +/** + * trace_mark - Marker + * @name: marker name, not quoted. + * @format: format string + * @args...: variable argument list + * + * Places a marker. + */ +#define trace_mark(name, format, args...) \ + __trace_mark(name, NULL, format, ## args) + +/** + * MARK_NOARGS - Format string for a marker with no argument. + */ +#define MARK_NOARGS " " + +/* To be used for string format validity checking with gcc */ +static inline void __attribute__((format(printf,1,2))) __mark_check_format(const char *fmt, ...) +{ +} + +extern marker_probe_func __mark_empty_function; + +extern void marker_probe_cb(const struct marker *mdata, + void *call_private, ...); +extern void marker_probe_cb_noarg(const struct marker *mdata, + void *call_private, ...); + +/* + * Connect a probe to a marker. + * private data pointer must be a valid allocated memory address, or NULL. + */ +extern int marker_probe_register(const char *name, const char *format, + marker_probe_func *probe, void *probe_private); + +/* + * Returns the private data given to marker_probe_register. + */ +extern int marker_probe_unregister(const char *name, + marker_probe_func *probe, void *probe_private); +/* + * Unregister a marker by providing the registered private data. + */ +extern int marker_probe_unregister_private_data(marker_probe_func *probe, + void *probe_private); + +extern void *marker_get_private_data(const char *name, marker_probe_func *probe, + int num); + +#endif diff --git a/include/linux/module.h b/include/linux/module.h index 78e649f..14f1add 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -18,6 +18,7 @@ #include <linux/kobject.h> #include <linux/moduleparam.h> #include <linux/tracepoint.h> +#include <linux/marker.h> #include <asm/local.h> #include <asm/module.h> @@ -358,6 +359,10 @@ struct module struct tracepoint *tracepoints; unsigned int num_tracepoints; #endif +#ifdef CONFIG_MARKERS + struct marker *markers; + unsigned int num_markers; +#endif }; /* FIXME: It'd be nice to isolate modules during init, too, so they @@ -485,6 +490,8 @@ void module_remove_driver(struct device_driver *); extern void module_update_tracepoints(void); extern int module_get_iter_tracepoints(struct tracepoint_iter *iter); +extern void module_update_markers(void); + #else /* !CONFIG_MODULES... */ #define EXPORT_SYMBOL(sym) #define EXPORT_SYMBOL_GPL(sym) @@ -602,6 +609,10 @@ static inline int module_get_iter_tracepoints(struct tracepoint_iter *iter) return 0; } +static inline void module_update_markers(void) +{ +} + #endif /* CONFIG_MODULES */ #define symbol_request(x) try_then_request_module(symbol_get(x), "symbol:" #x) diff --git a/init/Kconfig b/init/Kconfig index da1e0c1..f039fb3 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -416,6 +416,12 @@ config TRACEPOINTS Place an empty function call at each tracepoint site. Can be dynamically changed for a probe function. +config MARKERS + bool "Activate markers" + help + Place an empty function call at each marker site. Can be + dynamically changed for a probe function. + endmenu # General setup config RT_MUTEXES diff --git a/kernel/Makefile b/kernel/Makefile index 5a70634..f1c92c2 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -53,6 +53,7 @@ obj-$(CONFIG_TASKSTATS) += taskstats.o obj-$(CONFIG_TRACEPOINTS) += tracepoint.o obj-$(CONFIG_UTRACE) += utrace.o obj-$(CONFIG_PTRACE) += ptrace.o +obj-$(CONFIG_MARKERS) += marker.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is diff --git a/kernel/marker.c b/kernel/marker.c new file mode 100644 index 0000000..7154e5f --- /dev/null +++ b/kernel/marker.c @@ -0,0 +1,858 @@ +/* + * Copyright (C) 2007 Mathieu Desnoyers + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/types.h> +#include <linux/jhash.h> +#include <linux/list.h> +#include <linux/rcupdate.h> +#include <linux/marker.h> +#include <linux/err.h> + +extern struct marker __start___markers[]; +extern struct marker __stop___markers[]; + +/* Set to 1 to enable marker debug output */ +static const int marker_debug; + +/* + * markers_mutex nests inside module_mutex. Markers mutex protects the builtin + * and module markers and the hash table. + */ +static DEFINE_MUTEX(markers_mutex); + +/* + * Marker hash table, containing the active markers. + * Protected by module_mutex. + */ +#define MARKER_HASH_BITS 6 +#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS) + +/* + * Note about RCU : + * It is used to make sure every handler has finished using its private data + * between two consecutive operation (add or remove) on a given marker. It is + * also used to delay the free of multiple probes array until a quiescent state + * is reached. + * marker entries modifications are protected by the markers_mutex. + */ +struct marker_entry { + struct hlist_node hlist; + char *format; + /* Probe wrapper */ + void (*call)(const struct marker *mdata, void *call_private, ...); + struct marker_probe_closure single; + struct marker_probe_closure *multi; + int refcount; /* Number of times armed. 0 if disarmed. */ + struct rcu_head rcu; + void *oldptr; + unsigned char rcu_pending:1; + unsigned char ptype:1; + char name[0]; /* Contains name'\0'format'\0' */ +}; + +static struct hlist_head marker_table[MARKER_TABLE_SIZE]; + +/** + * __mark_empty_function - Empty probe callback + * @probe_private: probe private data + * @call_private: call site private data + * @fmt: format string + * @...: variable argument list + * + * Empty callback provided as a probe to the markers. By providing this to a + * disabled marker, we make sure the execution flow is always valid even + * though the function pointer change and the marker enabling are two distinct + * operations that modifies the execution flow of preemptible code. + */ +void __mark_empty_function(void *probe_private, void *call_private, + const char *fmt, va_list *args) +{ +} +EXPORT_SYMBOL_GPL(__mark_empty_function); + +/* + * marker_probe_cb Callback that prepares the variable argument list for probes. + * @mdata: pointer of type struct marker + * @call_private: caller site private data + * @...: Variable argument list. + * + * Since we do not use "typical" pointer based RCU in the 1 argument case, we + * need to put a full smp_rmb() in this branch. This is why we do not use + * rcu_dereference() for the pointer read. + */ +void marker_probe_cb(const struct marker *mdata, void *call_private, ...) +{ + va_list args; + char ptype; + + /* + * rcu_read_lock does two things : disabling preemption to make sure the + * teardown of the callbacks can be done correctly when they are in + * modules and they insure RCU read coherency. + */ + rcu_read_lock(); + ptype = mdata->ptype; + if (likely(!ptype)) { + marker_probe_func *func; + /* Must read the ptype before ptr. They are not data dependant, + * so we put an explicit smp_rmb() here. */ + smp_rmb(); + func = mdata->single.func; + /* Must read the ptr before private data. They are not data + * dependant, so we put an explicit smp_rmb() here. */ + smp_rmb(); + va_start(args, call_private); + func(mdata->single.probe_private, call_private, mdata->format, + &args); + va_end(args); + } else { + struct marker_probe_closure *multi; + int i; + /* + * Read mdata->ptype before mdata->multi. + */ + smp_rmb(); + multi = mdata->multi; + /* + * multi points to an array, therefore accessing the array + * depends on reading multi. However, even in this case, + * we must insure that the pointer is read _before_ the array + * data. Same as rcu_dereference, but we need a full smp_rmb() + * in the fast path, so put the explicit barrier here. + */ + smp_read_barrier_depends(); + for (i = 0; multi[i].func; i++) { + va_start(args, call_private); + multi[i].func(multi[i].probe_private, call_private, + mdata->format, &args); + va_end(args); + } + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(marker_probe_cb); + +/* + * marker_probe_cb Callback that does not prepare the variable argument list. + * @mdata: pointer of type struct marker + * @call_private: caller site private data + * @...: Variable argument list. + * + * Should be connected to markers "MARK_NOARGS". + */ +void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) +{ + va_list args; /* not initialized */ + char ptype; + + preempt_disable(); + ptype = mdata->ptype; + if (likely(!ptype)) { + marker_probe_func *func; + /* Must read the ptype before ptr. They are not data dependant, + * so we put an explicit smp_rmb() here. */ + smp_rmb(); + func = mdata->single.func; + /* Must read the ptr before private data. They are not data + * dependant, so we put an explicit smp_rmb() here. */ + smp_rmb(); + func(mdata->single.probe_private, call_private, mdata->format, + &args); + } else { + struct marker_probe_closure *multi; + int i; + /* + * Read mdata->ptype before mdata->multi. + */ + smp_rmb(); + multi = mdata->multi; + /* + * multi points to an array, therefore accessing the array + * depends on reading multi. However, even in this case, + * we must insure that the pointer is read _before_ the array + * data. Same as rcu_dereference, but we need a full smp_rmb() + * in the fast path, so put the explicit barrier here. + */ + smp_read_barrier_depends(); + for (i = 0; multi[i].func; i++) + multi[i].func(multi[i].probe_private, call_private, + mdata->format, &args); + } + preempt_enable(); +} +EXPORT_SYMBOL_GPL(marker_probe_cb_noarg); + +static void free_old_closure(struct rcu_head *head) +{ + struct marker_entry *entry = container_of(head, + struct marker_entry, rcu); + kfree(entry->oldptr); + /* Make sure we free the data before setting the pending flag to 0 */ + smp_wmb(); + entry->rcu_pending = 0; +} + +static void debug_print_probes(struct marker_entry *entry) +{ + int i; + + if (!marker_debug) + return; + + if (!entry->ptype) { + printk(KERN_DEBUG "Single probe : %p %p\n", + entry->single.func, + entry->single.probe_private); + } else { + for (i = 0; entry->multi[i].func; i++) + printk(KERN_DEBUG "Multi probe %d : %p %p\n", i, + entry->multi[i].func, + entry->multi[i].probe_private); + } +} + +static struct marker_probe_closure * +marker_entry_add_probe(struct marker_entry *entry, + marker_probe_func *probe, void *probe_private) +{ + int nr_probes = 0; + struct marker_probe_closure *old, *new; + + WARN_ON(!probe); + + debug_print_probes(entry); + old = entry->multi; + if (!entry->ptype) { + if (entry->single.func == probe && + entry->single.probe_private == probe_private) + return ERR_PTR(-EBUSY); + if (entry->single.func == __mark_empty_function) { + /* 0 -> 1 probes */ + entry->single.func = probe; + entry->single.probe_private = probe_private; + entry->refcount = 1; + entry->ptype = 0; + debug_print_probes(entry); + return NULL; + } else { + /* 1 -> 2 probes */ + nr_probes = 1; + old = NULL; + } + } else { + /* (N -> N+1), (N != 0, 1) probes */ + for (nr_probes = 0; old[nr_probes].func; nr_probes++) + if (old[nr_probes].func == probe + && old[nr_probes].probe_private + == probe_private) + return ERR_PTR(-EBUSY); + } + /* + 2 : one for new probe, one for NULL func */ + new = kzalloc((nr_probes + 2) * sizeof(struct marker_probe_closure), + GFP_KERNEL); + if (new == NULL) + return ERR_PTR(-ENOMEM); + if (!old) + new[0] = entry->single; + else + memcpy(new, old, + nr_probes * sizeof(struct marker_probe_closure)); + new[nr_probes].func = probe; + new[nr_probes].probe_private = probe_private; + entry->refcount = nr_probes + 1; + entry->multi = new; + entry->ptype = 1; + debug_print_probes(entry); + return old; +} + +static struct marker_probe_closure * +marker_entry_remove_probe(struct marker_entry *entry, + marker_probe_func *probe, void *probe_private) +{ + int nr_probes = 0, nr_del = 0, i; + struct marker_probe_closure *old, *new; + + old = entry->multi; + + debug_print_probes(entry); + if (!entry->ptype) { + /* 0 -> N is an error */ + WARN_ON(entry->single.func == __mark_empty_function); + /* 1 -> 0 probes */ + WARN_ON(probe && entry->single.func != probe); + WARN_ON(entry->single.probe_private != probe_private); + entry->single.func = __mark_empty_function; + entry->refcount = 0; + entry->ptype = 0; + debug_print_probes(entry); + return NULL; + } else { + /* (N -> M), (N > 1, M >= 0) probes */ + for (nr_probes = 0; old[nr_probes].func; nr_probes++) { + if ((!probe || old[nr_probes].func == probe) + && old[nr_probes].probe_private + == probe_private) + nr_del++; + } + } + + if (nr_probes - nr_del == 0) { + /* N -> 0, (N > 1) */ + entry->single.func = __mark_empty_function; + entry->refcount = 0; + entry->ptype = 0; + } else if (nr_probes - nr_del == 1) { + /* N -> 1, (N > 1) */ + for (i = 0; old[i].func; i++) + if ((probe && old[i].func != probe) || + old[i].probe_private != probe_private) + entry->single = old[i]; + entry->refcount = 1; + entry->ptype = 0; + } else { + int j = 0; + /* N -> M, (N > 1, M > 1) */ + /* + 1 for NULL */ + new = kzalloc((nr_probes - nr_del + 1) + * sizeof(struct marker_probe_closure), GFP_KERNEL); + if (new == NULL) + return ERR_PTR(-ENOMEM); + for (i = 0; old[i].func; i++) + if ((probe && old[i].func != probe) || + old[i].probe_private != probe_private) + new[j++] = old[i]; + entry->refcount = nr_probes - nr_del; + entry->ptype = 1; + entry->multi = new; + } + debug_print_probes(entry); + return old; +} + +/* + * Get marker if the marker is present in the marker hash table. + * Must be called with markers_mutex held. + * Returns NULL if not present. + */ +static struct marker_entry *get_marker(const char *name) +{ + struct hlist_head *head; + struct hlist_node *node; + struct marker_entry *e; + u32 hash = jhash(name, strlen(name), 0); + + head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; + hlist_for_each_entry(e, node, head, hlist) { + if (!strcmp(name, e->name)) + return e; + } + return NULL; +} + +/* + * Add the marker to the marker hash table. Must be called with markers_mutex + * held. + */ +static struct marker_entry *add_marker(const char *name, const char *format) +{ + struct hlist_head *head; + struct hlist_node *node; + struct marker_entry *e; + size_t name_len = strlen(name) + 1; + size_t format_len = 0; + u32 hash = jhash(name, name_len-1, 0); + + if (format) + format_len = strlen(format) + 1; + head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; + hlist_for_each_entry(e, node, head, hlist) { + if (!strcmp(name, e->name)) { + printk(KERN_NOTICE + "Marker %s busy\n", name); + return ERR_PTR(-EBUSY); /* Already there */ + } + } + /* + * Using kmalloc here to allocate a variable length element. Could + * cause some memory fragmentation if overused. + */ + e = kmalloc(sizeof(struct marker_entry) + name_len + format_len, + GFP_KERNEL); + if (!e) + return ERR_PTR(-ENOMEM); + memcpy(&e->name[0], name, name_len); + if (format) { + e->format = &e->name[name_len]; + memcpy(e->format, format, format_len); + if (strcmp(e->format, MARK_NOARGS) == 0) + e->call = marker_probe_cb_noarg; + else + e->call = marker_probe_cb; + trace_mark(core_marker_format, "name %s format %s", + e->name, e->format); + } else { + e->format = NULL; + e->call = marker_probe_cb; + } + e->single.func = __mark_empty_function; + e->single.probe_private = NULL; + e->multi = NULL; + e->ptype = 0; + e->refcount = 0; + e->rcu_pending = 0; + hlist_add_head(&e->hlist, head); + return e; +} + +/* + * Remove the marker from the marker hash table. Must be called with mutex_lock + * held. + */ +static int remove_marker(const char *name) +{ + struct hlist_head *head; + struct hlist_node *node; + struct marker_entry *e; + int found = 0; + size_t len = strlen(name) + 1; + u32 hash = jhash(name, len-1, 0); + + head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; + hlist_for_each_entry(e, node, head, hlist) { + if (!strcmp(name, e->name)) { + found = 1; + break; + } + } + if (!found) + return -ENOENT; + if (e->single.func != __mark_empty_function) + return -EBUSY; + hlist_del(&e->hlist); + /* Make sure the call_rcu has been executed */ + if (e->rcu_pending) + rcu_barrier(); + kfree(e); + return 0; +} + +/* + * Set the mark_entry format to the format found in the element. + */ +static int marker_set_format(struct marker_entry **entry, const char *format) +{ + struct marker_entry *e; + size_t name_len = strlen((*entry)->name) + 1; + size_t format_len = strlen(format) + 1; + + + e = kmalloc(sizeof(struct marker_entry) + name_len + format_len, + GFP_KERNEL); + if (!e) + return -ENOMEM; + memcpy(&e->name[0], (*entry)->name, name_len); + e->format = &e->name[name_len]; + memcpy(e->format, format, format_len); + if (strcmp(e->format, MARK_NOARGS) == 0) + e->call = marker_probe_cb_noarg; + else + e->call = marker_probe_cb; + e->single = (*entry)->single; + e->multi = (*entry)->multi; + e->ptype = (*entry)->ptype; + e->refcount = (*entry)->refcount; + e->rcu_pending = 0; + hlist_add_before(&e->hlist, &(*entry)->hlist); + hlist_del(&(*entry)->hlist); + /* Make sure the call_rcu has been executed */ + if ((*entry)->rcu_pending) + rcu_barrier(); + kfree(*entry); + *entry = e; + trace_mark(core_marker_format, "name %s format %s", + e->name, e->format); + return 0; +} + +/* + * Sets the probe callback corresponding to one marker. + */ +static int set_marker(struct marker_entry **entry, struct marker *elem, + int active) +{ + int ret; + WARN_ON(strcmp((*entry)->name, elem->name) != 0); + + if ((*entry)->format) { + if (strcmp((*entry)->format, elem->format) != 0) { + printk(KERN_NOTICE + "Format mismatch for probe %s " + "(%s), marker (%s)\n", + (*entry)->name, + (*entry)->format, + elem->format); + return -EPERM; + } + } else { + ret = marker_set_format(entry, elem->format); + if (ret) + return ret; + } + + /* + * probe_cb setup (statically known) is done here. It is + * asynchronous with the rest of execution, therefore we only + * pass from a "safe" callback (with argument) to an "unsafe" + * callback (does not set arguments). + */ + elem->call = (*entry)->call; + /* + * Sanity check : + * We only update the single probe private data when the ptr is + * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1) + */ + WARN_ON(elem->single.func != __mark_empty_function + && elem->single.probe_private + != (*entry)->single.probe_private && + !elem->ptype); + elem->single.probe_private = (*entry)->single.probe_private; + /* + * Make sure the private data is valid when we update the + * single probe ptr. + */ + smp_wmb(); + elem->single.func = (*entry)->single.func; + /* + * We also make sure that the new probe callbacks array is consistent + * before setting a pointer to it. + */ + rcu_assign_pointer(elem->multi, (*entry)->multi); + /* + * Update the function or multi probe array pointer before setting the + * ptype. + */ + smp_wmb(); + elem->ptype = (*entry)->ptype; + elem->state = active; + + return 0; +} + +/* + * Disable a marker and its probe callback. + * Note: only after a synchronize_sched() issued after setting elem->call to the + * empty function insures that the original callback is not used anymore. This + * insured by preemption disabling around the call site. + */ +static void disable_marker(struct marker *elem) +{ + /* leave "call" as is. It is known statically. */ + elem->state = 0; + elem->single.func = __mark_empty_function; + /* Update the function before setting the ptype */ + smp_wmb(); + elem->ptype = 0; /* single probe */ + /* + * Leave the private data and id there, because removal is racy and + * should be done only after a synchronize_sched(). These are never used + * until the next initialization anyway. + */ +} + +/** + * marker_update_probe_range - Update a probe range + * @begin: beginning of the range + * @end: end of the range + * + * Updates the probe callback corresponding to a range of markers. + */ +void marker_update_probe_range(struct marker *begin, + struct marker *end) +{ + struct marker *iter; + struct marker_entry *mark_entry; + + mutex_lock(&markers_mutex); + for (iter = begin; iter < end; iter++) { + mark_entry = get_marker(iter->name); + if (mark_entry) { + set_marker(&mark_entry, iter, + !!mark_entry->refcount); + /* + * ignore error, continue + */ + } else { + disable_marker(iter); + } + } + mutex_unlock(&markers_mutex); +} + +/* + * Update probes, removing the faulty probes. + * Issues a synchronize_sched() when no reference to the module passed + * as parameter is found in the probes so the probe module can be + * safely unloaded from now on. + * + * Internal callback only changed before the first probe is connected to it. + * Single probe private data can only be changed on 0 -> 1 and 2 -> 1 + * transitions. All other transitions will leave the old private data valid. + * This makes the non-atomicity of the callback/private data updates valid. + * + * "special case" updates : + * 0 -> 1 callback + * 1 -> 0 callback + * 1 -> 2 callbacks + * 2 -> 1 callbacks + * Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates. + * Site effect : marker_set_format may delete the marker entry (creating a + * replacement). + */ +static void marker_update_probes(void) +{ + /* Core kernel markers */ + marker_update_probe_range(__start___markers, __stop___markers); + /* Markers in modules. */ + module_update_markers(); +} + +/** + * marker_probe_register - Connect a probe to a marker + * @name: marker name + * @format: format string + * @probe: probe handler + * @probe_private: probe private data + * + * private data must be a valid allocated memory address, or NULL. + * Returns 0 if ok, error value on error. + * The probe address must at least be aligned on the architecture pointer size. + */ +int marker_probe_register(const char *name, const char *format, + marker_probe_func *probe, void *probe_private) +{ + struct marker_entry *entry; + int ret = 0; + struct marker_probe_closure *old; + + mutex_lock(&markers_mutex); + entry = get_marker(name); + if (!entry) { + entry = add_marker(name, format); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry); + goto end; + } + } + /* + * If we detect that a call_rcu is pending for this marker, + * make sure it's executed now. + */ + if (entry->rcu_pending) + rcu_barrier(); + old = marker_entry_add_probe(entry, probe, probe_private); + if (IS_ERR(old)) { + ret = PTR_ERR(old); + goto end; + } + mutex_unlock(&markers_mutex); + marker_update_probes(); /* may update entry */ + mutex_lock(&markers_mutex); + entry = get_marker(name); + WARN_ON(!entry); + entry->oldptr = old; + entry->rcu_pending = 1; + /* write rcu_pending before calling the RCU callback */ + smp_wmb(); + call_rcu(&entry->rcu, free_old_closure); +end: + mutex_unlock(&markers_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(marker_probe_register); + +/** + * marker_probe_unregister - Disconnect a probe from a marker + * @name: marker name + * @probe: probe function pointer + * @probe_private: probe private data + * + * Returns the private data given to marker_probe_register, or an ERR_PTR(). + * We do not need to call a synchronize_sched to make sure the probes have + * finished running before doing a module unload, because the module unload + * itself uses stop_machine(), which insures that every preempt disabled section + * have finished. + */ +int marker_probe_unregister(const char *name, + marker_probe_func *probe, void *probe_private) +{ + struct marker_entry *entry; + struct marker_probe_closure *old; + int ret = -ENOENT; + + mutex_lock(&markers_mutex); + entry = get_marker(name); + if (!entry) + goto end; + if (entry->rcu_pending) + rcu_barrier(); + old = marker_entry_remove_probe(entry, probe, probe_private); + mutex_unlock(&markers_mutex); + marker_update_probes(); /* may update entry */ + mutex_lock(&markers_mutex); + entry = get_marker(name); + if (!entry) + goto end; + entry->oldptr = old; + entry->rcu_pending = 1; + /* write rcu_pending before calling the RCU callback */ + smp_wmb(); + call_rcu(&entry->rcu, free_old_closure); + remove_marker(name); /* Ignore busy error message */ + ret = 0; +end: + mutex_unlock(&markers_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(marker_probe_unregister); + +static struct marker_entry * +get_marker_from_private_data(marker_probe_func *probe, void *probe_private) +{ + struct marker_entry *entry; + unsigned int i; + struct hlist_head *head; + struct hlist_node *node; + + for (i = 0; i < MARKER_TABLE_SIZE; i++) { + head = &marker_table[i]; + hlist_for_each_entry(entry, node, head, hlist) { + if (!entry->ptype) { + if (entry->single.func == probe + && entry->single.probe_private + == probe_private) + return entry; + } else { + struct marker_probe_closure *closure; + closure = entry->multi; + for (i = 0; closure[i].func; i++) { + if (closure[i].func == probe && + closure[i].probe_private + == probe_private) + return entry; + } + } + } + } + return NULL; +} + +/** + * marker_probe_unregister_private_data - Disconnect a probe from a marker + * @probe: probe function + * @probe_private: probe private data + * + * Unregister a probe by providing the registered private data. + * Only removes the first marker found in hash table. + * Return 0 on success or error value. + * We do not need to call a synchronize_sched to make sure the probes have + * finished running before doing a module unload, because the module unload + * itself uses stop_machine(), which insures that every preempt disabled section + * have finished. + */ +int marker_probe_unregister_private_data(marker_probe_func *probe, + void *probe_private) +{ + struct marker_entry *entry; + int ret = 0; + struct marker_probe_closure *old; + + mutex_lock(&markers_mutex); + entry = get_marker_from_private_data(probe, probe_private); + if (!entry) { + ret = -ENOENT; + goto end; + } + if (entry->rcu_pending) + rcu_barrier(); + old = marker_entry_remove_probe(entry, NULL, probe_private); + mutex_unlock(&markers_mutex); + marker_update_probes(); /* may update entry */ + mutex_lock(&markers_mutex); + entry = get_marker_from_private_data(probe, probe_private); + WARN_ON(!entry); + entry->oldptr = old; + entry->rcu_pending = 1; + /* write rcu_pending before calling the RCU callback */ + smp_wmb(); + call_rcu(&entry->rcu, free_old_closure); + remove_marker(entry->name); /* Ignore busy error message */ +end: + mutex_unlock(&markers_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data); + +/** + * marker_get_private_data - Get a marker's probe private data + * @name: marker name + * @probe: probe to match + * @num: get the nth matching probe's private data + * + * Returns the nth private data pointer (starting from 0) matching, or an + * ERR_PTR. + * Returns the private data pointer, or an ERR_PTR. + * The private data pointer should _only_ be dereferenced if the caller is the + * owner of the data, or its content could vanish. This is mostly used to + * confirm that a caller is the owner of a registered probe. + */ +void *marker_get_private_data(const char *name, marker_probe_func *probe, + int num) +{ + struct hlist_head *head; + struct hlist_node *node; + struct marker_entry *e; + size_t name_len = strlen(name) + 1; + u32 hash = jhash(name, name_len-1, 0); + int i; + + head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; + hlist_for_each_entry(e, node, head, hlist) { + if (!strcmp(name, e->name)) { + if (!e->ptype) { + if (num == 0 && e->single.func == probe) + return e->single.probe_private; + else + break; + } else { + struct marker_probe_closure *closure; + int match = 0; + closure = e->multi; + for (i = 0; closure[i].func; i++) { + if (closure[i].func != probe) + continue; + if (match++ == num) + return closure[i].probe_private; + } + } + } + } + return ERR_PTR(-ENOENT); +} +EXPORT_SYMBOL_GPL(marker_get_private_data); diff --git a/kernel/module.c b/kernel/module.c index 3fd5d43..6cf428e 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1529,6 +1529,8 @@ static struct module *load_module(void __user *umod, unsigned int unusedgplcrcindex; unsigned int tracepointsindex; unsigned int tracepointsstringsindex; + unsigned int markersindex; + unsigned int markersstringsindex; struct module *mod; long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ @@ -1636,6 +1638,10 @@ static struct module *load_module(void __user *umod, tracepointsindex = find_sec(hdr, sechdrs, secstrings, "__tracepoints"); tracepointsstringsindex = find_sec(hdr, sechdrs, secstrings, "__tracepoints_strings"); + markersindex = find_sec(hdr, sechdrs, secstrings, "__markers"); + markersstringsindex = find_sec(hdr, sechdrs, secstrings, + "__markers_strings"); + #ifdef ARCH_UNWIND_SECTION_NAME unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME); #endif @@ -1809,7 +1815,11 @@ static struct module *load_module(void __user *umod, mod->num_tracepoints = sechdrs[tracepointsindex].sh_size / sizeof(*mod->tracepoints); #endif - +#ifdef CONFIG_MARKERS + mod->markers = (void *)sechdrs[markersindex].sh_addr; + mod->num_markers = + sechdrs[markersindex].sh_size / sizeof(*mod->markers); +#endif /* Now do relocations. */ for (i = 1; i < hdr->e_shnum; i++) { @@ -1854,7 +1864,10 @@ static struct module *load_module(void __user *umod, tracepoint_update_probe_range(mod->tracepoints, mod->tracepoints + mod->num_tracepoints); #endif - +#ifdef CONFIG_MARKERS + marker_update_probe_range(mod->markers, + mod->markers + mod->num_markers); +#endif err = module_finalize(hdr, sechdrs, mod); if (err < 0) goto cleanup; @@ -1968,7 +1981,6 @@ sys_init_module(void __user *umod, mutex_unlock(&module_mutex); return PTR_ERR(mod); } - /* Now sew it into the lists. They won't access us, since strong_try_module_get() will fail. */ stop_machine_run(__link_module, mod, NR_CPUS); @@ -2393,3 +2405,16 @@ int module_get_iter_tracepoints(struct tracepoint_iter *iter) return found; } #endif + +#ifdef CONFIG_MARKERS +void module_update_markers(void) +{ + struct module *mod; + + mutex_lock(&module_mutex); + list_for_each_entry(mod, &modules, list) + marker_update_probe_range(mod->markers, + mod->markers + mod->num_markers); + mutex_unlock(&module_mutex); +} +#endif