Here's the latest relayfs patch, incorporating the previous round of suggestions. Thanks to everyone who sent comments. Here's a list of the major changes:
- replaced remap_page_range() and reserved bits with a nopage() handler. - buffers are now allocated/freed as part of inode alloc/destroy. - got rid of the automatic creation/teardown of directory hierarchies, and exported create/remove dir functions instead. - replaced the fileop callback with explicit map/unmap callbacks - these were the only fileops currently being used by anything, so I got rid of the other cases. - relay_write() and __relay_write() no longer return a value - it doesn't really make sense for clients to check a return value in the fast path anyway. - added a buf_full() callback to notify users that a buffer has become full and therefore events might be lost. - got rid of the 'helper' macros, which weren't helping much. - various other bits of code and comment cleanup.
Also, there was some question as to whether or not the memcpy in relay_write() was being inlined properly - I looked at the generated assembly code, and it seems to be, but I'll be taking a closer look later.
int subbuf_start(buf, subbuf, prev_subbuf_idx); int deliver(buffer, subbuf, subbuf_idx); void buf_mapped(buf, filp); void buf_unmapped(buf, filp); void buf_full(buf);
As before, I've tested this code on a single proc machine using a hacked version of the kprobes network packet tracing module, which can be found here:
diff -urpN -X dontdiff linux-2.6.10/fs/Kconfig linux-2.6.10-cur/fs/Kconfig --- linux-2.6.10/fs/Kconfig 2004-12-24 15:34:58.000000000 -0600 +++ linux-2.6.10-cur/fs/Kconfig 2005-01-31 21:49:27.000000000 -0600 @@ -972,6 +972,18 @@ config RAMFS To compile this as a module, choose M here: the module will be called ramfs.
+config RELAYFS_FS + tristate "Relayfs file system support" + ---help--- + Relayfs is a high-speed data relay filesystem designed to provide + an efficient mechanism for tools and facilities to relay large + amounts of data from kernel space to user space. + + To compile this code as a module, choose M here: the module will be + called relayfs. + + If unsure, say N. + endmenu
menu "Miscellaneous filesystems" diff -urpN -X dontdiff linux-2.6.10/fs/Makefile linux-2.6.10-cur/fs/Makefile --- linux-2.6.10/fs/Makefile 2004-12-24 15:34:58.000000000 -0600 +++ linux-2.6.10-cur/fs/Makefile 2005-01-31 21:49:27.000000000 -0600 @@ -94,3 +94,4 @@ obj-$(CONFIG_AFS_FS) += afs/ obj-$(CONFIG_BEFS_FS) += befs/ obj-$(CONFIG_HOSTFS) += hostfs/ obj-$(CONFIG_HPPFS) += hppfs/ +obj-$(CONFIG_RELAYFS_FS) += relayfs/ diff -urpN -X dontdiff linux-2.6.10/fs/relayfs/Makefile linux-2.6.10-cur/fs/relayfs/Makefile --- linux-2.6.10/fs/relayfs/Makefile 1969-12-31 18:00:00.000000000 -0600 +++ linux-2.6.10-cur/fs/relayfs/Makefile 2005-02-03 06:34:11.000000000 -0600 @@ -0,0 +1,4 @@ +obj-$(CONFIG_RELAYFS_FS) += relayfs.o + +relayfs-y := relay.o buffers.o inode.o + diff -urpN -X dontdiff linux-2.6.10/fs/relayfs/buffers.c linux-2.6.10-cur/fs/relayfs/buffers.c --- linux-2.6.10/fs/relayfs/buffers.c 1969-12-31 18:00:00.000000000 -0600 +++ linux-2.6.10-cur/fs/relayfs/buffers.c 2005-02-03 08:16:39.000000000 -0600 @@ -0,0 +1,119 @@ +/* + * RelayFS buffer management code. + * + * Copyright (C) 2002, 2003 - Tom Zanussi (zanu...@us.ibm.com), IBM Corp + * Copyright (C) 1999, 2000, 2001, 2002 - Karim Yaghmour (ka...@opersys.com) + * + * This file is released under the GPL. + */ + +#include <linux/module.h> +#include <linux/vmalloc.h> +#include <linux/mm.h> +#include "buffers.h" + +/** + * alloc_page_array - alloc array to hold pages, but not pages + * @size: the total size of the memory represented by the page array + * @page_count: the number of pages the array can hold + * + * Returns a pointer to the page array if successful, NULL otherwise. + */ +static inline struct page **alloc_page_array(int size, int *page_count) +{ + int n_pages; + struct page **page_array; + + size = PAGE_ALIGN(size); + n_pages = size >> PAGE_SHIFT; + + page_array = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL); + if (!page_array) + return NULL; + *page_count = n_pages; + + return page_array; +} + +/** + * depopulate_page_array - free and unreserve all pages in the array + * @page_array: pointer to the page array + * @page_count: number of pages to free + */ +static void depopulate_page_array(struct page **page_array, int page_count) +{ + int i; + + for (i = 0; i < page_count; i++) + __free_page(page_array[i]); +} + +/** + * populate_page_array - allocate and reserve pages + * @page_array: pointer to the page array + * @page_count: number of pages to allocate + * + * Returns 0 if successful, negative otherwise. + */ +static int populate_page_array(struct page **page_array, int page_count) +{ + int i; + + for (i = 0; i < page_count; i++) { + page_array[i] = alloc_page(GFP_KERNEL); + if (unlikely(!page_array[i])) { + depopulate_page_array(page_array, i); + return -ENOMEM; + } + } + return 0; +} + +/** + * relay_alloc_rchan_buf - allocate a channel buffer + * @size: total size of the buffer + * @page_array: receives a pointer to the buffer's page array + * @page_count: receives the number of pages allocated + * + * Returns a pointer to the resulting buffer, NULL if unsuccessful + */ +void *relay_alloc_rchan_buf(unsigned long size, struct page ***page_array, + int *page_count) +{ + void *mem; + + *page_array = alloc_page_array(size, page_count); + if (!*page_array) + return NULL; + + if (populate_page_array(*page_array, *page_count)) { + kfree(*page_array); + *page_array = NULL; + return NULL; + } + + mem = vmap(*page_array, *page_count, GFP_KERNEL, PAGE_KERNEL); + if (!mem) { + depopulate_page_array(*page_array, *page_count); + kfree(*page_array); + *page_array = NULL; + return NULL; + } + memset(mem, 0, size); + + return mem; +} + +/** + * relay_free_rchan_buf - free a channel buffer + * @buf: pointer to the buffer to free + * @page_array: pointer to the buffer's page array + * @page_count: number of pages in page array + */ +void relay_free_rchan_buf(void *buf, struct page **page_array, + int page_count) +{ + vunmap(buf); + depopulate_page_array(page_array, page_count); + kfree(page_array); +} diff -urpN -X dontdiff linux-2.6.10/fs/relayfs/buffers.h linux-2.6.10-cur/fs/relayfs/buffers.h --- linux-2.6.10/fs/relayfs/buffers.h 1969-12-31 18:00:00.000000000 -0600 +++ linux-2.6.10-cur/fs/relayfs/buffers.h 2005-01-31 21:49:27.000000000 -0600 @@ -0,0 +1,14 @@ +#ifndef _BUFFERS_H +#define _BUFFERS_H + +/* This inspired by rtai/shmem */ +#define FIX_SIZE(x) (((x) - 1) & PAGE_MASK) + PAGE_SIZE + +extern void *relay_alloc_rchan_buf(unsigned long size, + struct page ***page_array, + int *page_count); +extern void relay_free_rchan_buf(void *buf, + struct page **page_array, + int page_count); + +#endif/* _BUFFERS_H */ diff -urpN -X dontdiff linux-2.6.10/fs/relayfs/inode.c linux-2.6.10-cur/fs/relayfs/inode.c --- linux-2.6.10/fs/relayfs/inode.c 1969-12-31 18:00:00.000000000 -0600 +++ linux-2.6.10-cur/fs/relayfs/inode.c 2005-02-06 00:22:18.000000000 -0600 @@ -0,0 +1,447 @@ +/* + * VFS-related code for RelayFS, a high-speed data relay filesystem. + * + * Copyright (C) 2003 - Tom Zanussi <zanu...@us.ibm.com>, IBM Corp + * Copyright (C) 2003 - Karim Yaghmour <ka...@opersys.com> + * + * Based on ramfs, Copyright (C) 2002 - Linus Torvalds + * + * This file is released under the GPL. + */ + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/mount.h> +#include <linux/pagemap.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/backing-dev.h> +#include <linux/namei.h> +#include <linux/poll.h> +#include <linux/relayfs_fs.h> +#include "relay.h" + +#define RELAYFS_MAGIC 0x26F82121 + +static struct vfsmount * relayfs_mount; +static int relayfs_mount_count; +static kmem_cache_t * relayfs_inode_cachep; + +static struct backing_dev_info relayfs_backing_dev_info = { + .ra_pages = 0, /* No readahead */ + .memory_backed = 1, /* Does not contribute to dirty memory */ +}; + +static struct inode *relayfs_get_inode(struct super_block *sb, int mode, + struct rchan *chan) +{ + struct rchan_buf *buf = NULL; + struct inode *inode; + + if (S_ISREG(mode)) { + BUG_ON(!chan); + buf = rchan_create_buf(chan); + if (!buf) + return NULL; + } + + inode = new_inode(sb); + if (!inode) { + rchan_destroy_buf(buf); + return NULL; + } + + inode->i_mode = mode; + inode->i_uid = 0; + inode->i_gid = 0; + inode->i_blksize = PAGE_CACHE_SIZE; + inode->i_blocks = 0; + inode->i_mapping->backing_dev_info = &relayfs_backing_dev_info; +
...
-- Maneesh Soni Linux Technology Center, IBM India Software Labs, Bangalore, India email: mane...@in.ibm.com Phone: 91-80-25044990 - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
> +static int populate_page_array(struct page **page_array, int page_count) > +{ > + int i; > + > + for (i = 0; i < page_count; i++) { > + page_array[i] = alloc_page(GFP_KERNEL); > + if (unlikely(!page_array[i])) { > + depopulate_page_array(page_array, i); > + return -ENOMEM; > + } > + } > + return 0; > +} > + > +/** > + * relay_alloc_rchan_buf - allocate a channel buffer > + * @size: total size of the buffer > + * @page_array: receives a pointer to the buffer's page array > + * @page_count: receives the number of pages allocated > + * > + * Returns a pointer to the resulting buffer, NULL if unsuccessful > + */ > +void *relay_alloc_rchan_buf(unsigned long size, struct page ***page_array, > + int *page_count) > +{ > + void *mem; > + > + *page_array = alloc_page_array(size, page_count); > + if (!*page_array) > + return NULL; > + > + if (populate_page_array(*page_array, *page_count)) { > + kfree(*page_array); > + *page_array = NULL; > + return NULL; > + }
[snip]
Please consider inlining alloc_page_array() and populate_page_array() into relay_alloc_rchan_buf() as they're only used once. You'd get rid of passing page_count as a pointer this way. If inlining is unacceptable, please at least move the n_pages calculation to relay_alloc_rchan_buf() to make the API more sane.
I think relay_alloc_rchan_buf also would benefit from goto-styled error handling.
I wrote: >> Please consider inlining alloc_page_array() and populate_page_array() >> into relay_alloc_rchan_buf() as they're only used once. You'd get rid >> of passing page_count as a pointer this way. If inlining is >> unacceptable, please at least move the n_pages calculation to >> relay_alloc_rchan_buf() to make the API more sane. Andi Kleen writes: > Modern gcc (3.3-hammer with unit-at-a-time or 3.4) will do that anyways on > its own.
I know that but I am commenting on readability. The pointer passing is due to silly interface (alloc_page_array() calculating number of pages on its own).
> Please consider inlining alloc_page_array() and populate_page_array() > into relay_alloc_rchan_buf() as they're only used once. You'd get rid > of passing page_count as a pointer this way. If inlining is > unacceptable, please at least move the n_pages calculation to > relay_alloc_rchan_buf() to make the API more sane.
Modern gcc (3.3-hammer with unit-at-a-time or 3.4) will do that anyways on its own.