diff options
| -rw-r--r-- | target/linux/generic/patches-2.6.39/100-overlayfs_v11.patch (renamed from target/linux/generic/patches-2.6.39/100-overlayfs.patch) | 3492 | ||||
| -rw-r--r-- | target/linux/generic/patches-3.0/100-overlayfs_v11.patch (renamed from target/linux/generic/patches-3.0/100-overlayfs_v10.patch) | 683 | 
2 files changed, 2307 insertions, 1868 deletions
diff --git a/target/linux/generic/patches-2.6.39/100-overlayfs.patch b/target/linux/generic/patches-2.6.39/100-overlayfs_v11.patch index 92bbe3829..83d74e95e 100644 --- a/target/linux/generic/patches-2.6.39/100-overlayfs.patch +++ b/target/linux/generic/patches-2.6.39/100-overlayfs_v11.patch @@ -1,21 +1,283 @@ ---- a/include/linux/fs.h -+++ b/include/linux/fs.h -@@ -1594,6 +1594,7 @@ struct inode_operations { - 	void (*truncate_range)(struct inode *, loff_t, loff_t); - 	int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, - 		      u64 len); -+	struct file *(*open)(struct dentry *, int flags, const struct cred *); - } ____cacheline_aligned; +--- /dev/null ++++ b/Documentation/filesystems/overlayfs.txt +@@ -0,0 +1,199 @@ ++Written by: Neil Brown <neilb@suse.de> ++ ++Overlay Filesystem ++================== ++ ++This document describes a prototype for a new approach to providing ++overlay-filesystem functionality in Linux (sometimes referred to as ++union-filesystems).  An overlay-filesystem tries to present a ++filesystem which is the result over overlaying one filesystem on top ++of the other. ++ ++The result will inevitably fail to look exactly like a normal ++filesystem for various technical reasons.  The expectation is that ++many use cases will be able to ignore these differences. ++ ++This approach is 'hybrid' because the objects that appear in the ++filesystem do not all appear to belong to that filesystem.  In many ++cases an object accessed in the union will be indistinguishable ++from accessing the corresponding object from the original filesystem. ++This is most obvious from the 'st_dev' field returned by stat(2). ++ ++While directories will report an st_dev from the overlay-filesystem, ++all non-directory objects will report an st_dev from the lower or ++upper filesystem that is providing the object.  Similarly st_ino will ++only be unique when combined with st_dev, and both of these can change ++over the lifetime of a non-directory object.  Many applications and ++tools ignore these values and will not be affected. ++ ++Upper and Lower ++--------------- ++ ++An overlay filesystem combines two filesystems - an 'upper' filesystem ++and a 'lower' filesystem.  When a name exists in both filesystems, the ++object in the 'upper' filesystem is visible while the object in the ++'lower' filesystem is either hidden or, in the case of directories, ++merged with the 'upper' object. ++ ++It would be more correct to refer to an upper and lower 'directory ++tree' rather than 'filesystem' as it is quite possible for both ++directory trees to be in the same filesystem and there is no ++requirement that the root of a filesystem be given for either upper or ++lower. ++ ++The lower filesystem can be any filesystem supported by Linux and does ++not need to be writable.  The lower filesystem can even be another ++overlayfs.  The upper filesystem will normally be writable and if it ++is it must support the creation of trusted.* extended attributes, and ++must provide valid d_type in readdir responses, at least for symbolic ++links - so NFS is not suitable. ++ ++A read-only overlay of two read-only filesystems may use any ++filesystem type. ++ ++Directories ++----------- ++ ++Overlaying mainly involved directories.  If a given name appears in both ++upper and lower filesystems and refers to a non-directory in either, ++then the lower object is hidden - the name refers only to the upper ++object. ++ ++Where both upper and lower objects are directories, a merged directory ++is formed. ++ ++At mount time, the two directories given as mount options are combined ++into a merged directory: ++ ++  mount -t overlayfs overlayfs -olowerdir=/lower,upperdir=/upper /overlay ++ ++Then whenever a lookup is requested in such a merged directory, the ++lookup is performed in each actual directory and the combined result ++is cached in the dentry belonging to the overlay filesystem.  If both ++actual lookups find directories, both are stored and a merged ++directory is created, otherwise only one is stored: the upper if it ++exists, else the lower. ++ ++Only the lists of names from directories are merged.  Other content ++such as metadata and extended attributes are reported for the upper ++directory only.  These attributes of the lower directory are hidden. ++ ++whiteouts and opaque directories ++-------------------------------- ++ ++In order to support rm and rmdir without changing the lower ++filesystem, an overlay filesystem needs to record in the upper filesystem ++that files have been removed.  This is done using whiteouts and opaque ++directories (non-directories are always opaque). ++ ++The overlay filesystem uses extended attributes with a ++"trusted.overlay."  prefix to record these details. ++ ++A whiteout is created as a symbolic link with target ++"(overlay-whiteout)" and with xattr "trusted.overlay.whiteout" set to "y". ++When a whiteout is found in the upper level of a merged directory, any ++matching name in the lower level is ignored, and the whiteout itself ++is also hidden. ++ ++A directory is made opaque by setting the xattr "trusted.overlay.opaque" ++to "y".  Where the upper filesystem contains an opaque directory, any ++directory in the lower filesystem with the same name is ignored. ++ ++readdir ++------- ++ ++When a 'readdir' request is made on a merged directory, the upper and ++lower directories are each read and the name lists merged in the ++obvious way (upper is read first, then lower - entries that already ++exist are not re-added).  This merged name list is cached in the ++'struct file' and so remains as long as the file is kept open.  If the ++directory is opened and read by two processes at the same time, they ++will each have separate caches.  A seekdir to the start of the ++directory (offset 0) followed by a readdir will cause the cache to be ++discarded and rebuilt. ++ ++This means that changes to the merged directory do not appear while a ++directory is being read.  This is unlikely to be noticed by many ++programs. ++ ++seek offsets are assigned sequentially when the directories are read. ++Thus if ++  - read part of a directory ++  - remember an offset, and close the directory ++  - re-open the directory some time later ++  - seek to the remembered offset ++ ++there may be little correlation between the old and new locations in ++the list of filenames, particularly if anything has changed in the ++directory. ++ ++Readdir on directories that are not merged is simply handled by the ++underlying directory (upper or lower). ++ ++ ++Non-directories ++--------------- ++ ++Objects that are not directories (files, symlinks, device-special ++files etc.) are presented either from the upper or lower filesystem as ++appropriate.  When a file in the lower filesystem is accessed in a way ++the requires write-access, such as opening for write access, changing ++some metadata etc., the file is first copied from the lower filesystem ++to the upper filesystem (copy_up).  Note that creating a hard-link ++also requires copy_up, though of course creation of a symlink does ++not. ++ ++The copy_up may turn out to be unnecessary, for example if the file is ++opened for read-write but the data is not modified. ++ ++The copy_up process first makes sure that the containing directory ++exists in the upper filesystem - creating it and any parents as ++necessary.  It then creates the object with the same metadata (owner, ++mode, mtime, symlink-target etc.) and then if the object is a file, the ++data is copied from the lower to the upper filesystem.  Finally any ++extended attributes are copied up. ++ ++Once the copy_up is complete, the overlay filesystem simply ++provides direct access to the newly created file in the upper ++filesystem - future operations on the file are barely noticed by the ++overlay filesystem (though an operation on the name of the file such as ++rename or unlink will of course be noticed and handled). ++ ++ ++Non-standard behavior ++--------------------- ++ ++The copy_up operation essentially creates a new, identical file and ++moves it over to the old name.  The new file may be on a different ++filesystem, so both st_dev and st_ino of the file may change. ++ ++Any open files referring to this inode will access the old data and ++metadata.  Similarly any file locks obtained before copy_up will not ++apply to the copied up file. ++ ++On a file is opened with O_RDONLY fchmod(2), fchown(2), futimesat(2) ++and fsetxattr(2) will fail with EROFS. ++ ++If a file with multiple hard links is copied up, then this will ++"break" the link.  Changes will not be propagated to other names ++referring to the same inode. ++ ++Symlinks in /proc/PID/ and /proc/PID/fd which point to a non-directory ++object in overlayfs will not contain vaid absolute paths, only ++relative paths leading up to the filesystem's root.  This will be ++fixed in the future. ++ ++Some operations are not atomic, for example a crash during copy_up or ++rename will leave the filesystem in an inconsitent state.  This will ++be addressed in the future. ++ ++Changes to underlying filesystems ++--------------------------------- ++ ++Offline changes, when the overlay is not mounted, are allowed to either ++the upper or the lower trees. ++ ++Changes to the underlying filesystems while part of a mounted overlay ++filesystem are not allowed.  If the underlying filesystem is changed, ++the behavior of the overlay is undefined, though it will not result in ++a crash or deadlock. +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -4689,6 +4689,13 @@ F:	drivers/scsi/osd/ + F:	include/scsi/osd_* + F:	fs/exofs/ - struct seq_file; -@@ -1988,6 +1989,7 @@ extern long do_sys_open(int dfd, const c - extern struct file *filp_open(const char *, int, int); - extern struct file *file_open_root(struct dentry *, struct vfsmount *, - 				   const char *, int); -+extern struct file *vfs_open(struct path *, int flags, const struct cred *); - extern struct file * dentry_open(struct dentry *, struct vfsmount *, int, - 				 const struct cred *); - extern int filp_close(struct file *, fl_owner_t id); ++OVERLAYFS FILESYSTEM ++M:	Miklos Szeredi <miklos@szeredi.hu> ++L:	linux-fsdevel@vger.kernel.org ++S:	Supported ++F:	fs/overlayfs/* ++F:	Documentation/filesystems/overlayfs.txt ++ + P54 WIRELESS DRIVER + M:	Christian Lamparter <chunkeey@googlemail.com> + L:	linux-wireless@vger.kernel.org +--- a/fs/Kconfig ++++ b/fs/Kconfig +@@ -63,6 +63,7 @@ source "fs/quota/Kconfig" +  + source "fs/autofs4/Kconfig" + source "fs/fuse/Kconfig" ++source "fs/overlayfs/Kconfig" +  + config CUSE + 	tristate "Character device in Userspace support" +--- a/fs/Makefile ++++ b/fs/Makefile +@@ -105,6 +105,7 @@ obj-$(CONFIG_QNX4FS_FS)		+= qnx4/ + obj-$(CONFIG_AUTOFS4_FS)	+= autofs4/ + obj-$(CONFIG_ADFS_FS)		+= adfs/ + obj-$(CONFIG_FUSE_FS)		+= fuse/ ++obj-$(CONFIG_OVERLAYFS_FS)	+= overlayfs/ + obj-$(CONFIG_UDF_FS)		+= udf/ + obj-$(CONFIG_SUN_OPENPROMFS)	+= openpromfs/ + obj-$(CONFIG_OMFS_FS)		+= omfs/ +--- a/fs/ecryptfs/main.c ++++ b/fs/ecryptfs/main.c +@@ -594,6 +594,13 @@ static struct dentry *ecryptfs_mount(str + 	s->s_maxbytes = path.dentry->d_sb->s_maxbytes; + 	s->s_blocksize = path.dentry->d_sb->s_blocksize; + 	s->s_magic = ECRYPTFS_SUPER_MAGIC; ++	s->s_stack_depth = path.dentry->d_sb->s_stack_depth + 1; ++ ++	rc = -EINVAL; ++	if (s->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { ++		printk(KERN_ERR "eCryptfs: maximum fs stacking depth exceeded\n"); ++		goto out_free; ++	} +  + 	inode = ecryptfs_get_inode(path.dentry->d_inode, s); + 	rc = PTR_ERR(inode); +--- a/fs/namespace.c ++++ b/fs/namespace.c +@@ -1494,6 +1494,23 @@ void drop_collected_mounts(struct vfsmou + 	release_mounts(&umount_list); + } +  ++struct vfsmount *clone_private_mount(struct path *path) ++{ ++	struct vfsmount *mnt; ++ ++	if (IS_MNT_UNBINDABLE(path->mnt)) ++		return ERR_PTR(-EINVAL); ++ ++	down_read(&namespace_sem); ++	mnt = clone_mnt(path->mnt, path->dentry, CL_PRIVATE); ++	up_read(&namespace_sem); ++	if (!mnt) ++		return ERR_PTR(-ENOMEM); ++ ++	return mnt; ++} ++EXPORT_SYMBOL_GPL(clone_private_mount); ++ + int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, + 		   struct vfsmount *root) + {  --- a/fs/open.c  +++ b/fs/open.c  @@ -666,8 +666,7 @@ static inline int __get_file_write_acces @@ -172,903 +434,46 @@   static void __put_unused_fd(struct files_struct *files, unsigned int fd)   { ---- a/fs/splice.c -+++ b/fs/splice.c -@@ -1296,6 +1296,7 @@ long do_splice_direct(struct file *in, l -  - 	return ret; - } -+EXPORT_SYMBOL(do_splice_direct); -  - static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, - 			       struct pipe_inode_info *opipe, ---- a/fs/namespace.c -+++ b/fs/namespace.c -@@ -1494,6 +1494,23 @@ void drop_collected_mounts(struct vfsmou - 	release_mounts(&umount_list); - } -  -+struct vfsmount *clone_private_mount(struct path *path) -+{ -+	struct vfsmount *mnt; -+ -+	if (IS_MNT_UNBINDABLE(path->mnt)) -+		return ERR_PTR(-EINVAL); -+ -+	down_read(&namespace_sem); -+	mnt = clone_mnt(path->mnt, path->dentry, CL_PRIVATE); -+	up_read(&namespace_sem); -+	if (!mnt) -+		return ERR_PTR(-ENOMEM); -+ -+	return mnt; -+} -+EXPORT_SYMBOL_GPL(clone_private_mount); +--- /dev/null ++++ b/fs/overlayfs/Kconfig +@@ -0,0 +1,4 @@ ++config OVERLAYFS_FS ++	tristate "Overlay filesystem support" ++	help ++	  Add support for overlay filesystem. +--- /dev/null ++++ b/fs/overlayfs/Makefile +@@ -0,0 +1,7 @@ ++# ++# Makefile for the overlay filesystem. ++#  + - int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, - 		   struct vfsmount *root) - { ---- a/include/linux/mount.h -+++ b/include/linux/mount.h -@@ -100,6 +100,9 @@ extern void mnt_pin(struct vfsmount *mnt - extern void mnt_unpin(struct vfsmount *mnt); - extern int __mnt_is_readonly(struct vfsmount *mnt); -  -+struct path; -+extern struct vfsmount *clone_private_mount(struct path *path); ++obj-$(CONFIG_OVERLAYFS_FS) += overlayfs.o  + - extern struct vfsmount *do_kern_mount(const char *fstype, int flags, - 				      const char *name, void *data); -  ++overlayfs-objs := super.o inode.o dir.o readdir.o copy_up.o  --- /dev/null -+++ b/fs/overlayfs/overlayfs.c -@@ -0,0 +1,2414 @@ ++++ b/fs/overlayfs/copy_up.c +@@ -0,0 +1,383 @@ ++/* ++ * ++ * Copyright (C) 2011 Novell Inc. ++ * ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License version 2 as published by ++ * the Free Software Foundation. ++ */ ++  +#include <linux/fs.h> -+#include <linux/namei.h> -+#include <linux/sched.h> -+#include <linux/fs_struct.h> ++#include <linux/slab.h>  +#include <linux/file.h> ++#include <linux/splice.h>  +#include <linux/xattr.h>  +#include <linux/security.h> -+#include <linux/device_cgroup.h> -+#include <linux/mount.h> -+#include <linux/splice.h> -+#include <linux/slab.h> -+#include <linux/parser.h> -+#include <linux/module.h>  +#include <linux/uaccess.h> -+#include <linux/rbtree.h> -+ -+MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); -+MODULE_DESCRIPTION("Overlay filesystem"); -+MODULE_LICENSE("GPL"); ++#include "overlayfs.h"  +  +#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)  + -+struct ovl_fs { -+	struct vfsmount *upper_mnt; -+	struct vfsmount *lower_mnt; -+}; -+ -+struct ovl_entry { -+	struct dentry *__upperdentry; -+	struct dentry *lowerdentry; -+	union { -+		struct { -+			u64 version; -+			bool opaque; -+		}; -+		struct rcu_head rcu; -+	}; -+}; -+ -+static const char *ovl_whiteout_xattr = "trusted.overlay.whiteout"; -+static const char *ovl_opaque_xattr = "trusted.overlay.opaque"; -+static const char *ovl_whiteout_symlink = "(overlay-whiteout)"; -+ -+enum ovl_path_type { -+	OVL_PATH_UPPER, -+	OVL_PATH_MERGE, -+	OVL_PATH_LOWER, -+}; -+ -+static enum ovl_path_type ovl_path_type(struct dentry *dentry) -+{ -+	struct ovl_entry *oe = dentry->d_fsdata; -+ -+	if (oe->__upperdentry) { -+		if (oe->lowerdentry && S_ISDIR(dentry->d_inode->i_mode)) -+			return OVL_PATH_MERGE; -+		else -+			return OVL_PATH_UPPER; -+	} else { -+		return OVL_PATH_LOWER; -+	} -+} -+ -+static struct dentry *ovl_upperdentry_dereference(struct ovl_entry *oe) -+{ -+	struct dentry *upperdentry = ACCESS_ONCE(oe->__upperdentry); -+	smp_read_barrier_depends(); -+	return upperdentry; -+} -+ -+static void ovl_path_upper(struct dentry *dentry, struct path *path) -+{ -+	struct ovl_fs *ofs = dentry->d_sb->s_fs_info; -+	struct ovl_entry *oe = dentry->d_fsdata; -+ -+	path->mnt = ofs->upper_mnt; -+	path->dentry = ovl_upperdentry_dereference(oe); -+} -+ -+static void ovl_path_lower(struct dentry *dentry, struct path *path) -+{ -+	struct ovl_fs *ofs = dentry->d_sb->s_fs_info; -+	struct ovl_entry *oe = dentry->d_fsdata; -+ -+	path->mnt = ofs->lower_mnt; -+	path->dentry = oe->lowerdentry; -+} -+ -+static enum ovl_path_type ovl_path_real(struct dentry *dentry, -+					struct path *path) -+{ -+ -+	enum ovl_path_type type = ovl_path_type(dentry); -+ -+	if (type == OVL_PATH_LOWER) -+		ovl_path_lower(dentry, path); -+	else -+		ovl_path_upper(dentry, path); -+ -+	return type; -+} -+ -+static struct dentry *ovl_dentry_upper(struct dentry *dentry) -+{ -+	struct ovl_entry *oe = dentry->d_fsdata; -+ -+	return ovl_upperdentry_dereference(oe); -+} -+ -+static struct dentry *ovl_dentry_lower(struct dentry *dentry) -+{ -+	struct ovl_entry *oe = dentry->d_fsdata; -+ -+	return oe->lowerdentry; -+} -+ -+static struct dentry *ovl_dentry_real(struct dentry *dentry) -+{ -+	struct ovl_entry *oe = dentry->d_fsdata; -+	struct dentry *realdentry; -+ -+	realdentry = ovl_upperdentry_dereference(oe); -+	if (!realdentry) -+		realdentry = oe->lowerdentry; -+ -+	return realdentry; -+} -+ -+static bool ovl_dentry_is_opaque(struct dentry *dentry) -+{ -+	struct ovl_entry *oe = dentry->d_fsdata; -+	return oe->opaque; -+} -+ -+static void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque) -+{ -+	struct ovl_entry *oe = dentry->d_fsdata; -+	oe->opaque = opaque; -+} -+ -+static void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry) -+{ -+	struct ovl_entry *oe = dentry->d_fsdata; -+ -+	WARN_ON(!mutex_is_locked(&upperdentry->d_parent->d_inode->i_mutex)); -+	WARN_ON(oe->__upperdentry); -+	smp_wmb(); -+	oe->__upperdentry = upperdentry; -+} -+ -+static void ovl_dentry_version_inc(struct dentry *dentry) -+{ -+	struct ovl_entry *oe = dentry->d_fsdata; -+ -+	WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); -+	oe->version++; -+} -+ -+static u64 ovl_dentry_version_get(struct dentry *dentry) -+{ -+	struct ovl_entry *oe = dentry->d_fsdata; -+ -+	WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); -+	return oe->version; -+} -+ -+static bool ovl_is_whiteout(struct dentry *dentry) -+{ -+	int res; -+	char val; -+ -+	if (!dentry) -+		return false; -+	if (!dentry->d_inode) -+		return false; -+	if (!S_ISLNK(dentry->d_inode->i_mode)) -+		return false; -+ -+	res = vfs_getxattr(dentry, ovl_whiteout_xattr, &val, 1); -+	if (res == 1 && val == 'y') -+		return true; -+ -+	return false; -+} -+ -+static bool ovl_is_opaquedir(struct dentry *dentry) -+{ -+	int res; -+	char val; -+ -+	if (!S_ISDIR(dentry->d_inode->i_mode)) -+		return false; -+ -+	res = vfs_getxattr(dentry, ovl_opaque_xattr, &val, 1); -+	if (res == 1 && val == 'y') -+		return true; -+ -+	return false; -+} -+ -+struct ovl_cache_entry { -+	const char *name; -+	unsigned int len; -+	unsigned int type; -+	u64 ino; -+	bool is_whiteout; -+	struct list_head l_node; -+	struct rb_node node; -+}; -+ -+struct ovl_readdir_data { -+	struct rb_root *root; -+	struct list_head *list; -+	struct list_head *middle; -+	struct dentry *dir; -+	int count; -+	int err; -+}; -+ -+struct ovl_dir_file { -+	bool is_real; -+	bool is_cached; -+	struct list_head cursor; -+	u64 cache_version; -+	struct list_head cache; -+	struct file *realfile; -+}; -+ -+static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n) -+{ -+	return container_of(n, struct ovl_cache_entry, node); -+} -+ -+static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root, -+						    const char *name, int len) -+{ -+	struct rb_node *node = root->rb_node; -+	int cmp; -+ -+	while (node) { -+		struct ovl_cache_entry *p = ovl_cache_entry_from_node(node); -+ -+		cmp = strncmp(name, p->name, len); -+		if (cmp > 0) -+			node = p->node.rb_right; -+		else if (cmp < 0 || len < p->len) -+			node = p->node.rb_left; -+		else -+			return p; -+	} -+ -+	return NULL; -+} -+ -+static struct ovl_cache_entry *ovl_cache_entry_new(const char *name, int len, -+						   u64 ino, unsigned int d_type) -+{ -+	struct ovl_cache_entry *p; -+ -+	p = kmalloc(sizeof(*p) + len + 1, GFP_KERNEL); -+	if (p) { -+		char *name_copy = (char *) (p + 1); -+		memcpy(name_copy, name, len); -+		name_copy[len] = '\0'; -+		p->name = name_copy; -+		p->len = len; -+		p->type = d_type; -+		p->ino = ino; -+		p->is_whiteout = false; -+	} -+ -+	return p; -+} -+ -+static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, -+				  const char *name, int len, u64 ino, -+				  unsigned int d_type) -+{ -+	struct rb_node **newp = &rdd->root->rb_node; -+	struct rb_node *parent = NULL; -+	struct ovl_cache_entry *p; -+ -+	while (*newp) { -+		int cmp; -+		struct ovl_cache_entry *tmp; -+ -+		parent = *newp; -+		tmp = ovl_cache_entry_from_node(*newp); -+		cmp = strncmp(name, tmp->name, len); -+		if (cmp > 0) -+			newp = &tmp->node.rb_right; -+		else if (cmp < 0 || len < tmp->len) -+			newp = &tmp->node.rb_left; -+		else -+			return 0; -+	} -+ -+	p = ovl_cache_entry_new(name, len, ino, d_type); -+	if (p == NULL) -+		return -ENOMEM; -+ -+	list_add_tail(&p->l_node, rdd->list); -+	rb_link_node(&p->node, parent, newp); -+	rb_insert_color(&p->node, rdd->root); -+ -+	return 0; -+} -+ -+static int ovl_fill_lower(void *buf, const char *name, int namelen, -+			    loff_t offset, u64 ino, unsigned int d_type) -+{ -+	struct ovl_readdir_data *rdd = buf; -+	struct ovl_cache_entry *p; -+ -+	rdd->count++; -+	p = ovl_cache_entry_find(rdd->root, name, namelen); -+	if (p) { -+		list_move_tail(&p->l_node, rdd->middle); -+	} else { -+		p = ovl_cache_entry_new(name, namelen, ino, d_type); -+		if (p == NULL) -+			rdd->err = -ENOMEM; -+		else -+			list_add_tail(&p->l_node, rdd->middle); -+	} -+ -+	return rdd->err; -+} -+ -+static void ovl_cache_free(struct list_head *list) -+{ -+	struct ovl_cache_entry *p; -+	struct ovl_cache_entry *n; -+ -+	list_for_each_entry_safe(p, n, list, l_node) -+		kfree(p); -+ -+	INIT_LIST_HEAD(list); -+} -+ -+static int ovl_fill_upper(void *buf, const char *name, int namelen, -+			  loff_t offset, u64 ino, unsigned int d_type) -+{ -+	struct ovl_readdir_data *rdd = buf; -+ -+	rdd->count++; -+	return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type); -+} -+ -+static int ovl_dir_read(struct path *realpath, struct ovl_readdir_data *rdd, -+			  filldir_t filler) -+{ -+	struct file *realfile; -+	int err; -+ -+	realfile = vfs_open(realpath, O_RDONLY | O_DIRECTORY, current_cred()); -+	if (IS_ERR(realfile)) -+		return PTR_ERR(realfile); -+ -+	do { -+		rdd->count = 0; -+		rdd->err = 0; -+		err = vfs_readdir(realfile, filler, rdd); -+		if (err >= 0) -+			err = rdd->err; -+	} while (!err && rdd->count); -+	fput(realfile); -+ -+	return 0; -+} -+ -+static void ovl_dir_reset(struct file *file) -+{ -+	struct ovl_dir_file *od = file->private_data; -+	enum ovl_path_type type = ovl_path_type(file->f_path.dentry); -+ -+	if (ovl_dentry_version_get(file->f_path.dentry) != od->cache_version) { -+		list_del_init(&od->cursor); -+		ovl_cache_free(&od->cache); -+		od->is_cached = false; -+	} -+	WARN_ON(!od->is_real && type != OVL_PATH_MERGE); -+	if (od->is_real && type == OVL_PATH_MERGE) { -+		fput(od->realfile); -+		od->realfile = NULL; -+		od->is_real = false; -+	} -+} -+ -+static int ovl_dir_mark_whiteouts(struct ovl_readdir_data *rdd) -+{ -+	struct ovl_cache_entry *p; -+	struct dentry *dentry; -+	const struct cred *old_cred; -+	struct cred *override_cred; -+ -+	override_cred = prepare_creds(); -+	if (!override_cred) { -+		ovl_cache_free(rdd->list); -+		return -ENOMEM; -+	} -+ -+	/* -+	 * CAP_SYS_ADMIN for getxattr -+	 * CAP_DAC_OVERRIDE for lookup -+	 */ -+	cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); -+	cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); -+	old_cred = override_creds(override_cred); -+ -+	mutex_lock(&rdd->dir->d_inode->i_mutex); -+	list_for_each_entry(p, rdd->list, l_node) { -+		if (p->type != DT_LNK) -+			continue; -+ -+		dentry = lookup_one_len(p->name, rdd->dir, p->len); -+		if (IS_ERR(dentry)) -+			continue; -+ -+		p->is_whiteout = ovl_is_whiteout(dentry); -+		dput(dentry); -+	} -+	mutex_unlock(&rdd->dir->d_inode->i_mutex); -+ -+	revert_creds(old_cred); -+	put_cred(override_cred); -+ -+	return 0; -+} -+ -+static int ovl_dir_read_merged(struct path *upperpath, struct path *lowerpath, -+			       struct ovl_readdir_data *rdd) -+{ -+	int err; -+	struct rb_root root = RB_ROOT; -+	struct list_head middle; -+ -+	rdd->root = &root; -+	if (upperpath->dentry) { -+		rdd->dir = upperpath->dentry; -+		err = ovl_dir_read(upperpath, rdd, ovl_fill_upper); -+		if (err) -+			goto out; -+ -+		err = ovl_dir_mark_whiteouts(rdd); -+		if (err) -+			goto out; -+	} -+	/* -+	 * Insert lowerpath entries before upperpath ones, this allows -+	 * offsets to be reasonably constant -+	 */ -+	list_add(&middle, rdd->list); -+	rdd->middle = &middle; -+	err = ovl_dir_read(lowerpath, rdd, ovl_fill_lower); -+	list_del(&middle); -+out: -+	rdd->root = NULL; -+ -+	return err; -+} -+ -+static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos) -+{ -+	struct list_head *l; -+	loff_t off; -+ -+	l = od->cache.next; -+	for (off = 0; off < pos; off++) { -+		if (l == &od->cache) -+			break; -+		l = l->next; -+	} -+	list_move_tail(&od->cursor, l); -+} -+ -+static int ovl_readdir(struct file *file, void *buf, filldir_t filler) -+{ -+	struct ovl_dir_file *od = file->private_data; -+	int res; -+ -+	if (!file->f_pos) -+		ovl_dir_reset(file); -+ -+	if (od->is_real) { -+		res = vfs_readdir(od->realfile, filler, buf); -+		file->f_pos = od->realfile->f_pos; -+ -+		return res; -+	} -+ -+	if (!od->is_cached) { -+		struct path lowerpath; -+		struct path upperpath; -+		struct ovl_readdir_data rdd = { .list = &od->cache }; -+ -+		ovl_path_lower(file->f_path.dentry, &lowerpath); -+		ovl_path_upper(file->f_path.dentry, &upperpath); -+ -+		res = ovl_dir_read_merged(&upperpath, &lowerpath, &rdd); -+		if (res) { -+			ovl_cache_free(rdd.list); -+			return res; -+		} -+ -+		od->cache_version = ovl_dentry_version_get(file->f_path.dentry); -+		od->is_cached = true; -+ -+		ovl_seek_cursor(od, file->f_pos); -+	} -+ -+	while (od->cursor.next != &od->cache) { -+		int over; -+		loff_t off; -+		struct ovl_cache_entry *p; -+ -+		p = list_entry(od->cursor.next, struct ovl_cache_entry, l_node); -+		off = file->f_pos; -+		file->f_pos++; -+		list_move(&od->cursor, &p->l_node); -+ -+		if (p->is_whiteout) -+			continue; -+ -+		over = filler(buf, p->name, p->len, off, p->ino, p->type); -+		if (over) -+			break; -+	} -+ -+	return 0; -+} -+ -+static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin) -+{ -+	loff_t res; -+	struct ovl_dir_file *od = file->private_data; -+ -+	mutex_lock(&file->f_dentry->d_inode->i_mutex); -+	if (!file->f_pos) -+		ovl_dir_reset(file); -+ -+	if (od->is_real) { -+		res = vfs_llseek(od->realfile, offset, origin); -+		file->f_pos = od->realfile->f_pos; -+	} else { -+		res = -EINVAL; -+ -+		switch (origin) { -+		case SEEK_CUR: -+			offset += file->f_pos; -+			break; -+		case SEEK_SET: -+			break; -+		default: -+			goto out_unlock; -+		} -+		if (offset < 0) -+			goto out_unlock; -+ -+		if (offset != file->f_pos) { -+			file->f_pos = offset; -+			if (od->is_cached) -+				ovl_seek_cursor(od, offset); -+		} -+		res = offset; -+	} -+out_unlock: -+	mutex_unlock(&file->f_dentry->d_inode->i_mutex); -+ -+	return res; -+} -+ -+static int ovl_dir_fsync(struct file *file, int datasync) -+{ -+	struct ovl_dir_file *od = file->private_data; -+ -+	/* May need to reopen directory if it got copied up */ -+	if (!od->realfile) { -+		struct path upperpath; -+ -+		ovl_path_upper(file->f_path.dentry, &upperpath); -+		od->realfile = vfs_open(&upperpath, O_RDONLY, current_cred()); -+		if (IS_ERR(od->realfile)) -+			return PTR_ERR(od->realfile); -+	} -+ -+	return vfs_fsync(od->realfile, datasync); -+} -+ -+static int ovl_dir_release(struct inode *inode, struct file *file) -+{ -+	struct ovl_dir_file *od = file->private_data; -+ -+	list_del(&od->cursor); -+	ovl_cache_free(&od->cache); -+	if (od->realfile) -+		fput(od->realfile); -+	kfree(od); -+ -+	return 0; -+} -+ -+static int ovl_dir_open(struct inode *inode, struct file *file) -+{ -+	struct path realpath; -+	struct file *realfile; -+	struct ovl_dir_file *od; -+	enum ovl_path_type type; -+ -+	od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL); -+	if (!od) -+		return -ENOMEM; -+ -+	type = ovl_path_real(file->f_path.dentry, &realpath); -+	realfile = vfs_open(&realpath, file->f_flags, current_cred()); -+	if (IS_ERR(realfile)) { -+		kfree(od); -+		return PTR_ERR(realfile); -+	} -+	INIT_LIST_HEAD(&od->cache); -+	INIT_LIST_HEAD(&od->cursor); -+	od->is_cached = false; -+	od->realfile = realfile; -+	od->is_real = (type != OVL_PATH_MERGE); -+	file->private_data = od; -+ -+	return 0; -+} -+ -+static const struct file_operations ovl_dir_operations = { -+	.read		= generic_read_dir, -+	.open		= ovl_dir_open, -+	.readdir	= ovl_readdir, -+	.llseek		= ovl_dir_llseek, -+	.fsync		= ovl_dir_fsync, -+	.release	= ovl_dir_release, -+}; -+ -+static const struct inode_operations ovl_dir_inode_operations; -+ -+static void ovl_entry_free(struct rcu_head *head) -+{ -+	struct ovl_entry *oe = container_of(head, struct ovl_entry, rcu); -+	kfree(oe); -+} -+ -+static void ovl_dentry_release(struct dentry *dentry) -+{ -+	struct ovl_entry *oe = dentry->d_fsdata; -+ -+	if (oe) { -+		dput(oe->__upperdentry); -+		dput(oe->lowerdentry); -+		call_rcu(&oe->rcu, ovl_entry_free); -+	} -+} -+ -+static const struct dentry_operations ovl_dentry_operations = { -+	.d_release = ovl_dentry_release, -+}; -+ -+static struct dentry *ovl_lookup_real(struct dentry *dir, struct qstr *name) -+{ -+	struct dentry *dentry; -+ -+	mutex_lock(&dir->d_inode->i_mutex); -+	dentry = lookup_one_len(name->name, dir, name->len); -+	mutex_unlock(&dir->d_inode->i_mutex); -+ -+	if (IS_ERR(dentry)) { -+		if (PTR_ERR(dentry) == -ENOENT) -+			dentry = NULL; -+	} else if (!dentry->d_inode) { -+		dput(dentry); -+		dentry = NULL; -+	} -+	return dentry; -+} -+ -+static struct ovl_entry *ovl_alloc_entry(void) -+{ -+	return kzalloc(sizeof(struct ovl_entry), GFP_KERNEL); -+} -+ -+static struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, -+				   struct ovl_entry *oe); -+ -+static int ovl_whiteout(struct dentry *upperdir, struct dentry *dentry) -+{ -+	int err; -+	struct dentry *newdentry; -+	const struct cred *old_cred; -+	struct cred *override_cred; -+ -+	/* FIXME: recheck lower dentry to see if whiteout is really needed */ -+ -+	err = -ENOMEM; -+	override_cred = prepare_creds(); -+	if (!override_cred) -+		goto out; -+ -+	/* -+	 * CAP_SYS_ADMIN for setxattr -+	 * CAP_DAC_OVERRIDE for symlink creation -+	 */ -+	cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); -+	cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); -+	override_cred->fsuid = 0; -+	override_cred->fsgid = 0; -+	old_cred = override_creds(override_cred); -+ -+	newdentry = lookup_one_len(dentry->d_name.name, upperdir, -+				   dentry->d_name.len); -+	err = PTR_ERR(newdentry); -+	if (IS_ERR(newdentry)) -+		goto out_put_cred; -+ -+	/* Just been removed within the same locked region */ -+	WARN_ON(newdentry->d_inode); -+ -+	err = vfs_symlink(upperdir->d_inode, newdentry, ovl_whiteout_symlink); -+	if (err) -+		goto out_dput; -+ -+	ovl_dentry_version_inc(dentry->d_parent); -+ -+	err = vfs_setxattr(newdentry, ovl_whiteout_xattr, "y", 1, 0); -+	if (err) -+		vfs_unlink(upperdir->d_inode, newdentry); -+ -+out_dput: -+	dput(newdentry); -+out_put_cred: -+	revert_creds(old_cred); -+	put_cred(override_cred); -+out: -+	if (err) { -+		/* -+		 * There's no way to recover from failure to whiteout. -+		 * What should we do?  Log a big fat error and... ? -+		 */ -+		printk(KERN_ERR "overlayfs: ERROR - failed to whiteout '%s'\n", -+		       dentry->d_name.name); -+	} -+ -+	return err; -+} -+ -+static struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, -+				   struct nameidata *nd) -+{ -+	struct ovl_entry *oe; -+	struct dentry *upperdir; -+	struct dentry *lowerdir; -+	struct dentry *upperdentry = NULL; -+	struct dentry *lowerdentry = NULL; -+	struct inode *inode = NULL; -+	int err; -+ -+	err = -ENOMEM; -+	oe = ovl_alloc_entry(); -+	if (!oe) -+		goto out; -+ -+	upperdir = ovl_dentry_upper(dentry->d_parent); -+	lowerdir = ovl_dentry_lower(dentry->d_parent); -+ -+	if (upperdir) { -+		upperdentry = ovl_lookup_real(upperdir, &dentry->d_name); -+		err = PTR_ERR(upperdentry); -+		if (IS_ERR(upperdentry)) -+			goto out_put_dir; -+ -+		if (lowerdir && upperdentry && -+		    (S_ISLNK(upperdentry->d_inode->i_mode) || -+		     S_ISDIR(upperdentry->d_inode->i_mode))) { -+			const struct cred *old_cred; -+			struct cred *override_cred; -+ -+			err = -ENOMEM; -+			override_cred = prepare_creds(); -+			if (!override_cred) -+				goto out_dput_upper; -+ -+			/* CAP_SYS_ADMIN needed for getxattr */ -+			cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); -+			old_cred = override_creds(override_cred); -+ -+			if (ovl_is_opaquedir(upperdentry)) { -+				oe->opaque = true; -+			} else if (ovl_is_whiteout(upperdentry)) { -+				dput(upperdentry); -+				upperdentry = NULL; -+				oe->opaque = true; -+			} -+			revert_creds(old_cred); -+			put_cred(override_cred); -+		} -+	} -+	if (lowerdir && !oe->opaque) { -+		lowerdentry = ovl_lookup_real(lowerdir, &dentry->d_name); -+		err = PTR_ERR(lowerdentry); -+		if (IS_ERR(lowerdentry)) -+			goto out_dput_upper; -+	} -+ -+	if (lowerdentry && upperdentry && -+	    (!S_ISDIR(upperdentry->d_inode->i_mode) || -+	     !S_ISDIR(lowerdentry->d_inode->i_mode))) { -+		dput(lowerdentry); -+		lowerdentry = NULL; -+		oe->opaque = true; -+	} -+ -+	if (lowerdentry || upperdentry) { -+		struct dentry *realdentry; -+ -+		realdentry = upperdentry ? upperdentry : lowerdentry; -+		err = -ENOMEM; -+		inode = ovl_new_inode(dir->i_sb, realdentry->d_inode->i_mode, oe); -+		if (!inode) -+			goto out_dput; -+	} -+ -+	if (upperdentry) -+		oe->__upperdentry = upperdentry; -+ -+	if (lowerdentry) -+		oe->lowerdentry = lowerdentry; -+ -+	dentry->d_fsdata = oe; -+	dentry->d_op = &ovl_dentry_operations; -+	d_add(dentry, inode); -+ -+	return NULL; -+ -+out_dput: -+	dput(lowerdentry); -+out_dput_upper: -+	dput(upperdentry); -+out_put_dir: -+	kfree(oe); -+out: -+	return ERR_PTR(err); -+} -+  +static int ovl_copy_up_xattr(struct dentry *old, struct dentry *new)  +{  +	ssize_t list_size, size; @@ -1168,120 +573,6 @@  +	return error;  +}  + -+static struct dentry *ovl_lookup_create(struct dentry *upperdir, -+					struct dentry *template) -+{ -+	int err; -+	struct dentry *newdentry; -+	struct qstr *name = &template->d_name; -+ -+	newdentry = lookup_one_len(name->name, upperdir, name->len); -+	if (IS_ERR(newdentry)) -+		return newdentry; -+ -+	if (newdentry->d_inode) { -+		const struct cred *old_cred; -+		struct cred *override_cred; -+ -+		/* No need to check whiteout if lower parent is non-existent */ -+		err = -EEXIST; -+		if (!ovl_dentry_lower(template->d_parent)) -+			goto out_dput; -+ -+		if (!S_ISLNK(newdentry->d_inode->i_mode)) -+			goto out_dput; -+ -+		err = -ENOMEM; -+		override_cred = prepare_creds(); -+		if (!override_cred) -+			goto out_dput; -+ -+		/* -+		 * CAP_SYS_ADMIN for getxattr -+		 * CAP_FOWNER for unlink in sticky directory -+		 */ -+		cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); -+		cap_raise(override_cred->cap_effective, CAP_FOWNER); -+		old_cred = override_creds(override_cred); -+ -+		err = -EEXIST; -+		if (ovl_is_whiteout(newdentry)) -+			err = vfs_unlink(upperdir->d_inode, newdentry); -+ -+		revert_creds(old_cred); -+		put_cred(override_cred); -+		if (err) -+			goto out_dput; -+ -+		dput(newdentry); -+		newdentry = lookup_one_len(name->name, upperdir, name->len); -+		if (IS_ERR(newdentry)) { -+			ovl_whiteout(upperdir, template); -+			return newdentry; -+		} -+ -+		/* -+		 * Whiteout just been successfully removed, parent -+		 * i_mutex is still held, there's no way the lookup -+		 * could return positive. -+		 */ -+		WARN_ON(newdentry->d_inode); -+	} -+ -+	return newdentry; -+ -+out_dput: -+	dput(newdentry); -+	return ERR_PTR(err); -+} -+ -+static struct dentry *ovl_upper_create(struct dentry *upperdir, -+				       struct dentry *dentry, -+				       struct kstat *stat, const char *link) -+{ -+	int err; -+	struct dentry *newdentry; -+	struct inode *dir = upperdir->d_inode; -+ -+	newdentry = ovl_lookup_create(upperdir, dentry); -+	if (IS_ERR(newdentry)) -+		goto out; -+ -+	switch (stat->mode & S_IFMT) { -+	case S_IFREG: -+		err = vfs_create(dir, newdentry, stat->mode, NULL); -+		break; -+ -+	case S_IFDIR: -+		err = vfs_mkdir(dir, newdentry, stat->mode); -+		break; -+ -+	case S_IFCHR: -+	case S_IFBLK: -+	case S_IFIFO: -+	case S_IFSOCK: -+		err = vfs_mknod(dir, newdentry, stat->mode, stat->rdev); -+		break; -+ -+	case S_IFLNK: -+		err = vfs_symlink(dir, newdentry, link); -+		break; -+ -+	default: -+		err = -EPERM; -+	} -+	if (err) { -+		if (ovl_dentry_is_opaque(dentry)) -+			ovl_whiteout(upperdir, dentry); -+		dput(newdentry); -+		newdentry = ERR_PTR(err); -+	} -+ -+out: -+	return newdentry; -+ -+} -+  +static char *ovl_read_symlink(struct dentry *realdentry)  +{  +	int res; @@ -1337,46 +628,6 @@  +	return notify_change(upperdentry, &attr);  +}  + -+static int ovl_set_opaque(struct dentry *upperdentry) -+{ -+	int err; -+	const struct cred *old_cred; -+	struct cred *override_cred; -+ -+	override_cred = prepare_creds(); -+	if (!override_cred) -+		return -ENOMEM; -+ -+	/* CAP_SYS_ADMIN for setxattr of "trusted" namespace */ -+	cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); -+	old_cred = override_creds(override_cred); -+	err = vfs_setxattr(upperdentry, ovl_opaque_xattr, "y", 1, 0); -+	revert_creds(old_cred); -+	put_cred(override_cred); -+ -+	return err; -+} -+ -+static int ovl_remove_opaque(struct dentry *upperdentry) -+{ -+	int err; -+	const struct cred *old_cred; -+	struct cred *override_cred; -+ -+	override_cred = prepare_creds(); -+	if (!override_cred) -+		return -ENOMEM; -+ -+	/* CAP_SYS_ADMIN for removexattr of "trusted" namespace */ -+	cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); -+	old_cred = override_creds(override_cred); -+	err = vfs_removexattr(upperdentry, ovl_opaque_xattr); -+	revert_creds(old_cred); -+	put_cred(override_cred); -+ -+	return err; -+} -+  +static int ovl_copy_up_locked(struct dentry *upperdir, struct dentry *dentry,  +			      struct path *lowerpath, struct kstat *stat,  +			      const char *link) @@ -1384,22 +635,15 @@  +	int err;  +	struct path newpath;  +	umode_t mode = stat->mode; -+	struct ovl_fs *ofs = dentry->d_sb->s_fs_info;  +  +	/* Can't properly set mode on creation because of the umask */  +	stat->mode &= S_IFMT;  + -+	newpath.mnt = ofs->upper_mnt; ++	ovl_path_upper(dentry, &newpath); ++	WARN_ON(newpath.dentry);  +	newpath.dentry = ovl_upper_create(upperdir, dentry, stat, link); -+	if (IS_ERR(newpath.dentry)) { -+		err = PTR_ERR(newpath.dentry); -+ -+		/* Already copied up? */ -+		if (err == -EEXIST && ovl_path_type(dentry) != OVL_PATH_LOWER) -+			return 0; -+ -+		return err; -+	} ++	if (IS_ERR(newpath.dentry)) ++		return PTR_ERR(newpath.dentry);  +  +	if (S_ISREG(stat->mode)) {  +		err = ovl_copy_up_data(lowerpath, &newpath, stat->size); @@ -1443,6 +687,21 @@  +	return err;  +}  + ++/* ++ * Copy up a single dentry ++ * ++ * Directory renames only allowed on "pure upper" (already created on ++ * upper filesystem, never copied up).  Directories which are on lower or ++ * are merged may not be renamed.  For these -EXDEV is returned and ++ * userspace has to deal with it.  This means, when copying up a ++ * directory we can rely on it and ancestors being stable. ++ * ++ * Non-directory renames start with copy up of source if necessary.  The ++ * actual rename will only proceed once the copy up was successful.  Copy ++ * up uses upper parent i_mutex for exclusion.  Since rename can change ++ * d_parent it is possible that the copy up will lock the old parent.  At ++ * that point the file will have already been copied up anyway. ++ */  +static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,  +			   struct path *lowerpath, struct kstat *stat)  +{ @@ -1489,13 +748,7 @@  +	old_cred = override_creds(override_cred);  +  +	mutex_lock_nested(&upperdir->d_inode->i_mutex, I_MUTEX_PARENT); -+	/* -+	 * Using upper filesystem locking to protect against copy up -+	 * racing with rename (rename means the copy up was already -+	 * successful). -+	 */ -+	if (dentry->d_parent != parent) { -+		WARN_ON((ovl_path_type(dentry) == OVL_PATH_LOWER)); ++	if (ovl_path_type(dentry) != OVL_PATH_LOWER) {  +		err = 0;  +	} else {  +		err = ovl_copy_up_locked(upperdir, dentry, lowerpath, @@ -1518,7 +771,7 @@  +	return err;  +}  + -+static int ovl_copy_up(struct dentry *dentry) ++int ovl_copy_up(struct dentry *dentry)  +{  +	int err;  + @@ -1559,7 +812,7 @@  +}  +  +/* Optimize by not copying up the file first and truncating later */ -+static int ovl_copy_up_truncate(struct dentry *dentry, loff_t size) ++int ovl_copy_up_truncate(struct dentry *dentry, loff_t size)  +{  +	int err;  +	struct kstat stat; @@ -1584,38 +837,247 @@  +	dput(parent);  +	return err;  +} +--- /dev/null ++++ b/fs/overlayfs/dir.c +@@ -0,0 +1,596 @@ ++/* ++ * ++ * Copyright (C) 2011 Novell Inc. ++ * ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License version 2 as published by ++ * the Free Software Foundation. ++ */ ++ ++#include <linux/fs.h> ++#include <linux/namei.h> ++#include <linux/xattr.h> ++#include <linux/security.h> ++#include "overlayfs.h" ++ ++static const char *ovl_whiteout_symlink = "(overlay-whiteout)";  + -+static int ovl_setattr(struct dentry *dentry, struct iattr *attr) ++static int ovl_whiteout(struct dentry *upperdir, struct dentry *dentry)  +{ -+	struct dentry *upperdentry;  +	int err; ++	struct dentry *newdentry; ++	const struct cred *old_cred; ++	struct cred *override_cred;  + -+	if ((attr->ia_valid & ATTR_SIZE) && !ovl_dentry_upper(dentry)) -+		err = ovl_copy_up_truncate(dentry, attr->ia_size); -+	else -+		err = ovl_copy_up(dentry); ++	/* FIXME: recheck lower dentry to see if whiteout is really needed */ ++ ++	err = -ENOMEM; ++	override_cred = prepare_creds(); ++	if (!override_cred) ++		goto out; ++ ++	/* ++	 * CAP_SYS_ADMIN for setxattr ++	 * CAP_DAC_OVERRIDE for symlink creation ++	 * CAP_FOWNER for unlink in sticky directory ++	 */ ++	cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); ++	cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); ++	cap_raise(override_cred->cap_effective, CAP_FOWNER); ++	override_cred->fsuid = 0; ++	override_cred->fsgid = 0; ++	old_cred = override_creds(override_cred); ++ ++	newdentry = lookup_one_len(dentry->d_name.name, upperdir, ++				   dentry->d_name.len); ++	err = PTR_ERR(newdentry); ++	if (IS_ERR(newdentry)) ++		goto out_put_cred; ++ ++	/* Just been removed within the same locked region */ ++	WARN_ON(newdentry->d_inode); ++ ++	err = vfs_symlink(upperdir->d_inode, newdentry, ovl_whiteout_symlink);  +	if (err) -+		return err; ++		goto out_dput;  + -+	upperdentry = ovl_dentry_upper(dentry); ++	ovl_dentry_version_inc(dentry->d_parent);  + -+	if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) -+		attr->ia_valid &= ~ATTR_MODE; ++	err = vfs_setxattr(newdentry, ovl_whiteout_xattr, "y", 1, 0); ++	if (err) ++		vfs_unlink(upperdir->d_inode, newdentry);  + -+	mutex_lock(&upperdentry->d_inode->i_mutex); -+	err = notify_change(upperdentry, attr); -+	mutex_unlock(&upperdentry->d_inode->i_mutex); ++out_dput: ++	dput(newdentry); ++out_put_cred: ++	revert_creds(old_cred); ++	put_cred(override_cred); ++out: ++	if (err) { ++		/* ++		 * There's no way to recover from failure to whiteout. ++		 * What should we do?  Log a big fat error and... ? ++		 */ ++		printk(KERN_ERR "overlayfs: ERROR - failed to whiteout '%s'\n", ++		       dentry->d_name.name); ++	}  +  +	return err;  +}  + -+static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry, -+			 struct kstat *stat) ++static struct dentry *ovl_lookup_create(struct dentry *upperdir, ++					struct dentry *template)  +{ -+	struct path realpath; ++	int err; ++	struct dentry *newdentry; ++	struct qstr *name = &template->d_name;  + -+	ovl_path_real(dentry, &realpath); -+	return vfs_getattr(realpath.mnt, realpath.dentry, stat); ++	newdentry = lookup_one_len(name->name, upperdir, name->len); ++	if (IS_ERR(newdentry)) ++		return newdentry; ++ ++	if (newdentry->d_inode) { ++		const struct cred *old_cred; ++		struct cred *override_cred; ++ ++		/* No need to check whiteout if lower parent is non-existent */ ++		err = -EEXIST; ++		if (!ovl_dentry_lower(template->d_parent)) ++			goto out_dput; ++ ++		if (!S_ISLNK(newdentry->d_inode->i_mode)) ++			goto out_dput; ++ ++		err = -ENOMEM; ++		override_cred = prepare_creds(); ++		if (!override_cred) ++			goto out_dput; ++ ++		/* ++		 * CAP_SYS_ADMIN for getxattr ++		 * CAP_FOWNER for unlink in sticky directory ++		 */ ++		cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); ++		cap_raise(override_cred->cap_effective, CAP_FOWNER); ++		old_cred = override_creds(override_cred); ++ ++		err = -EEXIST; ++		if (ovl_is_whiteout(newdentry)) ++			err = vfs_unlink(upperdir->d_inode, newdentry); ++ ++		revert_creds(old_cred); ++		put_cred(override_cred); ++		if (err) ++			goto out_dput; ++ ++		dput(newdentry); ++		newdentry = lookup_one_len(name->name, upperdir, name->len); ++		if (IS_ERR(newdentry)) { ++			ovl_whiteout(upperdir, template); ++			return newdentry; ++		} ++ ++		/* ++		 * Whiteout just been successfully removed, parent ++		 * i_mutex is still held, there's no way the lookup ++		 * could return positive. ++		 */ ++		WARN_ON(newdentry->d_inode); ++	} ++ ++	return newdentry; ++ ++out_dput: ++	dput(newdentry); ++	return ERR_PTR(err); ++} ++ ++struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry, ++				struct kstat *stat, const char *link) ++{ ++	int err; ++	struct dentry *newdentry; ++	struct inode *dir = upperdir->d_inode; ++ ++	newdentry = ovl_lookup_create(upperdir, dentry); ++	if (IS_ERR(newdentry)) ++		goto out; ++ ++	switch (stat->mode & S_IFMT) { ++	case S_IFREG: ++		err = vfs_create(dir, newdentry, stat->mode, NULL); ++		break; ++ ++	case S_IFDIR: ++		err = vfs_mkdir(dir, newdentry, stat->mode); ++		break; ++ ++	case S_IFCHR: ++	case S_IFBLK: ++	case S_IFIFO: ++	case S_IFSOCK: ++		err = vfs_mknod(dir, newdentry, stat->mode, stat->rdev); ++		break; ++ ++	case S_IFLNK: ++		err = vfs_symlink(dir, newdentry, link); ++		break; ++ ++	default: ++		err = -EPERM; ++	} ++	if (err) { ++		if (ovl_dentry_is_opaque(dentry)) ++			ovl_whiteout(upperdir, dentry); ++		dput(newdentry); ++		newdentry = ERR_PTR(err); ++	} else if (WARN_ON(!newdentry->d_inode)) { ++		/* ++		 * Not quite sure if non-instantiated dentry is legal or not. ++		 * VFS doesn't seem to care so check and warn here. ++		 */ ++		dput(newdentry); ++		newdentry = ERR_PTR(-ENOENT); ++	} ++ ++out: ++	return newdentry; ++ ++} ++ ++static int ovl_set_opaque(struct dentry *upperdentry) ++{ ++	int err; ++	const struct cred *old_cred; ++	struct cred *override_cred; ++ ++	override_cred = prepare_creds(); ++	if (!override_cred) ++		return -ENOMEM; ++ ++	/* CAP_SYS_ADMIN for setxattr of "trusted" namespace */ ++	cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); ++	old_cred = override_creds(override_cred); ++	err = vfs_setxattr(upperdentry, ovl_opaque_xattr, "y", 1, 0); ++	revert_creds(old_cred); ++	put_cred(override_cred); ++ ++	return err; ++} ++ ++static int ovl_remove_opaque(struct dentry *upperdentry) ++{ ++	int err; ++	const struct cred *old_cred; ++	struct cred *override_cred; ++ ++	override_cred = prepare_creds(); ++	if (!override_cred) ++		return -ENOMEM; ++ ++	/* CAP_SYS_ADMIN for removexattr of "trusted" namespace */ ++	cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); ++	old_cred = override_creds(override_cred); ++	err = vfs_removexattr(upperdentry, ovl_opaque_xattr); ++	revert_creds(old_cred); ++	put_cred(override_cred); ++ ++	return err;  +}  +  +static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry, @@ -1644,79 +1106,6 @@  +	return 0;  +}  + -+static int ovl_permission(struct inode *inode, int mask, unsigned int flags) -+{ -+	struct ovl_entry *oe; -+	struct dentry *alias = NULL; -+	struct inode *realinode; -+	struct dentry *realdentry; -+	bool is_upper; -+	int err; -+ -+	if (S_ISDIR(inode->i_mode)) { -+		oe = inode->i_private; -+	} else if (flags & IPERM_FLAG_RCU) { -+		return -ECHILD; -+	} else { -+		/* -+		 * For non-directories find an alias and get the info -+		 * from there. -+		 */ -+		spin_lock(&inode->i_lock); -+		if (WARN_ON(list_empty(&inode->i_dentry))) { -+			spin_unlock(&inode->i_lock); -+			return -ENOENT; -+		} -+		alias = list_entry(inode->i_dentry.next, struct dentry, d_alias); -+		dget(alias); -+		spin_unlock(&inode->i_lock); -+		oe = alias->d_fsdata; -+	} -+ -+	realdentry = ovl_upperdentry_dereference(oe); -+	is_upper = true; -+	if (!realdentry) { -+		realdentry = oe->lowerdentry; -+		is_upper = false; -+	} -+ -+	/* Careful in RCU walk mode */ -+	realinode = ACCESS_ONCE(realdentry->d_inode); -+	if (!realinode) { -+		WARN_ON(!(flags & IPERM_FLAG_RCU)); -+		return -ENOENT; -+	} -+ -+	if (mask & MAY_WRITE) { -+		umode_t mode = realinode->i_mode; -+ -+		/* -+		 * Writes will always be redirected to upper layer, so -+		 * ignore lower layer being read-only. -+		 */ -+		err = -EROFS; -+		if (is_upper && IS_RDONLY(realinode) && -+		    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) -+			goto out_dput; -+ -+		/* -+		 * Nobody gets write access to an immutable file. -+		 */ -+		err = -EACCES; -+		if (IS_IMMUTABLE(realinode)) -+			goto out_dput; -+	} -+ -+	if (realinode->i_op->permission) -+		err = realinode->i_op->permission(realinode, mask, flags); -+	else -+		err = generic_permission(realinode, mask, flags, -+					 realinode->i_op->check_acl); -+out_dput: -+	dput(alias); -+	return err; -+} -+  +static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,  +			     const char *link)  +{ @@ -1794,73 +1183,6 @@  +	return ovl_create_object(dentry, S_IFLNK, 0, link);  +}  + -+struct ovl_link_data { -+	struct dentry *realdentry; -+	void *cookie; -+}; -+ -+static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd) -+{ -+	void *ret; -+	struct dentry *realdentry; -+	struct inode *realinode; -+ -+	realdentry = ovl_dentry_real(dentry); -+	realinode = realdentry->d_inode; -+ -+	if (WARN_ON(!realinode->i_op->follow_link)) -+		return ERR_PTR(-EPERM); -+ -+	ret = realinode->i_op->follow_link(realdentry, nd); -+	if (IS_ERR(ret)) -+		return ret; -+ -+	if (realinode->i_op->put_link) { -+		struct ovl_link_data *data; -+ -+		data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL); -+		if (!data) { -+			realinode->i_op->put_link(realdentry, nd, ret); -+			return ERR_PTR(-ENOMEM); -+		} -+		data->realdentry = realdentry; -+		data->cookie = ret; -+ -+		return data; -+	} else { -+		return NULL; -+	} -+} -+ -+static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c) -+{ -+	struct inode *realinode; -+	struct ovl_link_data *data = c; -+ -+	if (!data) -+		return; -+ -+	realinode = data->realdentry->d_inode; -+	realinode->i_op->put_link(data->realdentry, nd, data->cookie); -+	kfree(data); -+} -+ -+static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz) -+{ -+	struct path realpath; -+	struct inode *realinode; -+ -+	ovl_path_real(dentry, &realpath); -+	realinode = realpath.dentry->d_inode; -+ -+	if (!realinode->i_op->readlink) -+		return -EINVAL; -+ -+	touch_atime(realpath.mnt, realpath.dentry); -+ -+	return realinode->i_op->readlink(realpath.dentry, buf, bufsiz); -+} -+  +static int ovl_do_remove(struct dentry *dentry, bool is_dir)  +{  +	int err; @@ -1880,6 +1202,8 @@  +		if (realpath.dentry->d_parent != upperdir)  +			goto out_d_drop;  + ++		/* FIXME: create whiteout up front and rename to target */ ++  +		if (is_dir)  +			err = vfs_rmdir(upperdir->d_inode, realpath.dentry);  +		else @@ -1911,100 +1235,6 @@  +	return ovl_do_remove(dentry, false);  +}  + -+static int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list) -+{ -+	int err; -+	struct path lowerpath; -+	struct path upperpath; -+	struct ovl_cache_entry *p; -+	struct ovl_readdir_data rdd = { .list = list }; -+ -+	ovl_path_upper(dentry, &upperpath); -+	ovl_path_lower(dentry, &lowerpath); -+ -+	err = ovl_dir_read_merged(&upperpath, &lowerpath, &rdd); -+	if (err) -+		return err; -+ -+	err = 0; -+ -+	list_for_each_entry(p, list, l_node) { -+		if (p->is_whiteout) -+			continue; -+ -+		if (p->name[0] == '.') { -+			if (p->len == 1) -+				continue; -+			if (p->len == 2 && p->name[1] == '.') -+				continue; -+		} -+		err = -ENOTEMPTY; -+		break; -+	} -+ -+	return err; -+} -+ -+static int ovl_remove_whiteouts(struct dentry *dir, struct list_head *list) -+{ -+	struct path upperpath; -+	struct dentry *upperdir; -+	struct ovl_cache_entry *p; -+	const struct cred *old_cred; -+	struct cred *override_cred; -+	int ret = 0; -+ -+	ovl_path_upper(dir, &upperpath); -+	upperdir = upperpath.dentry; -+ -+	override_cred = prepare_creds(); -+	if (!override_cred) -+		return -ENOMEM; -+ -+	/* -+	 * CAP_DAC_OVERRIDE for lookup and unlink -+	 */ -+	cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); -+	old_cred = override_creds(override_cred); -+ -+	mutex_lock(&upperdir->d_inode->i_mutex); -+	list_for_each_entry(p, list, l_node) { -+		if (p->is_whiteout) { -+			struct dentry *dentry; -+ -+			dentry = lookup_one_len(p->name, upperdir, p->len); -+			if (IS_ERR(dentry)) { -+				ret = PTR_ERR(dentry); -+				break; -+			} -+			ret = vfs_unlink(upperdir->d_inode, dentry); -+			dput(dentry); -+			if (ret) -+				break; -+		} -+	} -+	mutex_unlock(&upperdir->d_inode->i_mutex); -+ -+	revert_creds(old_cred); -+	put_cred(override_cred); -+ -+	return ret; -+} -+ -+static int ovl_check_empty_and_clear(struct dentry *dentry, -+				     enum ovl_path_type type) -+{ -+	int err; -+	LIST_HEAD(list); -+ -+	err = ovl_check_empty_dir(dentry, &list); -+	if (!err && type == OVL_PATH_MERGE) -+		err = ovl_remove_whiteouts(dentry, &list); -+ -+	ovl_cache_free(&list); -+ -+	return err; -+}  +  +static int ovl_rmdir(struct inode *dir, struct dentry *dentry)  +{ @@ -2047,6 +1277,12 @@  +	olddentry = ovl_dentry_upper(old);  +	err = vfs_link(olddentry, upperdir->d_inode, newdentry);  +	if (!err) { ++		if (WARN_ON(!newdentry->d_inode)) { ++			dput(newdentry); ++			err = -ENOENT; ++			goto out_unlock; ++		} ++  +		ovl_dentry_version_inc(new->d_parent);  +		ovl_dentry_update(new, newdentry);  + @@ -2069,6 +1305,7 @@  +{  +	int err;  +	enum ovl_path_type old_type; ++	enum ovl_path_type new_type;  +	struct dentry *old_upperdir;  +	struct dentry *new_upperdir;  +	struct dentry *olddentry; @@ -2076,6 +1313,7 @@  +	struct dentry *trap;  +	bool old_opaque;  +	bool new_opaque; ++	bool new_create = false;  +	bool is_dir = S_ISDIR(old->d_inode->i_mode);  +  +	/* Don't copy up directory trees */ @@ -2084,8 +1322,6 @@  +		return -EXDEV;  +  +	if (new->d_inode) { -+		enum ovl_path_type new_type; -+  +		new_type = ovl_path_type(new);  +  +		if (new_type == OVL_PATH_LOWER && old_type == OVL_PATH_LOWER) { @@ -2105,6 +1341,8 @@  +			if (err)  +				return err;  +		} ++	} else { ++		new_type = OVL_PATH_UPPER;  +	}  +  +	err = ovl_copy_up(old); @@ -2125,6 +1363,7 @@  +	if (newdentry) {  +		dget(newdentry);  +	} else { ++		new_create = true;  +		newdentry = ovl_lookup_create(new_upperdir, new);  +		err = PTR_ERR(newdentry);  +		if (IS_ERR(newdentry)) @@ -2142,8 +1381,7 @@  +		goto out_dput;  +  +	old_opaque = ovl_dentry_is_opaque(old); -+	new_opaque = ovl_dentry_is_opaque(new) || -+		ovl_path_type(new) != OVL_PATH_UPPER; ++	new_opaque = ovl_dentry_is_opaque(new) || new_type != OVL_PATH_UPPER;  +  +	if (is_dir && !old_opaque && new_opaque) {  +		err = ovl_set_opaque(olddentry); @@ -2155,7 +1393,7 @@  +			 new_upperdir->d_inode, newdentry);  +  +	if (err) { -+		if (ovl_dentry_is_opaque(new)) ++		if (new_create && ovl_dentry_is_opaque(new))  +			ovl_whiteout(new_upperdir, new);  +		if (is_dir && !old_opaque && new_opaque)  +			ovl_remove_opaque(olddentry); @@ -2180,13 +1418,228 @@  +	return err;  +}  + ++const struct inode_operations ovl_dir_inode_operations = { ++	.lookup		= ovl_lookup, ++	.mkdir		= ovl_mkdir, ++	.symlink	= ovl_symlink, ++	.unlink		= ovl_unlink, ++	.rmdir		= ovl_rmdir, ++	.rename		= ovl_rename, ++	.link		= ovl_link, ++	.setattr	= ovl_setattr, ++	.create		= ovl_create, ++	.mknod		= ovl_mknod, ++	.permission	= ovl_permission, ++	.getattr	= ovl_dir_getattr, ++	.setxattr	= ovl_setxattr, ++	.getxattr	= ovl_getxattr, ++	.listxattr	= ovl_listxattr, ++	.removexattr	= ovl_removexattr, ++}; +--- /dev/null ++++ b/fs/overlayfs/inode.c +@@ -0,0 +1,384 @@ ++/* ++ * ++ * Copyright (C) 2011 Novell Inc. ++ * ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License version 2 as published by ++ * the Free Software Foundation. ++ */ ++ ++#include <linux/fs.h> ++#include <linux/slab.h> ++#include <linux/xattr.h> ++#include "overlayfs.h" ++ ++int ovl_setattr(struct dentry *dentry, struct iattr *attr) ++{ ++	struct dentry *upperdentry; ++	int err; ++ ++	if ((attr->ia_valid & ATTR_SIZE) && !ovl_dentry_upper(dentry)) ++		err = ovl_copy_up_truncate(dentry, attr->ia_size); ++	else ++		err = ovl_copy_up(dentry); ++	if (err) ++		return err; ++ ++	upperdentry = ovl_dentry_upper(dentry); ++ ++	if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) ++		attr->ia_valid &= ~ATTR_MODE; ++ ++	mutex_lock(&upperdentry->d_inode->i_mutex); ++	err = notify_change(upperdentry, attr); ++	mutex_unlock(&upperdentry->d_inode->i_mutex); ++ ++	return err; ++} ++ ++static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry, ++			 struct kstat *stat) ++{ ++	struct path realpath; ++ ++	ovl_path_real(dentry, &realpath); ++	return vfs_getattr(realpath.mnt, realpath.dentry, stat); ++} ++ ++int ovl_permission(struct inode *inode, int mask, unsigned int flags) ++{ ++	struct ovl_entry *oe; ++	struct dentry *alias = NULL; ++	struct inode *realinode; ++	struct dentry *realdentry; ++	bool is_upper; ++	int err; ++ ++	if (S_ISDIR(inode->i_mode)) { ++		oe = inode->i_private; ++	} else if (flags & IPERM_FLAG_RCU) { ++		return -ECHILD; ++	} else { ++		/* ++		 * For non-directories find an alias and get the info ++		 * from there. ++		 */ ++		spin_lock(&inode->i_lock); ++		if (WARN_ON(list_empty(&inode->i_dentry))) { ++			spin_unlock(&inode->i_lock); ++			return -ENOENT; ++		} ++		alias = list_entry(inode->i_dentry.next, struct dentry, d_alias); ++		dget(alias); ++		spin_unlock(&inode->i_lock); ++		oe = alias->d_fsdata; ++	} ++ ++	realdentry = ovl_entry_real(oe, &is_upper); ++ ++	/* Careful in RCU walk mode */ ++	realinode = ACCESS_ONCE(realdentry->d_inode); ++	if (!realinode) { ++		WARN_ON(!(flags & IPERM_FLAG_RCU)); ++		err = -ENOENT; ++		goto out_dput; ++	} ++ ++	if (mask & MAY_WRITE) { ++		umode_t mode = realinode->i_mode; ++ ++		/* ++		 * Writes will always be redirected to upper layer, so ++		 * ignore lower layer being read-only. ++		 * ++		 * If the overlay itself is read-only then proceed ++		 * with the permission check, don't return EROFS. ++		 * This will only happen if this is the lower layer of ++		 * another overlayfs. ++		 * ++		 * If upper fs becomes read-only after the overlay was ++		 * constructed return EROFS to prevent modification of ++		 * upper layer. ++		 */ ++		err = -EROFS; ++		if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) && ++		    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) ++			goto out_dput; ++ ++		/* ++		 * Nobody gets write access to an immutable file. ++		 */ ++		err = -EACCES; ++		if (IS_IMMUTABLE(realinode)) ++			goto out_dput; ++	} ++ ++	if (realinode->i_op->permission) ++		err = realinode->i_op->permission(realinode, mask, flags); ++	else ++		err = generic_permission(realinode, mask, flags, ++					 realinode->i_op->check_acl); ++out_dput: ++	dput(alias); ++	return err; ++} ++ ++ ++struct ovl_link_data { ++	struct dentry *realdentry; ++	void *cookie; ++}; ++ ++static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd) ++{ ++	void *ret; ++	struct dentry *realdentry; ++	struct inode *realinode; ++ ++	realdentry = ovl_dentry_real(dentry); ++	realinode = realdentry->d_inode; ++ ++	if (WARN_ON(!realinode->i_op->follow_link)) ++		return ERR_PTR(-EPERM); ++ ++	ret = realinode->i_op->follow_link(realdentry, nd); ++	if (IS_ERR(ret)) ++		return ret; ++ ++	if (realinode->i_op->put_link) { ++		struct ovl_link_data *data; ++ ++		data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL); ++		if (!data) { ++			realinode->i_op->put_link(realdentry, nd, ret); ++			return ERR_PTR(-ENOMEM); ++		} ++		data->realdentry = realdentry; ++		data->cookie = ret; ++ ++		return data; ++	} else { ++		return NULL; ++	} ++} ++ ++static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c) ++{ ++	struct inode *realinode; ++	struct ovl_link_data *data = c; ++ ++	if (!data) ++		return; ++ ++	realinode = data->realdentry->d_inode; ++	realinode->i_op->put_link(data->realdentry, nd, data->cookie); ++	kfree(data); ++} ++ ++static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz) ++{ ++	struct path realpath; ++	struct inode *realinode; ++ ++	ovl_path_real(dentry, &realpath); ++	realinode = realpath.dentry->d_inode; ++ ++	if (!realinode->i_op->readlink) ++		return -EINVAL; ++ ++	touch_atime(realpath.mnt, realpath.dentry); ++ ++	return realinode->i_op->readlink(realpath.dentry, buf, bufsiz); ++} ++ ++  +static bool ovl_is_private_xattr(const char *name)  +{  +	return strncmp(name, "trusted.overlay.", 14) == 0;  +}  + -+static int ovl_setxattr(struct dentry *dentry, const char *name, -+			  const void *value, size_t size, int flags) ++int ovl_setxattr(struct dentry *dentry, const char *name, ++		 const void *value, size_t size, int flags)  +{  +	int err;  +	struct dentry *upperdentry; @@ -2202,8 +1655,8 @@  +	return  vfs_setxattr(upperdentry, name, value, size, flags);  +}  + -+static ssize_t ovl_getxattr(struct dentry *dentry, const char *name, -+			      void *value, size_t size) ++ssize_t ovl_getxattr(struct dentry *dentry, const char *name, ++		     void *value, size_t size)  +{  +	if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE &&  +	    ovl_is_private_xattr(name)) @@ -2212,7 +1665,7 @@  +	return vfs_getxattr(ovl_dentry_real(dentry), name, value, size);  +}  + -+static ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) ++ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)  +{  +	ssize_t res;  +	int off; @@ -2242,7 +1695,7 @@  +	return res;  +}  + -+static int ovl_removexattr(struct dentry *dentry, const char *name) ++int ovl_removexattr(struct dentry *dentry, const char *name)  +{  +	int err;  +	struct path realpath; @@ -2305,25 +1758,6 @@  +	return vfs_open(&realpath, flags, cred);  +}  + -+static const struct inode_operations ovl_dir_inode_operations = { -+	.lookup		= ovl_lookup, -+	.mkdir		= ovl_mkdir, -+	.symlink	= ovl_symlink, -+	.unlink		= ovl_unlink, -+	.rmdir		= ovl_rmdir, -+	.rename		= ovl_rename, -+	.link		= ovl_link, -+	.setattr	= ovl_setattr, -+	.create		= ovl_create, -+	.mknod		= ovl_mknod, -+	.permission	= ovl_permission, -+	.getattr	= ovl_dir_getattr, -+	.setxattr	= ovl_setxattr, -+	.getxattr	= ovl_getxattr, -+	.listxattr	= ovl_listxattr, -+	.removexattr	= ovl_removexattr, -+}; -+  +static const struct inode_operations ovl_file_inode_operations = {  +	.setattr	= ovl_setattr,  +	.permission	= ovl_permission, @@ -2347,8 +1781,8 @@  +	.removexattr	= ovl_removexattr,  +};  + -+static struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, -+				   struct ovl_entry *oe) ++struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, ++			    struct ovl_entry *oe)  +{  +	struct inode *inode;  + @@ -2389,6 +1823,1011 @@  +	return inode;  +  +} +--- /dev/null ++++ b/fs/overlayfs/overlayfs.h +@@ -0,0 +1,63 @@ ++/* ++ * ++ * Copyright (C) 2011 Novell Inc. ++ * ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License version 2 as published by ++ * the Free Software Foundation. ++ */ ++ ++struct ovl_entry; ++ ++enum ovl_path_type { ++	OVL_PATH_UPPER, ++	OVL_PATH_MERGE, ++	OVL_PATH_LOWER, ++}; ++ ++extern const char *ovl_opaque_xattr; ++extern const char *ovl_whiteout_xattr; ++extern const struct dentry_operations ovl_dentry_operations; ++ ++enum ovl_path_type ovl_path_type(struct dentry *dentry); ++u64 ovl_dentry_version_get(struct dentry *dentry); ++void ovl_dentry_version_inc(struct dentry *dentry); ++void ovl_path_upper(struct dentry *dentry, struct path *path); ++void ovl_path_lower(struct dentry *dentry, struct path *path); ++enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path); ++struct dentry *ovl_dentry_upper(struct dentry *dentry); ++struct dentry *ovl_dentry_lower(struct dentry *dentry); ++struct dentry *ovl_dentry_real(struct dentry *dentry); ++struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper); ++bool ovl_dentry_is_opaque(struct dentry *dentry); ++void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque); ++bool ovl_is_whiteout(struct dentry *dentry); ++void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry); ++struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, ++			  struct nameidata *nd); ++ ++struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry, ++				struct kstat *stat, const char *link); ++ ++/* readdir.c */ ++extern const struct file_operations ovl_dir_operations; ++int ovl_check_empty_and_clear(struct dentry *dentry, enum ovl_path_type type); ++ ++/* inode.c */ ++int ovl_setattr(struct dentry *dentry, struct iattr *attr); ++int ovl_permission(struct inode *inode, int mask, unsigned int flags); ++int ovl_setxattr(struct dentry *dentry, const char *name, ++		 const void *value, size_t size, int flags); ++ssize_t ovl_getxattr(struct dentry *dentry, const char *name, ++		     void *value, size_t size); ++ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); ++int ovl_removexattr(struct dentry *dentry, const char *name); ++ ++struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, ++			    struct ovl_entry *oe); ++/* dir.c */ ++extern const struct inode_operations ovl_dir_inode_operations; ++ ++/* copy_up.c */ ++int ovl_copy_up(struct dentry *dentry); ++int ovl_copy_up_truncate(struct dentry *dentry, loff_t size); +--- /dev/null ++++ b/fs/overlayfs/readdir.c +@@ -0,0 +1,558 @@ ++/* ++ * ++ * Copyright (C) 2011 Novell Inc. ++ * ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License version 2 as published by ++ * the Free Software Foundation. ++ */ ++ ++#include <linux/fs.h> ++#include <linux/slab.h> ++#include <linux/namei.h> ++#include <linux/file.h> ++#include <linux/xattr.h> ++#include <linux/rbtree.h> ++#include <linux/security.h> ++#include "overlayfs.h" ++ ++struct ovl_cache_entry { ++	const char *name; ++	unsigned int len; ++	unsigned int type; ++	u64 ino; ++	bool is_whiteout; ++	struct list_head l_node; ++	struct rb_node node; ++}; ++ ++struct ovl_readdir_data { ++	struct rb_root *root; ++	struct list_head *list; ++	struct list_head *middle; ++	struct dentry *dir; ++	int count; ++	int err; ++}; ++ ++struct ovl_dir_file { ++	bool is_real; ++	bool is_cached; ++	struct list_head cursor; ++	u64 cache_version; ++	struct list_head cache; ++	struct file *realfile; ++}; ++ ++static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n) ++{ ++	return container_of(n, struct ovl_cache_entry, node); ++} ++ ++static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root, ++						    const char *name, int len) ++{ ++	struct rb_node *node = root->rb_node; ++	int cmp; ++ ++	while (node) { ++		struct ovl_cache_entry *p = ovl_cache_entry_from_node(node); ++ ++		cmp = strncmp(name, p->name, len); ++		if (cmp > 0) ++			node = p->node.rb_right; ++		else if (cmp < 0 || len < p->len) ++			node = p->node.rb_left; ++		else ++			return p; ++	} ++ ++	return NULL; ++} ++ ++static struct ovl_cache_entry *ovl_cache_entry_new(const char *name, int len, ++						   u64 ino, unsigned int d_type) ++{ ++	struct ovl_cache_entry *p; ++ ++	p = kmalloc(sizeof(*p) + len + 1, GFP_KERNEL); ++	if (p) { ++		char *name_copy = (char *) (p + 1); ++		memcpy(name_copy, name, len); ++		name_copy[len] = '\0'; ++		p->name = name_copy; ++		p->len = len; ++		p->type = d_type; ++		p->ino = ino; ++		p->is_whiteout = false; ++	} ++ ++	return p; ++} ++ ++static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, ++				  const char *name, int len, u64 ino, ++				  unsigned int d_type) ++{ ++	struct rb_node **newp = &rdd->root->rb_node; ++	struct rb_node *parent = NULL; ++	struct ovl_cache_entry *p; ++ ++	while (*newp) { ++		int cmp; ++		struct ovl_cache_entry *tmp; ++ ++		parent = *newp; ++		tmp = ovl_cache_entry_from_node(*newp); ++		cmp = strncmp(name, tmp->name, len); ++		if (cmp > 0) ++			newp = &tmp->node.rb_right; ++		else if (cmp < 0 || len < tmp->len) ++			newp = &tmp->node.rb_left; ++		else ++			return 0; ++	} ++ ++	p = ovl_cache_entry_new(name, len, ino, d_type); ++	if (p == NULL) ++		return -ENOMEM; ++ ++	list_add_tail(&p->l_node, rdd->list); ++	rb_link_node(&p->node, parent, newp); ++	rb_insert_color(&p->node, rdd->root); ++ ++	return 0; ++} ++ ++static int ovl_fill_lower(void *buf, const char *name, int namelen, ++			    loff_t offset, u64 ino, unsigned int d_type) ++{ ++	struct ovl_readdir_data *rdd = buf; ++	struct ovl_cache_entry *p; ++ ++	rdd->count++; ++	p = ovl_cache_entry_find(rdd->root, name, namelen); ++	if (p) { ++		list_move_tail(&p->l_node, rdd->middle); ++	} else { ++		p = ovl_cache_entry_new(name, namelen, ino, d_type); ++		if (p == NULL) ++			rdd->err = -ENOMEM; ++		else ++			list_add_tail(&p->l_node, rdd->middle); ++	} ++ ++	return rdd->err; ++} ++ ++static void ovl_cache_free(struct list_head *list) ++{ ++	struct ovl_cache_entry *p; ++	struct ovl_cache_entry *n; ++ ++	list_for_each_entry_safe(p, n, list, l_node) ++		kfree(p); ++ ++	INIT_LIST_HEAD(list); ++} ++ ++static int ovl_fill_upper(void *buf, const char *name, int namelen, ++			  loff_t offset, u64 ino, unsigned int d_type) ++{ ++	struct ovl_readdir_data *rdd = buf; ++ ++	rdd->count++; ++	return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type); ++} ++ ++static inline int ovl_dir_read(struct path *realpath, ++			       struct ovl_readdir_data *rdd, filldir_t filler) ++{ ++	struct file *realfile; ++	int err; ++ ++	realfile = vfs_open(realpath, O_RDONLY | O_DIRECTORY, current_cred()); ++	if (IS_ERR(realfile)) ++		return PTR_ERR(realfile); ++ ++	do { ++		rdd->count = 0; ++		rdd->err = 0; ++		err = vfs_readdir(realfile, filler, rdd); ++		if (err >= 0) ++			err = rdd->err; ++	} while (!err && rdd->count); ++	fput(realfile); ++ ++	return 0; ++} ++ ++static void ovl_dir_reset(struct file *file) ++{ ++	struct ovl_dir_file *od = file->private_data; ++	enum ovl_path_type type = ovl_path_type(file->f_path.dentry); ++ ++	if (ovl_dentry_version_get(file->f_path.dentry) != od->cache_version) { ++		list_del_init(&od->cursor); ++		ovl_cache_free(&od->cache); ++		od->is_cached = false; ++	} ++	WARN_ON(!od->is_real && type != OVL_PATH_MERGE); ++	if (od->is_real && type == OVL_PATH_MERGE) { ++		fput(od->realfile); ++		od->realfile = NULL; ++		od->is_real = false; ++	} ++} ++ ++static int ovl_dir_mark_whiteouts(struct ovl_readdir_data *rdd) ++{ ++	struct ovl_cache_entry *p; ++	struct dentry *dentry; ++	const struct cred *old_cred; ++	struct cred *override_cred; ++ ++	override_cred = prepare_creds(); ++	if (!override_cred) { ++		ovl_cache_free(rdd->list); ++		return -ENOMEM; ++	} ++ ++	/* ++	 * CAP_SYS_ADMIN for getxattr ++	 * CAP_DAC_OVERRIDE for lookup ++	 */ ++	cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); ++	cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); ++	old_cred = override_creds(override_cred); ++ ++	mutex_lock(&rdd->dir->d_inode->i_mutex); ++	list_for_each_entry(p, rdd->list, l_node) { ++		if (p->type != DT_LNK) ++			continue; ++ ++		dentry = lookup_one_len(p->name, rdd->dir, p->len); ++		if (IS_ERR(dentry)) ++			continue; ++ ++		p->is_whiteout = ovl_is_whiteout(dentry); ++		dput(dentry); ++	} ++	mutex_unlock(&rdd->dir->d_inode->i_mutex); ++ ++	revert_creds(old_cred); ++	put_cred(override_cred); ++ ++	return 0; ++} ++ ++static inline int ovl_dir_read_merged(struct path *upperpath, struct path *lowerpath, ++			       struct ovl_readdir_data *rdd) ++{ ++	int err; ++	struct rb_root root = RB_ROOT; ++	struct list_head middle; ++ ++	rdd->root = &root; ++	if (upperpath->dentry) { ++		rdd->dir = upperpath->dentry; ++		err = ovl_dir_read(upperpath, rdd, ovl_fill_upper); ++		if (err) ++			goto out; ++ ++		err = ovl_dir_mark_whiteouts(rdd); ++		if (err) ++			goto out; ++	} ++	/* ++	 * Insert lowerpath entries before upperpath ones, this allows ++	 * offsets to be reasonably constant ++	 */ ++	list_add(&middle, rdd->list); ++	rdd->middle = &middle; ++	err = ovl_dir_read(lowerpath, rdd, ovl_fill_lower); ++	list_del(&middle); ++out: ++	rdd->root = NULL; ++ ++	return err; ++} ++ ++static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos) ++{ ++	struct list_head *l; ++	loff_t off; ++ ++	l = od->cache.next; ++	for (off = 0; off < pos; off++) { ++		if (l == &od->cache) ++			break; ++		l = l->next; ++	} ++	list_move_tail(&od->cursor, l); ++} ++ ++static int ovl_readdir(struct file *file, void *buf, filldir_t filler) ++{ ++	struct ovl_dir_file *od = file->private_data; ++	int res; ++ ++	if (!file->f_pos) ++		ovl_dir_reset(file); ++ ++	if (od->is_real) { ++		res = vfs_readdir(od->realfile, filler, buf); ++		file->f_pos = od->realfile->f_pos; ++ ++		return res; ++	} ++ ++	if (!od->is_cached) { ++		struct path lowerpath; ++		struct path upperpath; ++		struct ovl_readdir_data rdd = { .list = &od->cache }; ++ ++		ovl_path_lower(file->f_path.dentry, &lowerpath); ++		ovl_path_upper(file->f_path.dentry, &upperpath); ++ ++		res = ovl_dir_read_merged(&upperpath, &lowerpath, &rdd); ++		if (res) { ++			ovl_cache_free(rdd.list); ++			return res; ++		} ++ ++		od->cache_version = ovl_dentry_version_get(file->f_path.dentry); ++		od->is_cached = true; ++ ++		ovl_seek_cursor(od, file->f_pos); ++	} ++ ++	while (od->cursor.next != &od->cache) { ++		int over; ++		loff_t off; ++		struct ovl_cache_entry *p; ++ ++		p = list_entry(od->cursor.next, struct ovl_cache_entry, l_node); ++		off = file->f_pos; ++		if (!p->is_whiteout) { ++			over = filler(buf, p->name, p->len, off, p->ino, p->type); ++			if (over) ++				break; ++		} ++		file->f_pos++; ++		list_move(&od->cursor, &p->l_node); ++	} ++ ++	return 0; ++} ++ ++static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin) ++{ ++	loff_t res; ++	struct ovl_dir_file *od = file->private_data; ++ ++	mutex_lock(&file->f_dentry->d_inode->i_mutex); ++	if (!file->f_pos) ++		ovl_dir_reset(file); ++ ++	if (od->is_real) { ++		res = vfs_llseek(od->realfile, offset, origin); ++		file->f_pos = od->realfile->f_pos; ++	} else { ++		res = -EINVAL; ++ ++		switch (origin) { ++		case SEEK_CUR: ++			offset += file->f_pos; ++			break; ++		case SEEK_SET: ++			break; ++		default: ++			goto out_unlock; ++		} ++		if (offset < 0) ++			goto out_unlock; ++ ++		if (offset != file->f_pos) { ++			file->f_pos = offset; ++			if (od->is_cached) ++				ovl_seek_cursor(od, offset); ++		} ++		res = offset; ++	} ++out_unlock: ++	mutex_unlock(&file->f_dentry->d_inode->i_mutex); ++ ++	return res; ++} ++ ++static int ovl_dir_fsync(struct file *file, int datasync) ++{ ++	struct ovl_dir_file *od = file->private_data; ++ ++	/* May need to reopen directory if it got copied up */ ++	if (!od->realfile) { ++		struct path upperpath; ++ ++		ovl_path_upper(file->f_path.dentry, &upperpath); ++		od->realfile = vfs_open(&upperpath, O_RDONLY, current_cred()); ++		if (IS_ERR(od->realfile)) ++			return PTR_ERR(od->realfile); ++	} ++ ++	return vfs_fsync(od->realfile, datasync); ++} ++ ++static int ovl_dir_release(struct inode *inode, struct file *file) ++{ ++	struct ovl_dir_file *od = file->private_data; ++ ++	list_del(&od->cursor); ++	ovl_cache_free(&od->cache); ++	if (od->realfile) ++		fput(od->realfile); ++	kfree(od); ++ ++	return 0; ++} ++ ++static int ovl_dir_open(struct inode *inode, struct file *file) ++{ ++	struct path realpath; ++	struct file *realfile; ++	struct ovl_dir_file *od; ++	enum ovl_path_type type; ++ ++	od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL); ++	if (!od) ++		return -ENOMEM; ++ ++	type = ovl_path_real(file->f_path.dentry, &realpath); ++	realfile = vfs_open(&realpath, file->f_flags, current_cred()); ++	if (IS_ERR(realfile)) { ++		kfree(od); ++		return PTR_ERR(realfile); ++	} ++	INIT_LIST_HEAD(&od->cache); ++	INIT_LIST_HEAD(&od->cursor); ++	od->is_cached = false; ++	od->realfile = realfile; ++	od->is_real = (type != OVL_PATH_MERGE); ++	file->private_data = od; ++ ++	return 0; ++} ++ ++const struct file_operations ovl_dir_operations = { ++	.read		= generic_read_dir, ++	.open		= ovl_dir_open, ++	.readdir	= ovl_readdir, ++	.llseek		= ovl_dir_llseek, ++	.fsync		= ovl_dir_fsync, ++	.release	= ovl_dir_release, ++}; ++ ++static int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list) ++{ ++	int err; ++	struct path lowerpath; ++	struct path upperpath; ++	struct ovl_cache_entry *p; ++	struct ovl_readdir_data rdd = { .list = list }; ++ ++	ovl_path_upper(dentry, &upperpath); ++	ovl_path_lower(dentry, &lowerpath); ++ ++	err = ovl_dir_read_merged(&upperpath, &lowerpath, &rdd); ++	if (err) ++		return err; ++ ++	err = 0; ++ ++	list_for_each_entry(p, list, l_node) { ++		if (p->is_whiteout) ++			continue; ++ ++		if (p->name[0] == '.') { ++			if (p->len == 1) ++				continue; ++			if (p->len == 2 && p->name[1] == '.') ++				continue; ++		} ++		err = -ENOTEMPTY; ++		break; ++	} ++ ++	return err; ++} ++ ++static int ovl_remove_whiteouts(struct dentry *dir, struct list_head *list) ++{ ++	struct path upperpath; ++	struct dentry *upperdir; ++	struct ovl_cache_entry *p; ++	const struct cred *old_cred; ++	struct cred *override_cred; ++	int err; ++ ++	ovl_path_upper(dir, &upperpath); ++	upperdir = upperpath.dentry; ++ ++	override_cred = prepare_creds(); ++	if (!override_cred) ++		return -ENOMEM; ++ ++	/* ++	 * CAP_DAC_OVERRIDE for lookup and unlink ++	 * CAP_SYS_ADMIN for setxattr of "trusted" namespace ++	 * CAP_FOWNER for unlink in sticky directory ++	 */ ++	cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); ++	cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); ++	cap_raise(override_cred->cap_effective, CAP_FOWNER); ++	old_cred = override_creds(override_cred); ++ ++	err = vfs_setxattr(upperdir, ovl_opaque_xattr, "y", 1, 0); ++	if (err) ++		goto out_revert_creds; ++ ++	mutex_lock_nested(&upperdir->d_inode->i_mutex, I_MUTEX_PARENT); ++	list_for_each_entry(p, list, l_node) { ++		struct dentry *dentry; ++		int ret; ++ ++		if (!p->is_whiteout) ++			continue; ++ ++		dentry = lookup_one_len(p->name, upperdir, p->len); ++		if (IS_ERR(dentry)) { ++			printk(KERN_WARNING "overlayfs: failed to lookup whiteout %.*s: %li\n", p->len, p->name, PTR_ERR(dentry)); ++			continue; ++		} ++		ret = vfs_unlink(upperdir->d_inode, dentry); ++		dput(dentry); ++		if (ret) ++			printk(KERN_WARNING "overlayfs: failed to unlink whiteout %.*s: %i\n", p->len, p->name, ret); ++	} ++	mutex_unlock(&upperdir->d_inode->i_mutex); ++ ++out_revert_creds: ++	revert_creds(old_cred); ++	put_cred(override_cred); ++ ++	return err; ++} ++ ++int ovl_check_empty_and_clear(struct dentry *dentry, enum ovl_path_type type) ++{ ++	int err; ++	LIST_HEAD(list); ++ ++	err = ovl_check_empty_dir(dentry, &list); ++	if (!err && type == OVL_PATH_MERGE) ++		err = ovl_remove_whiteouts(dentry, &list); ++ ++	ovl_cache_free(&list); ++ ++	return err; ++} +--- /dev/null ++++ b/fs/overlayfs/super.c +@@ -0,0 +1,656 @@ ++/* ++ * ++ * Copyright (C) 2011 Novell Inc. ++ * ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License version 2 as published by ++ * the Free Software Foundation. ++ */ ++ ++#include <linux/fs.h> ++#include <linux/namei.h> ++#include <linux/xattr.h> ++#include <linux/security.h> ++#include <linux/mount.h> ++#include <linux/slab.h> ++#include <linux/parser.h> ++#include <linux/module.h> ++#include <linux/seq_file.h> ++#include "overlayfs.h" ++ ++MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); ++MODULE_DESCRIPTION("Overlay filesystem"); ++MODULE_LICENSE("GPL"); ++ ++struct ovl_config { ++	char *lowerdir; ++	char *upperdir; ++}; ++ ++/* private information held for overlayfs's superblock */ ++struct ovl_fs { ++	struct vfsmount *upper_mnt; ++	struct vfsmount *lower_mnt; ++	/* pathnames of lower and upper dirs, for show_options */ ++	struct ovl_config config; ++}; ++ ++/* private information held for every overlayfs dentry */ ++struct ovl_entry { ++	/* ++	 * Keep "double reference" on upper dentries, so that ++	 * d_delete() doesn't think it's OK to reset d_inode to NULL. ++	 */ ++	struct dentry *__upperdentry; ++	struct dentry *lowerdentry; ++	union { ++		struct { ++			u64 version; ++			bool opaque; ++		}; ++		struct rcu_head rcu; ++	}; ++}; ++ ++const char *ovl_whiteout_xattr = "trusted.overlay.whiteout"; ++const char *ovl_opaque_xattr = "trusted.overlay.opaque"; ++ ++ ++enum ovl_path_type ovl_path_type(struct dentry *dentry) ++{ ++	struct ovl_entry *oe = dentry->d_fsdata; ++ ++	if (oe->__upperdentry) { ++		if (oe->lowerdentry && S_ISDIR(dentry->d_inode->i_mode)) ++			return OVL_PATH_MERGE; ++		else ++			return OVL_PATH_UPPER; ++	} else { ++		return OVL_PATH_LOWER; ++	} ++} ++ ++static struct dentry *ovl_upperdentry_dereference(struct ovl_entry *oe) ++{ ++	struct dentry *upperdentry = ACCESS_ONCE(oe->__upperdentry); ++	smp_read_barrier_depends(); ++	return upperdentry; ++} ++ ++void ovl_path_upper(struct dentry *dentry, struct path *path) ++{ ++	struct ovl_fs *ofs = dentry->d_sb->s_fs_info; ++	struct ovl_entry *oe = dentry->d_fsdata; ++ ++	path->mnt = ofs->upper_mnt; ++	path->dentry = ovl_upperdentry_dereference(oe); ++} ++ ++void ovl_path_lower(struct dentry *dentry, struct path *path) ++{ ++	struct ovl_fs *ofs = dentry->d_sb->s_fs_info; ++	struct ovl_entry *oe = dentry->d_fsdata; ++ ++	path->mnt = ofs->lower_mnt; ++	path->dentry = oe->lowerdentry; ++} ++ ++enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path) ++{ ++ ++	enum ovl_path_type type = ovl_path_type(dentry); ++ ++	if (type == OVL_PATH_LOWER) ++		ovl_path_lower(dentry, path); ++	else ++		ovl_path_upper(dentry, path); ++ ++	return type; ++} ++ ++struct dentry *ovl_dentry_upper(struct dentry *dentry) ++{ ++	struct ovl_entry *oe = dentry->d_fsdata; ++ ++	return ovl_upperdentry_dereference(oe); ++} ++ ++struct dentry *ovl_dentry_lower(struct dentry *dentry) ++{ ++	struct ovl_entry *oe = dentry->d_fsdata; ++ ++	return oe->lowerdentry; ++} ++ ++struct dentry *ovl_dentry_real(struct dentry *dentry) ++{ ++	struct ovl_entry *oe = dentry->d_fsdata; ++	struct dentry *realdentry; ++ ++	realdentry = ovl_upperdentry_dereference(oe); ++	if (!realdentry) ++		realdentry = oe->lowerdentry; ++ ++	return realdentry; ++} ++ ++struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper) ++{ ++	struct dentry *realdentry; ++ ++	realdentry = ovl_upperdentry_dereference(oe); ++	if (realdentry) { ++		*is_upper = true; ++	} else { ++		realdentry = oe->lowerdentry; ++		*is_upper = false; ++	} ++	return realdentry; ++} ++ ++bool ovl_dentry_is_opaque(struct dentry *dentry) ++{ ++	struct ovl_entry *oe = dentry->d_fsdata; ++	return oe->opaque; ++} ++ ++void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque) ++{ ++	struct ovl_entry *oe = dentry->d_fsdata; ++	oe->opaque = opaque; ++} ++ ++void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry) ++{ ++	struct ovl_entry *oe = dentry->d_fsdata; ++ ++	WARN_ON(!mutex_is_locked(&upperdentry->d_parent->d_inode->i_mutex)); ++	WARN_ON(oe->__upperdentry); ++	BUG_ON(!upperdentry->d_inode); ++	smp_wmb(); ++	oe->__upperdentry = dget(upperdentry); ++} ++ ++void ovl_dentry_version_inc(struct dentry *dentry) ++{ ++	struct ovl_entry *oe = dentry->d_fsdata; ++ ++	WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); ++	oe->version++; ++} ++ ++u64 ovl_dentry_version_get(struct dentry *dentry) ++{ ++	struct ovl_entry *oe = dentry->d_fsdata; ++ ++	WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); ++	return oe->version; ++} ++ ++bool ovl_is_whiteout(struct dentry *dentry) ++{ ++	int res; ++	char val; ++ ++	if (!dentry) ++		return false; ++	if (!dentry->d_inode) ++		return false; ++	if (!S_ISLNK(dentry->d_inode->i_mode)) ++		return false; ++ ++	res = vfs_getxattr(dentry, ovl_whiteout_xattr, &val, 1); ++	if (res == 1 && val == 'y') ++		return true; ++ ++	return false; ++} ++ ++static bool ovl_is_opaquedir(struct dentry *dentry) ++{ ++	int res; ++	char val; ++ ++	if (!S_ISDIR(dentry->d_inode->i_mode)) ++		return false; ++ ++	res = vfs_getxattr(dentry, ovl_opaque_xattr, &val, 1); ++	if (res == 1 && val == 'y') ++		return true; ++ ++	return false; ++} ++ ++static void ovl_entry_free(struct rcu_head *head) ++{ ++	struct ovl_entry *oe = container_of(head, struct ovl_entry, rcu); ++	kfree(oe); ++} ++ ++static void ovl_dentry_release(struct dentry *dentry) ++{ ++	struct ovl_entry *oe = dentry->d_fsdata; ++ ++	if (oe) { ++		dput(oe->__upperdentry); ++		dput(oe->__upperdentry); ++		dput(oe->lowerdentry); ++		call_rcu(&oe->rcu, ovl_entry_free); ++	} ++} ++ ++const struct dentry_operations ovl_dentry_operations = { ++	.d_release = ovl_dentry_release, ++}; ++ ++static struct ovl_entry *ovl_alloc_entry(void) ++{ ++	return kzalloc(sizeof(struct ovl_entry), GFP_KERNEL); ++} ++ ++static inline struct dentry *ovl_lookup_real(struct dentry *dir, struct qstr *name) ++{ ++	struct dentry *dentry; ++ ++	mutex_lock(&dir->d_inode->i_mutex); ++	dentry = lookup_one_len(name->name, dir, name->len); ++	mutex_unlock(&dir->d_inode->i_mutex); ++ ++	if (IS_ERR(dentry)) { ++		if (PTR_ERR(dentry) == -ENOENT) ++			dentry = NULL; ++	} else if (!dentry->d_inode) { ++		dput(dentry); ++		dentry = NULL; ++	} ++	return dentry; ++} ++ ++static int ovl_do_lookup(struct dentry *dentry) ++{ ++	struct ovl_entry *oe; ++	struct dentry *upperdir; ++	struct dentry *lowerdir; ++	struct dentry *upperdentry = NULL; ++	struct dentry *lowerdentry = NULL; ++	struct inode *inode = NULL; ++	int err; ++ ++	err = -ENOMEM; ++	oe = ovl_alloc_entry(); ++	if (!oe) ++		goto out; ++ ++	upperdir = ovl_dentry_upper(dentry->d_parent); ++	lowerdir = ovl_dentry_lower(dentry->d_parent); ++ ++	if (upperdir) { ++		upperdentry = ovl_lookup_real(upperdir, &dentry->d_name); ++		err = PTR_ERR(upperdentry); ++		if (IS_ERR(upperdentry)) ++			goto out_put_dir; ++ ++		if (lowerdir && upperdentry && ++		    (S_ISLNK(upperdentry->d_inode->i_mode) || ++		     S_ISDIR(upperdentry->d_inode->i_mode))) { ++			const struct cred *old_cred; ++			struct cred *override_cred; ++ ++			err = -ENOMEM; ++			override_cred = prepare_creds(); ++			if (!override_cred) ++				goto out_dput_upper; ++ ++			/* CAP_SYS_ADMIN needed for getxattr */ ++			cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); ++			old_cred = override_creds(override_cred); ++ ++			if (ovl_is_opaquedir(upperdentry)) { ++				oe->opaque = true; ++			} else if (ovl_is_whiteout(upperdentry)) { ++				dput(upperdentry); ++				upperdentry = NULL; ++				oe->opaque = true; ++			} ++			revert_creds(old_cred); ++			put_cred(override_cred); ++		} ++	} ++	if (lowerdir && !oe->opaque) { ++		lowerdentry = ovl_lookup_real(lowerdir, &dentry->d_name); ++		err = PTR_ERR(lowerdentry); ++		if (IS_ERR(lowerdentry)) ++			goto out_dput_upper; ++	} ++ ++	if (lowerdentry && upperdentry && ++	    (!S_ISDIR(upperdentry->d_inode->i_mode) || ++	     !S_ISDIR(lowerdentry->d_inode->i_mode))) { ++		dput(lowerdentry); ++		lowerdentry = NULL; ++		oe->opaque = true; ++	} ++ ++	if (lowerdentry || upperdentry) { ++		struct dentry *realdentry; ++ ++		realdentry = upperdentry ? upperdentry : lowerdentry; ++		err = -ENOMEM; ++		inode = ovl_new_inode(dentry->d_sb, realdentry->d_inode->i_mode, oe); ++		if (!inode) ++			goto out_dput; ++	} ++ ++	if (upperdentry) ++		oe->__upperdentry = dget(upperdentry); ++ ++	if (lowerdentry) ++		oe->lowerdentry = lowerdentry; ++ ++	dentry->d_fsdata = oe; ++	dentry->d_op = &ovl_dentry_operations; ++	d_add(dentry, inode); ++ ++	return 0; ++ ++out_dput: ++	dput(lowerdentry); ++out_dput_upper: ++	dput(upperdentry); ++out_put_dir: ++	kfree(oe); ++out: ++	return err; ++} ++ ++struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, ++			  struct nameidata *nd) ++{ ++	int err = ovl_do_lookup(dentry); ++ ++	if (err) ++		return ERR_PTR(err); ++ ++	return NULL; ++}  +  +static void ovl_put_super(struct super_block *sb)  +{ @@ -2400,6 +2839,8 @@  +	mntput(ufs->upper_mnt);  +	mntput(ufs->lower_mnt);  + ++	kfree(ufs->config.lowerdir); ++	kfree(ufs->config.upperdir);  +	kfree(ufs);  +}  + @@ -2441,15 +2882,27 @@  +	return path.dentry->d_sb->s_op->statfs(path.dentry, buf);  +}  + ++/** ++ * ovl_show_options ++ * ++ * Prints the mount options for a given superblock. ++ * Returns zero; does not fail. ++ */ ++static int ovl_show_options(struct seq_file *m, struct vfsmount *mnt) ++{ ++	struct super_block *sb = mnt->mnt_sb; ++	struct ovl_fs *ufs = sb->s_fs_info; ++ ++	seq_printf(m, ",lowerdir=%s", ufs->config.lowerdir); ++	seq_printf(m, ",upperdir=%s", ufs->config.upperdir); ++	return 0; ++} ++  +static const struct super_operations ovl_super_operations = {  +	.put_super	= ovl_put_super,  +	.remount_fs	= ovl_remount_fs,  +	.statfs		= ovl_statfs, -+}; -+ -+struct ovl_config { -+	char *lowerdir; -+	char *upperdir; ++	.show_options	= ovl_show_options,  +};  +  +enum { @@ -2509,37 +2962,36 @@  +	struct dentry *root_dentry;  +	struct ovl_entry *oe;  +	struct ovl_fs *ufs; -+	struct ovl_config config;  +	int err;  + -+	err = ovl_parse_opt((char *) data, &config); -+	if (err) ++	err = -ENOMEM; ++	ufs = kmalloc(sizeof(struct ovl_fs), GFP_KERNEL); ++	if (!ufs)  +		goto out;  + ++	err = ovl_parse_opt((char *) data, &ufs->config); ++	if (err) ++		goto out_free_ufs; ++  +	err = -EINVAL; -+	if (!config.upperdir || !config.lowerdir) { ++	if (!ufs->config.upperdir || !ufs->config.lowerdir) {  +		printk(KERN_ERR "overlayfs: missing upperdir or lowerdir\n");  +		goto out_free_config;  +	}  + -+	err = -ENOMEM; -+	ufs = kmalloc(sizeof(struct ovl_fs), GFP_KERNEL); -+	if (!ufs) -+		goto out_free_config; -+  +	oe = ovl_alloc_entry();  +	if (oe == NULL) -+		goto out_free_ufs; ++		goto out_free_config;  +  +	root_inode = ovl_new_inode(sb, S_IFDIR, oe);  +	if (!root_inode)  +		goto out_free_oe;  + -+	err = kern_path(config.upperdir, LOOKUP_FOLLOW, &upperpath); ++	err = kern_path(ufs->config.upperdir, LOOKUP_FOLLOW, &upperpath);  +	if (err)  +		goto out_put_root;  + -+	err = kern_path(config.lowerdir, LOOKUP_FOLLOW, &lowerpath); ++	err = kern_path(ufs->config.lowerdir, LOOKUP_FOLLOW, &lowerpath);  +	if (err)  +		goto out_put_upperpath;  + @@ -2548,6 +3000,16 @@  +	    !S_ISDIR(lowerpath.dentry->d_inode->i_mode))  +		goto out_put_lowerpath;  + ++	sb->s_stack_depth = max(upperpath.mnt->mnt_sb->s_stack_depth, ++				lowerpath.mnt->mnt_sb->s_stack_depth) + 1; ++ ++	err = -EINVAL; ++	if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { ++		printk(KERN_ERR "overlayfs: maximum fs stacking depth exceeded\n"); ++		goto out_put_lowerpath; ++	} ++ ++  +	ufs->upper_mnt = clone_private_mount(&upperpath);  +	err = PTR_ERR(ufs->upper_mnt);  +	if (IS_ERR(ufs->upper_mnt)) { @@ -2562,6 +3024,16 @@  +		goto out_put_upper_mnt;  +	}  + ++	/* ++	 * Make lower_mnt R/O.  That way fchmod/fchown on lower file ++	 * will fail instead of modifying lower fs. ++	 */ ++	ufs->lower_mnt->mnt_flags |= MNT_READONLY; ++ ++	/* If the upper fs is r/o, we mark overlayfs r/o too */ ++	if (ufs->upper_mnt->mnt_sb->s_flags & MS_RDONLY) ++		sb->s_flags |= MS_RDONLY; ++  +	if (!(sb->s_flags & MS_RDONLY)) {  +		err = mnt_want_write(ufs->upper_mnt);  +		if (err) @@ -2576,7 +3048,7 @@  +	mntput(upperpath.mnt);  +	mntput(lowerpath.mnt);  + -+	oe->__upperdentry = upperpath.dentry; ++	oe->__upperdentry = dget(upperpath.dentry);  +	oe->lowerdentry = lowerpath.dentry;  +  +	root_dentry->d_fsdata = oe; @@ -2603,11 +3075,11 @@  +	iput(root_inode);  +out_free_oe:  +	kfree(oe); ++out_free_config: ++	kfree(ufs->config.lowerdir); ++	kfree(ufs->config.upperdir);  +out_free_ufs:  +	kfree(ufs); -+out_free_config: -+	kfree(config.lowerdir); -+	kfree(config.upperdir);  +out:  +	return err;  +} @@ -2637,204 +3109,68 @@  +  +module_init(ovl_init);  +module_exit(ovl_exit); ---- a/fs/Kconfig -+++ b/fs/Kconfig -@@ -63,6 +63,7 @@ source "fs/quota/Kconfig" +--- a/fs/splice.c ++++ b/fs/splice.c +@@ -1296,6 +1296,7 @@ long do_splice_direct(struct file *in, l - source "fs/autofs4/Kconfig" - source "fs/fuse/Kconfig" -+source "fs/overlayfs/Kconfig" + 	return ret; + } ++EXPORT_SYMBOL(do_splice_direct); - config CUSE - 	tristate "Character device in Userspace support" ---- a/fs/Makefile -+++ b/fs/Makefile -@@ -105,6 +105,7 @@ obj-$(CONFIG_QNX4FS_FS)		+= qnx4/ - obj-$(CONFIG_AUTOFS4_FS)	+= autofs4/ - obj-$(CONFIG_ADFS_FS)		+= adfs/ - obj-$(CONFIG_FUSE_FS)		+= fuse/ -+obj-$(CONFIG_OVERLAYFS_FS)	+= overlayfs/ - obj-$(CONFIG_UDF_FS)		+= udf/ - obj-$(CONFIG_SUN_OPENPROMFS)	+= openpromfs/ - obj-$(CONFIG_OMFS_FS)		+= omfs/ ---- /dev/null -+++ b/fs/overlayfs/Kconfig -@@ -0,0 +1,4 @@ -+config OVERLAYFS_FS -+	tristate "Overlay filesystem support" -+	help -+	  Add support for overlay filesystem. ---- /dev/null -+++ b/fs/overlayfs/Makefile -@@ -0,0 +1,5 @@ -+# -+# Makefile for the overlay filesystem. -+# -+ -+obj-$(CONFIG_OVERLAYFS_FS) += overlayfs.o ---- /dev/null -+++ b/Documentation/filesystems/overlayfs.txt -@@ -0,0 +1,163 @@ -+Written by: Neil Brown <neilb@suse.de> -+ -+Overlay Filesystem -+================== -+ -+This document describes a prototype for a new approach to providing -+overlay-filesystem functionality in Linux (sometimes referred to as -+union-filesystems).  An overlay-filesystem tries to present a -+filesystem which is the result over overlaying one filesystem on top -+of the other. -+ -+The result will inevitably fail to look exactly like a normal -+filesystem for various technical reasons.  The expectation is that -+many use cases will be able to ignore these differences. -+ -+This approach is 'hybrid' because the objects that appear in the -+filesystem do not all appear to belong to that filesystem.  In many -+cases an object accessed in the union will be indistinguishable -+from accessing the corresponding object from the original filesystem. -+This is most obvious from the 'st_dev' field returned by stat(2). -+ -+While directories will report an st_dev from the overlay-filesystem, -+all non-directory objects will report an st_dev from the lower or -+upper filesystem that is providing the object.  Similarly st_ino will -+only be unique when combined with st_dev, and both of these can change -+over the lifetime of a non-directory object.  Many applications and -+tools ignore these values and will not be affected. -+ -+Upper and Lower -+--------------- -+ -+An overlay filesystem combines two filesystems - an 'upper' filesystem -+and a 'lower' filesystem.  When a name exists in both filesystems, the -+object in the 'upper' filesystem is visible while the object in the -+'lower' filesystem is either hidden or, in the case of directories, -+merged with the 'upper' object. -+ -+It would be more correct to refer to an upper and lower 'directory -+tree' rather than 'filesystem' as it is quite possible for both -+directory trees to be in the same filesystem and there is no -+requirement that the root of a filesystem be given for either upper or -+lower. -+ -+The lower filesystem can be any filesystem supported by Linux and does -+not need to be writable.  The lower filesystem can even be another -+overlayfs.  The upper filesystem will normally be writable and if it -+is it must support the creation of trusted.* extended attributes, and -+must provide valid d_type in readdir responses, at least for symbolic -+links - so NFS is not suitable. -+ -+A read-only overlay of two read-only filesystems may use any -+filesystem type. -+ -+Directories -+----------- -+ -+Overlaying mainly involved directories.  If a given name appears in both -+upper and lower filesystems and refers to a non-directory in either, -+then the lower object is hidden - the name refers only to the upper -+object. -+ -+Where both upper and lower objects are directories, a merged directory -+is formed. -+ -+At mount time, the two directories given as mount options are combined -+into a merged directory.  Then whenever a lookup is requested in such -+a merged directory, the lookup is performed in each actual directory -+and the combined result is cached in the dentry belonging to the overlay -+filesystem.  If both actual lookups find directories, both are stored -+and a merged directory is created, otherwise only one is stored: the -+upper if it exists, else the lower. -+ -+Only the lists of names from directories are merged.  Other content -+such as metadata and extended attributes are reported for the upper -+directory only.  These attributes of the lower directory are hidden. -+ -+whiteouts and opaque directories -+-------------------------------- -+ -+In order to support rm and rmdir without changing the lower -+filesystem, an overlay filesystem needs to record in the upper filesystem -+that files have been removed.  This is done using whiteouts and opaque -+directories (non-directories are always opaque). -+ -+The overlay filesystem uses extended attributes with a -+"trusted.overlay."  prefix to record these details. -+ -+A whiteout is created as a symbolic link with target -+"(overlay-whiteout)" and with xattr "trusted.overlay.whiteout" set to "y". -+When a whiteout is found in the upper level of a merged directory, any -+matching name in the lower level is ignored, and the whiteout itself -+is also hidden. -+ -+A directory is made opaque by setting the xattr "trusted.overlay.opaque" -+to "y".  Where the upper filesystem contains an opaque directory, any -+directory in the lower filesystem with the same name is ignored. -+ -+readdir -+------- -+ -+When a 'readdir' request is made on a merged directory, the upper and -+lower directories are each read and the name lists merged in the -+obvious way (upper is read first, then lower - entries that already -+exist are not re-added).  This merged name list is cached in the -+'struct file' and so remains as long as the file is kept open.  If the -+directory is opened and read by two processes at the same time, they -+will each have separate caches.  A seekdir to the start of the -+directory (offset 0) followed by a readdir will cause the cache to be -+discarded and rebuilt. -+ -+This means that changes to the merged directory do not appear while a -+directory is being read.  This is unlikely to be noticed by many -+programs. -+ -+seek offsets are assigned sequentially when the directories are read. -+Thus if -+  - read part of a directory -+  - remember an offset, and close the directory -+  - re-open the directory some time later -+  - seek to the remembered offset -+ -+there may be little correlation between the old and new locations in -+the list of filenames, particularly if anything has changed in the -+directory. -+ -+Readdir on directories that are not merged is simply handled by the -+underlying directory (upper or lower). -+ -+ -+Non-directories -+--------------- -+ -+Objects that are not directories (files, symlinks, device-special -+files etc.) are presented either from the upper or lower filesystem as -+appropriate.  When a file in the lower filesystem is accessed in a way -+the requires write-access, such as opening for write access, changing -+some metadata etc., the file is first copied from the lower filesystem -+to the upper filesystem (copy_up).  Note that creating a hard-link -+also requires copy_up, though of course creation of a symlink does -+not. -+ -+The copy_up process first makes sure that the containing directory -+exists in the upper filesystem - creating it and any parents as -+necessary.  It then creates the object with the same metadata (owner, -+mode, mtime, symlink-target etc.) and then if the object is a file, the -+data is copied from the lower to the upper filesystem.  Finally any -+extended attributes are copied up. -+ -+Once the copy_up is complete, the overlay filesystem simply -+provides direct access to the newly created file in the upper -+filesystem - future operations on the file are barely noticed by the -+overlay filesystem (though an operation on the name of the file such as -+rename or unlink will of course be noticed and handled). + static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, + 			       struct pipe_inode_info *opipe, +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -476,6 +476,12 @@ struct iattr { +  */ + #include <linux/quota.h> +  ++/* ++ * Maximum number of layers of fs stack.  Needs to be limited to ++ * prevent kernel stack overflow ++ */ ++#define FILESYSTEM_MAX_STACK_DEPTH 2  + -+Changes to underlying filesystems -+--------------------------------- + /**  +  * enum positive_aop_returns - aop return codes with specific semantics +  * +@@ -1429,6 +1435,11 @@ struct super_block { + 	 */ + 	char __rcu *s_options; + 	const struct dentry_operations *s_d_op; /* default d_op for dentries */  + -+Offline changes, when the overlay is not mounted, are allowed to either -+the upper or the lower trees. ++	/* ++	 * Indicates how deep in a filesystem stack this SB is ++	 */ ++	int s_stack_depth; + }; +  + extern struct timespec current_fs_time(struct super_block *sb); +@@ -1594,6 +1605,7 @@ struct inode_operations { + 	void (*truncate_range)(struct inode *, loff_t, loff_t); + 	int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, + 		      u64 len); ++	struct file *(*open)(struct dentry *, int flags, const struct cred *); + } ____cacheline_aligned; +  + struct seq_file; +@@ -1988,6 +2000,7 @@ extern long do_sys_open(int dfd, const c + extern struct file *filp_open(const char *, int, int); + extern struct file *file_open_root(struct dentry *, struct vfsmount *, + 				   const char *, int); ++extern struct file *vfs_open(struct path *, int flags, const struct cred *); + extern struct file * dentry_open(struct dentry *, struct vfsmount *, int, + 				 const struct cred *); + extern int filp_close(struct file *, fl_owner_t id); +--- a/include/linux/mount.h ++++ b/include/linux/mount.h +@@ -100,6 +100,9 @@ extern void mnt_pin(struct vfsmount *mnt + extern void mnt_unpin(struct vfsmount *mnt); + extern int __mnt_is_readonly(struct vfsmount *mnt); +  ++struct path; ++extern struct vfsmount *clone_private_mount(struct path *path);  + -+Changes to the underlying filesystems while part of a mounted overlay -+filesystem are not allowed.  This is not yet enforced, but will be in -+the future. + extern struct vfsmount *do_kern_mount(const char *fstype, int flags, + 				      const char *name, void *data); +  diff --git a/target/linux/generic/patches-3.0/100-overlayfs_v10.patch b/target/linux/generic/patches-3.0/100-overlayfs_v11.patch index 179626324..1dccf7b1c 100644 --- a/target/linux/generic/patches-3.0/100-overlayfs_v10.patch +++ b/target/linux/generic/patches-3.0/100-overlayfs_v11.patch @@ -1,3 +1,283 @@ +--- /dev/null ++++ b/Documentation/filesystems/overlayfs.txt +@@ -0,0 +1,199 @@ ++Written by: Neil Brown <neilb@suse.de> ++ ++Overlay Filesystem ++================== ++ ++This document describes a prototype for a new approach to providing ++overlay-filesystem functionality in Linux (sometimes referred to as ++union-filesystems).  An overlay-filesystem tries to present a ++filesystem which is the result over overlaying one filesystem on top ++of the other. ++ ++The result will inevitably fail to look exactly like a normal ++filesystem for various technical reasons.  The expectation is that ++many use cases will be able to ignore these differences. ++ ++This approach is 'hybrid' because the objects that appear in the ++filesystem do not all appear to belong to that filesystem.  In many ++cases an object accessed in the union will be indistinguishable ++from accessing the corresponding object from the original filesystem. ++This is most obvious from the 'st_dev' field returned by stat(2). ++ ++While directories will report an st_dev from the overlay-filesystem, ++all non-directory objects will report an st_dev from the lower or ++upper filesystem that is providing the object.  Similarly st_ino will ++only be unique when combined with st_dev, and both of these can change ++over the lifetime of a non-directory object.  Many applications and ++tools ignore these values and will not be affected. ++ ++Upper and Lower ++--------------- ++ ++An overlay filesystem combines two filesystems - an 'upper' filesystem ++and a 'lower' filesystem.  When a name exists in both filesystems, the ++object in the 'upper' filesystem is visible while the object in the ++'lower' filesystem is either hidden or, in the case of directories, ++merged with the 'upper' object. ++ ++It would be more correct to refer to an upper and lower 'directory ++tree' rather than 'filesystem' as it is quite possible for both ++directory trees to be in the same filesystem and there is no ++requirement that the root of a filesystem be given for either upper or ++lower. ++ ++The lower filesystem can be any filesystem supported by Linux and does ++not need to be writable.  The lower filesystem can even be another ++overlayfs.  The upper filesystem will normally be writable and if it ++is it must support the creation of trusted.* extended attributes, and ++must provide valid d_type in readdir responses, at least for symbolic ++links - so NFS is not suitable. ++ ++A read-only overlay of two read-only filesystems may use any ++filesystem type. ++ ++Directories ++----------- ++ ++Overlaying mainly involved directories.  If a given name appears in both ++upper and lower filesystems and refers to a non-directory in either, ++then the lower object is hidden - the name refers only to the upper ++object. ++ ++Where both upper and lower objects are directories, a merged directory ++is formed. ++ ++At mount time, the two directories given as mount options are combined ++into a merged directory: ++ ++  mount -t overlayfs overlayfs -olowerdir=/lower,upperdir=/upper /overlay ++ ++Then whenever a lookup is requested in such a merged directory, the ++lookup is performed in each actual directory and the combined result ++is cached in the dentry belonging to the overlay filesystem.  If both ++actual lookups find directories, both are stored and a merged ++directory is created, otherwise only one is stored: the upper if it ++exists, else the lower. ++ ++Only the lists of names from directories are merged.  Other content ++such as metadata and extended attributes are reported for the upper ++directory only.  These attributes of the lower directory are hidden. ++ ++whiteouts and opaque directories ++-------------------------------- ++ ++In order to support rm and rmdir without changing the lower ++filesystem, an overlay filesystem needs to record in the upper filesystem ++that files have been removed.  This is done using whiteouts and opaque ++directories (non-directories are always opaque). ++ ++The overlay filesystem uses extended attributes with a ++"trusted.overlay."  prefix to record these details. ++ ++A whiteout is created as a symbolic link with target ++"(overlay-whiteout)" and with xattr "trusted.overlay.whiteout" set to "y". ++When a whiteout is found in the upper level of a merged directory, any ++matching name in the lower level is ignored, and the whiteout itself ++is also hidden. ++ ++A directory is made opaque by setting the xattr "trusted.overlay.opaque" ++to "y".  Where the upper filesystem contains an opaque directory, any ++directory in the lower filesystem with the same name is ignored. ++ ++readdir ++------- ++ ++When a 'readdir' request is made on a merged directory, the upper and ++lower directories are each read and the name lists merged in the ++obvious way (upper is read first, then lower - entries that already ++exist are not re-added).  This merged name list is cached in the ++'struct file' and so remains as long as the file is kept open.  If the ++directory is opened and read by two processes at the same time, they ++will each have separate caches.  A seekdir to the start of the ++directory (offset 0) followed by a readdir will cause the cache to be ++discarded and rebuilt. ++ ++This means that changes to the merged directory do not appear while a ++directory is being read.  This is unlikely to be noticed by many ++programs. ++ ++seek offsets are assigned sequentially when the directories are read. ++Thus if ++  - read part of a directory ++  - remember an offset, and close the directory ++  - re-open the directory some time later ++  - seek to the remembered offset ++ ++there may be little correlation between the old and new locations in ++the list of filenames, particularly if anything has changed in the ++directory. ++ ++Readdir on directories that are not merged is simply handled by the ++underlying directory (upper or lower). ++ ++ ++Non-directories ++--------------- ++ ++Objects that are not directories (files, symlinks, device-special ++files etc.) are presented either from the upper or lower filesystem as ++appropriate.  When a file in the lower filesystem is accessed in a way ++the requires write-access, such as opening for write access, changing ++some metadata etc., the file is first copied from the lower filesystem ++to the upper filesystem (copy_up).  Note that creating a hard-link ++also requires copy_up, though of course creation of a symlink does ++not. ++ ++The copy_up may turn out to be unnecessary, for example if the file is ++opened for read-write but the data is not modified. ++ ++The copy_up process first makes sure that the containing directory ++exists in the upper filesystem - creating it and any parents as ++necessary.  It then creates the object with the same metadata (owner, ++mode, mtime, symlink-target etc.) and then if the object is a file, the ++data is copied from the lower to the upper filesystem.  Finally any ++extended attributes are copied up. ++ ++Once the copy_up is complete, the overlay filesystem simply ++provides direct access to the newly created file in the upper ++filesystem - future operations on the file are barely noticed by the ++overlay filesystem (though an operation on the name of the file such as ++rename or unlink will of course be noticed and handled). ++ ++ ++Non-standard behavior ++--------------------- ++ ++The copy_up operation essentially creates a new, identical file and ++moves it over to the old name.  The new file may be on a different ++filesystem, so both st_dev and st_ino of the file may change. ++ ++Any open files referring to this inode will access the old data and ++metadata.  Similarly any file locks obtained before copy_up will not ++apply to the copied up file. ++ ++On a file is opened with O_RDONLY fchmod(2), fchown(2), futimesat(2) ++and fsetxattr(2) will fail with EROFS. ++ ++If a file with multiple hard links is copied up, then this will ++"break" the link.  Changes will not be propagated to other names ++referring to the same inode. ++ ++Symlinks in /proc/PID/ and /proc/PID/fd which point to a non-directory ++object in overlayfs will not contain vaid absolute paths, only ++relative paths leading up to the filesystem's root.  This will be ++fixed in the future. ++ ++Some operations are not atomic, for example a crash during copy_up or ++rename will leave the filesystem in an inconsitent state.  This will ++be addressed in the future. ++ ++Changes to underlying filesystems ++--------------------------------- ++ ++Offline changes, when the overlay is not mounted, are allowed to either ++the upper or the lower trees. ++ ++Changes to the underlying filesystems while part of a mounted overlay ++filesystem are not allowed.  If the underlying filesystem is changed, ++the behavior of the overlay is undefined, though it will not result in ++a crash or deadlock. +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -4727,6 +4727,13 @@ F:	drivers/scsi/osd/ + F:	include/scsi/osd_* + F:	fs/exofs/ +  ++OVERLAYFS FILESYSTEM ++M:	Miklos Szeredi <miklos@szeredi.hu> ++L:	linux-fsdevel@vger.kernel.org ++S:	Supported ++F:	fs/overlayfs/* ++F:	Documentation/filesystems/overlayfs.txt ++ + P54 WIRELESS DRIVER + M:	Christian Lamparter <chunkeey@googlemail.com> + L:	linux-wireless@vger.kernel.org +--- a/fs/Kconfig ++++ b/fs/Kconfig +@@ -63,6 +63,7 @@ source "fs/quota/Kconfig" +  + source "fs/autofs4/Kconfig" + source "fs/fuse/Kconfig" ++source "fs/overlayfs/Kconfig" +  + config CUSE + 	tristate "Character device in Userspace support" +--- a/fs/Makefile ++++ b/fs/Makefile +@@ -105,6 +105,7 @@ obj-$(CONFIG_QNX4FS_FS)		+= qnx4/ + obj-$(CONFIG_AUTOFS4_FS)	+= autofs4/ + obj-$(CONFIG_ADFS_FS)		+= adfs/ + obj-$(CONFIG_FUSE_FS)		+= fuse/ ++obj-$(CONFIG_OVERLAYFS_FS)	+= overlayfs/ + obj-$(CONFIG_UDF_FS)		+= udf/ + obj-$(CONFIG_SUN_OPENPROMFS)	+= openpromfs/ + obj-$(CONFIG_OMFS_FS)		+= omfs/ +--- a/fs/ecryptfs/main.c ++++ b/fs/ecryptfs/main.c +@@ -544,6 +544,13 @@ static struct dentry *ecryptfs_mount(str + 	s->s_maxbytes = path.dentry->d_sb->s_maxbytes; + 	s->s_blocksize = path.dentry->d_sb->s_blocksize; + 	s->s_magic = ECRYPTFS_SUPER_MAGIC; ++	s->s_stack_depth = path.dentry->d_sb->s_stack_depth + 1; ++ ++	rc = -EINVAL; ++	if (s->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { ++		printk(KERN_ERR "eCryptfs: maximum fs stacking depth exceeded\n"); ++		goto out_free; ++	} +  + 	inode = ecryptfs_get_inode(path.dentry->d_inode, s); + 	rc = PTR_ERR(inode); +--- a/fs/namespace.c ++++ b/fs/namespace.c +@@ -1492,6 +1492,23 @@ void drop_collected_mounts(struct vfsmou + 	release_mounts(&umount_list); + } +  ++struct vfsmount *clone_private_mount(struct path *path) ++{ ++	struct vfsmount *mnt; ++ ++	if (IS_MNT_UNBINDABLE(path->mnt)) ++		return ERR_PTR(-EINVAL); ++ ++	down_read(&namespace_sem); ++	mnt = clone_mnt(path->mnt, path->dentry, CL_PRIVATE); ++	up_read(&namespace_sem); ++	if (!mnt) ++		return ERR_PTR(-ENOMEM); ++ ++	return mnt; ++} ++EXPORT_SYMBOL_GPL(clone_private_mount); ++ + int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, + 		   struct vfsmount *root) + {  --- a/fs/open.c  +++ b/fs/open.c  @@ -666,8 +666,7 @@ static inline int __get_file_write_acces @@ -154,92 +434,6 @@   static void __put_unused_fd(struct files_struct *files, unsigned int fd)   { ---- a/include/linux/fs.h -+++ b/include/linux/fs.h -@@ -1603,6 +1603,7 @@ struct inode_operations { - 	void (*truncate_range)(struct inode *, loff_t, loff_t); - 	int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, - 		      u64 len); -+	struct file *(*open)(struct dentry *, int flags, const struct cred *); - } ____cacheline_aligned; -  - struct seq_file; -@@ -1998,6 +1999,7 @@ extern long do_sys_open(int dfd, const c - extern struct file *filp_open(const char *, int, int); - extern struct file *file_open_root(struct dentry *, struct vfsmount *, - 				   const char *, int); -+extern struct file *vfs_open(struct path *, int flags, const struct cred *); - extern struct file * dentry_open(struct dentry *, struct vfsmount *, int, - 				 const struct cred *); - extern int filp_close(struct file *, fl_owner_t id); ---- a/fs/splice.c -+++ b/fs/splice.c -@@ -1300,6 +1300,7 @@ long do_splice_direct(struct file *in, l -  - 	return ret; - } -+EXPORT_SYMBOL(do_splice_direct); -  - static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, - 			       struct pipe_inode_info *opipe, ---- a/fs/namespace.c -+++ b/fs/namespace.c -@@ -1492,6 +1492,23 @@ void drop_collected_mounts(struct vfsmou - 	release_mounts(&umount_list); - } -  -+struct vfsmount *clone_private_mount(struct path *path) -+{ -+	struct vfsmount *mnt; -+ -+	if (IS_MNT_UNBINDABLE(path->mnt)) -+		return ERR_PTR(-EINVAL); -+ -+	down_read(&namespace_sem); -+	mnt = clone_mnt(path->mnt, path->dentry, CL_PRIVATE); -+	up_read(&namespace_sem); -+	if (!mnt) -+		return ERR_PTR(-ENOMEM); -+ -+	return mnt; -+} -+EXPORT_SYMBOL_GPL(clone_private_mount); -+ - int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, - 		   struct vfsmount *root) - { ---- a/include/linux/mount.h -+++ b/include/linux/mount.h -@@ -100,6 +100,9 @@ extern void mnt_pin(struct vfsmount *mnt - extern void mnt_unpin(struct vfsmount *mnt); - extern int __mnt_is_readonly(struct vfsmount *mnt); -  -+struct path; -+extern struct vfsmount *clone_private_mount(struct path *path); -+ - extern struct vfsmount *do_kern_mount(const char *fstype, int flags, - 				      const char *name, void *data); -  ---- a/fs/Kconfig -+++ b/fs/Kconfig -@@ -63,6 +63,7 @@ source "fs/quota/Kconfig" -  - source "fs/autofs4/Kconfig" - source "fs/fuse/Kconfig" -+source "fs/overlayfs/Kconfig" -  - config CUSE - 	tristate "Character device in Userspace support" ---- a/fs/Makefile -+++ b/fs/Makefile -@@ -105,6 +105,7 @@ obj-$(CONFIG_QNX4FS_FS)		+= qnx4/ - obj-$(CONFIG_AUTOFS4_FS)	+= autofs4/ - obj-$(CONFIG_ADFS_FS)		+= adfs/ - obj-$(CONFIG_FUSE_FS)		+= fuse/ -+obj-$(CONFIG_OVERLAYFS_FS)	+= overlayfs/ - obj-$(CONFIG_UDF_FS)		+= udf/ - obj-$(CONFIG_SUN_OPENPROMFS)	+= openpromfs/ - obj-$(CONFIG_OMFS_FS)		+= omfs/  --- /dev/null  +++ b/fs/overlayfs/Kconfig  @@ -0,0 +1,4 @@ @@ -645,7 +839,7 @@  +}  --- /dev/null  +++ b/fs/overlayfs/dir.c -@@ -0,0 +1,607 @@ +@@ -0,0 +1,596 @@  +/*  + *  + * Copyright (C) 2011 Novell Inc. @@ -663,17 +857,6 @@  +  +static const char *ovl_whiteout_symlink = "(overlay-whiteout)";  + -+static struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, -+				 struct nameidata *nd) -+{ -+	int err = ovl_do_lookup(dentry); -+ -+	if (err) -+		return ERR_PTR(err); -+ -+	return NULL; -+} -+  +static int ovl_whiteout(struct dentry *upperdir, struct dentry *dentry)  +{  +	int err; @@ -1255,7 +1438,7 @@  +};  --- /dev/null  +++ b/fs/overlayfs/inode.c -@@ -0,0 +1,375 @@ +@@ -0,0 +1,384 @@  +/*  + *  + * Copyright (C) 2011 Novell Inc. @@ -1348,9 +1531,18 @@  +		/*  +		 * Writes will always be redirected to upper layer, so  +		 * ignore lower layer being read-only. ++		 * ++		 * If the overlay itself is read-only then proceed ++		 * with the permission check, don't return EROFS. ++		 * This will only happen if this is the lower layer of ++		 * another overlayfs. ++		 * ++		 * If upper fs becomes read-only after the overlay was ++		 * constructed return EROFS to prevent modification of ++		 * upper layer.  +		 */  +		err = -EROFS; -+		if (is_upper && IS_RDONLY(realinode) && ++		if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&  +		    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))  +			goto out_dput;  + @@ -1633,7 +1825,7 @@  +}  --- /dev/null  +++ b/fs/overlayfs/overlayfs.h -@@ -0,0 +1,62 @@ +@@ -0,0 +1,63 @@  +/*  + *  + * Copyright (C) 2011 Novell Inc. @@ -1669,7 +1861,8 @@  +void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);  +bool ovl_is_whiteout(struct dentry *dentry);  +void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry); -+int ovl_do_lookup(struct dentry *dentry); ++struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, ++			  struct nameidata *nd);  +  +struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,  +				struct kstat *stat, const char *link); @@ -1866,8 +2059,8 @@  +	return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);  +}  + -+static int ovl_dir_read(struct path *realpath, struct ovl_readdir_data *rdd, -+			  filldir_t filler) ++static inline int ovl_dir_read(struct path *realpath, ++			       struct ovl_readdir_data *rdd, filldir_t filler)  +{  +	struct file *realfile;  +	int err; @@ -1947,7 +2140,7 @@  +	return 0;  +}  + -+static int ovl_dir_read_merged(struct path *upperpath, struct path *lowerpath, ++static inline int ovl_dir_read_merged(struct path *upperpath, struct path *lowerpath,  +			       struct ovl_readdir_data *rdd)  +{  +	int err; @@ -2259,7 +2452,7 @@  +}  --- /dev/null  +++ b/fs/overlayfs/super.c -@@ -0,0 +1,625 @@ +@@ -0,0 +1,656 @@  +/*  + *  + * Copyright (C) 2011 Novell Inc. @@ -2510,7 +2703,7 @@  +	return kzalloc(sizeof(struct ovl_entry), GFP_KERNEL);  +}  + -+static struct dentry *ovl_lookup_real(struct dentry *dir, struct qstr *name) ++static inline struct dentry *ovl_lookup_real(struct dentry *dir, struct qstr *name)  +{  +	struct dentry *dentry;  + @@ -2528,7 +2721,7 @@  +	return dentry;  +}  + -+int ovl_do_lookup(struct dentry *dentry) ++static int ovl_do_lookup(struct dentry *dentry)  +{  +	struct ovl_entry *oe;  +	struct dentry *upperdir; @@ -2625,6 +2818,17 @@  +	return err;  +}  + ++struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, ++			  struct nameidata *nd) ++{ ++	int err = ovl_do_lookup(dentry); ++ ++	if (err) ++		return ERR_PTR(err); ++ ++	return NULL; ++} ++  +static void ovl_put_super(struct super_block *sb)  +{  +	struct ovl_fs *ufs = sb->s_fs_info; @@ -2796,6 +3000,16 @@  +	    !S_ISDIR(lowerpath.dentry->d_inode->i_mode))  +		goto out_put_lowerpath;  + ++	sb->s_stack_depth = max(upperpath.mnt->mnt_sb->s_stack_depth, ++				lowerpath.mnt->mnt_sb->s_stack_depth) + 1; ++ ++	err = -EINVAL; ++	if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { ++		printk(KERN_ERR "overlayfs: maximum fs stacking depth exceeded\n"); ++		goto out_put_lowerpath; ++	} ++ ++  +	ufs->upper_mnt = clone_private_mount(&upperpath);  +	err = PTR_ERR(ufs->upper_mnt);  +	if (IS_ERR(ufs->upper_mnt)) { @@ -2810,6 +3024,16 @@  +		goto out_put_upper_mnt;  +	}  + ++	/* ++	 * Make lower_mnt R/O.  That way fchmod/fchown on lower file ++	 * will fail instead of modifying lower fs. ++	 */ ++	ufs->lower_mnt->mnt_flags |= MNT_READONLY; ++ ++	/* If the upper fs is r/o, we mark overlayfs r/o too */ ++	if (ufs->upper_mnt->mnt_sb->s_flags & MS_RDONLY) ++		sb->s_flags |= MS_RDONLY; ++  +	if (!(sb->s_flags & MS_RDONLY)) {  +		err = mnt_want_write(ufs->upper_mnt);  +		if (err) @@ -2885,189 +3109,68 @@  +  +module_init(ovl_init);  +module_exit(ovl_exit); ---- /dev/null -+++ b/Documentation/filesystems/overlayfs.txt -@@ -0,0 +1,167 @@ -+Written by: Neil Brown <neilb@suse.de> -+ -+Overlay Filesystem -+================== -+ -+This document describes a prototype for a new approach to providing -+overlay-filesystem functionality in Linux (sometimes referred to as -+union-filesystems).  An overlay-filesystem tries to present a -+filesystem which is the result over overlaying one filesystem on top -+of the other. -+ -+The result will inevitably fail to look exactly like a normal -+filesystem for various technical reasons.  The expectation is that -+many use cases will be able to ignore these differences. -+ -+This approach is 'hybrid' because the objects that appear in the -+filesystem do not all appear to belong to that filesystem.  In many -+cases an object accessed in the union will be indistinguishable -+from accessing the corresponding object from the original filesystem. -+This is most obvious from the 'st_dev' field returned by stat(2). -+ -+While directories will report an st_dev from the overlay-filesystem, -+all non-directory objects will report an st_dev from the lower or -+upper filesystem that is providing the object.  Similarly st_ino will -+only be unique when combined with st_dev, and both of these can change -+over the lifetime of a non-directory object.  Many applications and -+tools ignore these values and will not be affected. -+ -+Upper and Lower -+--------------- -+ -+An overlay filesystem combines two filesystems - an 'upper' filesystem -+and a 'lower' filesystem.  When a name exists in both filesystems, the -+object in the 'upper' filesystem is visible while the object in the -+'lower' filesystem is either hidden or, in the case of directories, -+merged with the 'upper' object. -+ -+It would be more correct to refer to an upper and lower 'directory -+tree' rather than 'filesystem' as it is quite possible for both -+directory trees to be in the same filesystem and there is no -+requirement that the root of a filesystem be given for either upper or -+lower. -+ -+The lower filesystem can be any filesystem supported by Linux and does -+not need to be writable.  The lower filesystem can even be another -+overlayfs.  The upper filesystem will normally be writable and if it -+is it must support the creation of trusted.* extended attributes, and -+must provide valid d_type in readdir responses, at least for symbolic -+links - so NFS is not suitable. -+ -+A read-only overlay of two read-only filesystems may use any -+filesystem type. -+ -+Directories -+----------- -+ -+Overlaying mainly involved directories.  If a given name appears in both -+upper and lower filesystems and refers to a non-directory in either, -+then the lower object is hidden - the name refers only to the upper -+object. -+ -+Where both upper and lower objects are directories, a merged directory -+is formed. -+ -+At mount time, the two directories given as mount options are combined -+into a merged directory: -+ -+  mount -t overlayfs overlayfs -olowerdir=/lower,upperdir=/upper /overlay -+ -+Then whenever a lookup is requested in such a merged directory, the -+lookup is performed in each actual directory and the combined result -+is cached in the dentry belonging to the overlay filesystem.  If both -+actual lookups find directories, both are stored and a merged -+directory is created, otherwise only one is stored: the upper if it -+exists, else the lower. -+ -+Only the lists of names from directories are merged.  Other content -+such as metadata and extended attributes are reported for the upper -+directory only.  These attributes of the lower directory are hidden. -+ -+whiteouts and opaque directories -+-------------------------------- -+ -+In order to support rm and rmdir without changing the lower -+filesystem, an overlay filesystem needs to record in the upper filesystem -+that files have been removed.  This is done using whiteouts and opaque -+directories (non-directories are always opaque). -+ -+The overlay filesystem uses extended attributes with a -+"trusted.overlay."  prefix to record these details. -+ -+A whiteout is created as a symbolic link with target -+"(overlay-whiteout)" and with xattr "trusted.overlay.whiteout" set to "y". -+When a whiteout is found in the upper level of a merged directory, any -+matching name in the lower level is ignored, and the whiteout itself -+is also hidden. -+ -+A directory is made opaque by setting the xattr "trusted.overlay.opaque" -+to "y".  Where the upper filesystem contains an opaque directory, any -+directory in the lower filesystem with the same name is ignored. -+ -+readdir -+------- -+ -+When a 'readdir' request is made on a merged directory, the upper and -+lower directories are each read and the name lists merged in the -+obvious way (upper is read first, then lower - entries that already -+exist are not re-added).  This merged name list is cached in the -+'struct file' and so remains as long as the file is kept open.  If the -+directory is opened and read by two processes at the same time, they -+will each have separate caches.  A seekdir to the start of the -+directory (offset 0) followed by a readdir will cause the cache to be -+discarded and rebuilt. -+ -+This means that changes to the merged directory do not appear while a -+directory is being read.  This is unlikely to be noticed by many -+programs. -+ -+seek offsets are assigned sequentially when the directories are read. -+Thus if -+  - read part of a directory -+  - remember an offset, and close the directory -+  - re-open the directory some time later -+  - seek to the remembered offset -+ -+there may be little correlation between the old and new locations in -+the list of filenames, particularly if anything has changed in the -+directory. -+ -+Readdir on directories that are not merged is simply handled by the -+underlying directory (upper or lower). -+ -+ -+Non-directories -+--------------- -+ -+Objects that are not directories (files, symlinks, device-special -+files etc.) are presented either from the upper or lower filesystem as -+appropriate.  When a file in the lower filesystem is accessed in a way -+the requires write-access, such as opening for write access, changing -+some metadata etc., the file is first copied from the lower filesystem -+to the upper filesystem (copy_up).  Note that creating a hard-link -+also requires copy_up, though of course creation of a symlink does -+not. -+ -+The copy_up process first makes sure that the containing directory -+exists in the upper filesystem - creating it and any parents as -+necessary.  It then creates the object with the same metadata (owner, -+mode, mtime, symlink-target etc.) and then if the object is a file, the -+data is copied from the lower to the upper filesystem.  Finally any -+extended attributes are copied up. -+ -+Once the copy_up is complete, the overlay filesystem simply -+provides direct access to the newly created file in the upper -+filesystem - future operations on the file are barely noticed by the -+overlay filesystem (though an operation on the name of the file such as -+rename or unlink will of course be noticed and handled). -+ -+Changes to underlying filesystems -+--------------------------------- +--- a/fs/splice.c ++++ b/fs/splice.c +@@ -1300,6 +1300,7 @@ long do_splice_direct(struct file *in, l +  + 	return ret; + } ++EXPORT_SYMBOL(do_splice_direct); +  + static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, + 			       struct pipe_inode_info *opipe, +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -480,6 +480,12 @@ struct iattr { +  */ + #include <linux/quota.h> +  ++/* ++ * Maximum number of layers of fs stack.  Needs to be limited to ++ * prevent kernel stack overflow ++ */ ++#define FILESYSTEM_MAX_STACK_DEPTH 2  + -+Offline changes, when the overlay is not mounted, are allowed to either -+the upper or the lower trees. + /**  +  * enum positive_aop_returns - aop return codes with specific semantics +  * +@@ -1438,6 +1444,11 @@ struct super_block { + 	 * Saved pool identifier for cleancache (-1 means none) + 	 */ + 	int cleancache_poolid;  + -+Changes to the underlying filesystems while part of a mounted overlay -+filesystem are not allowed.  This is not yet enforced, but will be in -+the future. ---- a/MAINTAINERS -+++ b/MAINTAINERS -@@ -4727,6 +4727,13 @@ F:	drivers/scsi/osd/ - F:	include/scsi/osd_* - F:	fs/exofs/ ++	/* ++	 * Indicates how deep in a filesystem stack this SB is ++	 */ ++	int s_stack_depth; + }; -+OVERLAYFS FILESYSTEM -+M:	Miklos Szeredi <miklos@szeredi.hu> -+L:	linux-fsdevel@vger.kernel.org -+S:	Supported -+F:	fs/overlayfs/* -+F:	Documentation/filesystems/overlayfs.txt + extern struct timespec current_fs_time(struct super_block *sb); +@@ -1603,6 +1614,7 @@ struct inode_operations { + 	void (*truncate_range)(struct inode *, loff_t, loff_t); + 	int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, + 		      u64 len); ++	struct file *(*open)(struct dentry *, int flags, const struct cred *); + } ____cacheline_aligned; +  + struct seq_file; +@@ -1998,6 +2010,7 @@ extern long do_sys_open(int dfd, const c + extern struct file *filp_open(const char *, int, int); + extern struct file *file_open_root(struct dentry *, struct vfsmount *, + 				   const char *, int); ++extern struct file *vfs_open(struct path *, int flags, const struct cred *); + extern struct file * dentry_open(struct dentry *, struct vfsmount *, int, + 				 const struct cred *); + extern int filp_close(struct file *, fl_owner_t id); +--- a/include/linux/mount.h ++++ b/include/linux/mount.h +@@ -100,6 +100,9 @@ extern void mnt_pin(struct vfsmount *mnt + extern void mnt_unpin(struct vfsmount *mnt); + extern int __mnt_is_readonly(struct vfsmount *mnt); +  ++struct path; ++extern struct vfsmount *clone_private_mount(struct path *path);  + - P54 WIRELESS DRIVER - M:	Christian Lamparter <chunkeey@googlemail.com> - L:	linux-wireless@vger.kernel.org + extern struct vfsmount *do_kern_mount(const char *fstype, int flags, + 				      const char *name, void *data); +   | 
