[lxc-devel] [PATCH RFC] Introduce new security.nscapability xattr

Serge E. Hallyn serge.hallyn at ubuntu.com
Mon Nov 30 22:43:56 UTC 2015


A common way for daemons to run with minimal privilege is to start as root,
perhaps setuid-root, choose a desired capability set, set PR_SET_KEEPCAPS,
then change uid to non-root.  A simpler way to achieve this is to set file
capabilities on a not-setuid-root binary.  However, when installing a package
inside a (user-namespaced) container, packages cannot be installed with file
capabilities.  For this reason, containers must install ping setuid-root.

To achieve this, we would need for containers to be able to request file
capabilities be added to a file without causing these to be honored in the
initial user namespace.

To this end, the patch below introduces a new capability xattr format.  The
main enhancement over the existing security.capability xattr is that we
tag capability sets with a uid - the uid of the root user in the namespace
where the capabilities are set.  The capabilities will be ignored in any
other namespace.  The special case of uid == -1 (which must only ever be
able to be set by kuid 0) means use the capabilities in all namespaces.

An alternative format would use a pair of uids to indicate a range of rootids.
This would allow root in a user namespace with uids 100000-165536 mapped to
set the xattr once on a file, then launch nested containers wherein the file
could be used with privilege.  That's not what this patch does, but would be
a trivial change if people think it would be worthwhile.

This patch does not actually address the real problem, which is setting the
xattrs from inside containers.  For that, I think the best solution is to
add a pair of new system calls, setfcap and getfcap. Userspace would for
instance call fsetfcap(fd, cap_user_header_t, cap_user_data_t), to which
the kernel would, if not in init_user_ns, react by writing an appropriate
security.nscapability xattr.

The libcap2 library's cap_set_file/cap_get_file could be switched over
transparently to use this to hide its use from all callers.

Comments appreciated.

Note - In this patch, file capabilities only work for containers which have
a root uid defined.  We may want to allow -1 uids to work in all
namespaces.  There certainly would be uses for this, but I'm a bit unsettled
about the implications of allowing a program privilege in a container where
there is no uid with privilege.  This needs more thought.

Signed-off-by: Serge Hallyn <serge.hallyn at ubuntu.com>
---
 include/linux/capability.h      |  15 +++++
 include/uapi/linux/capability.h |  47 ++++++++++++++
 include/uapi/linux/xattr.h      |   3 +
 security/commoncap.c            | 135 ++++++++++++++++++++++++++++++++++++++--
 4 files changed, 194 insertions(+), 6 deletions(-)

diff --git a/include/linux/capability.h b/include/linux/capability.h
index af9f0b9..24ac18e 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -13,6 +13,7 @@
 #define _LINUX_CAPABILITY_H
 
 #include <uapi/linux/capability.h>
+#include <linux/uidgid.h>
 
 
 #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3
@@ -31,6 +32,20 @@ struct cpu_vfs_cap_data {
 	kernel_cap_t inheritable;
 };
 
+struct cpu_vfs_ns_cap_data {
+	__u32 flags;
+	kuid_t rootid;
+	kernel_cap_t permitted;
+	kernel_cap_t inheritable;
+};
+
+struct cpu_vfs_ns_cap_header {
+	__u32 hdr_info;
+	struct cpu_vfs_ns_cap_data caps[0];
+};
+#define NS_CAPS_VERSION(x) (x & 0xFF)
+#define NS_CAPS_NCAPS(x) ( (x >> 8) & 0xFF )
+
 #define _USER_CAP_HEADER_SIZE  (sizeof(struct __user_cap_header_struct))
 #define _KERNEL_CAP_T_SIZE     (sizeof(kernel_cap_t))
 
diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index 12c37a1..2211a33 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -62,10 +62,14 @@ typedef struct __user_cap_data_struct {
 #define VFS_CAP_U32_2           2
 #define XATTR_CAPS_SZ_2         (sizeof(__le32)*(1 + 2*VFS_CAP_U32_2))
 
+/* version number for security.nscapability xattrs hdr->hdr_info */
+#define VFS_NS_CAP_REVISION     1
+
 #define XATTR_CAPS_SZ           XATTR_CAPS_SZ_2
 #define VFS_CAP_U32             VFS_CAP_U32_2
 #define VFS_CAP_REVISION	VFS_CAP_REVISION_2
 
+
 struct vfs_cap_data {
 	__le32 magic_etc;            /* Little endian */
 	struct {
@@ -74,6 +78,49 @@ struct vfs_cap_data {
 	} data[VFS_CAP_U32];
 };
 
+/*
+ * Q: do we want version in the header, or in the data?
+ * If it is in the header, then a container will need to
+ * make sure it is writing the same data.
+ *
+ * Actually, perhaps we simply do not support writing the
+ * xattr, we just use a new system call to get/set the fscap.
+ * The kernel can be in charge of watching the version numbers.
+ * After all, we can't allow the container to override the
+ * fscaps of the init ns.
+ *
+ * @flags currently only containers the effective bit.  The
+ * other bits are reserved, and must be 0 at the moment.
+ * @rootid contains the kuid value of the root in the namespace
+ * for which this capability should be used.  If -1, then this
+ * works for all namespaces.  Only root in the initial ns can
+ * use this.
+ *
+ * Q: do we want to use a range instead?  Then root in a container
+ * could allow one binary with one capability to be used by any
+ * nested containers.
+ */
+#define VFS_NS_CAP_EFFECTIVE    0x1
+struct vfs_ns_cap_data {
+	__le32 flags;
+	__le32 rootid;
+	struct {
+		__le32 permitted;    /* Little endian */
+		__le32 inheritable;  /* Little endian */
+	} data[VFS_CAP_U32];
+};
+
+/*
+ * 32-bit hdr_info contains
+ * 16 leftmost: reserved
+ * next 8: ncaps
+ * last 8: version
+ */
+struct vfs_ns_cap_header {
+	__le32 hdr_info;
+	/* ncaps * vfs_ns_cap_data */
+};
+
 #ifndef __KERNEL__
 
 /*
diff --git a/include/uapi/linux/xattr.h b/include/uapi/linux/xattr.h
index 1590c49..67c80ab 100644
--- a/include/uapi/linux/xattr.h
+++ b/include/uapi/linux/xattr.h
@@ -68,6 +68,9 @@
 #define XATTR_CAPS_SUFFIX "capability"
 #define XATTR_NAME_CAPS XATTR_SECURITY_PREFIX XATTR_CAPS_SUFFIX
 
+#define XATTR_NS_CAPS_SUFFIX "nscapability"
+#define XATTR_NAME_NS_CAPS XATTR_SECURITY_PREFIX XATTR_NS_CAPS_SUFFIX
+
 #define XATTR_POSIX_ACL_ACCESS  "posix_acl_access"
 #define XATTR_NAME_POSIX_ACL_ACCESS XATTR_SYSTEM_PREFIX XATTR_POSIX_ACL_ACCESS
 #define XATTR_POSIX_ACL_DEFAULT  "posix_acl_default"
diff --git a/security/commoncap.c b/security/commoncap.c
index 1832cf7..c44edf3 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -308,6 +308,10 @@ int cap_inode_need_killpriv(struct dentry *dentry)
 	if (!inode->i_op->getxattr)
 	       return 0;
 
+	error = inode->i_op->getxattr(dentry, XATTR_NAME_NS_CAPS, NULL, 0);
+	if (error > 0)
+		return 1;
+
 	error = inode->i_op->getxattr(dentry, XATTR_NAME_CAPS, NULL, 0);
 	if (error <= 0)
 		return 0;
@@ -325,11 +329,17 @@ int cap_inode_need_killpriv(struct dentry *dentry)
 int cap_inode_killpriv(struct dentry *dentry)
 {
 	struct inode *inode = d_backing_inode(dentry);
+	int ret1, ret2;;
 
 	if (!inode->i_op->removexattr)
 	       return 0;
 
-	return inode->i_op->removexattr(dentry, XATTR_NAME_CAPS);
+	ret1 = inode->i_op->removexattr(dentry, XATTR_NAME_CAPS);
+	ret2 = inode->i_op->removexattr(dentry, XATTR_NAME_NS_CAPS);
+
+	if (ret1 != 0)
+		return ret1;
+	return ret2;
 }
 
 /*
@@ -433,6 +443,117 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data
 	return 0;
 }
 
+int get_vfs_ns_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps)
+{
+	struct inode *inode = d_backing_inode(dentry);
+	unsigned tocopy, i;
+	int ret = 0, size, expected;
+	unsigned len = 0;
+	struct vfs_ns_cap_header *hdr;
+	struct vfs_ns_cap_data *cap, *nscap = NULL;
+	__u16 ncaps, version;
+	__u32 hdr_info;
+	kuid_t current_root, caprootuid;
+
+	memset(cpu_caps, 0, sizeof(*cpu_caps));
+
+	if (!inode || !inode->i_op->getxattr)
+		return -ENODATA;
+
+	/* get the size */
+	size = inode->i_op->getxattr((struct dentry *)dentry, XATTR_NAME_NS_CAPS,
+			NULL, 0);
+	if (size == -ENODATA || size == -EOPNOTSUPP)
+		/* no data, that's ok */
+		return -ENODATA;
+	if (size < 0)
+		return size;
+	if (size < sizeof(struct cpu_vfs_ns_cap_header))
+		return -EINVAL;
+	if (size > sizeof(struct cpu_vfs_ns_cap_header) + 255 * sizeof(struct vfs_ns_cap_data))
+		return -EINVAL;
+	len = size;
+
+	hdr = kmalloc(len + 1, GFP_NOFS);
+	if (!hdr)
+		return -ENOMEM;
+
+	size = inode->i_op->getxattr((struct dentry *)dentry, XATTR_NAME_NS_CAPS, hdr,
+				   len);
+	if (size < 0) {
+		ret = size;
+		goto out;
+	}
+
+	if (size != len) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	hdr_info = le32_to_cpu(hdr->hdr_info);
+	version = NS_CAPS_VERSION(hdr_info);
+	ncaps = NS_CAPS_NCAPS(hdr_info);
+
+	if (version != VFS_NS_CAP_REVISION) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	expected = sizeof(*hdr) + ncaps * sizeof(*cap);
+	if (size != expected) {
+		ret = -EINVAL;
+		goto out;
+	}
+	tocopy = VFS_CAP_U32;
+
+	/* find an applicable entry */
+	/* a global entry (uid == -1) takes precedence */
+	current_root = make_kuid(current_user_ns(), 0);
+	if (!uid_valid(current_root)) {
+		/* no root user in this namespace;  no capabilities */
+		ret = -EINVAL;
+		goto out;
+	}
+
+	for (i = 0, cap = (void *) hdr + sizeof(*hdr); i < ncaps; cap += sizeof(*cap), i++) {
+		uid_t uid = le32_to_cpu(cap->rootid);
+		if (uid == -1) {
+			nscap = cap;
+			break;
+		}
+
+		caprootuid = make_kuid(&init_user_ns, uid);
+		if (uid_eq(caprootuid, current_root))
+			nscap = cap;
+	}
+
+	if (!nscap) {
+		/* nothing found for this namespace */
+		ret = -ENODATA;
+		goto out;
+	}
+
+	/* copy the entry */
+	CAP_FOR_EACH_U32(i) {
+		if (i >= tocopy)
+			break;
+		cpu_caps->permitted.cap[i] = le32_to_cpu(nscap->data[i].permitted);
+		cpu_caps->inheritable.cap[i] = le32_to_cpu(nscap->data[i].inheritable);
+	}
+
+	cpu_caps->permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
+	cpu_caps->inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
+
+	cpu_caps->magic_etc = VFS_CAP_REVISION_2;
+	if (nscap->flags & VFS_NS_CAP_EFFECTIVE)
+		cpu_caps->magic_etc |= VFS_CAP_FLAGS_EFFECTIVE;
+
+out:
+	kfree(hdr);
+
+	return ret;
+}
+
 /*
  * Attempt to get the on-exec apply capability sets for an executable file from
  * its xattrs and, if present, apply them to the proposed credentials being
@@ -451,11 +572,13 @@ static int get_file_caps(struct linux_binprm *bprm, bool *effective, bool *has_c
 	if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)
 		return 0;
 
-	rc = get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
+	rc = get_vfs_ns_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
+	if (rc == -ENODATA)
+		rc = get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
 	if (rc < 0) {
 		if (rc == -EINVAL)
-			printk(KERN_NOTICE "%s: get_vfs_caps_from_disk returned %d for %s\n",
-				__func__, rc, bprm->filename);
+			printk(KERN_NOTICE "Got EINVAL reading file caps for %s\n",
+				bprm->filename);
 		else if (rc == -ENODATA)
 			rc = 0;
 		goto out;
@@ -651,7 +774,7 @@ int cap_bprm_secureexec(struct linux_binprm *bprm)
 int cap_inode_setxattr(struct dentry *dentry, const char *name,
 		       const void *value, size_t size, int flags)
 {
-	if (!strcmp(name, XATTR_NAME_CAPS)) {
+	if (!strcmp(name, XATTR_NAME_CAPS) || !strcmp(name, XATTR_NAME_NS_CAPS)) {
 		if (!capable(CAP_SETFCAP))
 			return -EPERM;
 		return 0;
@@ -677,7 +800,7 @@ int cap_inode_setxattr(struct dentry *dentry, const char *name,
  */
 int cap_inode_removexattr(struct dentry *dentry, const char *name)
 {
-	if (!strcmp(name, XATTR_NAME_CAPS)) {
+	if (!strcmp(name, XATTR_NAME_CAPS) || !strcmp(name, XATTR_NAME_NS_CAPS)) {
 		if (!capable(CAP_SETFCAP))
 			return -EPERM;
 		return 0;
-- 
2.5.0



More information about the lxc-devel mailing list