[lxc-devel] [PATCH 1/1] Implement userid mappings (enable user namespaces)

Serge Hallyn serge.hallyn at canonical.com
Tue Jan 15 00:03:06 UTC 2013


The 3.8 kernel now supporst uid mappings, so I believe it's appropriate
to proceed with this patchset.
The container config supports new entries of the form:
 lxc.id_map = U 100000 0 10000
 lxc.id_map = G 100000 0 10000
meaning map 'virtual' uids (in the container) 0-10000 to uids
100000-110000 on the host, and same for gids.  So long as there are
mappings specified in the container config, then CONFIG_NEWUSER will
be used when the container is cloned.  This means that container
setup is no longer done with root privilege on the host, only root
privilege in the container.  Therefore cgroup setup is moved from the
init task to the monitor task.

To use this patchset, you currently need to either use the raring
kernel at ppa:serge-hallyn/usern-natty, or build your own kernel
from either git://kernel.ubuntu.com/serge/quantal-userns.git.
(Alternatively you can use Eric's tree at the latest userns-always-map-*
branch at
git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace.git
but you will likely want to at least enable tmpfs mounts in user namespaces)

You also need to chown the files in the container rootfs into the
mapped range.  There is a utility at
https://code.launchpad.net/~serge-hallyn/+junk/nsexec to do this.
uidmapshift does the chowning, while the container-userns-convert
script nicely wraps that program.  So I simply

	sudo lxc-create -t ubuntu -n r1
	sudo container-userns-convert r1 200000

will create a container which is shifted so uid 0 in the container
is uid 200000 on the host.

TODO: when doing setuid(0), need to only do that if 0 is one of the
ids we map to.  Similarly, when dropping capabilities, need to only
not do that if 0 is one of the ids we map to.  However, the question
of what to do for 'weird' containers in private user namespaces is
one I'm punting for later.

Signed-off-by: Serge Hallyn <serge.hallyn at ubuntu.com>
---
 doc/lxc.conf.sgml.in |  40 +++++++++++++++
 src/lxc/conf.c       | 134 +++++++++++++++++++++++++++++++++++++++++++++++++--
 src/lxc/conf.h       |  26 ++++++++++
 src/lxc/confile.c    |  60 +++++++++++++++++++++++
 src/lxc/start.c      |  35 ++++++++++++++
 5 files changed, 292 insertions(+), 3 deletions(-)

diff --git a/doc/lxc.conf.sgml.in b/doc/lxc.conf.sgml.in
index 1298143..ae91221 100644
--- a/doc/lxc.conf.sgml.in
+++ b/doc/lxc.conf.sgml.in
@@ -690,6 +690,46 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
     </refsect2>
 
     <refsect2>
+      <title>UID mappings</title>
+      <para>
+        A container can be started in a private user namespace with
+	user and group id mappings.  For instance, you can map userid
+	0 in the container to userid 200000 on the host.  The root
+	user in the container will be privileged in the container,
+	but unprivileged on the host.  Normally a system container
+	will want a range of ids, so you would map, for instance,
+	user and group ids 0 through 20,000 in the container to the
+	ids 200,000 through 220,000.
+      </para>
+      <variablelist>
+	<varlistentry>
+	  <term>
+	    <option>lxc.id_map</option>
+	  </term>
+	  <listitem>
+	    <para>
+	      Four values must be provided.  First a character, either
+	      'U', or 'G', to specify whether user or group ids are
+	      being mapped.  Next is the first userid as seen on the
+	      host.  Next is the userid to be mapped in the container.
+	      Finally, a range indicating the number of consecutive
+	      ids to map.  For instance
+	     </para>
+<programlisting>
+	lxc.id_map = U 200000 0 20000
+	lxc.id_map = G 200000 0 20000
+</programlisting>
+	    <para>
+	      will map both user and group ids in the
+	      range 0-19999 in the container to the ids
+	      200000-219999 on the host.
+	    </para>
+	  </listitem>
+	</varlistentry>
+      </variablelist>
+    </refsect2>
+
+    <refsect2>
       <title>Startup hooks</title>
       <para>
         Startup hooks are programs or scripts which can be executed
diff --git a/src/lxc/conf.c b/src/lxc/conf.c
index b516d7d..10d713b 100644
--- a/src/lxc/conf.c
+++ b/src/lxc/conf.c
@@ -2053,6 +2053,7 @@ struct lxc_conf *lxc_conf_init(void)
 	lxc_list_init(&new->network);
 	lxc_list_init(&new->mount_list);
 	lxc_list_init(&new->caps);
+	lxc_list_init(&new->id_map);
 	for (i=0; i<NUM_LXC_HOOKS; i++)
 		lxc_list_init(&new->hooks[i]);
 #if HAVE_APPARMOR
@@ -2427,6 +2428,44 @@ int lxc_assign_network(struct lxc_list *network, pid_t pid)
 	return 0;
 }
 
+int add_id_mapping(enum idtype idtype, pid_t pid, uid_t host_start, uid_t ns_start, int range)
+{
+        char path[PATH_MAX];
+        int ret;
+        FILE *f;
+
+        ret = snprintf(path, PATH_MAX, "/proc/%d/%cid_map", pid, idtype == ID_TYPE_UID ? 'u' : 'g');
+        if (ret < 0 || ret >= PATH_MAX) {
+                fprintf(stderr, "%s: path name too long", __func__);
+                return -E2BIG;
+        }
+        f = fopen(path, "w");
+        if (!f) {
+                perror("open");
+                return -EINVAL;
+        }
+        ret = fprintf(f, "%d %d %d", ns_start, host_start, range);
+        if (ret < 0)
+                perror("write");
+        fclose(f);
+        return ret < 0 ? ret : 0;
+}
+
+int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
+{
+	struct lxc_list *iterator;
+	struct id_map *map;
+	int ret = 0;
+
+	lxc_list_for_each(iterator, idmap) {
+		map = iterator->elem;
+		ret = add_id_mapping(map->idtype, pid, map->hostid, map->nsid, map->range);
+		if (ret)
+			break;
+	}
+	return ret;
+}
+
 int lxc_find_gateway_addresses(struct lxc_handler *handler)
 {
 	struct lxc_list *network = &handler->conf->network;
@@ -2535,6 +2574,93 @@ void lxc_delete_tty(struct lxc_tty_info *tty_info)
 	tty_info->nbtty = 0;
 }
 
+/*
+ * given a host uid, return the ns uid if it is mapped.
+ * if it is not mapped, return the original host id.
+ */
+static int shiftid(struct lxc_conf *c, int uid, enum idtype w)
+{
+	struct lxc_list *iterator;
+	struct id_map *map;
+	int low, high;
+
+	lxc_list_for_each(iterator, &c->id_map) {
+		map = iterator->elem;
+		if (map->idtype != w)
+			continue;
+
+		low = map->nsid;
+		high = map->nsid + map->range;
+		if (uid < low || uid >= high)
+			continue;
+
+		return uid - low + map->hostid;
+	}
+
+	return uid;
+}
+
+/*
+ * Take a pathname for a file created on the host, and map the uid and gid
+ * into the container if needed.  (Used for ttys)
+ */
+static int uid_shift_file(char *path, struct lxc_conf *c)
+{
+	struct stat statbuf;
+	int newuid, newgid;
+
+	if (stat(path, &statbuf)) {
+		SYSERROR("stat(%s)", path);
+		return -1;
+	}
+
+	newuid = shiftid(c, statbuf.st_uid, ID_TYPE_UID);
+	newgid = shiftid(c, statbuf.st_gid, ID_TYPE_GID);
+	if (newuid != statbuf.st_uid || newgid != statbuf.st_gid) {
+		DEBUG("chowning %s from %d:%d to %d:%d\n", path, statbuf.st_uid, statbuf.st_gid, newuid, newgid);
+		if (chown(path, newuid, newgid)) {
+			SYSERROR("chown(%s)", path);
+			return -1;
+		}
+	}
+	return 0;
+}
+
+int uid_shift_ttys(int pid, struct lxc_conf *conf)
+{
+	int i, ret;
+	struct lxc_tty_info *tty_info = &conf->tty_info;
+	char path[MAXPATHLEN];
+	char *ttydir = conf->ttydir;
+
+	if (!conf->rootfs.path)
+		return 0;
+	/* first the console */
+	ret = snprintf(path, sizeof(path), "/proc/%d/root/dev/%s/console", pid, ttydir ? ttydir : "");
+	if (ret < 0 || ret >= sizeof(path)) {
+		ERROR("console path too long\n");
+		return -1;
+	}
+	if (uid_shift_file(path, conf)) {
+		DEBUG("Failed to chown the console %s.\n", path);
+		return -1;
+	}
+	for (i=0; i< tty_info->nbtty; i++) {
+		ret = snprintf(path, sizeof(path), "/proc/%d/root/dev/%s/tty%d",
+			pid, ttydir ? ttydir : "", i + 1);
+		if (ret < 0 || ret >= sizeof(path)) {
+			ERROR("pathname too long for ttys");
+			return -1;
+		}
+		if (uid_shift_file(path, conf)) {
+			DEBUG("Failed to chown pty %s.\n", path);
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
 int lxc_setup(const char *name, struct lxc_conf *lxc_conf)
 {
 #if HAVE_APPARMOR /* || HAVE_SMACK || HAVE_SELINUX */
@@ -2637,9 +2763,11 @@ int lxc_setup(const char *name, struct lxc_conf *lxc_conf)
 		return -1;
 	}
 
-	if (setup_caps(&lxc_conf->caps)) {
-		ERROR("failed to drop capabilities");
-		return -1;
+	if (lxc_list_empty(&lxc_conf->id_map)) {
+		if (setup_caps(&lxc_conf->caps)) {
+			ERROR("failed to drop capabilities");
+			return -1;
+		}
 	}
 
 	NOTICE("'%s' is setup.", name);
diff --git a/src/lxc/conf.h b/src/lxc/conf.h
index e226859..4c48b46 100644
--- a/src/lxc/conf.h
+++ b/src/lxc/conf.h
@@ -142,6 +142,26 @@ struct lxc_cgroup {
 	char *value;
 };
 
+enum idtype {
+	ID_TYPE_UID,
+	ID_TYPE_GID
+};
+
+/*
+ * id_map is an id map entry.  Form in confile is:
+ * lxc.id_map = U 9800 0 100
+ * lxc.id_map = U 9900 1000 100
+ * lxc.id_map = G 9800 0 100
+ * lxc.id_map = G 9900 1000 100
+ * meaning the container can use uids and gids 0-100 and 1000-1100,
+ * with uid 0 mapping to uid 9800 on the host, and gid 1000 to
+ * gid 9900 on the host.
+ */
+struct id_map {
+	enum idtype idtype;
+	int hostid, nsid, range;
+};
+
 /*
  * Defines a structure containing a pty information for
  * virtualizing a tty
@@ -232,6 +252,7 @@ struct lxc_conf {
 	int personality;
 	struct utsname *utsname;
 	struct lxc_list cgroup;
+	struct lxc_list id_map;
 	struct lxc_list network;
 	struct saved_nic *saved_nics;
 	int num_savednics;
@@ -275,6 +296,7 @@ extern int pin_rootfs(const char *rootfs);
 extern int lxc_create_network(struct lxc_handler *handler);
 extern void lxc_delete_network(struct lxc_handler *handler);
 extern int lxc_assign_network(struct lxc_list *networks, pid_t pid);
+extern int lxc_map_ids(struct lxc_list *idmap, pid_t pid);
 extern int lxc_find_gateway_addresses(struct lxc_handler *handler);
 
 extern int lxc_create_tty(const char *name, struct lxc_conf *conf);
@@ -287,6 +309,10 @@ extern int lxc_clear_cgroups(struct lxc_conf *c, const char *key);
 extern int lxc_clear_mount_entries(struct lxc_conf *c);
 extern int lxc_clear_hooks(struct lxc_conf *c, const char *key);
 
+extern int setup_cgroup(const char *name, struct lxc_list *cgroups);
+
+extern int uid_shift_ttys(int pid, struct lxc_conf *conf);
+
 /*
  * Configure the container from inside
  */
diff --git a/src/lxc/confile.c b/src/lxc/confile.c
index 034136e..850894e 100644
--- a/src/lxc/confile.c
+++ b/src/lxc/confile.c
@@ -58,6 +58,7 @@ static int config_ttydir(const char *, const char *, struct lxc_conf *);
 static int config_aa_profile(const char *, const char *, struct lxc_conf *);
 #endif
 static int config_cgroup(const char *, const char *, struct lxc_conf *);
+static int config_idmap(const char *, const char *, struct lxc_conf *);
 static int config_loglevel(const char *, const char *, struct lxc_conf *);
 static int config_logfile(const char *, const char *, struct lxc_conf *);
 static int config_mount(const char *, const char *, struct lxc_conf *);
@@ -97,6 +98,7 @@ static struct lxc_config_t config[] = {
 	{ "lxc.aa_profile",            config_aa_profile          },
 #endif
 	{ "lxc.cgroup",               config_cgroup               },
+	{ "lxc.id_map",               config_idmap                },
 	{ "lxc.loglevel",             config_loglevel             },
 	{ "lxc.logfile",              config_logfile              },
 	{ "lxc.mount",                config_mount                },
@@ -1021,6 +1023,64 @@ out:
 	return -1;
 }
 
+static int config_idmap(const char *key, const char *value, struct lxc_conf *lxc_conf)
+{
+	char *token = "lxc.id_map";
+	char *subkey;
+	struct lxc_list *idmaplist = NULL;
+	struct id_map *idmap = NULL;
+	int hostid, nsid, range;
+	char type;
+	int ret;
+
+	subkey = strstr(key, token);
+
+	if (!subkey)
+		return -1;
+
+	if (!strlen(subkey))
+		return -1;
+
+	idmaplist = malloc(sizeof(*idmaplist));
+	if (!idmaplist)
+		goto out;
+
+	idmap = malloc(sizeof(*idmap));
+	if (!idmap)
+		goto out;
+	memset(idmap, 0, sizeof(*idmap));
+
+	idmaplist->elem = idmap;
+
+	lxc_list_add_tail(&lxc_conf->id_map, idmaplist);
+
+	ret = sscanf(value, "%c %d %d %d", &type, &hostid, &nsid, &range);
+	if (ret != 4)
+		goto out;
+	INFO("read uid map: type %c hostid %d nsid %d range %d", type, hostid, nsid, range);
+	if (type == 'U')
+		idmap->idtype = ID_TYPE_UID;
+	else if (type == 'G')
+		idmap->idtype = ID_TYPE_GID;
+	else 
+		goto out;
+	idmap->hostid = hostid;
+	idmap->nsid = nsid;
+	idmap->range = range;
+
+	return 0;
+
+out:
+	if (idmaplist)
+		free(idmaplist);
+
+	if (idmap) {
+		free(idmap);
+	}
+
+	return -1;
+}
+
 static int config_path_item(const char *key, const char *value,
 			    struct lxc_conf *lxc_conf, char **conf_item)
 {
diff --git a/src/lxc/start.c b/src/lxc/start.c
index ccec9ef..be738c8 100644
--- a/src/lxc/start.c
+++ b/src/lxc/start.c
@@ -581,6 +581,22 @@ static int do_start(void *data)
 	if (lxc_sync_barrier_parent(handler, LXC_SYNC_CONFIGURE))
 		return -1;
 
+	/*
+	 * if we are in a new user namespace, become root there to have
+	 * privilege over our namespace
+	 */
+	if (!lxc_list_empty(&handler->conf->id_map)) {
+		NOTICE("switching to gid/uid 0 in new user namespace");
+		if (setgid(0)) {
+			SYSERROR("setgid");
+			goto out_warn_father;
+		}
+		if (setuid(0)) {
+			SYSERROR("setuid");
+			goto out_warn_father;
+		}
+	}
+
 	#if HAVE_SYS_CAPABILITY_H
 	if (handler->conf->need_utmp_watch) {
 		if (prctl(PR_CAPBSET_DROP, CAP_SYS_BOOT, 0, 0, 0)) {
@@ -681,6 +697,10 @@ int lxc_spawn(struct lxc_handler *handler)
 		return -1;
 
 	handler->clone_flags = CLONE_NEWUTS|CLONE_NEWPID|CLONE_NEWIPC|CLONE_NEWNS;
+	if (!lxc_list_empty(&handler->conf->id_map)) {
+		INFO("Cloning a new user namespace");
+		handler->clone_flags |= CLONE_NEWUSER;
+	}
 	if (!lxc_list_empty(&handler->conf->network)) {
 
 		handler->clone_flags |= CLONE_NEWNET;
@@ -747,6 +767,16 @@ int lxc_spawn(struct lxc_handler *handler)
 		}
 	}
 
+	/* map the container uids - the container became an invalid
+	 * userid the moment it was cloned with CLONE_NEWUSER - this
+	 * call doesn't change anything immediately, but allows the
+	 * container to setuid(0) (0 being mapped to something else on
+	 * the host) later to become a valid uid again */
+	if (lxc_map_ids(&handler->conf->id_map, handler->pid)) {
+		ERROR("failed to set up id mapping");
+		goto out_delete_net;
+	}
+
 	/* Tell the child to continue its initialization.  we'll get
 	 * LXC_SYNC_CGROUP when it is ready for us to setup cgroups
 	 */
@@ -772,6 +802,11 @@ int lxc_spawn(struct lxc_handler *handler)
 	if (detect_shared_rootfs())
 		umount2(handler->conf->rootfs.mount, MNT_DETACH);
 
+	/* If child is in a fresh user namespace, chown his ptys for
+	 * him */
+	if (uid_shift_ttys(handler->pid, handler->conf))
+		DEBUG("Failed to chown ptys.\n");
+
 	if (handler->ops->post_start(handler, handler->data))
 		goto out_abort;
 
-- 
1.8.0





More information about the lxc-devel mailing list