[lxc-devel] [PATCH RFC] Enable use of user namespaces in containers
Stéphane Graber
stgraber at ubuntu.com
Thu Dec 6 19:37:15 UTC 2012
On 12/06/2012 10:02 AM, Serge Hallyn wrote:
> The rootfs will need to be chowned to the mapped userids, which can
> be done with the /usr/bin/uidmapshift tool shipped with the nsexec
> package in ppa:serge-hallyn/userns-natty.
> The container config supports new entries of the form:
> lxc.id_map = U 100000 0 10000
> lxc.id_map = G 100000 0 10000
> meaning map 'virtual' uids (in the container) 0-10000 to uids
> 100000-110000 on the host, and same for gids. So long as there are
> mappings specified in the container config, then CONFIG_NEWUSER will
> be used when the container is cloned. This means that container
> setup is no longer done with root privilege on the host, only root
> privilege in the container. Therefore cgroup setup is moved from the
> init task to the monitor task.
>
> To use this patchset, you currently need to either use the raring
> kernel at ppa:serge-hallyn/usern-natty, or build your own kernel
> from either git://kernel.ubuntu.com/serge/quantal-userns.git branch
> master-next.dec3.userns or branch userns-always-map-user-v76 at
> git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace.git
> plus a patch enabling tmpfs mounts in userns.
>
> You also need to chown the files in the container rootfs into the
> mapped range. There is a utility at
> https://code.launchpad.net/~serge-hallyn/+junk/nsexec to do this.
> uidmapshift does the chowning, while the container-userns-convert
> script nicely wraps that program. So I simply
>
> sudo lxc-create -t ubuntu -n r1
> sudo container-userns-convert r1 200000
>
> will create a container which is shifted so uid 0 in the container
> is uid 200000 on the host.
>
> TODO: when doing setuid(0), need to only do that if 0 is one of the
> ids we map to. Similarly, when dropping capabilities, need to only
> not do that if 0 is one of the ids we map to.
>
> Signed-off-by: Serge Hallyn <serge.hallyn at ubuntu.com>
Nice, the code is much simpler than I expected.
I only read through pretty briefly and left a few comments, mostly
questions and cosmetics than problems with the code itself.
I'm also wondering, what's the state of lxc-attach wrt user namespaces?
does it need any updating too?
> ---
> src/lxc/conf.c | 141 +++++++++++++++++++++++++++++++++++++++++++++++++----
> src/lxc/conf.h | 26 ++++++++++
> src/lxc/confile.c | 60 +++++++++++++++++++++++
> src/lxc/start.c | 35 +++++++++++++
> 4 files changed, 253 insertions(+), 9 deletions(-)
>
> diff --git a/src/lxc/conf.c b/src/lxc/conf.c
> index 79d96d7..1a619d0 100644
> --- a/src/lxc/conf.c
> +++ b/src/lxc/conf.c
> @@ -1221,7 +1221,7 @@ static int setup_kmsg(const struct lxc_rootfs *rootfs,
> return 0;
> }
>
> -static int setup_cgroup(const char *name, struct lxc_list *cgroups)
> +int setup_cgroup(const char *name, struct lxc_list *cgroups)
Why the change?
> {
> struct lxc_list *iterator;
> struct lxc_cgroup *cg;
> @@ -1882,6 +1882,7 @@ struct lxc_conf *lxc_conf_init(void)
> lxc_list_init(&new->network);
> lxc_list_init(&new->mount_list);
> lxc_list_init(&new->caps);
> + lxc_list_init(&new->id_map);
> for (i=0; i<NUM_LXC_HOOKS; i++)
> lxc_list_init(&new->hooks[i]);
> #if HAVE_APPARMOR
> @@ -2256,6 +2257,44 @@ int lxc_assign_network(struct lxc_list *network, pid_t pid)
> return 0;
> }
>
> +int add_id_mapping(enum idtype idtype, pid_t pid, uid_t host_start, uid_t ns_start, int range)
> +{
> + char path[PATH_MAX];
> + int ret;
> + FILE *f;
> +
> + ret = snprintf(path, PATH_MAX, "/proc/%d/%cid_map", pid, idtype == ID_TYPE_UID ? 'u' : 'g');
> + if (ret < 0 || ret >= PATH_MAX) {
> + fprintf(stderr, "%s: path name too long", __func__);
> + return -E2BIG;
> + }
> + f = fopen(path, "w");
> + if (!f) {
> + perror("open");
> + return -EINVAL;
> + }
> + ret = fprintf(f, "%d %d %d", ns_start, host_start, range);
> + if (ret < 0)
> + perror("write");
> + fclose(f);
> + return ret < 0 ? ret : 0;
> +}
> +
> +int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
> +{
> + struct lxc_list *iterator;
> + struct id_map *map;
> + int ret = 0;
> +
> + lxc_list_for_each(iterator, idmap) {
> + map = iterator->elem;
> + ret = add_id_mapping(map->idtype, pid, map->hostid, map->nsid, map->range);
> + if (ret)
> + break;
> + }
> + return ret;
> +}
> +
> int lxc_find_gateway_addresses(struct lxc_handler *handler)
> {
> struct lxc_list *network = &handler->conf->network;
> @@ -2364,6 +2403,93 @@ void lxc_delete_tty(struct lxc_tty_info *tty_info)
> tty_info->nbtty = 0;
> }
>
> +/*
> + * given a host uid, return the ns uid if it is mapped.
> + * if it is not mapped, return the original host id.
> + */
> +static int shiftid(struct lxc_conf *c, int uid, enum idtype w)
> +{
> + struct lxc_list *iterator;
> + struct id_map *map;
> + int low, high;
> +
> + lxc_list_for_each(iterator, &c->id_map) {
> + map = iterator->elem;
> + if (map->idtype != w)
> + continue;
> +
> + low = map->nsid;
> + high = map->nsid + map->range;
> + if (uid < low || uid >= high)
> + continue;
> +
> + return uid - low + map->hostid;
> + }
> +
> + return uid;
> +}
> +
> +/*
> + * Take a pathname for a file created on the host, and map the uid and gid
> + * into the container if needed. (Used for ttys)
> + */
> +static int uid_shift_file(char *path, struct lxc_conf *c)
> +{
> + struct stat statbuf;
> + int newuid, newgid;
> +
> + if (stat(path, &statbuf)) {
> + SYSERROR("stat(%s)", path);
> + return -1;
> + }
> +
> + newuid = shiftid(c, statbuf.st_uid, ID_TYPE_UID);
> + newgid = shiftid(c, statbuf.st_gid, ID_TYPE_GID);
> + if (newuid != statbuf.st_uid || newgid != statbuf.st_gid) {
> + DEBUG("chowning %s from %d:%d to %d:%d\n", path, statbuf.st_uid, statbuf.st_gid, newuid, newgid);
> + if (chown(path, newuid, newgid)) {
> + SYSERROR("chown(%s)", path);
> + return -1;
> + }
> + }
> + return 0;
> +}
> +
> +int uid_shift_ttys(int pid, struct lxc_conf *conf)
> +{
> + int i, ret;
> + struct lxc_tty_info *tty_info = &conf->tty_info;
> + char path[MAXPATHLEN];
> + char *ttydir = conf->ttydir;
> +
> + if (!conf->rootfs.path)
> + return 0;
> + /* first the console */
> + ret = snprintf(path, sizeof(path), "/proc/%d/root/dev/%s/console", pid, ttydir ? ttydir : "");
> + if (ret < 0 || ret >= sizeof(path)) {
> + ERROR("console path too long\n");
> + return -1;
> + }
> + if (uid_shift_file(path, conf)) {
> + DEBUG("Failed to chown the console %s.\n", path);
> + return -1;
> + }
> + for (i=0; i< tty_info->nbtty; i++) {
> + ret = snprintf(path, sizeof(path), "/proc/%d/root/dev/%s/tty%d",
> + pid, ttydir ? ttydir : "", i + 1);
> + if (ret < 0 || ret >= sizeof(path)) {
> + ERROR("pathname too long for ttys");
> + return -1;
> + }
> + if (uid_shift_file(path, conf)) {
> + DEBUG("Failed to chown pty %s.\n", path);
> + return -1;
> + }
> + }
> +
> + return 0;
> +}
> +
> int lxc_setup(const char *name, struct lxc_conf *lxc_conf)
> {
> #if HAVE_APPARMOR /* || HAVE_SMACK || HAVE_SELINUX */
> @@ -2419,11 +2545,6 @@ int lxc_setup(const char *name, struct lxc_conf *lxc_conf)
> }
> }
>
> - if (setup_cgroup(name, &lxc_conf->cgroup)) {
> - ERROR("failed to setup the cgroups for '%s'", name);
> - return -1;
> - }
> -
> if (setup_console(&lxc_conf->rootfs, &lxc_conf->console, lxc_conf->ttydir)) {
> ERROR("failed to setup the console for '%s'", name);
> return -1;
> @@ -2467,9 +2588,11 @@ int lxc_setup(const char *name, struct lxc_conf *lxc_conf)
> return -1;
> }
>
> - if (setup_caps(&lxc_conf->caps)) {
> - ERROR("failed to drop capabilities");
> - return -1;
> + if (lxc_list_empty(&lxc_conf->id_map)) {
> + if (setup_caps(&lxc_conf->caps)) {
> + ERROR("failed to drop capabilities");
> + return -1;
> + }
> }
Why can't we drop capabilities in a user namespace?
> NOTICE("'%s' is setup.", name);
> diff --git a/src/lxc/conf.h b/src/lxc/conf.h
> index 694bce4..97b9274 100644
> --- a/src/lxc/conf.h
> +++ b/src/lxc/conf.h
> @@ -137,6 +137,26 @@ struct lxc_cgroup {
> char *value;
> };
>
> +enum idtype {
> + ID_TYPE_UID,
> + ID_TYPE_GID
> +};
> +
> +/*
> + * id_map is an id map entry. Form in confile is:
> + * lxc.id_map = U 9800 0 100
> + * lxc.id_map = U 9900 1000 100
> + * lxc.id_map = G 9800 0 100
> + * lxc.id_map = G 9900 1000 100
> + * meaning the container can use uids and gids 0-100 and 1000-1100,
> + * with uid 0 mapping to uid 9800 on the host, and gid 1000 to
> + * gid 9900 on the host.
> + */
> +struct id_map {
> + enum idtype idtype;
> + int hostid, nsid, range;
> +};
> +
> /*
> * Defines a structure containing a pty information for
> * virtualizing a tty
> @@ -220,6 +240,7 @@ struct lxc_conf {
> int personality;
> struct utsname *utsname;
> struct lxc_list cgroup;
> + struct lxc_list id_map;
> struct lxc_list network;
> struct lxc_list mount_list;
> struct lxc_list caps;
> @@ -256,6 +277,7 @@ extern int pin_rootfs(const char *rootfs);
> extern int lxc_create_network(struct lxc_handler *handler);
> extern void lxc_delete_network(struct lxc_handler *handler);
> extern int lxc_assign_network(struct lxc_list *networks, pid_t pid);
> +extern int lxc_map_ids(struct lxc_list *idmap, pid_t pid);
> extern int lxc_find_gateway_addresses(struct lxc_handler *handler);
>
> extern int lxc_create_tty(const char *name, struct lxc_conf *conf);
> @@ -268,6 +290,10 @@ extern int lxc_clear_cgroups(struct lxc_conf *c, const char *key);
> extern int lxc_clear_mount_entries(struct lxc_conf *c);
> extern int lxc_clear_hooks(struct lxc_conf *c, const char *key);
>
> +extern int setup_cgroup(const char *name, struct lxc_list *cgroups);
> +
> +extern int uid_shift_ttys(int pid, struct lxc_conf *conf);
> +
> /*
> * Configure the container from inside
> */
> diff --git a/src/lxc/confile.c b/src/lxc/confile.c
> index a64ae09..1fa6189 100644
> --- a/src/lxc/confile.c
> +++ b/src/lxc/confile.c
> @@ -55,6 +55,7 @@ static int config_ttydir(const char *, const char *, struct lxc_conf *);
> static int config_aa_profile(const char *, const char *, struct lxc_conf *);
> #endif
> static int config_cgroup(const char *, const char *, struct lxc_conf *);
> +static int config_idmap(const char *, const char *, struct lxc_conf *);
> static int config_loglevel(const char *, const char *, struct lxc_conf *);
> static int config_logfile(const char *, const char *, struct lxc_conf *);
> static int config_mount(const char *, const char *, struct lxc_conf *);
> @@ -94,6 +95,7 @@ static struct lxc_config_t config[] = {
> { "lxc.aa_profile", config_aa_profile },
> #endif
> { "lxc.cgroup", config_cgroup },
> + { "lxc.id_map", config_idmap },
> { "lxc.loglevel", config_loglevel },
> { "lxc.logfile", config_logfile },
> { "lxc.mount", config_mount },
> @@ -1021,6 +1023,64 @@ out:
> return -1;
> }
>
> +static int config_idmap(const char *key, const char *value, struct lxc_conf *lxc_conf)
> +{
> + char *token = "lxc.id_map";
> + char *subkey;
> + struct lxc_list *idmaplist = NULL;
> + struct id_map *idmap = NULL;
> + int hostid, nsid, range;
> + char type;
> + int ret;
> +
> + subkey = strstr(key, token);
> +
> + if (!subkey)
> + return -1;
> +
> + if (!strlen(subkey))
> + return -1;
> +
> + idmaplist = malloc(sizeof(*idmaplist));
> + if (!idmaplist)
> + goto out;
> +
> + idmap = malloc(sizeof(*idmap));
> + if (!idmap)
> + goto out;
> + memset(idmap, 0, sizeof(*idmap));
> +
> + idmaplist->elem = idmap;
> +
> + lxc_list_add_tail(&lxc_conf->id_map, idmaplist);
> +
> + ret = sscanf(value, "%c %d %d %d", &type, &hostid, &nsid, &range);
> + if (ret != 4)
> + goto out;
> + INFO("read uid map: type %c hostid %d nsid %d range %d", type, hostid, nsid, range);
> + if (type == 'U')
> + idmap->idtype = ID_TYPE_UID;
> + else if (type == 'G')
> + idmap->idtype = ID_TYPE_GID;
> + else
> + goto out;
> + idmap->hostid = hostid;
> + idmap->nsid = nsid;
> + idmap->range = range;
> +
> + return 0;
> +
> +out:
> + if (idmaplist)
> + free(idmaplist);
> +
> + if (idmap) {
> + free(idmap);
> + }
^ code style isn't really consistent here :)
> + return -1;
> +}
> +
> static int config_path_item(const char *key, const char *value,
> struct lxc_conf *lxc_conf, char **conf_item)
> {
> diff --git a/src/lxc/start.c b/src/lxc/start.c
> index 3e26b27..8d03b69 100644
> --- a/src/lxc/start.c
> +++ b/src/lxc/start.c
> @@ -542,6 +542,22 @@ static int do_start(void *data)
> if (lxc_sync_barrier_parent(handler, LXC_SYNC_CONFIGURE))
> return -1;
>
> + /*
> + * if we are in a new user namespace, become root there to have
> + * privilege over our namespace
> + */
> + if (!lxc_list_empty(&handler->conf->id_map)) {
> + NOTICE("switching to gid/uid 0 in new user namespace");
> + if (setgid(0)) {
> + SYSERROR("setgid");
> + exit(1);
> + }
> + if (setuid(0)) {
> + SYSERROR("setuid");
> + exit(1);
> + }
> + }
> +
> if (handler->conf->need_utmp_watch) {
> if (prctl(PR_CAPBSET_DROP, CAP_SYS_BOOT, 0, 0, 0)) {
> SYSERROR("failed to remove CAP_SYS_BOOT capability");
> @@ -589,6 +605,10 @@ int lxc_spawn(struct lxc_handler *handler)
> return -1;
>
> handler->clone_flags = CLONE_NEWUTS|CLONE_NEWPID|CLONE_NEWIPC|CLONE_NEWNS;
> + if (!lxc_list_empty(&handler->conf->id_map)) {
> + INFO("Cloning a new user namespace");
> + handler->clone_flags |= CLONE_NEWUSER;
> + }
> if (!lxc_list_empty(&handler->conf->network)) {
>
> handler->clone_flags |= CLONE_NEWNET;
> @@ -650,12 +670,27 @@ int lxc_spawn(struct lxc_handler *handler)
> }
> }
>
> + if (lxc_map_ids(&handler->conf->id_map, handler->pid)) {
> + ERROR("failed to set up id mapping");
> + goto out_delete_net;
> + }
> +
> /* Tell the child to continue its initialization and wait for
> * it to exec or return an error
> */
> if (lxc_sync_barrier_child(handler, LXC_SYNC_POST_CONFIGURE))
> return -1;
>
> + if (setup_cgroup(name, &handler->conf->cgroup)) {
> + ERROR("failed to setup the cgroups for '%s'", name);
> + goto out_delete_net;
> + }
> +
> + /* If child is in a fresh user namespace, chown his ptys for
> + * him */
> + if (uid_shift_ttys(handler->pid, handler->conf))
> + DEBUG("Failed to chown ptys.\n");
> +
> if (handler->ops->post_start(handler, handler->data))
> goto out_abort;
>
>
--
Stéphane Graber
Ubuntu developer
http://www.ubuntu.com
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 899 bytes
Desc: OpenPGP digital signature
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20121206/38aa3b46/attachment.pgp>
More information about the lxc-devel
mailing list