[lxc-devel] [PATCH 3/8] cgroup: introduce cgroup namespaces

Mon Jan 4 20:20:53 UTC 2016

On Mon, Jan 04, 2016 at 01:54:48PM -0600, serge.hallyn at ubuntu.com wrote:
> From: Aditya Kali <adityakali at google.com>
> 
> Introduce the ability to create new cgroup namespace. The newly created
> cgroup namespace remembers the cgroup of the process at the point
> of creation of the cgroup namespace (referred as cgroupns-root).
> The main purpose of cgroup namespace is to virtualize the contents
> of /proc/self/cgroup file. Processes inside a cgroup namespace
> are only able to see paths relative to their namespace root
> (unless they are moved outside of their cgroupns-root, at which point
>  they will see a relative path from their cgroupns-root).
> For a correctly setup container this enables container-tools
> (like libcontainer, lxc, lmctfy, etc.) to create completely virtualized
> containers without leaking system level cgroup hierarchy to the task.
> This patch only implements the 'unshare' part of the cgroupns.
> 
> Signed-off-by: Aditya Kali <adityakali at google.com>
> Signed-off-by: Serge Hallyn <serge.hallyn at canonical.com>
> ---
> Changelog: 2015-11-24
> 	- move cgroup_namespace.c into cgroup.c (and .h)
> 	- reformatting
> 	- make get_cgroup_ns return void
> 	- rename ns->root_cgrps to root_cset.
> Changelog: 2015-12-08
> 	- Move init_cgroup_ns to other variable declarations
> 	- Remove accidental conversion of put-css_set to inline
> 	- Drop BUG_ON(NULL)
> 	- Remove unneeded pre declaration of struct cgroupns_operations.
> 	- cgroup.h: collect common ns declerations
> Changelog: 2015-12-09
> 	- cgroup.h: move ns declarations to bottom
> 	- cgroup.c: undo all accidental conversions to inline
> Changelog: 2015-12-22
> 	- update for new kernfs_path_from_node() return value.  Since
> 	  cgroup_path was already gpl-exported, I abstained from updating
> 	  its return value.
> Changelog: 2015-12-23
> 	- cgroup_path(): use init_cgroup_ns when in interupt context.
> Changelog: 2015-01-02
> 	- move to_cg_ns definition forward in patch series
> 	- cgroup_release_agent: grab css_set_lock around cgroup_path()
> 	- leave cgroup_path non-namespaced, use cgroup_path_ns when
> 	  namespaced path is desired.
> ---
>  fs/proc/namespaces.c    |    3 +
>  include/linux/cgroup.h  |   56 +++++++++++++--
>  include/linux/nsproxy.h |    2 +
>  include/linux/proc_ns.h |    4 ++
>  kernel/cgroup.c         |  177 ++++++++++++++++++++++++++++++++++++++++++++++-
>  kernel/cpuset.c         |    3 +-
>  kernel/fork.c           |    2 +-
>  kernel/nsproxy.c        |   21 +++++-
>  kernel/sched/debug.c    |    3 +-
>  9 files changed, 257 insertions(+), 14 deletions(-)
> 
> diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
> index f6e8354..bd61075 100644
> --- a/fs/proc/namespaces.c
> +++ b/fs/proc/namespaces.c
> @@ -28,6 +28,9 @@ static const struct proc_ns_operations *ns_entries[] = {
>  	&userns_operations,
>  #endif
>  	&mntns_operations,
> +#ifdef CONFIG_CGROUPS
> +	&cgroupns_operations,
> +#endif
>  };
>  
>  static const char *proc_ns_follow_link(struct dentry *dentry, void **cookie)
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index 9d70b48..149ae0a 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -17,6 +17,11 @@
>  #include <linux/seq_file.h>
>  #include <linux/kernfs.h>
>  #include <linux/jump_label.h>
> +#include <linux/nsproxy.h>
> +#include <linux/types.h>
> +#include <linux/ns_common.h>
> +#include <linux/nsproxy.h>
> +#include <linux/user_namespace.h>
>  
>  #include <linux/cgroup-defs.h>
>  
> @@ -532,12 +537,6 @@ static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen)
>  	return kernfs_name(cgrp->kn, buf, buflen);
>  }
>  
> -static inline char * __must_check cgroup_path(struct cgroup *cgrp, char *buf,
> -					      size_t buflen)
> -{
> -	return kernfs_path(cgrp->kn, buf, buflen);
> -}
> -
>  static inline void pr_cont_cgroup_name(struct cgroup *cgrp)
>  {
>  	pr_cont_kernfs_name(cgrp->kn);
> @@ -570,4 +569,49 @@ static inline int cgroup_init(void) { return 0; }
>  
>  #endif /* !CONFIG_CGROUPS */
>  
> +struct cgroup_namespace {
> +	atomic_t		count;
> +	struct ns_common	ns;
> +	struct user_namespace	*user_ns;
> +	struct css_set          *root_cset;
> +};
> +
> +extern struct cgroup_namespace init_cgroup_ns;
> +
> +#ifdef CONFIG_CGROUPS
> +
> +void free_cgroup_ns(struct cgroup_namespace *ns);
> +
> +struct cgroup_namespace *
> +copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns,
> +	       struct cgroup_namespace *old_ns);
> +
> +char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
> +		     struct cgroup_namespace *ns);
> +char *cgroup_path(struct cgroup *cgrp, char *buf, size_t buflen);
> +
> +#else /* !CONFIG_CGROUPS */
> +
> +static inline void free_cgroup_ns(struct cgroup_namespace *ns) { }
> +static inline struct cgroup_namespace *
> +copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns,
> +	       struct cgroup_namespace *old_ns)
> +{
> +	return old_ns;
> +}
> +
> +#endif /* !CONFIG_CGROUPS */
> +
> +static inline void get_cgroup_ns(struct cgroup_namespace *ns)
> +{
> +	if (ns)
> +		atomic_inc(&ns->count);
> +}
> +
> +static inline void put_cgroup_ns(struct cgroup_namespace *ns)
> +{
> +	if (ns && atomic_dec_and_test(&ns->count))
> +		free_cgroup_ns(ns);
> +}
> +
>  #endif /* _LINUX_CGROUP_H */
> diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
> index 35fa08f..ac0d65b 100644
> --- a/include/linux/nsproxy.h
> +++ b/include/linux/nsproxy.h
> @@ -8,6 +8,7 @@ struct mnt_namespace;
>  struct uts_namespace;
>  struct ipc_namespace;
>  struct pid_namespace;
> +struct cgroup_namespace;
>  struct fs_struct;
>  
>  /*
> @@ -33,6 +34,7 @@ struct nsproxy {
>  	struct mnt_namespace *mnt_ns;
>  	struct pid_namespace *pid_ns_for_children;
>  	struct net 	     *net_ns;
> +	struct cgroup_namespace *cgroup_ns;
>  };
>  extern struct nsproxy init_nsproxy;
>  
> diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
> index 42dfc61..de0e771 100644
> --- a/include/linux/proc_ns.h
> +++ b/include/linux/proc_ns.h
> @@ -9,6 +9,8 @@
>  struct pid_namespace;
>  struct nsproxy;
>  struct path;
> +struct task_struct;
> +struct inode;
>  
>  struct proc_ns_operations {
>  	const char *name;
> @@ -24,6 +26,7 @@ extern const struct proc_ns_operations ipcns_operations;
>  extern const struct proc_ns_operations pidns_operations;
>  extern const struct proc_ns_operations userns_operations;
>  extern const struct proc_ns_operations mntns_operations;
> +extern const struct proc_ns_operations cgroupns_operations;
>  
>  /*
>   * We always define these enumerators
> @@ -34,6 +37,7 @@ enum {
>  	PROC_UTS_INIT_INO	= 0xEFFFFFFEU,
>  	PROC_USER_INIT_INO	= 0xEFFFFFFDU,
>  	PROC_PID_INIT_INO	= 0xEFFFFFFCU,
> +	PROC_CGROUP_INIT_INO	= 0xEFFFFFFBU,
>  };
>  
>  #ifdef CONFIG_PROC_FS
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 6b33631..60270b1 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -57,6 +57,9 @@
>  #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
>  #include <linux/kthread.h>
>  #include <linux/delay.h>
> +#include <linux/proc_ns.h>
> +#include <linux/nsproxy.h>
> +#include <linux/proc_ns.h>
>  
>  #include <linux/atomic.h>
>  
> @@ -208,6 +211,15 @@ static unsigned long have_fork_callback __read_mostly;
>  static unsigned long have_exit_callback __read_mostly;
>  static unsigned long have_free_callback __read_mostly;
>  
> +/* Cgroup namespace for init task */
> +struct cgroup_namespace init_cgroup_ns = {
> +	.count		= { .counter = 2, },
> +	.user_ns	= &init_user_ns,
> +	.ns.ops		= &cgroupns_operations,
> +	.ns.inum	= PROC_CGROUP_INIT_INO,
> +	.root_cset	= &init_css_set,
> +};
> +
>  /* Ditto for the can_fork callback. */
>  static unsigned long have_canfork_callback __read_mostly;
>  
> @@ -2166,6 +2178,43 @@ static struct file_system_type cgroup2_fs_type = {
>  	.kill_sb = cgroup_kill_sb,
>  };
>  
> +char *

Sorry, that one should be 'static char *'

> +cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
> +		      struct cgroup_namespace *ns)
> +{
> +	int ret;
> +	struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
> +
> +	ret = kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
> +	if (ret < 0 || ret >= buflen)
> +		return NULL;
> +	return buf;
> +}
> +
> +char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
> +		     struct cgroup_namespace *ns)
> +{
> +	char *ret;
> +
> +	mutex_lock(&cgroup_mutex);
> +	spin_lock_bh(&css_set_lock);
> +
> +	ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
> +
> +	spin_unlock_bh(&css_set_lock);
> +	mutex_unlock(&cgroup_mutex);
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(cgroup_path_ns);
> +
> +char *cgroup_path(struct cgroup *cgrp, char *buf, size_t buflen)
> +{
> +	return cgroup_path_ns(cgrp, buf, buflen, &init_cgroup_ns);
> +}
> +
> +EXPORT_SYMBOL_GPL(cgroup_path);
> +
>  /**
>   * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
>   * @task: target task
> @@ -2193,7 +2242,8 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
>  
>  	if (root) {
>  		cgrp = task_cgroup_from_root(task, root);
> -		path = cgroup_path(cgrp, buf, buflen);
> +		path = cgroup_path_ns_locked(cgrp, buf, buflen,
> +					     &init_cgroup_ns);
>  	} else {
>  		/* if no hierarchy exists, everyone is in "/" */
>  		if (strlcpy(buf, "/", buflen) < buflen)
> @@ -5272,6 +5322,8 @@ int __init cgroup_init(void)
>  	BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
>  	BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
>  
> +	get_user_ns(init_cgroup_ns.user_ns);
> +
>  	mutex_lock(&cgroup_mutex);
>  
>  	/* Add init_css_set to the hash table */
> @@ -5409,7 +5461,8 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
>  		 * " (deleted)" is appended to the cgroup path.
>  		 */
>  		if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
> -			path = cgroup_path(cgrp, buf, PATH_MAX);
> +			path = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
> +						     current->nsproxy->cgroup_ns);
>  			if (!path) {
>  				retval = -ENAMETOOLONG;
>  				goto out_unlock;
> @@ -5691,7 +5744,10 @@ static void cgroup_release_agent(struct work_struct *work)
>  	if (!pathbuf || !agentbuf)
>  		goto out;
>  
> -	path = cgroup_path(cgrp, pathbuf, PATH_MAX);
> +	spin_lock_bh(&css_set_lock);
> +	path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX,
> +				     &init_cgroup_ns);
> +	spin_unlock_bh(&css_set_lock);
>  	if (!path)
>  		goto out;
>  
> @@ -5822,6 +5878,121 @@ struct cgroup *cgroup_get_from_path(const char *path)
>  }
>  EXPORT_SYMBOL_GPL(cgroup_get_from_path);
>  
> +/* cgroup namespaces */
> +
> +static struct cgroup_namespace *alloc_cgroup_ns(void)
> +{
> +	struct cgroup_namespace *new_ns;
> +	int ret;
> +
> +	new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
> +	if (!new_ns)
> +		return ERR_PTR(-ENOMEM);
> +	ret = ns_alloc_inum(&new_ns->ns);
> +	if (ret) {
> +		kfree(new_ns);
> +		return ERR_PTR(ret);
> +	}
> +	atomic_set(&new_ns->count, 1);
> +	new_ns->ns.ops = &cgroupns_operations;
> +	return new_ns;
> +}
> +
> +void free_cgroup_ns(struct cgroup_namespace *ns)
> +{
> +	put_css_set(ns->root_cset);
> +	put_user_ns(ns->user_ns);
> +	ns_free_inum(&ns->ns);
> +	kfree(ns);
> +}
> +EXPORT_SYMBOL(free_cgroup_ns);
> +
> +struct cgroup_namespace *
> +copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns,
> +	       struct cgroup_namespace *old_ns)
> +{
> +	struct cgroup_namespace *new_ns = NULL;
> +	struct css_set *cset = NULL;
> +	int err;
> +
> +	BUG_ON(!old_ns);
> +
> +	if (!(flags & CLONE_NEWCGROUP)) {
> +		get_cgroup_ns(old_ns);
> +		return old_ns;
> +	}
> +
> +	/* Allow only sysadmin to create cgroup namespace. */
> +	err = -EPERM;
> +	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
> +		goto err_out;
> +
> +	cset = task_css_set(current);
> +	get_css_set(cset);
> +
> +	err = -ENOMEM;
> +	new_ns = alloc_cgroup_ns();
> +	if (!new_ns)
> +		goto err_out;
> +
> +	new_ns->user_ns = get_user_ns(user_ns);
> +	new_ns->root_cset = cset;
> +
> +	return new_ns;
> +
> +err_out:
> +	if (cset)
> +		put_css_set(cset);
> +	kfree(new_ns);
> +	return ERR_PTR(err);
> +}
> +
> +static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
> +{
> +	return container_of(ns, struct cgroup_namespace, ns);
> +}
> +
> +static int cgroupns_install(struct nsproxy *nsproxy, void *ns)
> +{
> +	pr_info("setns not supported for cgroup namespace");
> +	return -EINVAL;
> +}
> +
> +static struct ns_common *cgroupns_get(struct task_struct *task)
> +{
> +	struct cgroup_namespace *ns = NULL;
> +	struct nsproxy *nsproxy;
> +
> +	task_lock(task);
> +	nsproxy = task->nsproxy;
> +	if (nsproxy) {
> +		ns = nsproxy->cgroup_ns;
> +		get_cgroup_ns(ns);
> +	}
> +	task_unlock(task);
> +
> +	return ns ? &ns->ns : NULL;
> +}
> +
> +static void cgroupns_put(struct ns_common *ns)
> +{
> +	put_cgroup_ns(to_cg_ns(ns));
> +}
> +
> +const struct proc_ns_operations cgroupns_operations = {
> +	.name		= "cgroup",
> +	.type		= CLONE_NEWCGROUP,
> +	.get		= cgroupns_get,
> +	.put		= cgroupns_put,
> +	.install	= cgroupns_install,
> +};
> +
> +static __init int cgroup_namespaces_init(void)
> +{
> +	return 0;
> +}
> +subsys_initcall(cgroup_namespaces_init);
> +
>  #ifdef CONFIG_CGROUP_DEBUG
>  static struct cgroup_subsys_state *
>  debug_css_alloc(struct cgroup_subsys_state *parent_css)
> diff --git a/kernel/cpuset.c b/kernel/cpuset.c
> index 3e945fc..37c8eb0 100644
> --- a/kernel/cpuset.c
> +++ b/kernel/cpuset.c
> @@ -2689,7 +2689,8 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
>  	retval = -ENAMETOOLONG;
>  	rcu_read_lock();
>  	css = task_css(tsk, cpuset_cgrp_id);
> -	p = cgroup_path(css->cgroup, buf, PATH_MAX);
> +	p = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
> +			   current->nsproxy->cgroup_ns);
>  	rcu_read_unlock();
>  	if (!p)
>  		goto out_free;
> diff --git a/kernel/fork.c b/kernel/fork.c
> index ba7d1c0..7982fee 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1880,7 +1880,7 @@ static int check_unshare_flags(unsigned long unshare_flags)
>  	if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
>  				CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
>  				CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
> -				CLONE_NEWUSER|CLONE_NEWPID))
> +				CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP))
>  		return -EINVAL;
>  	/*
>  	 * Not implemented, but pretend it works if there is nothing
> diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
> index 49746c8..64fe865 100644
> --- a/kernel/nsproxy.c
> +++ b/kernel/nsproxy.c
> @@ -25,6 +25,7 @@
>  #include <linux/proc_ns.h>
>  #include <linux/file.h>
>  #include <linux/syscalls.h>
> +#include <linux/cgroup.h>
>  
>  static struct kmem_cache *nsproxy_cachep;
>  
> @@ -39,6 +40,9 @@ struct nsproxy init_nsproxy = {
>  #ifdef CONFIG_NET
>  	.net_ns			= &init_net,
>  #endif
> +#ifdef CONFIG_CGROUPS
> +	.cgroup_ns		= &init_cgroup_ns,
> +#endif
>  };
>  
>  static inline struct nsproxy *create_nsproxy(void)
> @@ -92,6 +96,13 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
>  		goto out_pid;
>  	}
>  
> +	new_nsp->cgroup_ns = copy_cgroup_ns(flags, user_ns,
> +					    tsk->nsproxy->cgroup_ns);
> +	if (IS_ERR(new_nsp->cgroup_ns)) {
> +		err = PTR_ERR(new_nsp->cgroup_ns);
> +		goto out_cgroup;
> +	}
> +
>  	new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns);
>  	if (IS_ERR(new_nsp->net_ns)) {
>  		err = PTR_ERR(new_nsp->net_ns);
> @@ -101,6 +112,9 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
>  	return new_nsp;
>  
>  out_net:
> +	if (new_nsp->cgroup_ns)
> +		put_cgroup_ns(new_nsp->cgroup_ns);
> +out_cgroup:
>  	if (new_nsp->pid_ns_for_children)
>  		put_pid_ns(new_nsp->pid_ns_for_children);
>  out_pid:
> @@ -128,7 +142,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
>  	struct nsproxy *new_ns;
>  
>  	if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
> -			      CLONE_NEWPID | CLONE_NEWNET)))) {
> +			      CLONE_NEWPID | CLONE_NEWNET |
> +			      CLONE_NEWCGROUP)))) {
>  		get_nsproxy(old_ns);
>  		return 0;
>  	}
> @@ -165,6 +180,8 @@ void free_nsproxy(struct nsproxy *ns)
>  		put_ipc_ns(ns->ipc_ns);
>  	if (ns->pid_ns_for_children)
>  		put_pid_ns(ns->pid_ns_for_children);
> +	if (ns->cgroup_ns)
> +		put_cgroup_ns(ns->cgroup_ns);
>  	put_net(ns->net_ns);
>  	kmem_cache_free(nsproxy_cachep, ns);
>  }
> @@ -180,7 +197,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
>  	int err = 0;
>  
>  	if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
> -			       CLONE_NEWNET | CLONE_NEWPID)))
> +			       CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP)))
>  		return 0;
>  
>  	user_ns = new_cred ? new_cred->user_ns : current_user_ns();
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 6415117..4c28523 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -104,7 +104,8 @@ static char *task_group_path(struct task_group *tg)
>  	if (autogroup_path(tg, group_path, PATH_MAX))
>  		return group_path;
>  
> -	return cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
> +	return cgroup_path_ns(tg->css.cgroup, group_path, PATH_MAX,
> +			      current->nsproxy->cgroup_ns);
>  }
>  #endif
>  
> -- 
> 1.7.9.5
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo at vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/