[lxc-devel] [PATCH 5/7] cgroup: mount cgroupns-root when inside non-init cgroupns

Tue Dec 8 16:48:31 UTC 2015

On Tue, Dec 08, 2015 at 11:20:40AM -0500, Tejun Heo wrote:
> Hello,
> 
> On Mon, Dec 07, 2015 at 05:06:20PM -0600, serge.hallyn at ubuntu.com wrote:
> >  fs/kernfs/mount.c      |   74 ++++++++++++++++++++++++++++++++++++++++++++++++
> >  include/linux/kernfs.h |    2 ++
> >  kernel/cgroup.c        |   39 ++++++++++++++++++++++++-
> >  3 files changed, 114 insertions(+), 1 deletion(-)
> 
> Please put kernfs changes in a spearate patch.
> 
> > diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
> > index 8eaf417..9219444 100644
> > --- a/fs/kernfs/mount.c
> > +++ b/fs/kernfs/mount.c
> > @@ -62,6 +63,79 @@ struct kernfs_root *kernfs_root_from_sb(struct super_block *sb)
> >  	return NULL;
> >  }
> >  
> > +/*
> > + * find the next ancestor in the path down to @child, where @parent was the
> > + * parent whose child we want to find.
> 
> s/parent/ancestor/ s/child/descendant/ ?
> 
> > + *
> > + * Say the path is /a/b/c/d.  @child is d, @parent is NULL.  We return the root
> > + * node.  If @parent is b, then we return the node for c.
> > + * Passing in d as @parent is not ok.
> > + */
> > +static struct kernfs_node *
> > +find_next_ancestor(struct kernfs_node *child, struct kernfs_node *parent)
> > +{
> > +	if (child == parent) {
> > +		pr_crit_once("BUG in find_next_ancestor: called with parent == child");
> > +		return NULL;
> > +	}
> > +
> > +	while (child->parent != parent) {
> > +		if (!child->parent)
> > +			return NULL;
> > +		child = child->parent;
> > +	}
> > +
> > +	return child;
> > +}
> > +
> > +/**
> > + * kernfs_obtain_root - get a dentry for the given kernfs_node
> > + * @sb: the kernfs super_block
> > + * @kn: kernfs_node for which a dentry is needed
> > + *
> > + * This can be used by callers which want to mount only a part of the kernfs
> > + * as root of the filesystem.
> > + */
> > +struct dentry *kernfs_obtain_root(struct super_block *sb,
> > +				  struct kernfs_node *kn)
> 
> Wouldn't @kn, @sb be a better order?  Also, kernfs super_blocks are
> determined by the kernfs_root and its namespace.  I wonder whether
> specifying @ns would be better.
> 
> > +{
> > +	struct dentry *dentry;
> > +	struct kernfs_node *knparent = NULL;
> > +
> > +	BUG_ON(sb->s_op != &kernfs_sops);
> > +
> > +	dentry = dget(sb->s_root);
> > +	if (!kn->parent) // this is the root
> 			^^^
> 			Do we do this now?
> 
> > +		return dentry;
> > +
> > +	knparent = find_next_ancestor(kn, NULL);
> > +	if (!knparent) {
> > +		pr_crit("BUG: find_next_ancestor for root dentry returned NULL\n");
> 
> Wouldn't stack dump helpful here?  Why not

Hm, yeah, that's a good reason to use WARN - thanks.

> 	if (WARN_ONCE(!knparent, "find_next..."))
> 		return ERR_PTR(-EINVAL);
> 
> or even just WARN_ON_ONCE().
> 
> > +		return ERR_PTR(-EINVAL);
> > +	}
> > +
> > +	do {
> > +		struct dentry *dtmp;
> > +		struct kernfs_node *kntmp;
> > +
> > +		if (kn == knparent)
> > +			return dentry;
> > +		kntmp = find_next_ancestor(kn, knparent);
> > +		if (!kntmp) {
> > +			pr_crit("BUG: find_next_ancestor returned NULL for node\n");
> 
> Ditto.  It'd be a kernel bug.  WARN is usually the better way.
> 
> > diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> > index a5ab74d..09cd718 100644
> > --- a/kernel/cgroup.c
> > +++ b/kernel/cgroup.c
> > @@ -2011,6 +2011,15 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
> >  	int ret;
> >  	int i;
> >  	bool new_sb;
> > +	struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
> 
> Please move this upwards so that it's below other initialized
> variables.
> 
> > +
> > +	get_cgroup_ns(ns);
> > +
> > +	/* Check if the caller has permission to mount. */
> > +	if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
> > +		put_cgroup_ns(ns);
> > +		return ERR_PTR(-EPERM);
> > +	}
> >  
> >  	/*
> >  	 * The first time anyone tries to mount a cgroup, enable the list
> > @@ -2127,6 +2136,11 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
> >  		goto out_unlock;
> >  	}
> >  
> > +	if (!opts.none && !capable(CAP_SYS_ADMIN)) {
> > +		ret = -EPERM;
> > +		goto out_unlock;
> > +	}
> 
> Hmmm... why is !opts.none necessary?  Please add a comment explaining
> why the above is necessary.
> 
> >  out_mount:
> >  	dentry = kernfs_mount(fs_type, flags, root->kf_root,
> >  			      is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
> >  			      &new_sb);
> > +
> > +	if (!IS_ERR(dentry)) {
> > +		/*
> > +		 * In non-init cgroup namespace, instead of root cgroup's
> > +		 * dentry, we return the dentry corresponding to the
> > +		 * cgroupns->root_cgrp.
> > +		 */
> > +		if (ns != &init_cgroup_ns) {
> 
> 	if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
> 
> > +			struct dentry *nsdentry;
> > +			struct cgroup *cgrp;
> > +
> > +			cgrp = cset_cgroup_from_root(ns->root_cset, root);
> > +			nsdentry = kernfs_obtain_root(dentry->d_sb,
> > +				cgrp->kn);
> 
> Heh, is kernfs_obtain_root() the right name?  Maybe
> kernfs_node_to_inode()?
> 
> Thanks.
> 
> -- 
> tejun
> _______________________________________________
> Containers mailing list
> Containers at lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/containers