[lxc-devel] [lxc/master] lxc: support CLONE_INTO_CGROUP
brauner on Github
lxc-bot at linuxcontainers.org
Mon Jun 29 11:58:43 UTC 2020
A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 364 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20200629/1446238e/attachment.bin>
-------------- next part --------------
From f7176c3ea944ce2b9968b7c4a18c266639927395 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Mon, 29 Jun 2020 11:34:01 +0200
Subject: [PATCH] lxc: support CLONE_INTO_CGROUP
Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
src/lxc/cgroups/cgfsng.c | 3 ++
src/lxc/cgroups/cgroup.h | 8 +++
src/lxc/process_utils.c | 4 +-
src/lxc/process_utils.h | 1 +
src/lxc/start.c | 105 ++++++++++++++++++++++++++++-----------
src/lxc/start.h | 20 ++++----
6 files changed, 100 insertions(+), 41 deletions(-)
diff --git a/src/lxc/cgroups/cgfsng.c b/src/lxc/cgroups/cgfsng.c
index 6c64c996c2..bab4ba3409 100644
--- a/src/lxc/cgroups/cgfsng.c
+++ b/src/lxc/cgroups/cgfsng.c
@@ -1549,6 +1549,9 @@ __cgfsng_ops static bool cgfsng_payload_enter(struct cgroup_ops *ops,
struct hierarchy *h = ops->hierarchies[i];
int ret;
+ if (is_unified_hierarchy(h) && handler->clone_flags & CLONE_INTO_CGROUP)
+ continue;
+
ret = lxc_writeat(h->cgfd_con, "cgroup.procs", pidstr, len);
if (ret != 0)
return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->container_full_path);
diff --git a/src/lxc/cgroups/cgroup.h b/src/lxc/cgroups/cgroup.h
index c5bf7941ad..e3712b710e 100644
--- a/src/lxc/cgroups/cgroup.h
+++ b/src/lxc/cgroups/cgroup.h
@@ -194,4 +194,12 @@ static inline bool pure_unified_layout(const struct cgroup_ops *ops)
return ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED;
}
+static inline int cgroup_unified_fd(const struct cgroup_ops *ops)
+{
+ if (!ops->unified)
+ return -EBADF;
+
+ return ops->unified->cgfd_con;
+}
+
#endif
diff --git a/src/lxc/process_utils.c b/src/lxc/process_utils.c
index 7494def46b..ccc4c0bf98 100644
--- a/src/lxc/process_utils.c
+++ b/src/lxc/process_utils.c
@@ -28,7 +28,7 @@ lxc_log_define(process_utils, lxc);
* The nice thing about this is that we get fork() behavior. That is
* lxc_raw_clone() returns 0 in the child and the child pid in the parent.
*/
-__returns_twice static pid_t __lxc_raw_clone(unsigned long flags, int *pidfd)
+__returns_twice pid_t lxc_raw_legacy_clone(unsigned long flags, int *pidfd)
{
#if defined(__s390x__) || defined(__s390__) || defined(__CRIS__)
@@ -108,7 +108,7 @@ __returns_twice pid_t lxc_raw_clone(unsigned long flags, int *pidfd)
pid = lxc_clone3(&args, CLONE_ARGS_SIZE_VER0);
if (pid < 0 && errno == ENOSYS) {
SYSTRACE("Falling back to legacy clone");
- return __lxc_raw_clone(flags, pidfd);
+ return lxc_raw_legacy_clone(flags, pidfd);
}
return pid;
diff --git a/src/lxc/process_utils.h b/src/lxc/process_utils.h
index 4ea898a633..61b0e412b7 100644
--- a/src/lxc/process_utils.h
+++ b/src/lxc/process_utils.h
@@ -240,6 +240,7 @@ extern pid_t lxc_clone(int (*fn)(void *), void *arg, int flags, int *pidfd);
* The child must use lxc_raw_getpid() to retrieve its pid.
*/
extern pid_t lxc_raw_clone(unsigned long flags, int *pidfd);
+extern pid_t lxc_raw_legacy_clone(unsigned long flags, int *pidfd);
/*
* lxc_raw_clone_cb() - create a new process
diff --git a/src/lxc/start.c b/src/lxc/start.c
index c49b249fb3..244de39dd1 100644
--- a/src/lxc/start.c
+++ b/src/lxc/start.c
@@ -1081,8 +1081,7 @@ static int do_start(void *data)
/* Unshare CLONE_NEWNET after CLONE_NEWUSER. See
* https://github.com/lxc/lxd/issues/1978.
*/
- if ((handler->ns_clone_flags & (CLONE_NEWNET | CLONE_NEWUSER)) ==
- (CLONE_NEWNET | CLONE_NEWUSER)) {
+ if (handler->ns_unshare_flags & CLONE_NEWNET) {
ret = unshare(CLONE_NEWNET);
if (ret < 0) {
SYSERROR("Failed to unshare CLONE_NEWNET");
@@ -1190,7 +1189,7 @@ static int do_start(void *data)
*
* 8:cpuset:/
*/
- if (handler->ns_clone_flags & CLONE_NEWCGROUP) {
+ if (handler->ns_unshare_flags & CLONE_NEWCGROUP) {
ret = unshare(CLONE_NEWCGROUP);
if (ret < 0) {
if (errno != EINVAL) {
@@ -1205,7 +1204,7 @@ static int do_start(void *data)
}
}
- if (handler->ns_clone_flags & CLONE_NEWTIME) {
+ if (handler->ns_unshare_flags & CLONE_NEWTIME) {
ret = unshare(CLONE_NEWTIME);
if (ret < 0) {
if (errno != EINVAL) {
@@ -1537,6 +1536,22 @@ int resolve_clone_flags(struct lxc_handler *handler)
if (wants_timens && (conf->ns_keep & ns_info[LXC_NS_TIME].clone_flag))
return log_trace_errno(-1, EINVAL, "Requested to keep time namespace while also specifying offsets");
+ /* Deal with namespaces that are unshared. */
+ if (handler->ns_clone_flags & CLONE_NEWTIME)
+ handler->ns_unshare_flags |= CLONE_NEWTIME;
+
+ if (!pure_unified_layout(handler->cgroup_ops) && handler->ns_clone_flags & CLONE_NEWCGROUP)
+ handler->ns_unshare_flags |= CLONE_NEWCGROUP;
+
+ if ((handler->ns_clone_flags & (CLONE_NEWNET | CLONE_NEWUSER)) ==
+ (CLONE_NEWNET | CLONE_NEWUSER))
+ handler->ns_unshare_flags |= CLONE_NEWNET;
+
+ /* Deal with namespaces that are spawned. */
+ handler->ns_on_clone_flags = handler->ns_clone_flags & ~handler->ns_unshare_flags;
+
+ handler->clone_flags = handler->ns_on_clone_flags | CLONE_PIDFD;
+
return 0;
}
@@ -1659,21 +1674,6 @@ static int lxc_spawn(struct lxc_handler *handler)
}
/* Create a process in a new set of namespaces. */
- handler->ns_on_clone_flags = handler->ns_clone_flags;
- if (handler->ns_clone_flags & CLONE_NEWUSER) {
- /* If CLONE_NEWUSER and CLONE_NEWNET was requested, we need to
- * clone a new user namespace first and only later unshare our
- * network namespace to ensure that network devices ownership is
- * set up correctly.
- */
- handler->ns_on_clone_flags &= ~CLONE_NEWNET;
- }
- /* The cgroup namespace gets unshare()ed not clone()ed. */
- handler->ns_on_clone_flags &= ~CLONE_NEWCGROUP;
-
- /* The time namespace (currently) gets unshare()ed not clone()ed. */
- handler->ns_on_clone_flags &= ~CLONE_NEWTIME;
-
if (share_ns) {
pid_t attacher_pid;
@@ -1689,15 +1689,64 @@ static int lxc_spawn(struct lxc_handler *handler)
SYSERROR("Intermediate process failed");
goto out_delete_net;
}
+
+ if (handler->pid < 0) {
+ SYSERROR(LXC_CLONE_ERROR);
+ goto out_delete_net;
+ }
} else {
- handler->pid = lxc_raw_clone_cb(do_start, handler,
- CLONE_PIDFD | handler->ns_on_clone_flags,
- &handler->pidfd);
- }
- if (handler->pid < 0) {
- SYSERROR(LXC_CLONE_ERROR);
- goto out_delete_net;
+ int cgroup_fd;
+
+ struct lxc_clone_args clone_args = {
+ .flags = handler->clone_flags,
+ .pidfd = ptr_to_u64(&handler->pidfd),
+ .exit_signal = SIGCHLD,
+ };
+
+ if (handler->ns_clone_flags & CLONE_NEWCGROUP) {
+ cgroup_fd = cgroup_unified_fd(cgroup_ops);
+ if (cgroup_fd >= 0) {
+ handler->clone_flags |= CLONE_INTO_CGROUP;
+ clone_args.flags |= CLONE_INTO_CGROUP;
+ clone_args.cgroup = cgroup_fd;
+ }
+ }
+
+ /* Try to spawn directly into target cgroup. */
+ handler->pid = lxc_clone3(&clone_args, CLONE_ARGS_SIZE_VER2);
+ if (handler->pid < 0) {
+ SYSTRACE("Failed to spawn container directly into target cgroup");
+
+ /* Kernel might simply be too old for CLONE_INTO_CGROUP. */
+ handler->clone_flags &= ~(CLONE_INTO_CGROUP | CLONE_NEWCGROUP);
+ handler->ns_on_clone_flags &= ~CLONE_NEWCGROUP;
+ handler->ns_unshare_flags |= CLONE_NEWCGROUP;
+
+ clone_args.flags = handler->clone_flags;
+
+ handler->pid = lxc_clone3(&clone_args, CLONE_ARGS_SIZE_VER0);
+ } else if (cgroup_fd >= 0) {
+ TRACE("Spawned container directly into target cgroup via cgroup2 fd %d", cgroup_fd);
+ }
+
+ /* Kernel might be too old for clone3(). */
+ if (handler->pid < 0) {
+ SYSTRACE("Failed to spawn container via clone3()");
+ handler->pid = lxc_raw_legacy_clone(handler->clone_flags, &handler->pidfd);
+ }
+
+ if (handler->pid < 0) {
+ SYSERROR(LXC_CLONE_ERROR);
+ goto out_delete_net;
+ }
+
+ if (handler->pid == 0) {
+ (void)do_start(handler);
+ _exit(EXIT_FAILURE);
+ }
}
+ if (handler->pidfd < 0)
+ handler->clone_flags &= ~CLONE_PIDFD;
TRACE("Cloned child process %d", handler->pid);
/* Verify that we can actually make use of pidfds. */
@@ -1853,7 +1902,7 @@ static int lxc_spawn(struct lxc_handler *handler)
}
TRACE("Set up cgroup2 device controller limits");
- if (handler->ns_clone_flags & CLONE_NEWCGROUP) {
+ if (handler->ns_unshare_flags & CLONE_NEWCGROUP) {
/* Now we're ready to preserve the cgroup namespace */
ret = lxc_try_preserve_ns(handler->pid, "cgroup");
if (ret < 0) {
@@ -1870,7 +1919,7 @@ static int lxc_spawn(struct lxc_handler *handler)
cgroup_ops->payload_finalize(cgroup_ops);
TRACE("Finished setting up cgroups");
- if (handler->ns_clone_flags & CLONE_NEWTIME) {
+ if (handler->ns_unshare_flags & CLONE_NEWTIME) {
/* Now we're ready to preserve the cgroup namespace */
ret = lxc_try_preserve_ns(handler->pid, "time");
if (ret < 0) {
diff --git a/src/lxc/start.h b/src/lxc/start.h
index ece4aac472..6852f6e22d 100644
--- a/src/lxc/start.h
+++ b/src/lxc/start.h
@@ -26,20 +26,18 @@ struct lxc_handler {
* list the clone flags that were unshare()ed rather then clone()ed
* because of ordering requirements (e.g. e.g. CLONE_NEWNET and
* CLONE_NEWUSER) or implementation details.
- *
- * @ns_keep_flags;
- * - The clone flags for the namespaces that the container will inherit
- * from the parent. They are not recorded in the handler itself but
- * are present in the container's config.
*
- * @ns_share_flags;
- * - The clone flags for the namespaces that the container will share
- * with another process. They are not recorded in the handler itself
- * but are present in the container's config.
+ * @ns_unshare_flags
+ * - Flags for namespaces that were unshared, not cloned.
+ *
+ * @clone_flags
+ * - ns_on_clone flags | other flags used to create container.
*/
struct /* lxc_ns */ {
- int ns_clone_flags;
- int ns_on_clone_flags;
+ unsigned int ns_clone_flags;
+ unsigned int ns_on_clone_flags;
+ unsigned int ns_unshare_flags;
+ unsigned int clone_flags;
};
/* File descriptor to pin the rootfs for privileged containers. */
More information about the lxc-devel
mailing list