[lxc-users] unprivileged container with systemd?]

Mon Feb 9 16:55:19 UTC 2015

I think I can help here. Systemd-based unprivileged containers will only work
properly with systemd compiled from git (master) or systemd 218 patched with:

From: Lennart Poettering <lennart at poettering.net>
Date: Thu, 8 Jan 2015 23:12:16 +0100
Subject: core: make EPERM errors when applying OOM adjustment for forked
 processes non-fatal

This should be useful for user namespaces.
---
 src/core/execute.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/core/execute.c b/src/core/execute.c
index 5e4135e..22b7862 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -1359,12 +1359,16 @@ static int exec_child(ExecCommand *command,
         }
 
         if (context->oom_score_adjust_set) {
-                char t[16];
+                char t[DECIMAL_STR_MAX(context->oom_score_adjust)];
 
-                snprintf(t, sizeof(t), "%i", context->oom_score_adjust);
-                char_array_0(t);
+                /* When we can't make this change due to EPERM, then
+                 * let's silently skip over it. User namespaces
+                 * prohibit write access to this file, and we
+                 * shouldn't trip up over that. */
 
-                if (write_string_file("/proc/self/oom_score_adj", t) < 0) {
+                sprintf(t, "%i", context->oom_score_adjust);
+                err = write_string_file("/proc/self/oom_score_adj", t);
+                if (err < 0 && err != -EPERM && err != -EACCES) {
                         *error = EXIT_OOM_ADJUST;
                         return -errno;
                 }

and

From: Martin Pitt <martin.pitt at ubuntu.com>
Date: Mon, 29 Dec 2014 14:18:28 +0100
Subject: Loopback setup / rtnl fixes

Various bug fixes in loopback device setup and netlink socket communication,
taken from upstream commits e95e909 ff. Fixes massive CPU usage due to tight
retry loops in user LXC containers.
---
 src/core/loopback-setup.c             | 42 +++++++++++++++--------------------
 src/libsystemd/sd-rtnl/rtnl-message.c | 18 ++++++++++++---
 src/libsystemd/sd-rtnl/sd-rtnl.c      |  4 +++-
 3 files changed, 36 insertions(+), 28 deletions(-)

diff --git a/src/core/loopback-setup.c b/src/core/loopback-setup.c
index 98fc04d..ca10e20 100644
--- a/src/core/loopback-setup.c
+++ b/src/core/loopback-setup.c
@@ -56,30 +56,24 @@ static int start_loopback(sd_rtnl *rtnl) {
         return 0;
 }
 
-static int check_loopback(void) {
+static bool check_loopback(sd_rtnl *rtnl) {
+        _cleanup_rtnl_message_unref_ sd_rtnl_message *req = NULL, *reply = NULL;
+        unsigned flags;
         int r;
-        _cleanup_close_ int fd = -1;
-        union {
-                struct sockaddr sa;
-                struct sockaddr_in in;
-        } sa = {
-                .in.sin_family = AF_INET,
-                .in.sin_addr.s_addr = INADDR_LOOPBACK,
-        };
-
-        /* If we failed to set up the loop back device, check whether
-         * it might already be set up */
-
-        fd = socket(AF_INET, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0);
-        if (fd < 0)
-                return -errno;
-
-        if (bind(fd, &sa.sa, sizeof(sa.in)) >= 0)
-                r = 1;
-        else
-                r = errno == EADDRNOTAVAIL ? 0 : -errno;
-
-        return r;
+
+        r = sd_rtnl_message_new_link(rtnl, &req, RTM_GETLINK, LOOPBACK_IFINDEX);
+        if (r < 0)
+                return false;
+
+        r = sd_rtnl_call(rtnl, req, 0, &reply);
+        if (r < 0)
+                return false;
+
+        r = sd_rtnl_message_link_get_flags(reply, &flags);
+        if (r < 0)
+                return false;
+
+        return flags & IFF_UP;
 }
 
 int loopback_setup(void) {
@@ -92,7 +86,7 @@ int loopback_setup(void) {
 
         r = start_loopback(rtnl);
         if (r == -EPERM) {
-                if (check_loopback() < 0)
+                if (!check_loopback(rtnl))
                         return log_warning_errno(EPERM, "Failed to configure loopback device: %m");
         } else if (r < 0)
                 return log_warning_errno(r, "Failed to configure loopback device: %m");
diff --git a/src/libsystemd/sd-rtnl/rtnl-message.c b/src/libsystemd/sd-rtnl/rtnl-message.c
index 165e84d..e5a69bd 100644
--- a/src/libsystemd/sd-rtnl/rtnl-message.c
+++ b/src/libsystemd/sd-rtnl/rtnl-message.c
@@ -1294,8 +1294,10 @@ static int socket_recv_message(int fd, struct iovec *iov, uint32_t *_group, bool
                 /* no data */
                 if (errno == ENOBUFS)
                         log_debug("rtnl: kernel receive buffer overrun");
+                else if (errno == EAGAIN)
+                        log_debug("rtnl: no data in socket");
 
-                return (errno == EAGAIN) ? 0 : -errno;
+                return (errno == EAGAIN || errno == EINTR) ? 0 : -errno;
         } else if (r == 0)
                 /* connection was closed by the kernel */
                 return -ECONNRESET;
@@ -1307,8 +1309,10 @@ static int socket_recv_message(int fd, struct iovec *iov, uint32_t *_group, bool
                         struct ucred *ucred = (void *)CMSG_DATA(cmsg);
 
                         /* from the kernel */
-                        if (ucred->uid == 0 && ucred->pid == 0)
+                        if (ucred->pid == 0)
                                 auth = true;
+                        else
+                                log_debug("rtnl: ignoring message from pid %u", ucred->pid);
                 } else if (cmsg->cmsg_level == SOL_NETLINK &&
                            cmsg->cmsg_type == NETLINK_PKTINFO &&
                            cmsg->cmsg_len == CMSG_LEN(sizeof(struct nl_pktinfo))) {
@@ -1319,9 +1323,17 @@ static int socket_recv_message(int fd, struct iovec *iov, uint32_t *_group, bool
                 }
         }
 
-        if (!auth)
+        if (!auth) {
                 /* not from the kernel, ignore */
+                if (peek) {
+                        /* drop the message */
+                        r = recvmsg(fd, &msg, 0);
+                        if (r < 0)
+                                return (errno == EAGAIN || errno == EINTR) ? 0 : -errno;
+                }
+
                 return 0;
+        }
 
         if (group)
                 *_group = group;
diff --git a/src/libsystemd/sd-rtnl/sd-rtnl.c b/src/libsystemd/sd-rtnl/sd-rtnl.c
index abb011e..7d388c9 100644
--- a/src/libsystemd/sd-rtnl/sd-rtnl.c
+++ b/src/libsystemd/sd-rtnl/sd-rtnl.c
@@ -489,7 +489,7 @@ static int rtnl_poll(sd_rtnl *rtnl, bool need_more, uint64_t timeout_usec) {
         if (need_more)
                 /* Caller wants more data, and doesn't care about
                  * what's been read or any other timeouts. */
-                return e |= POLLIN;
+                e |= POLLIN;
         else {
                 usec_t until;
                 /* Caller wants to process if there is something to
@@ -701,6 +701,8 @@ int sd_rtnl_call(sd_rtnl *rtnl,
                 r = rtnl_poll(rtnl, true, left);
                 if (r < 0)
                         return r;
+                else if (r == 0)
+                        return -ETIMEDOUT;
 
                 r = dispatch_wqueue(rtnl);
                 if (r < 0)

Both are upstream patches and will therefore be included in the next
systemd-release per default. If you want to run a distribution that is not
running systemd 218 the second patch cannot be applied. Hence you need to
install systemd 218 in that case. In any way for now you can either patch or
disable the OOMScoreAdjust in the dbus.service file. This will get you around
the problem that dbus is not starting but it won't get you around the problem
of systemd respawning endlessly and too fast (for which the second patch
exists) when it show you the greeting screen "Welcome to
DISTRIBUTION-NAME-HERE!".

But I can say with some confidence that patching systemd for now is worth it. I
run Archlinux and Fedora 21 with rawhide enabled or Fedora Rawhide unprivileged
with a clean, nice, and fast boot.

If you decide to patch systemd 218 for now, here is a suggestion:

(1) Start the container, even if it hangs with a folder bind-mounted from the
host which containes the patched systemd version.
(2) Attach to it with lxc-attach and get a shell in the container.
(3) Install the distribution specific patched systemd and uninstall the old
    one.
(4) Stop the container.
(5) Clone the container and keep the original one untouched.
(6) Fiddle around with the clone.
(7) Delete the clone when you don't need it anymore or it breaks.
(8) Clone a new one from the original container with the patched-systemd to
    create a new one.

This will get you through the period until all distros have the release that is
coming after systemd 218-1.

Best,
Christian

P.S. If you're just interested in running Debian Jessie then you can probably
download the systemd package from Debian Sid or Ubuntu Vivid which which
contain both patches and install.

> Hi Serge,
> 
> > > I just to follow
> > > 
> > >    https://www.stgraber.org/2014/01/17/lxc-1-0-unprivileged-containers/
> > > 
> > > once more to install a new container and it fails. First of all it
> > > was a problem with the access to the directory 
> > > 
> > >    ~/.local/share/lxc/jessie1
> > > 
> > > The owner changed to a mapped one -> 100000 and then there was no
> > > access for the lxcuser, which has uid 1001. I fixed this via setting
> > > write access for the users group.
> > > 
> > > But then I installed a download template:
> > > 
> > >    lxc-create -t download -n jessie1 -- -d debian -r jessie -a amd64
> > > 
> > > which worked without problems (except warnings regarding reopen tty).
> > > 
> > > If I try to start the container it ends up with:
> > > 
> > >    ~$ lxc-start -n jessie1
> > >    lxc_container: Permission denied - Unable to create /dev/.lxc for autodev
> > >    Failed to mount cgroup at /sys/fs/cgroup/systemd: Operation not permitted
> > > 
> > > Here it ends, nothing more happens and only a kill -9 works...
> > > 
> > > And yes, /sbin/init in the container is now a link to systemd:
> > > 
> > >    /sbin/init -> /lib/systemd/systemd
> > > 
> > > I suspect, this does not work at all without cgroup namespace support
> > > in the kernel? Or am I missing something else?
> > 
> > There's something else you're missing, but I'm not sure what.  What is
> > your environment (os/release and any custom installs)?  Try 1.1.0, and
> > make sure to re-create the container as the new config file should be
> > more correct for systemd backed containers.
> 
> the host is Debian wheezy, kernel 3.18.4 and a backported shadow
> package for newuidmap & Co. 
> 
> For LXC I have now lxc-1.1, cgmanager-0.35 and lxcfs-0.5. This works
> fine with an debian-container for wheezy and jessie. But if I switch
> jessie to systemd, it fails. I just tried an upgrade to experimental,
> but this fails, too.
> 
> Now I started with a fresh template download of ubuntu vivid. This
> works fine, too, but with upstart. If I install systemd an put a
> lxc.init_cmd=/bin/systemd, it fails too:
> 
>    $ lxc-start -n ubuntu -F -l DEBUG
>    WARN: could not reopen tty: Permission denied
>    systemd 218 running in system mode. (+PAM +AUDIT +SELINUX +IMA
>    +APPARMOR +SMACK +SYSVINIT +UTMP +LIBCRYPTSETUP +GCRYPT -GNUTLS +ACL
>    +XZ -LZ4 -SECCOMP +BLKID -ELFUTILS +KMOD -IDN)
>    Detected virtualization 'lxc'.
>    Detected architecture 'x86-64'.
> 
>    Welcome to Ubuntu Vivid Vervet (development branch)!
> 
>    Set hostname to <ubuntu>.
>    /etc/mtab is not a symlink or not pointing to /proc/self/mounts. This
>    is not supported anymore. Please make sure to replace this file by a
>    symlink to avoid incorrect or misleading mount(8) output.
>    Failed to install release agent, ignoring: No such file or directory
> 
> That's the same behaviour as with debian jessie and systemd. At this
> point even an lxc-attach does not work...
> 
> I have no idea, what is going on, maybe there is something too old on
> debian wheezy as a host?
> 
> Best regards
> 
> Dirk
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 819 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-users/attachments/20150209/53f7ae34/attachment.sig>