[lxc-users] unprivileged container with systemd?]
Christian Brauner
christianvanbrauner at gmail.com
Mon Feb 9 17:06:29 UTC 2015
For the ones unfamiliar with patching software, here is are links to my
Google Drive with patched systemd-versions for Fedora 21 and Rawhide,
and Archlinux:
Fedora:
(1) systemd
https://drive.google.com/file/d/0B_UAut69TSAiTmI3SnN4TzRTaXM/view?usp=sharing
(2) systemd-libs
https://drive.google.com/file/d/0B_UAut69TSAiQkJKTmFjVDRyYVU/view?usp=sharing
Archlinux:
(1) systemd
https://drive.google.com/file/d/0B_UAut69TSAidi1wdzJKNmc3X1E/view?usp=sharing
(2) libsystemd
https://drive.google.com/file/d/0B_UAut69TSAiNWpvWVVxZ2dSSnc/view?usp=sharing
(3) systemd-sysvcompat
https://drive.google.com/file/d/0B_UAut69TSAiODFDZXF4MFVKMXM/view?usp=sharing
Have fun!
Christian
> I think I can help here. Systemd-based unprivileged containers will only work
> properly with systemd compiled from git (master) or systemd 218 patched with:
>
> From: Lennart Poettering <lennart at poettering.net>
> Date: Thu, 8 Jan 2015 23:12:16 +0100
> Subject: core: make EPERM errors when applying OOM adjustment for forked
> processes non-fatal
>
> This should be useful for user namespaces.
> ---
> src/core/execute.c | 12 ++++++++----
> 1 file changed, 8 insertions(+), 4 deletions(-)
>
> diff --git a/src/core/execute.c b/src/core/execute.c
> index 5e4135e..22b7862 100644
> --- a/src/core/execute.c
> +++ b/src/core/execute.c
> @@ -1359,12 +1359,16 @@ static int exec_child(ExecCommand *command,
> }
>
> if (context->oom_score_adjust_set) {
> - char t[16];
> + char t[DECIMAL_STR_MAX(context->oom_score_adjust)];
>
> - snprintf(t, sizeof(t), "%i", context->oom_score_adjust);
> - char_array_0(t);
> + /* When we can't make this change due to EPERM, then
> + * let's silently skip over it. User namespaces
> + * prohibit write access to this file, and we
> + * shouldn't trip up over that. */
>
> - if (write_string_file("/proc/self/oom_score_adj", t) < 0) {
> + sprintf(t, "%i", context->oom_score_adjust);
> + err = write_string_file("/proc/self/oom_score_adj", t);
> + if (err < 0 && err != -EPERM && err != -EACCES) {
> *error = EXIT_OOM_ADJUST;
> return -errno;
> }
>
> and
>
> From: Martin Pitt <martin.pitt at ubuntu.com>
> Date: Mon, 29 Dec 2014 14:18:28 +0100
> Subject: Loopback setup / rtnl fixes
>
> Various bug fixes in loopback device setup and netlink socket communication,
> taken from upstream commits e95e909 ff. Fixes massive CPU usage due to tight
> retry loops in user LXC containers.
> ---
> src/core/loopback-setup.c | 42 +++++++++++++++--------------------
> src/libsystemd/sd-rtnl/rtnl-message.c | 18 ++++++++++++---
> src/libsystemd/sd-rtnl/sd-rtnl.c | 4 +++-
> 3 files changed, 36 insertions(+), 28 deletions(-)
>
> diff --git a/src/core/loopback-setup.c b/src/core/loopback-setup.c
> index 98fc04d..ca10e20 100644
> --- a/src/core/loopback-setup.c
> +++ b/src/core/loopback-setup.c
> @@ -56,30 +56,24 @@ static int start_loopback(sd_rtnl *rtnl) {
> return 0;
> }
>
> -static int check_loopback(void) {
> +static bool check_loopback(sd_rtnl *rtnl) {
> + _cleanup_rtnl_message_unref_ sd_rtnl_message *req = NULL, *reply = NULL;
> + unsigned flags;
> int r;
> - _cleanup_close_ int fd = -1;
> - union {
> - struct sockaddr sa;
> - struct sockaddr_in in;
> - } sa = {
> - .in.sin_family = AF_INET,
> - .in.sin_addr.s_addr = INADDR_LOOPBACK,
> - };
> -
> - /* If we failed to set up the loop back device, check whether
> - * it might already be set up */
> -
> - fd = socket(AF_INET, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0);
> - if (fd < 0)
> - return -errno;
> -
> - if (bind(fd, &sa.sa, sizeof(sa.in)) >= 0)
> - r = 1;
> - else
> - r = errno == EADDRNOTAVAIL ? 0 : -errno;
> -
> - return r;
> +
> + r = sd_rtnl_message_new_link(rtnl, &req, RTM_GETLINK, LOOPBACK_IFINDEX);
> + if (r < 0)
> + return false;
> +
> + r = sd_rtnl_call(rtnl, req, 0, &reply);
> + if (r < 0)
> + return false;
> +
> + r = sd_rtnl_message_link_get_flags(reply, &flags);
> + if (r < 0)
> + return false;
> +
> + return flags & IFF_UP;
> }
>
> int loopback_setup(void) {
> @@ -92,7 +86,7 @@ int loopback_setup(void) {
>
> r = start_loopback(rtnl);
> if (r == -EPERM) {
> - if (check_loopback() < 0)
> + if (!check_loopback(rtnl))
> return log_warning_errno(EPERM, "Failed to configure loopback device: %m");
> } else if (r < 0)
> return log_warning_errno(r, "Failed to configure loopback device: %m");
> diff --git a/src/libsystemd/sd-rtnl/rtnl-message.c b/src/libsystemd/sd-rtnl/rtnl-message.c
> index 165e84d..e5a69bd 100644
> --- a/src/libsystemd/sd-rtnl/rtnl-message.c
> +++ b/src/libsystemd/sd-rtnl/rtnl-message.c
> @@ -1294,8 +1294,10 @@ static int socket_recv_message(int fd, struct iovec *iov, uint32_t *_group, bool
> /* no data */
> if (errno == ENOBUFS)
> log_debug("rtnl: kernel receive buffer overrun");
> + else if (errno == EAGAIN)
> + log_debug("rtnl: no data in socket");
>
> - return (errno == EAGAIN) ? 0 : -errno;
> + return (errno == EAGAIN || errno == EINTR) ? 0 : -errno;
> } else if (r == 0)
> /* connection was closed by the kernel */
> return -ECONNRESET;
> @@ -1307,8 +1309,10 @@ static int socket_recv_message(int fd, struct iovec *iov, uint32_t *_group, bool
> struct ucred *ucred = (void *)CMSG_DATA(cmsg);
>
> /* from the kernel */
> - if (ucred->uid == 0 && ucred->pid == 0)
> + if (ucred->pid == 0)
> auth = true;
> + else
> + log_debug("rtnl: ignoring message from pid %u", ucred->pid);
> } else if (cmsg->cmsg_level == SOL_NETLINK &&
> cmsg->cmsg_type == NETLINK_PKTINFO &&
> cmsg->cmsg_len == CMSG_LEN(sizeof(struct nl_pktinfo))) {
> @@ -1319,9 +1323,17 @@ static int socket_recv_message(int fd, struct iovec *iov, uint32_t *_group, bool
> }
> }
>
> - if (!auth)
> + if (!auth) {
> /* not from the kernel, ignore */
> + if (peek) {
> + /* drop the message */
> + r = recvmsg(fd, &msg, 0);
> + if (r < 0)
> + return (errno == EAGAIN || errno == EINTR) ? 0 : -errno;
> + }
> +
> return 0;
> + }
>
> if (group)
> *_group = group;
> diff --git a/src/libsystemd/sd-rtnl/sd-rtnl.c b/src/libsystemd/sd-rtnl/sd-rtnl.c
> index abb011e..7d388c9 100644
> --- a/src/libsystemd/sd-rtnl/sd-rtnl.c
> +++ b/src/libsystemd/sd-rtnl/sd-rtnl.c
> @@ -489,7 +489,7 @@ static int rtnl_poll(sd_rtnl *rtnl, bool need_more, uint64_t timeout_usec) {
> if (need_more)
> /* Caller wants more data, and doesn't care about
> * what's been read or any other timeouts. */
> - return e |= POLLIN;
> + e |= POLLIN;
> else {
> usec_t until;
> /* Caller wants to process if there is something to
> @@ -701,6 +701,8 @@ int sd_rtnl_call(sd_rtnl *rtnl,
> r = rtnl_poll(rtnl, true, left);
> if (r < 0)
> return r;
> + else if (r == 0)
> + return -ETIMEDOUT;
>
> r = dispatch_wqueue(rtnl);
> if (r < 0)
>
> Both are upstream patches and will therefore be included in the next
> systemd-release per default. If you want to run a distribution that is not
> running systemd 218 the second patch cannot be applied. Hence you need to
> install systemd 218 in that case. In any way for now you can either patch or
> disable the OOMScoreAdjust in the dbus.service file. This will get you around
> the problem that dbus is not starting but it won't get you around the problem
> of systemd respawning endlessly and too fast (for which the second patch
> exists) when it show you the greeting screen "Welcome to
> DISTRIBUTION-NAME-HERE!".
>
> But I can say with some confidence that patching systemd for now is worth it. I
> run Archlinux and Fedora 21 with rawhide enabled or Fedora Rawhide unprivileged
> with a clean, nice, and fast boot.
>
> If you decide to patch systemd 218 for now, here is a suggestion:
>
> (1) Start the container, even if it hangs with a folder bind-mounted from the
> host which containes the patched systemd version.
> (2) Attach to it with lxc-attach and get a shell in the container.
> (3) Install the distribution specific patched systemd and uninstall the old
> one.
> (4) Stop the container.
> (5) Clone the container and keep the original one untouched.
> (6) Fiddle around with the clone.
> (7) Delete the clone when you don't need it anymore or it breaks.
> (8) Clone a new one from the original container with the patched-systemd to
> create a new one.
>
> This will get you through the period until all distros have the release that is
> coming after systemd 218-1.
>
> Best,
> Christian
>
> P.S. If you're just interested in running Debian Jessie then you can probably
> download the systemd package from Debian Sid or Ubuntu Vivid which which
> contain both patches and install.
>
> > Hi Serge,
> >
> > > > I just to follow
> > > >
> > > > https://www.stgraber.org/2014/01/17/lxc-1-0-unprivileged-containers/
> > > >
> > > > once more to install a new container and it fails. First of all it
> > > > was a problem with the access to the directory
> > > >
> > > > ~/.local/share/lxc/jessie1
> > > >
> > > > The owner changed to a mapped one -> 100000 and then there was no
> > > > access for the lxcuser, which has uid 1001. I fixed this via setting
> > > > write access for the users group.
> > > >
> > > > But then I installed a download template:
> > > >
> > > > lxc-create -t download -n jessie1 -- -d debian -r jessie -a amd64
> > > >
> > > > which worked without problems (except warnings regarding reopen tty).
> > > >
> > > > If I try to start the container it ends up with:
> > > >
> > > > ~$ lxc-start -n jessie1
> > > > lxc_container: Permission denied - Unable to create /dev/.lxc for autodev
> > > > Failed to mount cgroup at /sys/fs/cgroup/systemd: Operation not permitted
> > > >
> > > > Here it ends, nothing more happens and only a kill -9 works...
> > > >
> > > > And yes, /sbin/init in the container is now a link to systemd:
> > > >
> > > > /sbin/init -> /lib/systemd/systemd
> > > >
> > > > I suspect, this does not work at all without cgroup namespace support
> > > > in the kernel? Or am I missing something else?
> > >
> > > There's something else you're missing, but I'm not sure what. What is
> > > your environment (os/release and any custom installs)? Try 1.1.0, and
> > > make sure to re-create the container as the new config file should be
> > > more correct for systemd backed containers.
> >
> > the host is Debian wheezy, kernel 3.18.4 and a backported shadow
> > package for newuidmap & Co.
> >
> > For LXC I have now lxc-1.1, cgmanager-0.35 and lxcfs-0.5. This works
> > fine with an debian-container for wheezy and jessie. But if I switch
> > jessie to systemd, it fails. I just tried an upgrade to experimental,
> > but this fails, too.
> >
> > Now I started with a fresh template download of ubuntu vivid. This
> > works fine, too, but with upstart. If I install systemd an put a
> > lxc.init_cmd=/bin/systemd, it fails too:
> >
> > $ lxc-start -n ubuntu -F -l DEBUG
> > WARN: could not reopen tty: Permission denied
> > systemd 218 running in system mode. (+PAM +AUDIT +SELINUX +IMA
> > +APPARMOR +SMACK +SYSVINIT +UTMP +LIBCRYPTSETUP +GCRYPT -GNUTLS +ACL
> > +XZ -LZ4 -SECCOMP +BLKID -ELFUTILS +KMOD -IDN)
> > Detected virtualization 'lxc'.
> > Detected architecture 'x86-64'.
> >
> > Welcome to Ubuntu Vivid Vervet (development branch)!
> >
> > Set hostname to <ubuntu>.
> > /etc/mtab is not a symlink or not pointing to /proc/self/mounts. This
> > is not supported anymore. Please make sure to replace this file by a
> > symlink to avoid incorrect or misleading mount(8) output.
> > Failed to install release agent, ignoring: No such file or directory
> >
> > That's the same behaviour as with debian jessie and systemd. At this
> > point even an lxc-attach does not work...
> >
> > I have no idea, what is going on, maybe there is something too old on
> > debian wheezy as a host?
> >
> > Best regards
> >
> > Dirk
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 819 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-users/attachments/20150209/6ce4e94c/attachment-0001.sig>
More information about the lxc-users
mailing list