[lxc-devel] [lxc/master] RFC: Generated Apparmor profiles, namespaces, stacking

Blub on Github lxc-bot at linuxcontainers.org
Wed Jul 18 12:30:42 UTC 2018


A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 1058 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20180718/79c08645/attachment.bin>
-------------- next part --------------
From dd3da3788c8103974ef78735e0797938f78c661c Mon Sep 17 00:00:00 2001
From: Wolfgang Bumiller <w.bumiller at proxmox.com>
Date: Thu, 12 Jul 2018 15:16:40 +0200
Subject: [PATCH 1/3] lsm: fixup lsm_process_label_set_at return values

Always return -1 on error (some code paths returned -1, some
returned negative error codes), don't assume 'errno' is set
afterwards, as the function already prints errors and not
all code paths will have a usable errno value.

Signed-off-by: Wolfgang Bumiller <w.bumiller at proxmox.com>
---
 src/lxc/lsm/apparmor.c |  2 +-
 src/lxc/lsm/lsm.c      | 12 ++++++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/lxc/lsm/apparmor.c b/src/lxc/lsm/apparmor.c
index 1507917c8..95b61943e 100644
--- a/src/lxc/lsm/apparmor.c
+++ b/src/lxc/lsm/apparmor.c
@@ -241,7 +241,7 @@ static int apparmor_process_label_set(const char *inlabel, struct lxc_conf *conf
 	ret = lsm_process_label_set_at(label_fd, label, on_exec);
 	close(label_fd);
 	if (ret < 0) {
-		SYSERROR("Failed to change apparmor profile to %s", label);
+		ERROR("Failed to change apparmor profile to %s", label);
 		return -1;
 	}
 
diff --git a/src/lxc/lsm/lsm.c b/src/lxc/lsm/lsm.c
index f4500ae20..8d7de2dbe 100644
--- a/src/lxc/lsm/lsm.c
+++ b/src/lxc/lsm/lsm.c
@@ -142,18 +142,20 @@ int lsm_process_label_set_at(int label_fd, const char *label, bool on_exec)
 
 		if (on_exec) {
 			ERROR("Changing AppArmor profile on exec not supported");
-			return -EINVAL;
+			return -1;
 		}
 
 		len = strlen(label) + strlen("changeprofile ") + 1;
 		command = malloc(len);
 		if (!command)
-			return -1;
+			goto on_error;
 
 		ret = snprintf(command, len, "changeprofile %s", label);
 		if (ret < 0 || (size_t)ret >= len) {
+			int saved_errno = errno;
 			free(command);
-			return -1;
+			errno = saved_errno;
+			goto on_error;
 		}
 
 		ret = lxc_write_nointr(label_fd, command, len - 1);
@@ -161,9 +163,11 @@ int lsm_process_label_set_at(int label_fd, const char *label, bool on_exec)
 	} else if (strcmp(name, "SELinux") == 0) {
 		ret = lxc_write_nointr(label_fd, label, strlen(label));
 	} else {
-		ret = -EINVAL;
+		errno = EINVAL;
+		ret = -1;
 	}
 	if (ret < 0) {
+on_error:
 		SYSERROR("Failed to set %s label \"%s\"", name, label);
 		return -1;
 	}

From ce79df8fd102cdef9228aace914bedebb0a6ff51 Mon Sep 17 00:00:00 2001
From: Wolfgang Bumiller <w.bumiller at proxmox.com>
Date: Wed, 18 Jul 2018 12:43:37 +0200
Subject: [PATCH 2/3] utils: add must_concat helper

Signed-off-by: Wolfgang Bumiller <w.bumiller at proxmox.com>
---
 src/lxc/utils.c | 24 ++++++++++++++++++++++++
 src/lxc/utils.h |  1 +
 2 files changed, 25 insertions(+)

diff --git a/src/lxc/utils.c b/src/lxc/utils.c
index 6bb05df00..ad101ae50 100644
--- a/src/lxc/utils.c
+++ b/src/lxc/utils.c
@@ -2425,6 +2425,30 @@ int run_command(char *buf, size_t buf_size, int (*child_fn)(void *), void *args)
 	return fret;
 }
 
+char *must_concat(const char *first, ...)
+{
+	va_list args;
+	char *cur, *dest;
+	size_t cur_len, it_len = strlen(first);
+
+	dest = must_copy_string(first);
+	cur_len = it_len;
+
+	va_start(args, first);
+	while ((cur = va_arg(args, char *)) != NULL) {
+		it_len = strlen(cur);
+
+		dest = must_realloc(dest, cur_len + it_len + 1);
+
+		(void)memcpy(dest + cur_len, cur, it_len);
+		cur_len += it_len;
+	}
+	va_end(args);
+
+	dest[cur_len] = 0;
+	return dest;
+}
+
 char *must_make_path(const char *first, ...)
 {
 	va_list args;
diff --git a/src/lxc/utils.h b/src/lxc/utils.h
index 295e7862c..ed486853a 100644
--- a/src/lxc/utils.h
+++ b/src/lxc/utils.h
@@ -567,6 +567,7 @@ extern int run_command(char *buf, size_t buf_size, int (*child_fn)(void *),
 /* Concatenate all passed-in strings into one path. Do not fail. If any piece
  * is not prefixed with '/', add a '/'.
  */
+__attribute__((sentinel)) extern char *must_concat(const char *first, ...);
 __attribute__((sentinel)) extern char *must_make_path(const char *first, ...);
 __attribute__((sentinel)) extern char *must_append_path(char *first, ...);
 

From 1121d33d6e04c7c098443dbcc3e456d3152c59b6 Mon Sep 17 00:00:00 2001
From: Wolfgang Bumiller <w.bumiller at proxmox.com>
Date: Mon, 25 Jun 2018 14:38:11 +0200
Subject: [PATCH 3/3] apparmor profile generation

This copies lxd's apparmor profile generation. This tries to
detect features such as cgroup namespaces, apparmor
namespaces and stacking support, and has profile parts
conditionally for unprivileged containers.

This introduces the following changes to the configuration:
  lxc.apparmor.profile = generated
    The fixed value 'generated' will cause this
    functionality to be used, otherwise there should be no
    functional changes happening unless specifically
    requested with the next key:
  lxc.apparmor.allow_nesting
    This is a boolean which, if enabled, causes the
    following changes: When generated apparmor profiles are
    used, they will contain the necessary changes to allow
    creating a nested container. In addition to the usual
    mount points, /dev/.lxc/proc and /dev/.lxc/sys will
    contain procfs and sysfs mount points without the lxcfs
    overlays, which, if generated apparmor profiles are
    being used, will not be read/writable directly.
  lxc.apparmor.raw
    A list of raw apparmor profile lines to append to the
    profile. Only valid when using generated profiles.

Signed-off-by: Wolfgang Bumiller <w.bumiller at proxmox.com>
---
 src/lxc/conf.c         |  43 ++-
 src/lxc/conf.h         |   7 +-
 src/lxc/confile.c      |  93 +++++
 src/lxc/criu.c         |   3 +-
 src/lxc/lsm/apparmor.c | 908 ++++++++++++++++++++++++++++++++++++++++++++++---
 src/lxc/lsm/lsm.c      |  26 +-
 src/lxc/lsm/lsm.h      |   8 +-
 src/lxc/lsm/nop.c      |   2 +-
 src/lxc/lsm/selinux.c  |   4 +-
 src/lxc/start.c        |  18 +-
 10 files changed, 1059 insertions(+), 53 deletions(-)

diff --git a/src/lxc/conf.c b/src/lxc/conf.c
index 33beb43d1..c19b0ae19 100644
--- a/src/lxc/conf.c
+++ b/src/lxc/conf.c
@@ -2324,7 +2324,23 @@ static int setup_mount(const struct lxc_conf *conf,
 	return ret;
 }
 
-FILE *make_anonymous_mount_file(struct lxc_list *mount)
+/*
+ * In order for nested containers to be able to mount /proc and /sys they need
+ * to see a "pure" proc and sysfs mount points with nothing mounted on top
+ * (like lxcfs).
+ * For this we provide proc and sysfs in /dev/.lxc/{proc,sys} while using an
+ * apparmor rule to deny access to them. This is mostly for convenience: The
+ * container's root user can mount them anyway and thus has access to the two
+ * file systems. But a non-root user in the container should not be allowed to
+ * access them as a side effect without explicitly allowing it.
+ */
+static const char nesting_helpers[] =
+"proc dev/.lxc/proc proc create=dir,optional\n"
+"sys dev/.lxc/sys sysfs create=dir,optional\n"
+;
+
+FILE *make_anonymous_mount_file(struct lxc_list *mount,
+				bool include_nesting_helpers)
 {
 	int ret;
 	char *mount_entry;
@@ -2366,6 +2382,13 @@ FILE *make_anonymous_mount_file(struct lxc_list *mount)
 			goto on_error;
 	}
 
+	if (include_nesting_helpers) {
+		ret = lxc_write_nointr(fd, nesting_helpers,
+				       sizeof(nesting_helpers)-1);
+		if (ret != sizeof(nesting_helpers)-1)
+			goto on_error;
+	}
+
 	ret = lseek(fd, 0, SEEK_SET);
 	if (ret < 0)
 		goto on_error;
@@ -2386,7 +2409,7 @@ static int setup_mount_entries(const struct lxc_conf *conf,
 	int ret;
 	FILE *f;
 
-	f = make_anonymous_mount_file(mount);
+	f = make_anonymous_mount_file(mount, conf->lsm_aa_allow_nesting);
 	if (!f)
 		return -1;
 
@@ -2702,6 +2725,7 @@ struct lxc_conf *lxc_conf_init(void)
 	lxc_list_init(&new->groups);
 	lxc_list_init(&new->state_clients);
 	new->lsm_aa_profile = NULL;
+	lxc_list_init(&new->lsm_aa_raw);
 	new->lsm_se_context = NULL;
 	new->tmp_umount_proc = false;
 
@@ -3986,6 +4010,19 @@ void lxc_clear_includes(struct lxc_conf *conf)
 	}
 }
 
+int lxc_clear_apparmor_raw(struct lxc_conf *c)
+{
+	struct lxc_list *it, *next;
+
+	lxc_list_for_each_safe (it, &c->lsm_aa_raw, next) {
+		lxc_list_del(it);
+		free(it->elem);
+		free(it);
+	}
+
+	return 0;
+}
+
 void lxc_conf_free(struct lxc_conf *conf)
 {
 	if (!conf)
@@ -4013,6 +4050,7 @@ void lxc_conf_free(struct lxc_conf *conf)
 	free(conf->syslog);
 	lxc_free_networks(&conf->network);
 	free(conf->lsm_aa_profile);
+	free(conf->lsm_aa_profile_computed);
 	free(conf->lsm_se_context);
 	lxc_seccomp_free(conf);
 	lxc_clear_config_caps(conf);
@@ -4029,6 +4067,7 @@ void lxc_conf_free(struct lxc_conf *conf)
 	lxc_clear_limits(conf, "lxc.prlimit");
 	lxc_clear_sysctls(conf, "lxc.sysctl");
 	lxc_clear_procs(conf, "lxc.proc");
+	lxc_clear_apparmor_raw(conf);
 	free(conf->cgroup_meta.dir);
 	free(conf->cgroup_meta.controllers);
 	free(conf);
diff --git a/src/lxc/conf.h b/src/lxc/conf.h
index f7a879c30..bcc97277c 100644
--- a/src/lxc/conf.h
+++ b/src/lxc/conf.h
@@ -272,7 +272,10 @@ struct lxc_conf {
 	};
 
 	char *lsm_aa_profile;
+	char *lsm_aa_profile_computed;
+	unsigned int lsm_aa_allow_nesting;
 	unsigned int lsm_aa_allow_incomplete;
+	struct lxc_list lsm_aa_raw;
 	char *lsm_se_context;
 	bool tmp_umount_proc;
 	char *seccomp;  /* filename with the seccomp rules */
@@ -417,7 +420,8 @@ extern int parse_mntopts(const char *mntopts, unsigned long *mntflags,
 extern void tmp_proc_unmount(struct lxc_conf *lxc_conf);
 extern void remount_all_slave(void);
 extern void suggest_default_idmap(void);
-extern FILE *make_anonymous_mount_file(struct lxc_list *mount);
+extern FILE *make_anonymous_mount_file(struct lxc_list *mount,
+				       bool include_nesting_helpers);
 extern struct lxc_list *sort_cgroup_settings(struct lxc_list *cgroup_settings);
 extern unsigned long add_required_remount_flags(const char *s, const char *d,
 						unsigned long flags);
@@ -431,5 +435,6 @@ extern int setup_sysctl_parameters(struct lxc_list *sysctls);
 extern int lxc_clear_sysctls(struct lxc_conf *c, const char *key);
 extern int setup_proc_filesystem(struct lxc_list *procs, pid_t pid);
 extern int lxc_clear_procs(struct lxc_conf *c, const char *key);
+extern int lxc_clear_apparmor_raw(struct lxc_conf *c);
 
 #endif /* __LXC_CONF_H */
diff --git a/src/lxc/confile.c b/src/lxc/confile.c
index 5a18d11bf..36a38af03 100644
--- a/src/lxc/confile.c
+++ b/src/lxc/confile.c
@@ -84,7 +84,9 @@ lxc_log_define(confile, lxc);
 
 lxc_config_define(autodev);
 lxc_config_define(apparmor_allow_incomplete);
+lxc_config_define(apparmor_allow_nesting);
 lxc_config_define(apparmor_profile);
+lxc_config_define(apparmor_raw);
 lxc_config_define(cap_drop);
 lxc_config_define(cap_keep);
 lxc_config_define(cgroup_controller);
@@ -158,6 +160,8 @@ static struct lxc_config_t config[] = {
 	{ "lxc.arch",                      set_config_personality,                 get_config_personality,                 clr_config_personality,               },
 	{ "lxc.apparmor.profile",          set_config_apparmor_profile,            get_config_apparmor_profile,            clr_config_apparmor_profile,          },
 	{ "lxc.apparmor.allow_incomplete", set_config_apparmor_allow_incomplete,   get_config_apparmor_allow_incomplete,   clr_config_apparmor_allow_incomplete, },
+	{ "lxc.apparmor.allow_nesting",    set_config_apparmor_allow_nesting,      get_config_apparmor_allow_nesting,      clr_config_apparmor_allow_nesting,    },
+	{ "lxc.apparmor.raw",              set_config_apparmor_raw,                get_config_apparmor_raw,                clr_config_apparmor_raw,              },
 	{ "lxc.autodev",                   set_config_autodev,                     get_config_autodev,                     clr_config_autodev,                   },
 	{ "lxc.cap.drop",                  set_config_cap_drop,                    get_config_cap_drop,                    clr_config_cap_drop,                  },
 	{ "lxc.cap.keep",                  set_config_cap_keep,                    get_config_cap_keep,                    clr_config_cap_keep,                  },
@@ -1132,6 +1136,52 @@ static int set_config_apparmor_allow_incomplete(const char *key,
 	return 0;
 }
 
+static int set_config_apparmor_allow_nesting(const char *key,
+					     const char *value,
+					     struct lxc_conf *lxc_conf,
+					     void *data)
+{
+	if (lxc_config_value_empty(value)) {
+		lxc_conf->lsm_aa_allow_nesting = 0;
+		return 0;
+	}
+
+	if (lxc_safe_uint(value, &lxc_conf->lsm_aa_allow_nesting) < 0)
+		return -1;
+
+	if (lxc_conf->lsm_aa_allow_nesting > 1)
+		return -1;
+
+	return 0;
+}
+
+static int set_config_apparmor_raw(const char *key,
+				   const char *value,
+				   struct lxc_conf *lxc_conf,
+				   void *data)
+{
+	char *elem;
+	struct lxc_list *list;
+
+	if (lxc_config_value_empty(value))
+		return lxc_clear_apparmor_raw(lxc_conf);
+
+	list = malloc(sizeof(*list));
+	if (!list)
+		return -1;
+
+	elem = strdup(value);
+	if (!elem) {
+		free(list);
+		return -1;
+	}
+	list->elem = elem;
+
+	lxc_list_add_tail(&lxc_conf->lsm_aa_raw, list);
+
+	return 0;
+}
+
 static int set_config_selinux_context(const char *key, const char *value,
 				      struct lxc_conf *lxc_conf, void *data)
 {
@@ -2973,6 +3023,33 @@ static int get_config_apparmor_allow_incomplete(const char *key, char *retv,
 				c->lsm_aa_allow_incomplete);
 }
 
+static int get_config_apparmor_allow_nesting(const char *key, char *retv,
+					     int inlen, struct lxc_conf *c,
+					     void *data)
+{
+	return lxc_get_conf_int(c, retv, inlen,
+				c->lsm_aa_allow_nesting);
+}
+
+static int get_config_apparmor_raw(const char *key, char *retv,
+				   int inlen, struct lxc_conf *c,
+				   void *data)
+{
+	int len, fulllen = 0;
+	struct lxc_list *it;
+
+	if (!retv)
+		inlen = 0;
+	else
+		memset(retv, 0, inlen);
+
+	lxc_list_for_each(it, &c->lsm_aa_raw) {
+		strprint(retv, inlen, "%s\n", (char *)it->elem);
+	}
+
+	return fulllen;
+}
+
 static int get_config_selinux_context(const char *key, char *retv, int inlen,
 				      struct lxc_conf *c, void *data)
 {
@@ -3763,6 +3840,21 @@ static inline int clr_config_apparmor_allow_incomplete(const char *key,
 	return 0;
 }
 
+static inline int clr_config_apparmor_allow_nesting(const char *key,
+						    struct lxc_conf *c,
+						    void *data)
+{
+	c->lsm_aa_allow_nesting = 0;
+	return 0;
+}
+
+static inline int clr_config_apparmor_raw(const char *key,
+					  struct lxc_conf *c,
+					  void *data)
+{
+	return lxc_clear_apparmor_raw(c);
+}
+
 static inline int clr_config_selinux_context(const char *key,
 					     struct lxc_conf *c, void *data)
 {
@@ -4955,6 +5047,7 @@ int lxc_list_subkeys(struct lxc_conf *conf, const char *key, char *retv,
 
 	if (!strcmp(key, "lxc.apparmor")) {
 		strprint(retv, inlen, "allow_incomplete\n");
+		strprint(retv, inlen, "allow_nesting\n");
 		strprint(retv, inlen, "profile\n");
 	} else if (!strcmp(key, "lxc.cgroup")) {
 		strprint(retv, inlen, "dir\n");
diff --git a/src/lxc/criu.c b/src/lxc/criu.c
index c36421627..64ea4f024 100644
--- a/src/lxc/criu.c
+++ b/src/lxc/criu.c
@@ -378,7 +378,8 @@ static void exec_criu(struct cgroup_ops *cgroup_ops, struct criu_opts *opts)
 		DECLARE_ARG(opts->user->action_script);
 	}
 
-	mnts = make_anonymous_mount_file(&opts->c->lxc_conf->mount_list);
+	mnts = make_anonymous_mount_file(&opts->c->lxc_conf->mount_list,
+	                                 opts->c->lxc_conf->lsm_aa_allow_nesting);
 	if (!mnts)
 		goto err;
 
diff --git a/src/lxc/lsm/apparmor.c b/src/lxc/lsm/apparmor.c
index 95b61943e..7caffbc78 100644
--- a/src/lxc/lsm/apparmor.c
+++ b/src/lxc/lsm/apparmor.c
@@ -32,19 +32,358 @@
 #include "lsm.h"
 #include "conf.h"
 #include "utils.h"
+#include "caps.h"
+#include "parse.h"
 
 lxc_log_define(apparmor, lsm);
 
 /* set by lsm_apparmor_drv_init if true */
 static int aa_enabled = 0;
+static bool aa_supports_unix;
+static bool aa_can_stack;
+static bool aa_is_stacked;
+static bool aa_admin;
 
 static int mount_features_enabled = 0;
 
+static char sanitized_lxcpath[sizeof(LXCPATH)];
+static char aa_cache_path[PATH_MAX];
+static char aa_profile_path[PATH_MAX];
+
 #define AA_DEF_PROFILE "lxc-container-default"
 #define AA_DEF_PROFILE_CGNS "lxc-container-default-cgns"
 #define AA_MOUNT_RESTR "/sys/kernel/security/apparmor/features/mount/mask"
 #define AA_ENABLED_FILE "/sys/module/apparmor/parameters/enabled"
 #define AA_UNCHANGED "unchanged"
+#define AA_GENERATED "generated"
+
+#define AA_CMD_LOAD   'r'
+#define AA_CMD_UNLOAD 'R'
+#define AA_CMD_PARSE  'Q'
+
+static const char AA_PROFILE_BASE[] =
+"  ### Base profile\n"
+"  capability,\n"
+"  dbus,\n"
+"  file,\n"
+"  network,\n"
+"  umount,\n"
+"\n"
+"  # Allow us to receive signals from anywhere.\n"
+"  signal (receive),\n"
+"\n"
+"  # Allow us to send signals to ourselves\n"
+"  signal peer=@{profile_name},\n"
+"\n"
+"  # Allow other processes to read our /proc entries, futexes, perf tracing and\n"
+"  # kcmp for now (they will need 'read' in the first place). Administrators can\n"
+"  # override with:\n"
+"  #   deny ptrace (readby) ...\n"
+"  ptrace (readby),\n"
+"\n"
+"  # Allow other processes to trace us by default (they will need 'trace' in\n"
+"  # the first place). Administrators can override with:\n"
+"  #   deny ptrace (tracedby) ...\n"
+"  ptrace (tracedby),\n"
+"\n"
+"  # Allow us to ptrace ourselves\n"
+"  ptrace peer=@{profile_name},\n"
+"\n"
+"  # ignore DENIED message on / remount\n"
+"  deny mount options=(ro, remount) -> /,\n"
+"  deny mount options=(ro, remount, silent) -> /,\n"
+"\n"
+"  # allow tmpfs mounts everywhere\n"
+"  mount fstype=tmpfs,\n"
+"\n"
+"  # allow hugetlbfs mounts everywhere\n"
+"  mount fstype=hugetlbfs,\n"
+"\n"
+"  # allow mqueue mounts everywhere\n"
+"  mount fstype=mqueue,\n"
+"\n"
+"  # allow fuse mounts everywhere\n"
+"  mount fstype=fuse,\n"
+"  mount fstype=fuse.*,\n"
+"\n"
+"  # deny access under /proc/bus to avoid e.g. messing with pci devices directly\n"
+"  deny @{PROC}/bus/** wklx,\n"
+"\n"
+"  # deny writes in /proc/sys/fs but allow binfmt_misc to be mounted\n"
+"  mount fstype=binfmt_misc -> /proc/sys/fs/binfmt_misc/,\n"
+"  deny @{PROC}/sys/fs/** wklx,\n"
+"\n"
+"  # allow efivars to be mounted, writing to it will be blocked though\n"
+"  mount fstype=efivarfs -> /sys/firmware/efi/efivars/,\n"
+"\n"
+"  # block some other dangerous paths\n"
+"  deny @{PROC}/kcore rwklx,\n"
+"  deny @{PROC}/sysrq-trigger rwklx,\n"
+"\n"
+"  # deny writes in /sys except for /sys/fs/cgroup, also allow\n"
+"  # fusectl, securityfs and debugfs to be mounted there (read-only)\n"
+"  mount fstype=fusectl -> /sys/fs/fuse/connections/,\n"
+"  mount fstype=securityfs -> /sys/kernel/security/,\n"
+"  mount fstype=debugfs -> /sys/kernel/debug/,\n"
+"  deny mount fstype=debugfs -> /var/lib/ureadahead/debugfs/,\n"
+"  mount fstype=proc -> /proc/,\n"
+"  mount fstype=sysfs -> /sys/,\n"
+"  mount options=(rw, nosuid, nodev, noexec, remount) -> /sys/,\n"
+"  deny /sys/firmware/efi/efivars/** rwklx,\n"
+"  # note, /sys/kernel/security/** handled below\n"
+"  mount options=(move) /sys/fs/cgroup/cgmanager/ -> /sys/fs/cgroup/cgmanager.lower/,\n"
+"  mount options=(ro, nosuid, nodev, noexec, remount, strictatime) -> /sys/fs/cgroup/,\n"
+"\n"
+"  # deny reads from debugfs\n"
+"  deny /sys/kernel/debug/{,**} rwklx,\n"
+"\n"
+"  # allow paths to be made slave, shared, private or unbindable\n"
+"  # FIXME: This currently doesn't work due to the apparmor parser treating those as allowing all mounts.\n"
+"#  mount options=(rw,make-slave) -> **,\n"
+"#  mount options=(rw,make-rslave) -> **,\n"
+"#  mount options=(rw,make-shared) -> **,\n"
+"#  mount options=(rw,make-rshared) -> **,\n"
+"#  mount options=(rw,make-private) -> **,\n"
+"#  mount options=(rw,make-rprivate) -> **,\n"
+"#  mount options=(rw,make-unbindable) -> **,\n"
+"#  mount options=(rw,make-runbindable) -> **,\n"
+"\n"
+"  # allow bind-mounts of anything except /proc, /sys and /dev\n"
+"  mount options=(rw,bind) /[^spd]*{,/**},\n"
+"  mount options=(rw,bind) /d[^e]*{,/**},\n"
+"  mount options=(rw,bind) /de[^v]*{,/**},\n"
+"  mount options=(rw,bind) /dev/.[^l]*{,/**},\n"
+"  mount options=(rw,bind) /dev/.l[^x]*{,/**},\n"
+"  mount options=(rw,bind) /dev/.lx[^c]*{,/**},\n"
+"  mount options=(rw,bind) /dev/.lxc?*{,/**},\n"
+"  mount options=(rw,bind) /dev/[^.]*{,/**},\n"
+"  mount options=(rw,bind) /dev?*{,/**},\n"
+"  mount options=(rw,bind) /p[^r]*{,/**},\n"
+"  mount options=(rw,bind) /pr[^o]*{,/**},\n"
+"  mount options=(rw,bind) /pro[^c]*{,/**},\n"
+"  mount options=(rw,bind) /proc?*{,/**},\n"
+"  mount options=(rw,bind) /s[^y]*{,/**},\n"
+"  mount options=(rw,bind) /sy[^s]*{,/**},\n"
+"  mount options=(rw,bind) /sys?*{,/**},\n"
+"\n"
+"  # allow read-only bind-mounts of anything except /proc, /sys and /dev\n"
+"  mount options=(ro,remount,bind) -> /[^spd]*{,/**},\n"
+"  mount options=(ro,remount,bind) -> /d[^e]*{,/**},\n"
+"  mount options=(ro,remount,bind) -> /de[^v]*{,/**},\n"
+"  mount options=(ro,remount,bind) -> /dev/.[^l]*{,/**},\n"
+"  mount options=(ro,remount,bind) -> /dev/.l[^x]*{,/**},\n"
+"  mount options=(ro,remount,bind) -> /dev/.lx[^c]*{,/**},\n"
+"  mount options=(ro,remount,bind) -> /dev/.lxc?*{,/**},\n"
+"  mount options=(ro,remount,bind) -> /dev/[^.]*{,/**},\n"
+"  mount options=(ro,remount,bind) -> /dev?*{,/**},\n"
+"  mount options=(ro,remount,bind) -> /p[^r]*{,/**},\n"
+"  mount options=(ro,remount,bind) -> /pr[^o]*{,/**},\n"
+"  mount options=(ro,remount,bind) -> /pro[^c]*{,/**},\n"
+"  mount options=(ro,remount,bind) -> /proc?*{,/**},\n"
+"  mount options=(ro,remount,bind) -> /s[^y]*{,/**},\n"
+"  mount options=(ro,remount,bind) -> /sy[^s]*{,/**},\n"
+"  mount options=(ro,remount,bind) -> /sys?*{,/**},\n"
+"\n"
+"  # allow moving mounts except for /proc, /sys and /dev\n"
+"  mount options=(rw,move) /[^spd]*{,/**},\n"
+"  mount options=(rw,move) /d[^e]*{,/**},\n"
+"  mount options=(rw,move) /de[^v]*{,/**},\n"
+"  mount options=(rw,move) /dev/.[^l]*{,/**},\n"
+"  mount options=(rw,move) /dev/.l[^x]*{,/**},\n"
+"  mount options=(rw,move) /dev/.lx[^c]*{,/**},\n"
+"  mount options=(rw,move) /dev/.lxc?*{,/**},\n"
+"  mount options=(rw,move) /dev/[^.]*{,/**},\n"
+"  mount options=(rw,move) /dev?*{,/**},\n"
+"  mount options=(rw,move) /p[^r]*{,/**},\n"
+"  mount options=(rw,move) /pr[^o]*{,/**},\n"
+"  mount options=(rw,move) /pro[^c]*{,/**},\n"
+"  mount options=(rw,move) /proc?*{,/**},\n"
+"  mount options=(rw,move) /s[^y]*{,/**},\n"
+"  mount options=(rw,move) /sy[^s]*{,/**},\n"
+"  mount options=(rw,move) /sys?*{,/**},\n"
+"\n"
+"  # generated by: lxc-generate-aa-rules.py container-rules.base\n"
+"  deny /proc/sys/[^kn]*{,/**} wklx,\n"
+"  deny /proc/sys/k[^e]*{,/**} wklx,\n"
+"  deny /proc/sys/ke[^r]*{,/**} wklx,\n"
+"  deny /proc/sys/ker[^n]*{,/**} wklx,\n"
+"  deny /proc/sys/kern[^e]*{,/**} wklx,\n"
+"  deny /proc/sys/kerne[^l]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/[^smhd]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/d[^o]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/do[^m]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/dom[^a]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/doma[^i]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/domai[^n]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/domain[^n]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/domainn[^a]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/domainna[^m]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/domainnam[^e]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/domainname?*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/h[^o]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/ho[^s]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/hos[^t]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/host[^n]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/hostn[^a]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/hostna[^m]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/hostnam[^e]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/hostname?*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/m[^s]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/ms[^g]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/msg*/** wklx,\n"
+"  deny /proc/sys/kernel/s[^he]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/se[^m]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/sem*/** wklx,\n"
+"  deny /proc/sys/kernel/sh[^m]*{,/**} wklx,\n"
+"  deny /proc/sys/kernel/shm*/** wklx,\n"
+"  deny /proc/sys/kernel?*{,/**} wklx,\n"
+"  deny /proc/sys/n[^e]*{,/**} wklx,\n"
+"  deny /proc/sys/ne[^t]*{,/**} wklx,\n"
+"  deny /proc/sys/net?*{,/**} wklx,\n"
+"  deny /sys/[^fdck]*{,/**} wklx,\n"
+"  deny /sys/c[^l]*{,/**} wklx,\n"
+"  deny /sys/cl[^a]*{,/**} wklx,\n"
+"  deny /sys/cla[^s]*{,/**} wklx,\n"
+"  deny /sys/clas[^s]*{,/**} wklx,\n"
+"  deny /sys/class/[^n]*{,/**} wklx,\n"
+"  deny /sys/class/n[^e]*{,/**} wklx,\n"
+"  deny /sys/class/ne[^t]*{,/**} wklx,\n"
+"  deny /sys/class/net?*{,/**} wklx,\n"
+"  deny /sys/class?*{,/**} wklx,\n"
+"  deny /sys/d[^e]*{,/**} wklx,\n"
+"  deny /sys/de[^v]*{,/**} wklx,\n"
+"  deny /sys/dev[^i]*{,/**} wklx,\n"
+"  deny /sys/devi[^c]*{,/**} wklx,\n"
+"  deny /sys/devic[^e]*{,/**} wklx,\n"
+"  deny /sys/device[^s]*{,/**} wklx,\n"
+"  deny /sys/devices/[^v]*{,/**} wklx,\n"
+"  deny /sys/devices/v[^i]*{,/**} wklx,\n"
+"  deny /sys/devices/vi[^r]*{,/**} wklx,\n"
+"  deny /sys/devices/vir[^t]*{,/**} wklx,\n"
+"  deny /sys/devices/virt[^u]*{,/**} wklx,\n"
+"  deny /sys/devices/virtu[^a]*{,/**} wklx,\n"
+"  deny /sys/devices/virtua[^l]*{,/**} wklx,\n"
+"  deny /sys/devices/virtual/[^n]*{,/**} wklx,\n"
+"  deny /sys/devices/virtual/n[^e]*{,/**} wklx,\n"
+"  deny /sys/devices/virtual/ne[^t]*{,/**} wklx,\n"
+"  deny /sys/devices/virtual/net?*{,/**} wklx,\n"
+"  deny /sys/devices/virtual?*{,/**} wklx,\n"
+"  deny /sys/devices?*{,/**} wklx,\n"
+"  deny /sys/f[^s]*{,/**} wklx,\n"
+"  deny /sys/fs/[^c]*{,/**} wklx,\n"
+"  deny /sys/fs/c[^g]*{,/**} wklx,\n"
+"  deny /sys/fs/cg[^r]*{,/**} wklx,\n"
+"  deny /sys/fs/cgr[^o]*{,/**} wklx,\n"
+"  deny /sys/fs/cgro[^u]*{,/**} wklx,\n"
+"  deny /sys/fs/cgrou[^p]*{,/**} wklx,\n"
+"  deny /sys/fs/cgroup?*{,/**} wklx,\n"
+"  deny /sys/fs?*{,/**} wklx,\n"
+;
+
+static const char AA_PROFILE_UNIX_SOCKETS[] =
+"\n"
+"  ### Feature: unix\n"
+"  # Allow receive via unix sockets from anywhere\n"
+"  unix (receive),\n"
+"\n"
+"  # Allow all unix in the container\n"
+"  unix peer=(label=@{profile_name}),\n"
+;
+
+static const char AA_PROFILE_CGROUP_NAMESPACES[] =
+"\n"
+"  ### Feature: cgroup namespace\n"
+"  mount fstype=cgroup -> /sys/fs/cgroup/**,\n"
+;
+
+/* '_BASE' because we still need to append generated change_profile rules */
+static const char AA_PROFILE_STACKING_BASE[] =
+"\n"
+"  ### Feature: apparmor stacking\n"
+"  ### Configuration: apparmor profile loading (in namespace)\n"
+"  deny /sys/k[^e]*{,/**} wklx,\n"
+"  deny /sys/ke[^r]*{,/**} wklx,\n"
+"  deny /sys/ker[^n]*{,/**} wklx,\n"
+"  deny /sys/kern[^e]*{,/**} wklx,\n"
+"  deny /sys/kerne[^l]*{,/**} wklx,\n"
+"  deny /sys/kernel/[^s]*{,/**} wklx,\n"
+"  deny /sys/kernel/s[^e]*{,/**} wklx,\n"
+"  deny /sys/kernel/se[^c]*{,/**} wklx,\n"
+"  deny /sys/kernel/sec[^u]*{,/**} wklx,\n"
+"  deny /sys/kernel/secu[^r]*{,/**} wklx,\n"
+"  deny /sys/kernel/secur[^i]*{,/**} wklx,\n"
+"  deny /sys/kernel/securi[^t]*{,/**} wklx,\n"
+"  deny /sys/kernel/securit[^y]*{,/**} wklx,\n"
+"  deny /sys/kernel/security/[^a]*{,/**} wklx,\n"
+"  deny /sys/kernel/security/a[^p]*{,/**} wklx,\n"
+"  deny /sys/kernel/security/ap[^p]*{,/**} wklx,\n"
+"  deny /sys/kernel/security/app[^a]*{,/**} wklx,\n"
+"  deny /sys/kernel/security/appa[^r]*{,/**} wklx,\n"
+"  deny /sys/kernel/security/appar[^m]*{,/**} wklx,\n"
+"  deny /sys/kernel/security/apparm[^o]*{,/**} wklx,\n"
+"  deny /sys/kernel/security/apparmo[^r]*{,/**} wklx,\n"
+"  deny /sys/kernel/security/apparmor?*{,/**} wklx,\n"
+"  deny /sys/kernel/security?*{,/**} wklx,\n"
+"  deny /sys/kernel?*{,/**} wklx,\n"
+;
+
+static const char AA_PROFILE_NO_STACKING[] =
+"\n"
+"  ### Feature: apparmor stacking (not present)\n"
+"  deny /sys/k*{,/**} rwklx,\n"
+;
+
+/* '_BASE' because we need to append change_profile for stacking */
+static const char AA_PROFILE_NESTING_BASE[] =
+"\n"
+"  ### Configuration: nesting\n"
+"  pivot_root,\n"
+"  ptrace,\n"
+"  signal,\n"
+"\n"
+   /* NOTE: See conf.c's "nesting_helpers" for details. */
+"  deny /dev/.lxc/proc/** rw,\n"
+"  deny /dev/.lxc/sys/** rw,\n"
+"\n"
+"  mount /var/lib/lxd/shmounts/ -> /var/lib/lxd/shmounts/,\n"
+"  mount none -> /var/lib/lxd/shmounts/,\n"
+"  mount fstype=proc -> /usr/lib/*/lxc/**,\n"
+"  mount fstype=sysfs -> /usr/lib/*/lxc/**,\n"
+"  mount options=(rw,bind),\n"
+"  mount options=(rw,rbind),\n"
+"  mount options=(rw,make-rshared),\n"
+"\n"
+   /* FIXME: What's the state here on apparmor's side? */
+"  # there doesn't seem to be a way to ask for:\n"
+"  # mount options=(ro,nosuid,nodev,noexec,remount,bind),\n"
+"  # as we always get mount to $cdir/proc/sys with those flags denied\n"
+"  # So allow all mounts until that is straightened out:\n"
+"  mount,\n"
+"  mount options=bind /var/lib/lxd/shmounts/** -> /var/lib/lxd/**,\n"
+;
+
+static const char AA_PROFILE_UNPRIVILEGED[] =
+"\n"
+"  ### Configuration: unprivileged container\n"
+"  pivot_root,\n"
+"\n"
+"  # Allow modifying mount propagation\n"
+"  mount options=(rw,make-slave) -> **,\n"
+"  mount options=(rw,make-rslave) -> **,\n"
+"  mount options=(rw,make-shared) -> **,\n"
+"  mount options=(rw,make-rshared) -> **,\n"
+"  mount options=(rw,make-private) -> **,\n"
+"  mount options=(rw,make-rprivate) -> **,\n"
+"  mount options=(rw,make-unbindable) -> **,\n"
+"  mount options=(rw,make-runbindable) -> **,\n"
+"\n"
+"  # Allow all bind-mounts\n"
+"  mount options=(rw,bind),\n"
+"  mount options=(rw,rbind),\n"
+"\n"
+"  # Allow remounting things read-only\n"
+"  mount options=(ro,remount),\n"
+;
 
 static bool check_mount_feature_enabled(void)
 {
@@ -143,11 +482,6 @@ static bool apparmor_am_unconfined(void)
 	return ret;
 }
 
-/* aa stacking is not yet supported */
-static bool aa_stacking_supported(void) {
-	return false;
-}
-
 static bool aa_needs_transition(char *curlabel)
 {
 	if (!curlabel)
@@ -159,61 +493,471 @@ static bool aa_needs_transition(char *curlabel)
 	return true;
 }
 
+static char *shorten_apparmor_name(char *name)
+{
+	size_t len;
+	len = strlen(name);
+	if (len+7 > 253) {
+		uint64_t hash = fnv_64a_buf(name, len, FNV1A_64_INIT);
+		name = must_realloc(name, 16+1);
+		(void)snprintf(name, 16+1, "%016" PRIx64, hash);
+	}
+	return name;
+}
+
+static char *apparmor_profile_short(const char *ctname)
+{
+	return shorten_apparmor_name(must_concat("lxc-", ctname, NULL));
+}
+
+static char *apparmor_profile_full(const char *ctname)
+{
+	return shorten_apparmor_name(must_concat("lxc-", ctname, "_<", LXCPATH, ">", NULL));
+}
+
+static char *apparmor_namespace(const char *ctname)
+{
+	return shorten_apparmor_name(must_concat("lxc-", ctname, "_<", sanitized_lxcpath, ">", NULL));
+}
+
+static bool check_apparmor_parser_version()
+{
+	struct lxc_popen_FILE *parserpipe;
+	int rc, major, minor, micro;
+
+	parserpipe = lxc_popen("apparmor_parser --version");
+	if (!parserpipe) {
+		SYSERROR("Failed to run apparmor_parser");
+		return false;
+	}
+
+	if (fscanf(parserpipe->f, "AppArmor parser version %d.%d.%d", &major, &minor, &micro) != 3) {
+		lxc_pclose(parserpipe);
+		ERROR("failed to parse version output of apparmor_parser");
+		return false;
+	}
+
+	rc = lxc_pclose(parserpipe);
+	if (rc < 0) {
+		SYSERROR("Error waiting for child process");
+		return false;
+	}
+	if (rc != 0) {
+		ERROR("'apparmor_parser --version' executed with an error status");
+		return false;
+	}
+
+	aa_supports_unix = (major > 2) ||
+	                   (major == 2 && minor > 10) ||
+	                   (major == 2 && minor == 10 && micro >= 95);
+
+	return true;
+}
+
+static bool file_is_yes(const char *path)
+{
+	ssize_t rd;
+	FILE *f;
+	char buf[8]; /* we expect "yes" or "no" */
+
+	f = fopen(path, "r");
+	if (!f)
+		return false;
+
+	rd = fread(buf, 1, sizeof(buf), f);
+	fclose(f);
+
+	return rd >= 4 && strncmp(buf, "yes\n", 4) == 0;
+}
+
+static bool apparmor_can_stack()
+{
+	int major, minor, scanned;
+	FILE *f;
+
+	if (!file_is_yes("/sys/kernel/security/apparmor/features/domain/stack"))
+		return false;
+
+	f = fopen("/sys/kernel/security/apparmor/features/domain/version", "r");
+	if (!f)
+		return false;
+
+	scanned = fscanf(f, "%d.%d", &major, &minor);
+	fclose(f);
+	if (scanned != 2)
+		return false;
+
+	return major >= 1 && minor >= 2;
+}
+
+static void make_sanitized_lxcpath()
+{
+	const char *lxcpath = LXCPATH;
+	size_t i, len;
+
+	while (lxcpath[0] == '/')
+		++lxcpath;
+
+	len = strlen(lxcpath);
+	for (i = 0; i != len; ++i)
+		sanitized_lxcpath[i] = (lxcpath[i] == '/') ? '-' : lxcpath[i];
+	sanitized_lxcpath[len] = 0;
+}
+
+static void must_append_sized_full(char **buf, size_t *bufsz, const char *data,
+				   size_t size, bool append_newline)
+{
+	size_t newsize = *bufsz + size;
+	if (append_newline)
+		++newsize;
+	*buf = must_realloc(*buf, newsize);
+	memcpy(*buf + *bufsz, data, size);
+	if (append_newline)
+		(*buf)[newsize-1] = '\n';
+	*bufsz = newsize;
+}
+
+static void must_append_sized(char **buf, size_t *bufsz, const char *data, size_t size)
+{
+	return must_append_sized_full(buf, bufsz, data, size, false);
+}
+
+static bool is_privileged(struct lxc_conf *conf)
+{
+	return lxc_list_empty(&conf->id_map);
+}
+
+static char *get_apparmor_profile_content(struct lxc_conf *conf)
+{
+	char *profile, *profile_name_full = apparmor_profile_full(conf->name);
+	size_t size;
+	struct lxc_list *it;
+
+	profile = must_concat(
+"#include <tunables/global>\n"
+"profile \"", profile_name_full, "\" flags=(attach_disconnected,mediate_deleted) {\n",
+	                      NULL);
+	size = strlen(profile);
+
+	must_append_sized(&profile, &size, AA_PROFILE_BASE,
+	                  sizeof(AA_PROFILE_BASE)-1);
+
+	if (aa_supports_unix) {
+		must_append_sized(&profile, &size, AA_PROFILE_UNIX_SOCKETS,
+		                  sizeof(AA_PROFILE_UNIX_SOCKETS)-1);
+	}
+
+	if (file_exists("/proc/self/ns/cgroup")) {
+		must_append_sized(&profile, &size, AA_PROFILE_CGROUP_NAMESPACES,
+		                  sizeof(AA_PROFILE_CGROUP_NAMESPACES)-1);
+	}
+
+	if (aa_can_stack && !aa_is_stacked) {
+		char *namespace, *temp;
+		must_append_sized(&profile, &size, AA_PROFILE_STACKING_BASE,
+		                  sizeof(AA_PROFILE_STACKING_BASE)-1);
+		namespace = apparmor_namespace(conf->name);
+		temp = must_concat("  change_profile -> \":", namespace, ":*\",\n"
+		                   "  change_profile -> \":", namespace, "://*\",\n",
+		                   NULL);
+		free(namespace);
+		must_append_sized(&profile, &size, temp, strlen(temp));
+		free(temp);
+	} else {
+		must_append_sized(&profile, &size, AA_PROFILE_NO_STACKING,
+		                  sizeof(AA_PROFILE_NO_STACKING)-1);
+	}
+
+	if (conf->lsm_aa_allow_nesting) {
+		must_append_sized(&profile, &size, AA_PROFILE_NESTING_BASE,
+		                  sizeof(AA_PROFILE_NESTING_BASE)-1);
+
+		if (!aa_can_stack || aa_is_stacked) {
+			char *temp = must_concat("  change_profile -> \"",
+			                         profile_name_full, "\",\n",
+			                         NULL);
+			must_append_sized(&profile, &size, temp, strlen(temp));
+			free(temp);
+		}
+	}
+
+	if (!is_privileged(conf) || am_host_unpriv()) {
+		must_append_sized(&profile, &size, AA_PROFILE_UNPRIVILEGED,
+		                  sizeof(AA_PROFILE_UNPRIVILEGED)-1);
+	}
+
+	lxc_list_for_each(it, &conf->lsm_aa_raw) {
+		const char *line = it->elem;
+		must_append_sized_full(&profile, &size, line, strlen(line), true);
+	}
+
+	/* include terminating \0 byte */
+	must_append_sized(&profile, &size, "}\n", 3);
+
+	free(profile_name_full);
+	return profile;
+}
+
+static char *make_apparmor_profile_path(const char *ctname, const char *cached_profile)
+{
+	char *path, *profile = NULL;
+	if (!cached_profile)
+		cached_profile = profile = apparmor_profile_short(ctname);
+	path = must_make_path(aa_profile_path, profile, NULL);
+	free(profile);
+	return path;
+}
+
+static char *make_apparmor_namespace_path(const char *ctname)
+{
+	char *ret, *namespace = apparmor_namespace(ctname);
+	ret = must_make_path("/sys/kernel/security/apparmor/policy/namespaces", namespace, NULL);
+	free(namespace);
+	return ret;
+}
+
+static bool make_apparmor_namespace(struct lxc_conf *conf)
+{
+	char *path;
+
+	if (!aa_can_stack || aa_is_stacked)
+		return true;
+
+	path = make_apparmor_namespace_path(conf->name);
+	errno = 0;
+	if (mkdir(path, 0755) < 0 && errno != EEXIST) {
+		SYSERROR("error creating apparmor namespace: %s", path);
+		free(path);
+		return false;
+	}
+	free(path);
+
+	return true;
+}
+
+static void remove_apparmor_namespace(struct lxc_conf *conf)
+{
+	char *path = make_apparmor_namespace_path(conf->name);
+	if (rmdir(path) != 0)
+		SYSERROR("Error removing apparmor namespace");
+	free(path);
+}
+
+struct apparmor_parser_args {
+	char cmd;
+	char *file;
+};
+
+static int apparmor_parser_exec(void *data)
+{
+	struct apparmor_parser_args *args = data;
+	char cmdbuf[] = { '-', args->cmd, 'W', 'L', 0 };
+	execlp("apparmor_parser", "apparmor_parser", cmdbuf, aa_cache_path, args->file, NULL);
+	return -1;
+}
+
+static int run_apparmor_parser(char command, struct lxc_conf *conf)
+{
+	char output[MAXPATHLEN];
+	int ret;
+	struct apparmor_parser_args args = {
+		.cmd = command,
+		.file = make_apparmor_profile_path(conf->name, NULL),
+	};
+
+	ret = run_command(output, sizeof(output), apparmor_parser_exec, (void*)&args);
+	if (ret < 0) {
+		ERROR("Failed to run apparmor_parser on \"%s\": %s", args.file, output);
+		free(args.file);
+		return -1;
+	}
+
+	free(args.file);
+	return 0;
+}
+
+static void remove_apparmor_profile(struct lxc_conf *conf)
+{
+	char *profile, *path;
+
+	/* It's ok if these deletes fail: if the container was never started,
+	 * we'll have never written a profile or cached it.
+	 */
+	profile = apparmor_profile_short(conf->name);
+	path = must_make_path(aa_cache_path, profile, NULL);
+	if (path) {
+		(void)rmdir(path);
+		free(path);
+	}
+	path = make_apparmor_profile_path(conf->name, profile);
+	if (path) {
+		(void)rmdir(path);
+		free(path);
+	}
+	free(profile);
+}
+
+static int load_apparmor_profile(struct lxc_conf *conf)
+{
+	int ret = -1;
+	size_t content_len, old_len = 0;
+	char *profile_path = NULL, *old_content = NULL, *new_content = NULL;
+	int profile_fd = -1;
+	struct stat profile_sb;
+
+	if (!aa_admin)
+		return 0;
+
+	if (!make_apparmor_namespace(conf))
+		return -1;
+
+	/* In order to avoid forcing a profile parse (potentially slow) on
+	 * every container start, let's use apparmor's binary policy cache,
+	 * which checks mtime of the files to figure out if the policy needs to
+	 * be regenerated.
+	 *
+	 * Since it uses mtimes, we shouldn't just always write out our local
+	 * apparmor template; instead we should check to see whether the
+	 * template is the same as ours. If it isn't we should write our
+	 * version out so that the new changes are reflected and we definitely
+	 * force a recompile.
+	 */
+
+	profile_path = make_apparmor_profile_path(conf->name, NULL);
+	profile_fd = open(profile_path, O_RDONLY);
+	if (profile_fd >= 0) {
+		if (fstat(profile_fd, &profile_sb) < 0) {
+			SYSERROR("Error accessing old profile from %s",
+			         profile_path);
+			goto out;
+		}
+		old_len = profile_sb.st_size;
+		old_content = lxc_strmmap(NULL, old_len, PROT_READ,
+		                          MAP_PRIVATE, profile_fd, 0);
+		if (!old_content) {
+			SYSERROR("Failed to mmap old profile from %s",
+			         profile_path);
+			goto out;
+		}
+	} else if (errno != ENOENT) {
+		SYSERROR("Error reading old profile from %s", profile_path);
+		goto out;
+	}
+
+	new_content = get_apparmor_profile_content(conf);
+	if (!new_content)
+		goto out;
+
+	content_len = strlen(new_content);
+
+	if (!old_content || old_len != content_len || memcmp(old_content, new_content, content_len) != 0) {
+		if (mkdir_p(aa_cache_path, 0755) != 0) {
+			SYSERROR("Error creating apparmor profile cache directory %s", aa_cache_path);
+			goto out;
+		}
+
+		(void)mkdir_p(aa_profile_path, 0755);
+		if (lxc_write_to_file(profile_path, new_content, content_len, false, 0600) == -1) {
+			SYSERROR("Error writing profile to %s", profile_path);
+			goto out;
+		}
+	}
+
+	ret = run_apparmor_parser(AA_CMD_LOAD, conf);
+	if (ret != 0)
+		goto out_remove_profile;
+
+	goto out_ok;
+
+out_remove_profile:
+	remove_apparmor_profile(conf);
+out:
+	remove_apparmor_namespace(conf);
+out_ok:
+	if (profile_fd >= 0) {
+		if (old_content)
+			lxc_strmunmap(old_content, old_len);
+		close(profile_fd);
+	}
+	free(profile_path);
+	free(new_content);
+	return ret;
+}
+
 /*
- * apparmor_process_label_set: Set AppArmor process profile
- *
- * @label   : the profile to set
- * @conf    : the container configuration to use if @label is NULL
- * @default : use the default profile if @label is NULL
- * @on_exec : this is ignored.  Apparmor profile will be changed immediately
- *
- * Returns 0 on success, < 0 on failure
- *
- * Notes: This relies on /proc being available.
+ * Ensure that the container's policy namespace is unloaded to free kernel
+ * memory. This does not delete the policy from disk or cache.
  */
-static int apparmor_process_label_set(const char *inlabel, struct lxc_conf *conf,
-				      bool use_default, bool on_exec)
+static void apparmor_cleanup(struct lxc_conf *conf)
 {
-	int label_fd, ret;
-	pid_t tid;
-	const char *label = inlabel ? inlabel : conf->lsm_aa_profile;
-	char *curlabel;
+	if (!aa_admin)
+		return;
 
-	if (!aa_enabled)
+	remove_apparmor_namespace(conf);
+	(void)run_apparmor_parser(AA_CMD_UNLOAD, conf);
+
+	remove_apparmor_profile(conf);
+}
+
+static int apparmor_prepare(struct lxc_conf *conf)
+{
+	int ret = -1;
+	const char *label;
+	char *curlabel = NULL, *genlabel = NULL;
+
+	if(!aa_enabled)
 		return 0;
 
+	label = conf->lsm_aa_profile;
+
 	/* user may request that we just ignore apparmor */
 	if (label && strcmp(label, AA_UNCHANGED) == 0) {
 		INFO("apparmor profile unchanged per user request");
 		return 0;
 	}
 
+	if (label && strcmp(label, AA_GENERATED) == 0) {
+		/* auto-generate profile based on available/requested security features */
+		if (load_apparmor_profile(conf) != 0) {
+			ERROR("failed to load generated apparmor profile");
+			goto out;
+		}
+		genlabel = apparmor_profile_full(conf->name);
+		if (!genlabel) {
+			ERROR("failed to build apparmor profile name");
+			goto out;
+		}
+		if (aa_can_stack && !aa_is_stacked) {
+			char *namespace = apparmor_namespace(conf->name);
+			size_t llen = strlen(genlabel);
+			must_append_sized(&genlabel, &llen, "//&:", sizeof("//&:")-1);
+			must_append_sized(&genlabel, &llen, namespace, strlen(namespace));
+			must_append_sized(&genlabel, &llen, ":", sizeof(":")); /* with the nul byte */
+			free(namespace);
+		}
+		label = genlabel;
+	}
+
 	curlabel = apparmor_process_label_get(lxc_raw_getpid());
 
-	if (!aa_stacking_supported() && aa_needs_transition(curlabel)) {
+	if (!aa_can_stack && aa_needs_transition(curlabel)) {
 		/* we're already confined, and stacking isn't supported */
 
 		if (!label || strcmp(curlabel, label) == 0) {
 			/* no change requested */
-			free(curlabel);
-			return 0;
+			ret = 0;
+			goto out;
 		}
 
 		ERROR("already apparmor confined, but new label requested.");
-		free(curlabel);
-		return -1;
+		goto out;
 	}
-	free(curlabel);
 
 	if (!label) {
-		if (use_default) {
-			if (cgns_supported())
-				label = AA_DEF_PROFILE_CGNS;
-			else
-				label = AA_DEF_PROFILE;
-		}
+		if (cgns_supported())
+			label = AA_DEF_PROFILE_CGNS;
 		else
-			label = "unconfined";
+			label = AA_DEF_PROFILE;
 	}
 
 	if (!check_mount_feature_enabled() && strcmp(label, "unconfined") != 0) {
@@ -222,31 +966,80 @@ static int apparmor_process_label_set(const char *inlabel, struct lxc_conf *conf
 			ERROR("If you really want to start this container, set");
 			ERROR("lxc.apparmor.allow_incomplete = 1");
 			ERROR("in your container configuration file");
-			return -1;
+			goto out;
 		}
 	}
 
+	conf->lsm_aa_profile_computed = must_copy_string(label);
+	ret = 0;
+
+out:
+	if (genlabel) {
+		free(genlabel);
+		if (ret != 0)
+			apparmor_cleanup(conf);
+	}
+	free(curlabel);
+	return ret;
+}
+
+/*
+ * apparmor_process_label_set: Set AppArmor process profile
+ *
+ * @label   : the profile to set
+ * @conf    : the container configuration to use if @label is NULL
+ * @default : use the default profile if @label is NULL
+ * @on_exec : this is ignored.  Apparmor profile will be changed immediately
+ *
+ * Returns 0 on success, < 0 on failure
+ *
+ * Notes: This relies on /proc being available.
+ */
+static int apparmor_process_label_set(const char *inlabel, struct lxc_conf *conf,
+				      bool on_exec)
+{
+	int label_fd, ret = -1;
+	pid_t tid;
+	const char *label = inlabel ? inlabel : conf->lsm_aa_profile_computed;
+
+	if (!aa_enabled)
+		return 0;
+
+	if (!label) {
+		ERROR("lsm wasn't prepared");
+		return -1;
+	}
+
+	/* user may request that we just ignore apparmor */
+	if (strcmp(label, AA_UNCHANGED) == 0) {
+		INFO("apparmor profile unchanged per user request");
+		return 0;
+	}
 
 	if (strcmp(label, "unconfined") == 0 && apparmor_am_unconfined()) {
 		INFO("apparmor profile unchanged");
-		return 0;
+		ret = 0;
+		goto out;
 	}
 	tid = lxc_raw_gettid();
 	label_fd = lsm_process_label_fd_get(tid, on_exec);
 	if (label_fd < 0) {
 		SYSERROR("Failed to change apparmor profile to %s", label);
-		return -1;
+		goto out;
 	}
 
 	ret = lsm_process_label_set_at(label_fd, label, on_exec);
 	close(label_fd);
 	if (ret < 0) {
 		ERROR("Failed to change apparmor profile to %s", label);
-		return -1;
+		goto out;
 	}
 
 	INFO("Changed apparmor profile to %s", label);
-	return 0;
+	ret = 0;
+
+out:
+	return ret;
 }
 
 static struct lsm_drv apparmor_drv = {
@@ -254,12 +1047,47 @@ static struct lsm_drv apparmor_drv = {
 	.enabled           = apparmor_enabled,
 	.process_label_get = apparmor_process_label_get,
 	.process_label_set = apparmor_process_label_set,
+	.prepare           = apparmor_prepare,
+	.cleanup           = apparmor_cleanup,
 };
 
 struct lsm_drv *lsm_apparmor_drv_init(void)
 {
+	bool have_mac_admin = false;
 	if (!apparmor_enabled())
 		return NULL;
+
+	if (snprintf(aa_cache_path, sizeof(aa_cache_path), "%s/.apparmor/cache", LXCPATH) >= sizeof(aa_cache_path)) {
+		ERROR("Failed to build apparmor cache path");
+		return NULL;
+	}
+	if (snprintf(aa_profile_path, sizeof(aa_profile_path), "%s/.apparmor/profiles", LXCPATH) >= sizeof(aa_profile_path)) {
+		ERROR("Failed to build apparmor profile path");
+		return NULL;
+	}
+
+	aa_can_stack = apparmor_can_stack();
+	if (aa_can_stack)
+		aa_is_stacked = file_is_yes("/sys/kernel/security/apparmor/.ns_stacked");
+
+	if (!check_apparmor_parser_version())
+		return NULL;
+
+	make_sanitized_lxcpath();
+
+	#if HAVE_LIBCAP
+	have_mac_admin = lxc_proc_cap_is_set(CAP_SETGID, CAP_EFFECTIVE);
+	#endif
+
+	aa_admin = true;
+	if (!have_mac_admin) {
+		WARN("Per-container AppArmor profiles are disabled because the mac_admin capability is missing");
+	} else if (!am_host_unpriv() || !aa_is_stacked) {
+		WARN("Per-container AppArmor profiles are disabled because LXC is running in an unprivileged container without stacking");
+	} else {
+		aa_admin = true;
+	}
+
 	aa_enabled = 1;
 	return &apparmor_drv;
 }
diff --git a/src/lxc/lsm/lsm.c b/src/lxc/lsm/lsm.c
index 8d7de2dbe..f0b575a17 100644
--- a/src/lxc/lsm/lsm.c
+++ b/src/lxc/lsm/lsm.c
@@ -177,11 +177,33 @@ int lsm_process_label_set_at(int label_fd, const char *label, bool on_exec)
 }
 
 int lsm_process_label_set(const char *label, struct lxc_conf *conf,
-			  bool use_default, bool on_exec)
+			  bool on_exec)
 {
 	if (!drv) {
 		ERROR("LSM driver not inited");
 		return -1;
 	}
-	return drv->process_label_set(label, conf, use_default, on_exec);
+	return drv->process_label_set(label, conf, on_exec);
+}
+
+int lsm_process_prepare(struct lxc_conf *conf)
+{
+	if (!drv) {
+		ERROR("LSM driver not inited");
+		return 0;
+	}
+	if (!drv->prepare)
+		return 0;
+	return drv->prepare(conf);
+}
+
+void lsm_process_cleanup(struct lxc_conf *conf)
+{
+	if (!drv) {
+		ERROR("LSM driver not inited");
+		return;
+	}
+	if (!drv->cleanup)
+		return;
+	drv->cleanup(conf);
 }
diff --git a/src/lxc/lsm/lsm.h b/src/lxc/lsm/lsm.h
index cafb2ac7c..5361a24f8 100644
--- a/src/lxc/lsm/lsm.h
+++ b/src/lxc/lsm/lsm.h
@@ -38,17 +38,21 @@ struct lsm_drv {
 	int (*enabled)(void);
 	char *(*process_label_get)(pid_t pid);
 	int (*process_label_set)(const char *label, struct lxc_conf *conf,
-				 bool use_default, bool on_exec);
+				 bool on_exec);
+	int (*prepare)(struct lxc_conf *conf);
+	void (*cleanup)(struct lxc_conf *conf);
 };
 
 extern void lsm_init(void);
 extern int lsm_enabled(void);
 extern const char *lsm_name(void);
 extern char *lsm_process_label_get(pid_t pid);
+extern int lsm_process_prepare(struct lxc_conf *conf);
 extern int lsm_process_label_set(const char *label, struct lxc_conf *conf,
-				 bool use_default, bool on_exec);
+				 bool on_exec);
 extern int lsm_process_label_fd_get(pid_t pid, bool on_exec);
 extern int lsm_process_label_set_at(int label_fd, const char *label,
 				    bool on_exec);
+extern void lsm_process_cleanup(struct lxc_conf *conf);
 
 #endif /* __LXC_LSM_H */
diff --git a/src/lxc/lsm/nop.c b/src/lxc/lsm/nop.c
index 7bb8121b8..9397f2bfb 100644
--- a/src/lxc/lsm/nop.c
+++ b/src/lxc/lsm/nop.c
@@ -30,7 +30,7 @@ static char *nop_process_label_get(pid_t pid)
 }
 
 static int nop_process_label_set(const char *label, struct lxc_conf *conf,
-				 bool use_default, bool on_exec)
+				 bool on_exec)
 {
 	return 0;
 }
diff --git a/src/lxc/lsm/selinux.c b/src/lxc/lsm/selinux.c
index c88c18e3d..9f7b7bc31 100644
--- a/src/lxc/lsm/selinux.c
+++ b/src/lxc/lsm/selinux.c
@@ -75,15 +75,13 @@ static char *selinux_process_label_get(pid_t pid)
  * Notes: This relies on /proc being available.
  */
 static int selinux_process_label_set(const char *inlabel, struct lxc_conf *conf,
-				     bool use_default, bool on_exec)
+				     bool on_exec)
 {
 	int ret;
 	const char *label;
 
 	label = inlabel ? inlabel : conf->lsm_se_context;
 	if (!label) {
-		if (!use_default)
-			return -EINVAL;
 
 		label = DEFAULT_LABEL;
 	}
diff --git a/src/lxc/start.c b/src/lxc/start.c
index 180a37ab4..863f047eb 100644
--- a/src/lxc/start.c
+++ b/src/lxc/start.c
@@ -854,9 +854,23 @@ int lxc_init(const char *name, struct lxc_handler *handler)
 	}
 	TRACE("Initialized cgroup driver");
 
+	ret = lsm_process_prepare(conf);
+	if (ret < 0) {
+		ERROR("Failed to initialize LSM");
+		goto out_destroy_cgroups;
+	}
+	TRACE("Initialized LSM");
+
 	INFO("Container \"%s\" is initialized", name);
 	return 0;
 
+#if 0
+/* For when more initializations are added: */
+out_lsm_cleanup:
+	lsm_process_cleanup(conf);
+#endif
+out_destroy_cgroups:
+	handler->cgroup_ops->destroy(handler->cgroup_ops, handler);
 out_delete_terminal:
 	lxc_terminal_delete(&handler->conf->console);
 out_restore_sigmask:
@@ -943,6 +957,8 @@ void lxc_fini(const char *name, struct lxc_handler *handler)
 	while (namespace_count--)
 		free(namespaces[namespace_count]);
 
+	lsm_process_cleanup(handler->conf);
+
 	cgroup_ops->destroy(cgroup_ops, handler);
 	cgroup_exit(cgroup_ops);
 
@@ -1222,7 +1238,7 @@ static int do_start(void *data)
 	}
 
 	/* Set the label to change to when we exec(2) the container's init. */
-	ret = lsm_process_label_set(NULL, handler->conf, 1, 1);
+	ret = lsm_process_label_set(NULL, handler->conf, 1);
 	if (ret < 0)
 		goto out_warn_father;
 


More information about the lxc-devel mailing list