[lxc-devel] [PATCH 1/2] Modify lxc_attach and lxc_init to use clone instead of fork.

Greg Kurz gkurz at fr.ibm.com
Wed Aug 24 22:07:06 UTC 2011


On Wed, 2011-08-24 at 14:17 +0400, Vladimir Smirnov wrote:
> Clone allows more flexible control. Currently, if there is any inherited fd,
> lxc-start exits with error. With clone it's possible not to pass open fd's to childs.
> 

Hmm... when it comes to file descriptors, you have two flavours:
- CLONE_FILES: the child will share the file descriptor table with the
parent. If any of the two tasks open or close a fd, the other will also
see the change.
- without CLONE_FILES: the child will get a copy of the parent's file
descriptor table. Then, each task may open or close a fd without effect
for the other.

As you see, the child shares or at least inherits a copy of all the fd's
with its parent. Honestly, I don't know any clone flag that would
prevent the child to be passed a fd...

> Signed-off-by: Vladimir Smirnov <civil at yandex-team.ru>
> ---
>  src/lxc/lxc_attach.c |   79 +++++++++++++++++++++++++++++---------------------
>  src/lxc/lxc_init.c   |   46 ++++++++++++++++++-----------
>  2 files changed, 75 insertions(+), 50 deletions(-)
> 
> diff --git a/src/lxc/lxc_attach.c b/src/lxc/lxc_attach.c

Why do you need to patch lxc-attach ? AFAIK it doesn't check for
inherited file descriptors...

> index ed3d5a4..930865e 100644
> --- a/src/lxc/lxc_attach.c
> +++ b/src/lxc/lxc_attach.c
> @@ -20,6 +20,7 @@
>   * License along with this library; if not, write to the Free Software
>   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
>   */
> +#define STACKSIZE 16384
> 
>  #define _GNU_SOURCE
>  #include <unistd.h>
> @@ -29,6 +30,7 @@
>  #include <sys/param.h>
>  #include <sys/types.h>
>  #include <sys/wait.h>
> +#include <sched.h>
> 
>  #include "commands.h"
>  #include "arguments.h"
> @@ -56,14 +58,48 @@ Options :\n\
>  	.checker  = NULL,
>  };
> 
> +static char *envp_global;
> +
> +int child_main()
> +{
> +	struct passwd *passwd;
> +	uid_t uid;
> +
> +	if (my_args.argc) {
> +		execve(my_args.argv[0], my_args.argv, envp_global);
> +		SYSERROR("failed to exec '%s'", my_args.argv[0]);
> +		return -1;
> +	}
> +
> +	uid = getuid();
> +
> +	passwd = getpwuid(uid);
> +	if (!passwd) {
> +		SYSERROR("failed to get passwd "		\
> +			 "entry for uid '%d'", uid);
> +		return -1;
> +	}
> +
> +	{
> +		char *const args[] = {
> +			passwd->pw_shell,
> +			NULL,
> +		};
> +
> +		execve(args[0], args, envp_global);
> +		SYSERROR("failed to exec '%s'", args[0]);
> +		return -1;
> +	}
> +}
> +
>  int main(int argc, char *argv[], char *envp[])
>  {
>  	int ret;
>  	pid_t pid;
> -	struct passwd *passwd;
> -	uid_t uid;
>  	char *curdir;
> 
> +	envp_global = envp;
> +
>  	ret = lxc_caps_init();
>  	if (ret)
>  		return ret;
> @@ -96,7 +132,14 @@ int main(int argc, char *argv[], char *envp[])
> 
>  	free(curdir);
> 
> -	pid = fork();
> +	void **newstack;
> +	newstack = (void **) malloc(STACKSIZE);
> +	if (!newstack)
> +		exit(newstack);
> +
> +	newstack = (void **)(STACKSIZE + (char *)newstack);
> +	*--newstack = 0;
> +	pid = clone(child_main, newstack, CLONE_VM | CLONE_FS | CLONE_SIGHAND, 0);
> 
>  	if (pid < 0) {
>  		SYSERROR("failed to fork");
> @@ -120,35 +163,5 @@ int main(int argc, char *argv[], char *envp[])
>  		return -1;
>  	}
> 
> -	if (!pid) {
> -
> -		if (my_args.argc) {
> -			execve(my_args.argv[0], my_args.argv, envp);
> -			SYSERROR("failed to exec '%s'", my_args.argv[0]);
> -			return -1;
> -		}
> -
> -		uid = getuid();
> -
> -		passwd = getpwuid(uid);
> -		if (!passwd) {
> -			SYSERROR("failed to get passwd "		\
> -				 "entry for uid '%d'", uid);
> -			return -1;
> -		}
> -
> -		{
> -			char *const args[] = {
> -				passwd->pw_shell,
> -				NULL,
> -			};
> -
> -			execve(args[0], args, envp);
> -			SYSERROR("failed to exec '%s'", args[0]);
> -			return -1;
> -		}
> -
> -	}
> -
>  	return 0;
>  }
> diff --git a/src/lxc/lxc_init.c b/src/lxc/lxc_init.c
> index a534b51..7debd6f 100644
> --- a/src/lxc/lxc_init.c
> +++ b/src/lxc/lxc_init.c
> @@ -20,6 +20,7 @@
>   * License along with this library; if not, write to the Free Software
>   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
>   */
> +#define STACKSIZE 16384
> 
>  #include <stdio.h>
>  #include <unistd.h>
> @@ -32,6 +33,7 @@
>  #include <sys/wait.h>
>  #define _GNU_SOURCE
>  #include <getopt.h>
> +#include <sched.h>
> 
>  #include "log.h"
>  #include "caps.h"
> @@ -40,6 +42,9 @@
> 
>  lxc_log_define(lxc_init, lxc);
> 
> +static sigset_t mask, omask;
> +static char **aargv;
> +
>  static int quiet;
> 
>  static struct option options[] = {
> @@ -49,6 +54,23 @@ static struct option options[] = {
> 
>  static	int was_interrupted = 0;
> 
> +int child_main()
> +{
> +	int i;
> +	int err = -1;
> +	/* restore default signal handlers */
> +	for (i = 1; i < NSIG; i++)
> +		signal(i, SIG_DFL);
> +
> +	sigprocmask(SIG_SETMASK, &omask, NULL);
> +
> +	NOTICE("about to exec '%s'", aargv[0]);
> +
> +	execvp(aargv[0], aargv);
> +	ERROR("failed to exec: '%s' : %m", aargv[0]);
> +	exit(err);
> +}
> +
>  int main(int argc, char *argv[])
>  {
> 
> @@ -61,8 +83,6 @@ int main(int argc, char *argv[])
>  	pid_t pid;
>  	int nbargs = 0;
>  	int err = -1;
> -	char **aargv;
> -	sigset_t mask, omask;
>  	int i, shutdown = 0;
> 
>  	while (1) {
> @@ -115,25 +135,17 @@ int main(int argc, char *argv[])
>  	if (lxc_caps_reset())
>  		exit(err);
> 
> -	pid = fork();
> -
> -	if (pid < 0)
> +	void **newstack;
> +	newstack = (void **) malloc(STACKSIZE);
> +	if (!newstack)
>  		exit(err);
> 
> -	if (!pid) {
> -
> -		/* restore default signal handlers */
> -		for (i = 1; i < NSIG; i++)
> -			signal(i, SIG_DFL);
> +	newstack = (void **)(STACKSIZE + (char *)newstack);
> +	*--newstack = 0;
> +	pid = clone(child_main, newstack, CLONE_VM | CLONE_FS | CLONE_SIGHAND, 0);
> 

Passing CLONE_VM here means execvp() will flush the memory for both
parent and child... This means the lxc-init code doesn't run anymore and
this isn't acceptable, as it honors the child ripping work in an
application container.

> -		sigprocmask(SIG_SETMASK, &omask, NULL);
> -
> -		NOTICE("about to exec '%s'", aargv[0]);
> -
> -		execvp(aargv[0], aargv);
> -		ERROR("failed to exec: '%s' : %m", aargv[0]);
> +	if (pid < 0)
>  		exit(err);
> -	}
> 
>  	/* let's process the signals now */
>  	sigdelset(&omask, SIGALRM);

-- 
Gregory Kurz                                     gkurz at fr.ibm.com
Software Engineer @ IBM/Meiosys                  http://www.ibm.com
Tel +33 (0)534 638 479                           Fax +33 (0)561 400 420

"Anarchy is about taking complete responsibility for yourself."
        Alan Moore.





More information about the lxc-devel mailing list