66#include <linux/types.h>
77
88#include <sys/capability.h>
9+ #include <sys/mman.h>
910#include <sys/mount.h>
1011#include <sys/prctl.h>
1112#include <sys/resource.h>
13+ #include <sys/sendfile.h>
14+ #include <sys/stat.h>
1215#include <sys/syscall.h>
1316#include <sys/types.h>
1417#include <sys/wait.h>
1518
1619#include <errno.h>
20+ #include <fcntl.h>
1721#include <paths.h>
1822#include <sched.h>
1923#ifdef WITH_SECCOMP
@@ -36,6 +40,10 @@ static int adjust_capabilities(struct error *, uid_t, bool);
3640static int adjust_privileges (struct error * , uid_t , gid_t , bool );
3741static int limit_resources (struct error * );
3842static int limit_syscalls (struct error * );
43+ static ssize_t sendfile_nointr (int , int , off_t * , size_t );
44+ static int open_as_memfd (struct error * , const char * );
45+ int memfd_create (const char * , unsigned int );
46+
3947
4048static inline bool
4149secure_mode (void )
@@ -242,7 +250,7 @@ limit_resources(struct error *err)
242250 limit = (struct rlimit ){64 , 64 };
243251 if (setrlimit (RLIMIT_NOFILE , & limit ) < 0 )
244252 goto fail ;
245- limit = (struct rlimit ){1024 * 1024 , 1024 * 1024 };
253+ limit = (struct rlimit ){2 * 1024 * 1024 , 2 * 1024 * 1024 };
246254 if (setrlimit (RLIMIT_FSIZE , & limit ) < 0 )
247255 goto fail ;
248256 return (0 );
@@ -279,14 +287,17 @@ limit_syscalls(struct error *err)
279287 SCMP_SYS (getegid ),
280288 SCMP_SYS (geteuid ),
281289 SCMP_SYS (getgid ),
282- SCMP_SYS (getpgrp ),
290+ SCMP_SYS (getpgrp ),
283291 SCMP_SYS (getpid ),
284292 SCMP_SYS (gettid ),
285293 SCMP_SYS (gettimeofday ),
286294 SCMP_SYS (getuid ),
287295 SCMP_SYS (_llseek ),
288296 SCMP_SYS (lseek ),
289297 SCMP_SYS (lstat ),
298+ #ifdef SYS_memfd_create
299+ SCMP_SYS (memfd_create ),
300+ #endif
290301 SCMP_SYS (mkdir ),
291302 SCMP_SYS (mmap ),
292303 SCMP_SYS (mprotect ),
@@ -303,6 +314,7 @@ limit_syscalls(struct error *err)
303314 SCMP_SYS (rt_sigaction ),
304315 SCMP_SYS (rt_sigprocmask ),
305316 SCMP_SYS (rt_sigreturn ),
317+ SCMP_SYS (sendfile ),
306318 SCMP_SYS (stat ),
307319 SCMP_SYS (symlink ),
308320 SCMP_SYS (tgkill ),
@@ -352,6 +364,98 @@ limit_syscalls(struct error *err)
352364}
353365#endif /* WITH_SECCOMP */
354366
367+ /* memfd_create(2) flags -- copied from <linux/memfd.h>. */
368+ #ifndef MFD_CLOEXEC
369+ # define MFD_CLOEXEC 0x0001U
370+ # define MFD_ALLOW_SEALING 0x0002U
371+ #endif
372+ #ifndef MFD_EXEC
373+ # define MFD_EXEC 0x0010U
374+ #endif
375+
376+ /* This comes directly from <linux/fcntl.h>. */
377+ #ifndef F_LINUX_SPECIFIC_BASE
378+ # define F_LINUX_SPECIFIC_BASE 1024
379+ #endif
380+ #ifndef F_ADD_SEALS
381+ # define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9)
382+ #endif
383+ #ifndef F_SEAL_SEAL
384+ # define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */
385+ # define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */
386+ # define F_SEAL_GROW 0x0004 /* prevent file from growing */
387+ # define F_SEAL_WRITE 0x0008 /* prevent writes */
388+ #endif
389+
390+ int memfd_create (const char * name , unsigned int flags )
391+ {
392+ #ifdef SYS_memfd_create
393+ return syscall (SYS_memfd_create , name , flags );
394+ #else
395+ errno = ENOSYS ;
396+ return -1 ;
397+ #endif
398+ }
399+
400+ static ssize_t
401+ sendfile_nointr (int out_fd , int in_fd , off_t * offset , size_t count )
402+ {
403+ ssize_t ret ;
404+
405+ do {
406+ ret = sendfile (out_fd , in_fd , offset , count );
407+ } while (ret < 0 && errno == EINTR );
408+
409+ return ret ;
410+ }
411+
412+ static int
413+ open_as_memfd (struct error * err , const char * path )
414+ {
415+ int fd , memfd , ret ;
416+ ssize_t bytes_sent = 0 ;
417+ struct stat st = {0 };
418+ off_t offset = 0 ;
419+
420+ if ((fd = xopen (err , path , O_RDONLY )) < 0 )
421+ return (-1 );
422+
423+ log_info ("creating a virtual copy of the ldconfig binary" );
424+ memfd = memfd_create (path , MFD_ALLOW_SEALING | MFD_CLOEXEC );
425+ if (memfd == -1 ) {
426+ error_set (err , "error creating memfd for path: %s" , path );
427+ return (-1 );
428+ }
429+
430+ ret = fstat (fd , & st );
431+ if (ret == -1 ) {
432+ error_set (err , "error running fstat for path: %s" , path );
433+ goto fail ;
434+ }
435+
436+ while (bytes_sent < st .st_size ) {
437+ ssize_t sent ;
438+ sent = sendfile_nointr (memfd , fd , & offset , st .st_size - bytes_sent );
439+ if (sent == -1 ) {
440+ error_set (err , "failed to copy ldconfig binary to virtual copy" );
441+ goto fail ;
442+ }
443+ bytes_sent += sent ;
444+ }
445+
446+ if (fcntl (memfd , F_ADD_SEALS , F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE ) == -1 ) {
447+ error_set (err , "failed to seal virtual copy of the ldconfig binary" );
448+ goto fail ;
449+ }
450+
451+ close (fd );
452+ return memfd ;
453+ fail :
454+ close (fd );
455+ close (memfd );
456+ return (-1 );
457+ }
458+
355459int
356460nvc_ldcache_update (struct nvc_context * ctx , const struct nvc_container * cnt )
357461{
@@ -374,8 +478,11 @@ nvc_ldcache_update(struct nvc_context *ctx, const struct nvc_container *cnt)
374478 * Force proc to be remounted since we're creating a PID namespace and fexecve depends on it.
375479 */
376480 ++ argv [0 ];
377- if ((fd = xopen (& ctx -> err , argv [0 ], O_RDONLY |O_CLOEXEC )) < 0 )
378- return (-1 );
481+ if ((fd = open_as_memfd (& ctx -> err , argv [0 ])) < 0 ) {
482+ log_warn ("failed to create virtual copy of the ldconfig binary" );
483+ if ((fd = xopen (& ctx -> err , argv [0 ], O_RDONLY |O_CLOEXEC )) < 0 )
484+ return (-1 );
485+ }
379486 host_ldconfig = true;
380487 log_infof ("executing %s from host at %s" , argv [0 ], cnt -> cfg .rootfs );
381488 } else {
0 commit comments