diff --git a/debian/changelog b/debian/changelog index 088b4157c..ad3018786 100644 --- a/debian/changelog +++ b/debian/changelog @@ -27,7 +27,7 @@ linux-2.6 (2.6.32-12) UNRELEASED; urgency=low [ maximilian attems] * [ia64] Built in fbcon. - * Update openvz patch to 6b5607eeec54. (closes: #574598) + * Update openvz patch to c05f95fcb04e. (closes: #574598) * Reenable nouveau autoloading. * reiserfs: Fix permissions on .reiserfs_priv. CVE-2010-1146 * libata,ata_piix: detect and clear spurious IRQs. diff --git a/debian/patches/features/all/openvz/openvz.patch b/debian/patches/features/all/openvz/openvz.patch index 4aaca1c39..3e10023e4 100644 --- a/debian/patches/features/all/openvz/openvz.patch +++ b/debian/patches/features/all/openvz/openvz.patch @@ -1,3 +1,1794 @@ +commit c05f95fcb04e896c898218d12a8f37c43d2f9cc6 +Author: Pavel Emelyanov +Date: Tue Apr 27 15:10:13 2010 +0400 + + OpenVZ kernel 2.6.32-avdeyev released + + Named after Sergei Vasilyevich Avdeyev - a Russian cosmonaut. + + Signed-off-by: Pavel Emelyanov + +commit b4a419d9abd11e3efd02e9fccd4a14180866cf99 +Merge: 455792e 5bf3475 +Author: Pavel Emelyanov +Date: Tue Apr 27 14:01:27 2010 +0400 + + Merged linux-2.6.32.12 + + Conflicts: + + Makefile + + Signed-off-by: Pavel Emelyanov + +commit 455792e7712fac15bba7ca187c244f30c9d0e825 +Author: Konstantin Khlebnikov +Date: Thu Apr 22 19:08:13 2010 +0400 + + ipv6: fix sysctl unregistering order + + call addrconf_ifdown for loopback at last last ipv6 addr delete with how=0 + to fix sysctl tables undergister ordering: all other interfaces attach their + sysctl paths to lo's, so unregister lo sysctl tables only at namespace destroy. + + https://bugzilla.sw.ru/show_bug.cgi?id=473430 + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit fa86dba2b6213e770f102d1e688f6527d759aecf +Author: Konstantin Khlebnikov +Date: Mon Apr 5 15:43:18 2010 +0400 + + ve: fix ve task state percpu counters + + Counters overlap detection for ve tasks in running/uninterraprible/iowait state + was broken due to type mismatch: + nr_{running/unin..e/iowait}_ve() uses _long_ for summing _int_ percpu counters. + + As result, it broke ve loadavg calculation after first int overlap. + + This patch expand all this percpu counters to unsigned long. + + http://bugzilla.openvz.org/show_bug.cgi?id=1396 + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit b484e22d951a02bd7ce25aaac396742766142790 +Author: Konstantin Khlebnikov +Date: Mon Apr 5 15:41:30 2010 +0400 + + check flags on parsed structure + + http://bugzilla.openvz.org/show_bug.cgi?id=1464 + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit d8a86ef5a6c747ddb2896696269c0feef5d6fe1e +Author: Konstantin Khlebnikov +Date: Mon Apr 5 15:38:29 2010 +0400 + + CPT: check signal curr_target at restore + + set signal curr_target to current if right task was not found. + fix oops after broken restore. + + "curr_target" controls round robin signal target balance over process + threads, there no reasons to care about migration accuracy. + + http://bugzilla.openvz.org/show_bug.cgi?id=1467 + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit 61845b781db7d86180977270c73f6ea3885485f3 +Author: Pavel Emelyanov +Date: Mon Apr 5 15:35:58 2010 +0400 + + cpt: Don't mind the tsk->splice_pipe cache at cpt time + + This field is just a cache for sendfile systemcall. It can be dropped + safely during migration - the first sendfile after restore will create + it back. + + http://bugzilla.openvz.org/show_bug.cgi?id=881 + + Signed-off-by: Pavel Emelyanov + +commit fcd86ff706b309999e526dc4a37e9de88ec051fb +Author: Peter Volkov +Date: Sun Mar 28 18:04:44 2010 +0400 + + Fix /proc/kmsg permissions with capabilities active + + Whenever application sets cap_sys_admin=ep it is unable to read + /proc/kmsg with EPERM. This patch makes /proc/kmsg readable on HN. + http://bugzilla.openvz.org/show_bug.cgi?id=1360 + + Signed-off-by: Peter Volkov + Signed-off-by: Pavel Emelyanov + +commit 8c6af363b89ebf94d3982d786dd21c64fb41528f +Author: Konstantin Khlebnikov +Date: Fri Mar 12 15:58:35 2010 +0300 + + quota: fix compilation 32-bit compat quota, remove size checks. + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit 26aeb82fc7ef70e83a4e0640fcb77c7b6f31d81b +Author: Konstantin Khlebnikov +Date: Fri Mar 12 15:58:34 2010 +0300 + + x86: fix compilation for 32-bit kernel + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit 92875e3c49a15885ffbf40cbb0f2bd82cf423e43 +Author: Konstantin Khlebnikov +Date: Mon Mar 1 13:03:59 2010 +0300 + + CPT: update image version to CPT_VERSION_27_3 + + sync cpt minor version with rhel5 branch + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit f7dd75ba9debbd60b12eec93128a5742d6876d28 +Author: Konstantin Khlebnikov +Date: Mon Mar 1 12:56:27 2010 +0300 + + CPT: ignore deleted linked chr blk fifo nodes + + Ignore unlinked but referenced pipes, character and block device nodes. + Restore process will create it itself. + + Bug #455855 + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit d7c68b191825cbbf6c7a40a75d38d09330b3abca +Author: Pavel Emelianov +Date: Mon Mar 1 12:55:36 2010 +0300 + + CPT: Dump fake hardlinks on inotify watch's inodes + + When a watch is attached to unlinked and closed file it + will not be restored, since the inode will not be in image. + + To fix this the proposal is to create a fake link on the + inode in a temp dir and dump it. + + Bug #454944 + + Signed-off-by: Pavel Emelyanov + +commit 7cf74bdd35d9559c671362cf8ce7016bb51aedaa +Author: Vitaliy Gusev +Date: Mon Mar 1 12:52:42 2010 +0300 + + CPT: Open hardlinked files only if is set 'hardlinked_on' + + Signed-off-by: Vitaliy Gusev + Signed-off-by: Pavel Emelyanov + +commit 52c2eb6da3f09f44d652eb7156a793b5f50e8e08 +Author: Vitaliy Gusev +Date: Mon Mar 1 12:52:09 2010 +0300 + + CPT: Add ioctl CPT_HARDLNK_ON for rst + + vzctl have to call ioctl CPT_HARDLNK_ON to enable open hardlinked + files by kernel during restore. + + This protection is needed to prevent mix new kernel + old vzctl (which + doesn't do cleaning). In other words, prevent creating/open files + which will not be removed, and therefore this issue can lead to + security problem. + + Signed-off-by: Vitaliy Gusev + Signed-off-by: Pavel Emelyanov + +commit 72dfa44429c57c924ec4ac4d25d9ef6a343ddade +Author: Vitaliy Gusev +Date: Mon Mar 1 12:51:39 2010 +0300 + + CPT: Add CPT_DENTRY_HARDLINKED flag to cpt_file_image + + This flag tells that file was hardlinked. + + Signed-off-by: Vitaliy Gusev + Signed-off-by: Pavel Emelyanov + +commit 80d2ce353aa41820eca28c15abd6c1421d537736 +Author: Vitaliy Gusev +Date: Mon Mar 1 12:49:48 2010 +0300 + + CPT: Create hard links to "deleted but referenced" during checkpoint + + For "deleted but referenced" files, kernel creates hard link in + directory (that was set via CPT_LINKDIR_ADD) in format: + + .cpt_hardlink.xxxxxxxx + + x - digit, from 0 to 9 + + Note - this policy is used only when no other ways of dumping unlined + file helped. + + Signed-off-by: Vitaliy Gusev + Signed-off-by: Pavel Emelyanov + +commit c24ab545f53ae07a2bfb3a6df100b56d49b57281 +Author: Vitaliy Gusev +Date: Mon Mar 1 12:47:30 2010 +0300 + + CPT: Add ioctl CPT_LINKDIR_ADD for cpt + + vzctl have to call ioctl CPT_LINKDIR_ADD to tell kernel where + create hardlinked files during checkpoint. Without this ioctl + kernel assumes that creating hardlinked files is off. + + Signed-off-by: Vitaliy Gusev + Signed-off-by: Pavel Emelyanov + +commit d4ef97ff64464126b459ef8d9a0adbb95fb9dc09 +Author: Konstantin Khorenko +Date: Sat Feb 27 16:58:11 2010 +0300 + + CPT: stop the migration if shm restoration failed + + Bug #268163 + + Signed-off-by: Konstantin Khorenko + Signed-off-by: Pavel Emelyanov + +commit 089c01a6503ec6fc1ce66841d049bb65aa3c212c +Author: Marat Stanichenko +Date: Sat Feb 27 16:58:11 2010 +0300 + + CPT: restart local_kernel_thread in case of -ERESTARTNOINTR + + This is essential in case of migration to SLM node. + + We can bump into situation when SLM refuses to fork during the + undumping process because it thinks that subgroup's resources + are to be redistributed. When this happens fork is delayed with + the -ERESTARTNOINTR error and the undumping process fails. + + As Den (den@) noticed userspace is not intented to see the + -ERESTARTNOINTR error so we should handle this situation in the + kernel. According to the logic in the do_signal() function the + interrupted system call is immediately restarted in case of the + -ERESTARTNOINTR error. + + We borrow this policy and apply it to the local_kernel_thread() + cpt helper function. + + [ xemul: this is quite a rare case, so simple cond_resched() + is OK here all the more so the redistribution should + happen in a timer ] + + Bug #116787 + + Signed-off-by: Pavel Emelyanov + +commit 8551a850a459df659d7b14a66dfc8cf6da5065d6 +Author: Andrey Mirkin +Date: Sat Feb 27 16:58:11 2010 +0300 + + CPT: save/restore only classic task flags + + Task flags were restored as they were saved in image. That is not correct as + flags are differs in 2.6.9, 2.6.16 and 2.6.18 kernels. + Actually we just need to save/restore only classic flags (PF_EXITING, PF_DEAD, + PF_FORKNOEXEC, PF_SUPERPRIV, PF_DUMPCORE and PF_SIGNALED). + + The problems can occure because during migration from 2.6.9 to 2.6.18 kernel + flag PF_USED_MATH was not restored on tsk->flags correctly. + + In 2.6.9 kernel there was field tsk->used_math for this purpose, in 2.6.18 + kernel it is transformed into one of the tsk->flags. + + And it was a bug, that after restore of fpu state and PF_USED_MATH flag, it + was cleared by "tsk->flags = ti->cpt_flags & ~PF_FROZEN", as old cpt_flags do + not contain PF_USED_MATH flag. + + Bugs #115977 #115980 #115982 + + Signed-off-by: Pavel Emelyanov + +commit 75f2abfa9f92fc7ac512a8ed9a34c2df0edd133d +Author: Andrey Mirkin +Date: Sat Feb 27 16:58:11 2010 +0300 + + CPT: udp sockets restore fix + + Some applications (like ntpd) set on udp sockets sk_reuse to 1. So any other + applications can bind to the same port. During restore we must skip this + check and restore and bind all sockets. On IPv6 we must also force DAD + (Duplicate Address Detection) procedure to be sure that IFA_F_TENTATIVE flag + will be cleared on IPv6 address and socket can be binded to it. + + http://bugzilla.openvz.org/show_bug.cgi?id=784 + + Signed-off-by: Pavel Emelyanov + +commit ba94d3fa2bb8636a7dceaa01fbf6fecdb8edacd5 +Author: Vitaliy Gusev +Date: Sat Feb 27 16:58:11 2010 +0300 + + CPT: screw up udev bindmounts knot + + Ubuntu's udev on boot does: + + if ! mountpoint -q /dev; then + # initramfs didn't mount /dev, so we'll need to do that + mount -n --bind /dev /etc/udev + mount -n -t tmpfs -o mode=0755 udev /dev + mkdir -m 0700 -p /dev/.static/dev + mount -n --move /etc/udev /dev/.static/dev + fi + + So, workaround is dumping "/dev" as bindmount's source. + + Bug #120852 + http://bugzilla.openvz.org/show_bug.cgi?id=1198 + + Signed-off-by: Vitaliy Gusev + Signed-off-by: Pavel Emelyanov + +commit faa9a6dd94c072b38c8f963ce314fc1d6ff69ddf +Author: Vitaliy Gusev +Date: Sat Feb 27 16:58:10 2010 +0300 + + CPT: restore dead tasks proc files + + If some process opened /proc/ and process with will die + after some time then checkpoint fails with error: + + Can not dump VE: Invalid argument + Error: d_path cannot be looked up /proc/125/cmdline + + The fix is to catch this situation at the dump time, mark the image respectively + and restore a fake file on restore. + + http://bugzilla.openvz.org/show_bug.cgi?id=1047 + + Signed-off-by: Vitaliy Gusev + Signed-off-by: Pavel Emelyanov + +commit 977418edceabb4705f5012e562d4e5e04a19f138 +Author: Vitaliy Gusev +Date: Sat Feb 27 16:58:10 2010 +0300 + + CPT: adjust vfsmounts restore order + + Idea is: Dump parent before dump his children + + This order is needed during checkpoint/restore: + + mount /A /B -o bind + mount none /C -t tmpfs + mkdir /C/D + mount /B /C/D --move + + After this, checkpoint (w/o this patch) will dump vfsmounts in order: + + - vfsmount, bind to /A, mounted to /C/D + - vfsmount, mounted to /C (tmpfs) + and will restore in the same order, that causes error. + + Bug #132951 + + Signed-off-by: Vitaliy Gusev + Signed-off-by: Pavel Emelyanov + +commit c42b985195cc8e7c2bbeb644e92d98a066aacc18 +Author: Vitaliy Gusev +Date: Sat Feb 27 16:58:10 2010 +0300 + + CPT: dont cpt requiresdev fs + + Don't allow chkpnt VE with mounted ext2/ext3, etc filesystems. + + Allow checkpoint only for mounted nodev and "external" filesystem. + + This check protects from error on restore: + CPT ERR: ffff810007113000,102 :-2 mounting /root/some_dir ext3 40000000 + + as do_one_mount() doesn't pass mntdev to mount(). + + [xemul: actually, the reason we don't support filesystems other than + virtual and tmpfs is because we simply can't (easily) get the + mount options for them to cpt and restore ] + + Bug #131737 + + Signed-off-by: Vitaliy Gusev + Signed-off-by: Pavel Emelyanov + +commit a1d028ce2f1e87b5d64fb9fb7ed46740c1d73ed2 +Author: Vitaliy Gusev +Date: Sat Feb 27 16:58:10 2010 +0300 + + CPT: Restore information about tcp listening sockets + + Not all options are important. Only missed ipv6only can cause + error if other application want to listen the same port for IPv4 any address. + + tp->XXX are inherited by children (noticed by Alexey Kuznetsov), so we need also + to restore these options. + + Signed-off-by: Vitaliy Gusev + + Comment from Alexey: + It [everything before] was not OK. The feature which are broken are important, + but not actually critical except for ipv6only. + + F.e. DEFER_ACCEPT is broken -> but nobody will notice, it just will not + be deferred. + Signed-off-by: Pavel Emelyanov + +commit 6364b5498e48bcb600472bb2fafb865206f35068 +Author: Vitaliy Gusev +Date: Sat Feb 27 16:58:10 2010 +0300 + + CPT: put 'expect' after insert to the 'conntrack' + + During restore conntrack, we need to put expect after allocating + ip_conntrack_expect and do something with one. Expect will be + freed or immediate (if nobody has this expect) or during cleanup/timer + hooks. Otherwise expect never will be freed. + + Note: Approaches for kernels 2.6.18 and 2.6.9 are different. For example + see help() in "net/ipv4/netfilter/ip_conntrack_netbios_ns.c" + + Signed-off-by: Vitaliy Gusev + Signed-off-by: Pavel Emelyanov + +commit b3d4348ca6322edad5a0a0d56b15d1eb8db718bd +Author: Vitaliy Gusev +Date: Sat Feb 27 16:58:09 2010 +0300 + + CPT: Fix ip_conntrack_ftp usage counter leak + + Function ip_conntrack_helper_find_get() gets module counter. So put a + conntrack after putting in the hash and handling the conntrack's expect + list. + + Signed-off-by: Vitaliy Gusev + Signed-off-by: Pavel Emelyanov + +commit 74e373eeb5e71b1c8253c04bee92250e5f6640cf +Author: Konstantin Khlebnikov +Date: Sat Feb 27 16:58:08 2010 +0300 + + CPT: dump and restore global snmp statistics + + Per device exists for ipv6 only and is probably not used now, but + anyway - I'll do it later. + + This patch adds new section CPT_SECT_SNMP_STATS that is populated + with CPT_OBJ_BITS set of objects - one for each type of statistics. + Objects have variable length. Stats are stored as a plain array of + __u32 numbers and thus the order in which stats types are stored is + implicitly hard-coded. + + In case we do not have an IPV6 turned on all ipv6 stats are dumped + as CPT_OBJ_BITS/CPT_CONTENT_VOID and are skipped on restore. + + When we restore from an image with more stats in any type, the not + supported ones are dropped with a warning. + + Stats add 28K to image file. + + Bug #113930 + + Signed-off-by: Pavel Emelyanov + +commit 3b0f4b2e0503c157d596d7426ffcba01e30e930f +Author: Vitaliy Gusev +Date: Sat Feb 27 16:58:08 2010 +0300 + + CPT: Fix memory corruption if cpt_family is wrong. + + During restore, if parent socket is AF_INET but cpt_family is + wrong (non initialized, see bug ##95113), then consider request as + related to AF_INET6 is not right and leads to memory corruption. + + As there are a lot of buggy images, so we can't check only on values + AF_INET and AF_INET6. + + Desicion: + - Check request on AF_INET6 first, and consider + request as AF_INET by default. + - Additionally checkup for AF_INET6 request (protect from + random value cpt_family == AF_INET6) + + Bug #118912 + + Signed-off-by: Vitaliy Gusev + Acked-by: Denis V. Lunev + Signed-off-by: Pavel Emelyanov + +commit 4a7ddd3db9a8030d514d120341bffd904ef57315 +Author: Pavel Emelianov +Date: Sat Feb 27 16:58:07 2010 +0300 + + CPT: fix restoring of /dev/null opened early by init + + The problem is the following: + * init from fc9 starts and opens /dev/null for its stdin, stdout + and stderr + * udev starts and overmounts /dev with tmpfs + + After this cpt cannot dump this ve, since one process holds a file, + that is inaccessible from ve root. + + The proposed solution is the following: + 1. allow for /dev/null to be over-mounted + 2. restore init's file in two stages: + stage1: *before* we restored mounts restore init's 0, 1 and + 2 file descriptors, since most likely (in fc9 case - definitely) + init opened them before any other manipulations with fs; + stage2: restore the rest files later, at usual time to make + sore that e.g. sockets etc are restored properly. + + Comment from Alexey: + + ACK. + + Though this is really ugly, it really produces 100% correct result + for this particular situation. + + Bug #116261 + + Signed-off-by: Pavel Emelyanov + +commit 937a5462e54d42a70ca0a66c7d3147d02ff40767 +Author: Pavel Emelianov +Date: Sat Feb 27 16:58:07 2010 +0300 + + CPT: lock sock before restoring its synwait queue + + This new socket already has all the necessary TCP timers armed, + so tcp_keepalive_timer can fire during the rst_restore_synwait_queue + and (for the latter being lockless) can spoil the queue. + + Bug #118912 + + Signed-off-by: Pavel Emelyanov + +commit c5d30bd0194b026df7684e08f1b6e8e77d06305c +Author: Konstantin Khlebnikov +Date: Sat Feb 27 16:58:07 2010 +0300 + + CPT: sysctl randomize_va_space + + implement checkpointing for virtualized sysctl kernel.randomize_va_space. + + reuse existing unused pad1 field in cpt_veinfo_image. + 0 -> image without rnd_va_space virtualization (default value is used) + 1 -> rnd = 0 + 2 -> rnd = 1 + etc... + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit bbdcbaadf794e4a6c579cdac4c92ecc278d7606c +Author: Andrey Mirkin +Date: Sat Feb 27 16:58:07 2010 +0300 + + CPT: add check for presence of module slm_dmprst if SLM is enabled + + Add a check in "checks" for presence of module slm_dmprst if SLM is enabled. + Check will be performed for both source and destination nodes. Changes in + vzmigrate are not needed. + + Bug #114312 + + Signed-off-by: Pavel Emelyanov + +commit 04c139f6c20e5c80a19db1439f8cd2f7e2715b4e +Author: Andrey Mirkin +Date: Sat Feb 27 16:58:07 2010 +0300 + + CPT: add diagnostics in case of iptables-restore fail + + It is not clear right now what is wrong if iptables-restore fails. + Add some diagnostics in case of error. + + Bug #95952 + + Signed-off-by: Pavel Emelyanov + +commit f06677625bf53b6aad0a3742b5f01d1376715e1d +Author: Denis Lunev +Date: Sat Feb 27 16:58:06 2010 +0300 + + CPT: Check that VE is not running on restore. + + Bug #99679 + + Signed-off-by: Denis V. Lunev + Signed-off-by: Pavel Emelyanov + +commit dcda94043007a5d005e92c2df31ba63eeb1b8a70 +Author: Andrey Mirkin +Date: Sat Feb 27 16:58:06 2010 +0300 + + CPT: fix check in decode_tuple() + + Tuple structure can be used as a mask and protonum can be 0xffff in 2.6.9 + kernel. In 2.6.18 kernel all masks for protonum are 0xff and 0xffff will + be shrunken to 0xff. + + Signed-off-by: Pavel Emelyanov + +commit 5a889e32263292bec6e2d4c2710ee41985f35716 +Author: Andrey Mirkin +Date: Sat Feb 27 16:58:06 2010 +0300 + + CPT: fix restore of conntrack expect timer + + One more fix of restore conntrack procedure. + Following code: + + if (ct->helper->timeout && !del_timer(&exp->timeout)) { + ... + } + + can lead to oops, as exp->timeout is not initialized at this point. + + Actually this optimization is not needed at all. + If expectation is dying, then we will let it die by its own death. + + Also in ip_conntrack_expect_insert() there is an initialization of + exp->timeout. And we can't just do add_timer() after that (as in add_timer() + we have BUG_ON(timer_pending(timer))), we must do mod_timer() instead. + + Signed-off-by: Pavel Emelyanov + +commit 19dce010faff8960e80b1778afa9f4ad07dd365f +Author: Andrey Mirkin +Date: Sat Feb 27 16:58:06 2010 +0300 + + CPT: restore mark value on conntracks + + Restore mark value in conntracks as it is needed for connmark module. + + Signed-off-by: Pavel Emelyanov + +commit 7ec63fdedf332db285f71d857cf395da8cf674d5 +Author: Andrey Mirkin +Date: Sat Feb 27 16:58:06 2010 +0300 + + CPT: convert conntrack tuple from 2.6.9 kernel image + + Add conversion for conntrack tuple from 2.6.9 kernel image. + Check for correct value is added in decode_tuple(). + + Signed-off-by: Pavel Emelyanov + +commit c34d6367f6cc5ee7f60fdee828c41de7b633a779 +Author: Andrey Mirkin +Date: Sat Feb 27 16:58:06 2010 +0300 + + CPT: convert conntrack image from 2.6.9 to 2.6.18 + + CPT structure in image file for conntracks is different in 2.6.9 and 2.6.18 + kernels (array cpt_help_data was enlarged in the middle of the structure), so + conntracks from 2.6.9 kernel are restored incorrectly on 2.6.18 kernel and + lead to kernel oops. + + A simple conversion from 2.6.9 to 2.6.18 is introduced to restore conntracks + correctly on 2.6.18 kernel. + + Bug #113290 + + Signed-off-by: Pavel Emelyanov + +commit 21644501b4651df2c7f271cae528f1996fc23a8d +Author: Andrey Mirkin +Date: Sat Feb 27 16:58:05 2010 +0300 + + CPT: create kernel threads in VE0 context + + In current implementation master process which performs checkpointing has + owner_env set to VE0 and exec_env set to VE. All auxiliary kernel threads + are created with exec_env set to VE and owner_env set to VE0, so after the + do_fork_pid() we have the follwing: + + * new thread has owner_env == ve0, exec env == ve + * its pid belongs to ve (pid->veid != 0) + + That is why if ve_enter() in thread fails, then we hit BUG_ON in + release_task -> detach_pid -> free_pid + sequence, since task owner env != pid's veid. + + When enter succeeds the task's owner env becomes ve and this BUG_ON + is not triggered. + + To solve this problem exec_env is switched to VE before kernel thread + creation and switched back after. Veid is passed to kernel via args. All + kernel threads are created with CLONE_VFORK to be sure that parent + process will not exit before doing exec() in thread. + + Bug #97124 + + Signed-off-by: Pavel Emelyanov + +commit 686bb3916a1247b46893078f8d87b8df6b1e305a +Author: Andrey Mirkin +Date: Sat Feb 27 16:58:05 2010 +0300 + + CPT: restore rlimits correctly during 32bit-64bit migration + + During 32bit to 64bit migration rlimits were restored incorrectly due to + different size of long on 32bit and 64bit archs. Now simple conversion is + introduced in case of 32bit-64bit migration. Infinity values are restored as + infinity values. Error is returned if value greater than RLIM_INFINITY32 is + found in dump during restore on 32bit arch. + + Bug #111965 + + Signed-off-by: Pavel Emelyanov + +commit c3e4a29b420b871a6543955728b1f8a5de75e955 +Author: Andrey Mirkin +Date: Sat Feb 27 16:58:05 2010 +0300 + + CPT: restore packet control block from kernels with and without IPv6 + + More generic mechanism for restoring packet control blocks. Unfortunately we + do not save length of control block in dump and we can only try to calculate + it during restore. This method is based on knowledge that the flags value in + TCP control block is not zero for all packets in queue. + Since this image version TCP control block will be saved in IPv6 form + regardless to IPv6 config option. + Restore of control block is splitted in 4 ways for any IPv6 and non-IPv6 + kernel combinations. + Check is added to be sure that all control block were restored in the same + way. If it will be found that some control blocks were restored incorrectly, + then undump process will be terminated. + + Bug #111370. + + Merged 4 patches sent earlier: + 1. Increase image version. + 2. Save TCP control block regardless to IPv6 config option. + 3. Restore of control block is splitted in 4 ways... + 4. Add appropriate comment on TCP control block restore procedure. + + [xemul: + Added do { } while (0) around macro body + Mention Alexey in comment about skb_cb->flags being non-zero + ] + + Signed-off-by: Pavel Emelyanov + +commit 1f218bb8d606af3b95cd089b68b44800f91ac7d1 +Author: Andrey Mirkin +Date: Sat Feb 27 16:58:05 2010 +0300 + + CPT: add binfmt_misc fs in supported list + + Just add binfmt_misc in list of supported file systems. With this small + quick fix migration will be allowed, but all binfmt_misc entries will + be dropped during migration. + + This fix is only for the first time. Later will be implemented generic + mechanism for checkpointing/restore of external modules. And this quick + fix will be replaced with full support for binfmt_misc in CPT. + + Bugs #100709, #101061 + + Signed-off-by: Pavel Emelyanov + +commit 85da0ddab187bb9e6000ba6c98b7454095055799 +Author: Andrey Mirkin +Date: Sat Feb 27 16:58:05 2010 +0300 + + CPT: relax check for several bind mounts on the same mount point + + Relax check for special bind mounts which mounted several times on the same + mount point. We need to check only dentry, mount check can be skipped in this + case. + We can't remove completely mount check as there are exist cases when we need + to check mnt too. E.g. /dev is mounted with NODEV over /dev and some file is + opened from underlying mount. If mount check is removed, then we will be able + to checkpoint such state, but we will not be able to restore it. + + Correct sollution will be to dump/restore whole mount tree with overmounts. + But we can't implement this right now for number of reasons. + + Bug #84310 + + Signed-off-by: Pavel Emelyanov + +commit bc4769bb4acc7547f4e537b23a093019e78652d7 +Author: Andrey Mirkin +Date: Sat Feb 27 16:58:04 2010 +0300 + + CPT: fix reopen dentries procedure + + Dentries were not reopened correctly during checkpointing and restore. + Two bugs fixed: + 1. In case of huge files (more then 2Gb) dentry_open() returns -EFBIG if + O_LARGEFILE flag is not set. This flag should be used for temporary files + used during checkpointing and restore process. + Bug #99544 + https://bugzilla.sw.ru/show_bug.cgi?id=99544 + + 2. In dump_content_regular() we have following code: + file = dentry_open(dget(file->f_dentry), + mntget(file->f_vfsmnt), O_RDONLY); + if (IS_ERR(file)) { + cpt_printk_dentry(file->f_dentry, file->f_vfsmnt); + eprintk_ctx("cannot reopen file for read %ld\n", PTR_ERR(file)); + return PTR_ERR(file); + } + + Which results in kernel oops if dentry_open() returns error + (e.g. -EFBIG because of bug #99544) + + Bug #99542 + + Signed-off-by: Pavel Emelyanov + +commit 08b8f8ba476ec8e67b2eac74028fa5f4a3586c2f +Author: Andrey Mirkin +Date: Sat Feb 27 16:58:04 2010 +0300 + + CPT: fix save/restore of open requests + + Open requests were saved and restored sometimes incorrectly: + + 1. Family of open request was not saved (commented out) + 2. Restore was broken, would crash because rsk_ops was cleared by memset. + 3. And finally, all the coded restoring open requests was skipped. + + Tested with http_load. + + Bug #95113 + http://bugzilla.openvz.org/show_bug.cgi?id=784 + + Signed-off-by: Pavel Emelyanov + +commit 0a6789976c6ff602e11a4f00123ae70b62738f21 +Author: Andrey Mirkin +Date: Sat Feb 27 16:58:04 2010 +0300 + + cpt: add lost dcache_lock protection around __d_path() + + Protect __d_path() call with dcache_lock spinlock. + Protect other checks with env->op_sem semaphore. + + Bug #98833 + + Signed-off-by: Pavel Emelyanov + +commit 22c792c3605e5d0f916308678319e25eb18cf4a6 +Author: Andrey Mirkin +Date: Sat Feb 27 16:58:04 2010 +0300 + + cpt: fix restore of inotify on symlink + + Inside VE file /etc/mtab is a symlink to /proc/mounts. + FreeNX server with KDE creates inotify on /etc/mtab file. + To restore such inotify we need to obtain dentry with path_lookup() and + restore inotify on it. + + Bug #96464 + + Signed-off-by: Pavel Emelyanov + +commit 66a6c3e51c35096b204b8866ee50afe0b1d13d59 +Author: Konstantin Khlebnikov +Date: Sat Feb 27 16:58:04 2010 +0300 + + quota: compat layer for compat quota + + This patch implements compatibility quotactls for old quota tools. + + replace: + diff-fs-quotcompat-ia32emul-fix-20050921 + diff-fs-quotcompat-comp-fix-20080710 + diff-fs-quotcompat-xencomp-fix-20080806 + diff-fs-quota-compat-proper-split-20081027 + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit 1b04f79cb59f8cd8fb1ca26e19a6a4e8295a088f +Author: Pavel Emelianov +Date: Sat Feb 27 16:58:03 2010 +0300 + + ve: Don't check for CAP_SETVEID - use more ... imagination + + This patch: + The proposed check correctly detects the root in ve0. + However, we lose the ability to create containers with + some fancy tool, that has the CAP_SETVEID capability + *only*, but we don't have such. + + The cap itself is declared to be obsoleted, but there's + no need in rewriting vzctl in a rush - things will still + work. If we'll want to manipulate audit caps from the + vzctl we'll make it via features. + + Overall history: + + Don't ban CAP_AUDIT_XXX capabilities in container to make the + dbus-daemon work. + + After two (maybe tree) days of brain storm me and Den finally + gave birth to this solution. So... + + First of all AUDIT will be banned in container. Since dbus refused + not to set audit caps we don't want it to mess with it in any case. + + Next step is to note, that CAP_AUDIT_CONTROL coincides with the + CAP_VE_ADMIN, which is not that bad (besides, dbus doesn't try to + set this one up) and we leave one alone. + + And finally - the CAP_AUDIT_WRITE, which coincides with the most + delicate one - CAP_SETVEID. The latter one is explicitly dropped + on container start and there's no way to set one (dbus tries this + and fails) back. Simple "don't clear it" solution is too dangerous. + + TO handle *this* case we + 1. replace all checks to capable(CAP_SETVEID) to more complicated, + but still matching ve0's root only; + 2. don't ban the CAP_SETVEID (== CAP_AUDIT_WRITE == the_one_dbus_needs); + 3. remember, that this capability is present on ve startup and thus + we automatically have the CAP_AUDIT_WRITE required by dbus; + 4. carefully handle the case, when we enter container in do_env_create + and try to call fairsched system calls. + + That's it. No fraud, just manual dexterity ;) + + Bug #117448 + + Signed-off-by: Pavel Emelyanov + +commit 153eca7d4bf56bd34e7c5957b1ff8ec331713a0b +Author: Pavel Emelianov +Date: Sat Feb 27 16:58:03 2010 +0300 + + fairsched: Sanitize fairsched manipulations on ve startup + + First of all we won't be able to call them after we fix + capability checks. Second of it is that taking the fairsched + mutex 4 times on startup is an overkill. + + Signed-off-by: Pavel Emelyanov + +commit e2fb9c79fd348a0603c4b881c4e1f179945b55b5 +Author: Konstantin Khlebnikov +Date: Sat Feb 27 16:58:03 2010 +0300 + + ms: lutime lchmod syscalls + + Add possibility to change owner/permissions on symbolic links + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit 0b7042d24abe59baba84a78e37b95a88624f9308 +Author: Konstantin Khorenko +Date: Sat Feb 27 16:58:02 2010 +0300 + + ve-net: permit changing of netdev's tx_queue_len from inside a CT + + In particular it makes OpenVPN happy. + + Bug #457318 + + Signed-off-by: Konstantin Khorenko + Signed-off-by: Pavel Emelyanov + +commit eb3139203f525babc452556dd5071c73382050dd +Author: Konstantin Khlebnikov +Date: Sat Feb 27 16:58:02 2010 +0300 + + venet: Core support for external ip filtering + + Allow VE emit packets with configured source IP address. + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit 98ec6de33c046e4f053c6b21152d3e07bead7804 +Author: Marat Stanichenko +Date: Sat Feb 27 16:58:01 2010 +0300 + + vzethdev: stat tx dropped acount + + Veth get_stats() should return the number of tx_dropped packets + + Signed-off-by: Pavel Emelyanov + +commit 57a5848f98e677abefa203f9ad5f1b4bf3d28ace +Author: Vitaliy Gusev +Date: Sat Feb 27 16:58:01 2010 +0300 + + venet: add TSO support in venet and vzethdev + + venet and veth support checksumming and scatter-gather features, but TSO + feature still wasn't added. + + TSO increases bandwidth up to 50% or appreciably decreases CPU usage. + + Approach is the same as for checksumming: + 1. TSO is off by default + 2. For veth: tso can be enabled/disabled in VE or VE0 for + pair {veth in VE, veth in VE0} + 3. For venet: tso can be enabled/disabled only in VE0 (for + all venet devices at once) + + To use this feature just enable: + 1. Tx checksumming: ethtool -K DEVNAME tx on + 2. Scatter-gather: ethtool -K DEVNAME sg on + 3. TSO: ethtool -K DEVNAME ts on + + Some performance info (tested via netperf): + + 1. Traffic VE->VE0 (via venet), TCP STREAM test, message size 32K, socket size 256K: + + TSO off 2300 10^6 bits/s + TSO on 5600 10^6 bits/s + + Notes: + Admins need to set TSO on {venet,veth} only if physical ethernet device supports TSO. + + Signed-off-by: Pavel Emelyanov + +commit f0fe2ba7ff9d91a2bfef1ec95fddbeada5be14d3 +Author: Vasily Averin +Date: Sat Feb 27 16:58:01 2010 +0300 + + ve: Kill not-yet-closed TCP sockets on VE stop herder + + Idea proposed by Alexey Kuznetsov + tcp_v4_kill_ve_sockets() can hangs in loop because NFS can hold some sockets in + host node rpciod/nfsdiod queues. + This patch resets such sockets if it's possible or delays its cleanup. + + changes in 20090429: fixed wrong locking and another xemul@ notices + Bug #429296 + + Signed-off-by: Pavel Emelyanov + +commit 5ad4c74a16b2f9812a1d79287bba724243454ecc +Author: Pavel Emelianov +Date: Sat Feb 27 16:58:00 2010 +0300 + + bc: compat system calls for bc and fairsched + + correct UB_MAXVALUE convertion and wire compat syscalls + + Signed-off-by: Pavel Emelyanov + +commit 20fd4dd54736b40a815ad07d34c4339d5c627f7e +Author: Denis Lunev +Date: Sat Feb 27 16:58:00 2010 +0300 + + ub-dcache: sleep in dput + + ub: dentry->dentry_bc.d_ub is unreliable after the sleep + + d_kill can sleep inside. In this case dentry->dentry_bc.d_ub saved before + is unreliable as we can have dcache accounting on event during sleep. In this + case we'll have saved ub == NULL and OOPS/leak inside dcache_uncharge. + + Another problem here is that we should decrement inuse count on the + dentry appropriately. + + Bug #116095 + + Signed-off-by: Pavel Emelyanov + +commit 76038f85b0523d4d2a48b20b5443a81dee3531e4 +Author: Cyrill Gorcunov +Date: Sat Feb 27 16:58:00 2010 +0300 + + ve-fs: implement "ve-xattr-policy" sysctl entry + + "ve-xattr-policy" sysctl entry allows to control how to react on xattr + change from inside of a container. + + There are three options allowed: + + 0 - accept any xattr modifications (VE0 always and VE by default) + 1 - ignore + 2 - reject + + Note that any other value assigned to "ve-xattr-policy" + leads to "accept" policy being applied without any warning. + + The sysctl is placed at /proc/sys/fs/ve-xattr-policy on HW node. + + http://bugzilla.openvz.org/show_bug.cgi?id=1050 + + Signed-off-by: Pavel Emelyanov + +commit 5cab8bf42b5da73a02d5288951aeeec8fd8b4716 +Author: Marat Stanichenko +Date: Sat Feb 27 16:57:59 2010 +0300 + + ve-kmsg: printk va copy add + + Copy args variable in ve_printk() function + + x64 can corrupt va_list after return from the called function. + + Bug #440939 + + Signed-off-by: Pavel Emelyanov + +commit b55fc66f70948758037a4639e8a63663792ec1f5 +Author: Vitaliy Gusev +Date: Sat Feb 27 16:57:59 2010 +0300 + + ve-kmsg: printk lockdep fixup + + printk: fix lockdep warnings if kernel compiled with CONFIG_LOCKDEP + + vprintk() to VE causes: + + ===================================== + [ BUG: lock held at task exit time! ] + ------------------------------------- + iptables/8203 is exiting with locks still held! + 1 lock held by iptables/8203: + #0: (sk_lock-AF_INET){--..}, at: [] ip_setsockopt+0x61/0xa0 + + stack backtrace: + + Call Trace: + [] show_trace+0xca/0x3b0 + [] dump_stack+0x15/0x20 + [] debug_check_no_locks_held+0x89/0xa0 + [] do_exit+0xe2e/0xe80 + [] sys_exit_group+0x0/0x20 + [<0000000000000001>] + + Note: to reproduce this you can type in VE: + iptables -A INPUT -m tcp --dport 22 -j DROP + + Signed-off-by: Vitaliy Gusev + Signed-off-by: Pavel Emelyanov + +commit 84ac295d2315ecf649e3910735d81e8d217396c3 +Author: Konstantin Khlebnikov +Date: Sat Feb 27 16:57:58 2010 +0300 + + ve-proc: mangle mounts devname harder + + mounts: show /dev/xxx devices near ve root mounts, rather than just xxx + Required for fixing autofs in rhel5 container: + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit 454ad87b41380655cb31a85f682ddb8289e8e1f9 +Author: Konstantin Khlebnikov +Date: Sat Feb 27 16:57:58 2010 +0300 + + ve-sysctl: randomize_va_space + + virtualize sysctl kernel.randomize_va_space + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit a44c3498bcf70065a85236b7daa77fe0320313f2 +Author: Konstantin Khlebnikov +Date: Sat Feb 27 16:57:58 2010 +0300 + + ve-sysctl: add proc_dointvec_ve helper + + add generic method for proc access to per ve int values. + + extra1 field of ctl_table contains data field offset from ve_struct begin. + without CONFIG_VE use address from .data field. + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit 34e6684b531637ad4fd34502d32f6e3c74e2dac6 +Author: Konstantin Khlebnikov +Date: Sat Feb 27 16:57:57 2010 +0300 + + ve: drop oom immunity at enter + + At CT enter switch to default OOM adjustment level if task is OOM-immune. + + This is a very bad idea to have OOM-unkillable tasks inside container, + because all forked tasks inherit this setting. + + Proc interface for changing OOM adjustment (/proc//oom_adj) + allready restricted in CT by diff-ve-oom-adjust-20070604. + + On some systems sshd got OOM protection at start and not drop it after fork. + (example: ssh root@HN -> vzctl enter -> restart apache -- apache now OOM immune) + (example from xemul@: ssh root@HN vzctl start - VE is now OOM immune) + + http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=480020 + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit c7cf5c388378abf4d6e8e2e18c6c815eccab4fd7 +Author: Pavel Emelianov +Date: Sat Feb 27 16:57:57 2010 +0300 + + ms: ext4 use get host + + Force ext4 page fault handlers use ->get_host callbacks + This is required not to use vzfs file in ->page_mkwrite callback. + Bug #454968 + + Signed-off-by: Pavel Emelyanov + +commit a7de88181858ae8f9ec51cee11ae7f955e76430d +Author: Denis Lunev +Date: Sat Feb 27 16:57:57 2010 +0300 + + nfs: disable nfs-v2 + + nfs: disable NFSv2 as it is broken + According to Alexey: "who is going to turn v2 on, having + a v3, which works better, nearby?" + + Bug #114720 + + Signed-off-by: Denis V. Lunev + Signed-off-by: Pavel Emelyanov + +commit 7805f36534f20e530fb84e83a360993ec78f3bb6 +Author: Denis Lunev +Date: Sat Feb 27 16:57:56 2010 +0300 + + ve: vfs sillyrename + + i_nlink count on private inodes after silly rename is 1. So, virtual inodes + gain i_nlink == 1 and remains in unused_list instead of to be cleaned. + + Bug #114672 #112999 + + Signed-off-by: Denis V. Lunev + Signed-off-by: Pavel Emelyanov + +commit d252a93b32d6d251fcc73863b75b91edaa801b95 +Author: Andrey Mirkin +Date: Sat Feb 27 16:57:56 2010 +0300 + + mm mmap zero length kludge + + Return -EINVAL in case of zero length file to all applications except + rpm. For (legacy) rpm address will be returned. + + Such hack is introduced just not to break compatibility with old + tools, sorry :( + + Bug #74964 + + Signed-off-by: Pavel Emelyanov + +commit 437d113149802cb91254246f29134e3ade55e411 +Author: Alexey Kuznetsov +Date: Sat Feb 27 16:57:56 2010 +0300 + + nfs: use file private macro + + Minor fix to nfs, which allows to use vzfs over nfs mounts. + + It survives fsstress test. I think normal vzfs tests can be started + asap to catch the pointes of possile misbehaviour. + + Signed-off-by: Pavel Emelyanov + +commit 3c07eb700d9bbe7fd6b7dcf52103faf58ef4a035 +Author: Konstantin Khlebnikov +Date: Sat Feb 27 16:57:55 2010 +0300 + + vzdq: cleanup fake qmblk destroy + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit 8d622018ad2a3d025576578c0838c18ebfd3fdab +Author: Konstantin Ozerkov +Date: Sat Feb 27 16:57:55 2010 +0300 + + vzdq: qmblk dq_sem to mutex + + vzquota: replace quota master block semaphore with mutex + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit 769b3bbe8d7859d168b42daa35720f12372e10db +Author: Konstantin Ozerkov +Date: Sat Feb 27 16:57:54 2010 +0300 + + vzdq: vz_quota sem to mutex + + vzquota: replace master lock semaphore with mutex + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit 085883fb2366ae47c84fb18aa50f832e93ab56aa +Author: Konstantin Khlebnikov +Date: Sat Feb 27 16:57:54 2010 +0300 + + vzdq: vzaquota proc nlink + + Produce correct nlink count for /proc/vz/vzaquota + + Use count mounpoints accessible from VE as upper estimate for + count subdirectories inside /proc/vz/vzaquot. + Concept stolen from vzdq_aquotd_readdir. + + Disable enumation in VE0 for performance reason (like in _readdir and _lookup) + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit b9a8ce596cba9f5161769ca0408c71f8e6a059c7 +Author: Alexey Kuznetsov +Date: Sat Feb 27 16:57:54 2010 +0300 + + vzdq: swap noquota + + swap_inode did not do anything for inodes not covered by vzquota, + which was wrong. F.e. mkdir, which creates inode with i_blocks!=0, + triggered message "detached inode not in creation". + + Signed-off-by: Pavel Emelyanov + +commit 20d11fba2ae882456b343ae78f466e27cc19d000 +Author: Alexey Kuznetsov +Date: Sat Feb 27 16:57:54 2010 +0300 + + vzdq: nfs support + + It works differently and requires different interface. + Block accounting and quota check are separate now, we account + without checks and check for space in places, where an operation + could allocate more space. + + Chunk-by-chunk: + + 1. Added new operation - swap_inode. Normally, virtual inode + is created/accounted/checked simultaneously. It is impossible for NFS. + So, each operation creating a new inode starts from allocating + space in quota using a dummy inode. If the operation succeeds and real + inode is created, we swap quota accounting information. + TODO: optimize out dummy inode. All that we need is qlnk. + + 2. DQUOT_CHECK_SPACE() to check that quota is not full. + + 3. DQUOT_SYNC_BLOCKS() to resync i_blocks obtained from NFS server + with our accounting. + + 4. is_nfs_root(). NFS does not have root inode. Instead each mount + has pointer to a disconnected inode. vzquota has to undestand this. + + Signed-off-by: Pavel Emelyanov + +commit fd4f6b28860495f939f10abfaec8f255797a4fe8 +Author: Alexey Kuznetsov +Date: Sat Feb 27 16:57:53 2010 +0300 + + vzdq: fix oops is inode_drop_call + + I suppose this happens when vzcache moves to template a file, + which was not under vzquota. + Bug #97782 + + Signed-off-by: Pavel Emelyanov + +commit 71208971e69657168517194564e045781b054526 +Author: Denis Lunev +Date: Sat Feb 27 16:57:53 2010 +0300 + + simfs: statfs on root + + Do not use s_root dentry of underlying for statfs + The real problem is that s_root on the NFS super block is a crap. + Unfortunately, the original dentry (which is asked to be statfs-ed) + is not available at this point. The only visible solution for this + is to use the dentry to which simfs is point to. + + Signed-off-by: Denis V. Lunev + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit 11d902b2933c3292b8e1305e38e37c6419cb9cf2 +Author: Konstant Khorenko +Date: Sat Feb 27 16:57:52 2010 +0300 + + virtinfo hook in daemonize + + #427726 + + Signed-off-by: Pavel Emelyanov + +commit 95a5273372efb164d0b3a4ab6eefca8b671d13e4 +Author: Andrey Mirkin +Date: Sat Feb 27 16:57:52 2010 +0300 + + virtinfo add cpttest + + Add VIRTINFO_SCP_TEST event to virtinfo calls + + This will be responsible for checking CPT features + during checkpoint/restore process. + + Signed-off-by: Pavel Emelyanov + +commit e2e5984d43c91b3aa674123af73849e9643bffb3 +Author: Konstantin Khorenko +Date: Sat Feb 27 16:57:52 2010 +0300 + + ve-proc: fake sysrq trigger + + Add dummy /proc/sysrq-trigger file inside a Container + + Oracle 11g Release 1 RAC tries to open one and refuses to start on fail. + Writing to the file inside a CT leads to nothing, first 10 writes are logged. + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit fc17c7e942ccbcf6909ef9fdb7c4f170acaf1d72 +Author: Vitaliy Gusev +Date: Sat Feb 27 16:57:51 2010 +0300 + + ve-proc: add devices + + Proc: add empty /proc/devices to CT + + Signed-off-by: Pavel Emelyanov + +commit 3cfd7ac2a553a88af0053a59ac9870f1ce82760f +Author: Denis Lunev +Date: Sat Feb 27 16:57:51 2010 +0300 + + ve: decrease ve_struct size in case of huge nr_cpus + + kstat_lat_pcpu_struct contains array of NR_CPUS elements. + Replace it with alloc_percpu data which helps to keep ve_struct + relatively small and prevents allocation fails of huge order. + + Signed-off-by: Pavel Emelyanov + +commit 010370ec6b62618648c8b8882d3887e5e4073fc8 +Author: Pavel Emelyanov +Date: Mon Apr 26 17:22:10 2010 +0400 + + percpu: Return ve0/ub0 percpu-s back + + With the DEFINE_PER_CPU and init-s made in proper place we can + use them as alloc_percpu-ed ones. + + Signed-off-by: Pavel Emelyanov + +commit 541c4b4da4f9c522593f3fd622e5d20fa6a6b294 +Author: Konstantin Khlebnikov +Date: Sat Feb 27 16:57:51 2010 +0300 + + ve: fix fs umount at ct stop + + Don't umount some mount multiple times on ct stop + + umount_tree kill argument must be empty list, + otherwise it can detach each vfsmount multiple times and + produce negative d_mounted count on mountpoint dentry. + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit 543578c2947332cda5aea3b195c4d6a80a3d317b +Author: Konstantin Khlebnikov +Date: Sat Feb 27 16:57:50 2010 +0300 + + ve: ptys idr mem leak + + Plug minor memory leak in idr_layer_cache slab on ve start-stop + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit 965adae71aaa774796aeac8087806b77bbb0709f +Author: Konstantin Khlebnikov +Date: Sat Feb 27 16:57:50 2010 +0300 + + ve: tmpfs virtualize default size + + set default size to half of physpages from meminfo + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit 79c0a2ab51af39b665f7e8162c26c5573eca1872 +Author: Denis Lunev +Date: Sat Feb 27 16:57:50 2010 +0300 + + ve: meminfo dont use subub + + Get parent UB instead of sub-group one to calculate usage + + Signed-off-by: Denis V. Lunev + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit 223f044cc32146df3a5f6dc61aab2bd053277de8 +Author: Konstantin Khlebnikov +Date: Sat Feb 27 16:57:50 2010 +0300 + + ve: move veinfo to vzmon + + Since some people wish to run openvz w/o venet device, but + vzlist tool relies on /proc/vz/veinfo file presence, vzmon + module is a better place for this file. + + http://bugzilla.openvz.org/show_bug.cgi?id=394 + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit f267ef18a62f50bd5293a876e43b89467c8253f4 +Author: Pavel Emelianov +Date: Sat Feb 27 16:57:49 2010 +0300 + + ve: virtualize binfmt-misc + + Nothing special. SUN jdk complains since can't use binfmt. + Not serious and java surely works fine w/o it, but just to + make it and its users happy let's virtualize binfmt_misc. + + Signed-off-by: Pavel Emelianov + Signed-off-by: Pavel Emelyanov + +commit 1ff4faada1dabfdc4592e2824ce53a357373c83e +Author: Konstantin Khlebnikov +Date: Sat Feb 27 16:57:49 2010 +0300 + + bc: pb hash cookie + + add random hash cookie to ub to use in pb_hash instead of non-random ub_uid + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit 31f588463c8294df47ff6357829b286abd580782 +Author: Marat Stanichenko +Date: Sat Feb 27 16:57:49 2010 +0300 + + bc: uncharge files harder + + There is a chance when we do not start uncharging because + ub_barrier_farnr() is not hit for UB_NUMFILE and ub_barrier_farsz() + is not hit for UB_KMEMSIZE (SLM for example set ubc barrier to a + huge value). + + This fact can lead us to the situation when two tasks are able + to consume all of UB_NUMFILE and UB_KMEMSIZE despite they close + opened files. + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit 76cd7c1686940c2eeef94926e978b8893f9bb9e2 +Author: Konstantin Khlebnikov +Date: Sat Feb 27 16:57:48 2010 +0300 + + ve: show proc swaps in ct + + Fill the size/used values with the ones from the meminfo virtinfo notifier. + + Show one fake swap partition (/dev/null) with the same size/used as in + /proc/meminfo. If --meminfo == none show overall swap statisctics from HN. + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit bf8c54dbd1c7b09abdab952da58e1f2c8f439ea4 +Author: Konstantin Khlebnikov +Date: Sat Feb 27 16:57:48 2010 +0300 + + ve: mangle swapinfo + + Fill swap size/usage with data from UB_SWAPPAGES in meminfo notifier. + Don't show swap if the limit is unlimited (default state). + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit 1c2b5b4b1cbaafa707cb56da94dd5099dbdcc73d +Author: Konstantin Khlebnikov +Date: Sat Feb 27 16:57:48 2010 +0300 + + cpt: bc resources array + + restore only bc resources really presented in cpt image. + + store UB_RESOURCES in cpt_beancounter_image while checkpointing. + (leave all new added resources with default limits filled at bc alloc) + + change cpt_content of cpt_beancounter_image to CPT_CONTENT_ARRAY to detect + structure version without bumping cpt image version, because in old images + __cpt_pad field (reused for cpt_ub_resources) uninitilized. + + add missed error handling inside rst_undump_ubc -- toss errors + from restore_one_bc to higher level. + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit 7b8bbb51527e58abadcd0eeb3e7103ba4048a57f +Author: Konstantin Khlebnikov +Date: Sat Feb 27 16:57:47 2010 +0300 + + bc-swap: add swappages bc resource + + The limit value will be used as configured CT swap size to show + in /proc/swaps and /proc/meminfo. Default is UB_MAXVALUE + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit e7416bee163fb262076d9b7dfa93c0dbf304891d +Author: Pavel Emelianov +Date: Sat Feb 27 16:57:47 2010 +0300 + + bc-rss: show how much page beancounters each bc has + + Essentially, this is the per-UB rss value calculated + (unline physpages and privvmpages) w/o taking sharing + into account. + + With this statistics (shown via /proc/bc/XXX/vmaux:rss) + we can evaluate the portion of pages, that are shared + accross beancounters (i.e. CTs) like this: + + (\sum (bc.rss + bc.tmpfs_respages) - \sum (bc.physpages)) / + (\sum (bc.rss + bc.tmpfs_respages)) + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit b03577fcbea66508aca033f9c9c78bc060c02c24 +Author: Denis Lunev +Date: Sat Feb 27 16:57:47 2010 +0300 + + bc-ioacct: define page_io_mark in right place + + fix compilation without CONFIG_BC_IO_ACCOUNTING + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit 35fe6d0b31e36227f572550dff53154491760fb1 +Author: Marat Stanichenko +Date: Sat Feb 27 16:57:47 2010 +0300 + + bc-ioprio: sys_ioprio_set lost unlock + + sys_ioprio_set() may exit without releasing tasklist_lock. Fix it. + + Acked-by: Pavel Emelyanov + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit 2cba7730c015206352563731d9f25cd027bd88f5 +Author: Konstantin Khlebnikov +Date: Sat Feb 27 16:57:45 2010 +0300 + + ve-proc: fix root entry nlink + + * Add entries from local tree, similar as in proc_getattr; + * Use per-ve process count for VE's root, rather than the + total number of processes in the system. + + All of the above is an upper estimation, that is perfectly + fine with 'find' utlity. + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit a2a22de6b8939570239c99973d3be7fb2eb4e70a +Author: Konstantin Khlebnikov +Date: Sat Feb 27 16:57:45 2010 +0300 + + ve-proc: fix nlink in getattr + + Fix nlink correction in proc_getattr + and change it right in the stat buffer insted of inode nlink + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit f665309226859e081bcae5c0c7fd3a3bdd9ecfbc +Author: Konstantin Khlebnikov +Date: Sat Feb 27 16:57:45 2010 +0300 + + bc-proc: bc nlink count + + Override getattr callback on /proc/bc and ubc entries to get correct nlink. + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit 85051b1c71ad37949ef448ff8ddb342b75d706b0 +Author: Konstantin Khlebnikov +Date: Sat Feb 27 16:57:45 2010 +0300 + + bc-proc: add bc and sub-bc counters + + Add counter of ubc, protected with ub_hash_lock. + Needed for correct proc n_link calculation. + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit d5ee7014d3f4995249cdadf3d00d1be778a3b10a +Author: Pavel Emelianov +Date: Sat Feb 27 16:57:44 2010 +0300 + + bc-proc: fix sub-bc inode number + + fix subbeancounter inode number calculations in /proc/bc + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit b93ef081a586e08e226273599bcf7800907c731b +Author: Konstantin Khlebnikov +Date: Sat Feb 27 16:57:44 2010 +0300 + + simfs: compilation without quota + + fix simfs compilation if CONFIG_QUOTA=n + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit 4fa1e482478bcde0552e9a97db1ddca620ebbe05 +Author: Konstantin Khlebnikov +Date: Sat Feb 27 16:57:43 2010 +0300 + + sysrq: smp nmi show regs v2 + + Rework nmi show regs, make it clean and tollerable to nmi ipi losts. + + Signed-off-by: Konstantin Khlebnikov + Signed-off-by: Pavel Emelyanov + +commit cab0d970b18692b61e62e2095392e63c5097bf29 +Author: Pavel Emelyanov +Date: Mon Apr 26 15:09:43 2010 +0400 + + sysrq: revert nmi ipi callback + + next patch will implement this in less intrusive manner, + and without deadlocks at nmi ipi loss + + Signed-off-by: Pavel Emelyanov + commit 6b5607eeec54fcef60c25fa7a72bc30f69446933 Author: Pavel Emelyanov Date: Fri Apr 16 12:34:01 2010 +0400 @@ -2799,14 +4590,14 @@ index 0000000..9856a2b +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/Makefile b/Makefile -index 78611d9..6c58263 100644 +index 573578f..12ba193 100644 --- a/Makefile +++ b/Makefile @@ -2,6 +2,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 32 EXTRAVERSION = -+VZVERSION = atkov ++VZVERSION = avdeyev NAME = Man-Eating Seals of Antiquity # *DOCUMENTATION* @@ -2849,7 +4640,7 @@ index 4fdb669..1334638 100644 + +source "kernel/bc/Kconfig" diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S -index 5294d84..cd218a8 100644 +index 5294d84..a920d42 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -617,7 +617,7 @@ ia32_sys_call_table: @@ -2870,6 +4661,32 @@ index 5294d84..cd218a8 100644 .quad quiet_ni_syscall /* query_module */ .quad sys_poll .quad compat_sys_nfsservctl +@@ -841,4 +841,25 @@ ia32_sys_call_table: + .quad compat_sys_pwritev + .quad compat_sys_rt_tgsigqueueinfo /* 335 */ + .quad sys_perf_event_open ++ .rept 500-(.-ia32_sys_call_table)/8 ++ .quad sys_ni_syscall ++ .endr ++ .quad sys_fairsched_mknod /* 500 */ ++ .quad sys_fairsched_rmnod ++ .quad sys_fairsched_chwt ++ .quad sys_fairsched_mvpr ++ .quad sys_fairsched_rate ++ .quad sys_fairsched_vcpus /* 505 */ ++ .quad sys_ni_syscall ++ .quad sys_ni_syscall ++ .quad sys_ni_syscall ++ .quad sys_ni_syscall ++ .quad sys_getluid /* 510 */ ++ .quad sys_setluid ++ .quad compat_sys_setublimit ++ .quad compat_sys_ubstat ++ .quad sys_ni_syscall ++ .quad sys_ni_syscall /* 515 */ ++ .quad sys_lchmod ++ .quad compat_sys_lutime + ia32_syscall_end: diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 016218c..f368a9a 100644 --- a/arch/x86/ia32/sys_ia32.c @@ -2922,21 +4739,6 @@ index 8ac9d9a..6f2fd90 100644 #define compat_arch_setup_additional_pages syscall32_setup_pages extern unsigned long arch_randomize_brk(struct mm_struct *mm); -diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h -index 139d4c1..5fd7d01 100644 ---- a/arch/x86/include/asm/nmi.h -+++ b/arch/x86/include/asm/nmi.h -@@ -25,6 +25,10 @@ extern void release_perfctr_nmi(unsigned int); - extern int reserve_evntsel_nmi(unsigned int); - extern void release_evntsel_nmi(unsigned int); - -+typedef int (*nmi_callback_t)(struct pt_regs *regs, int cpu); -+void set_nmi_ipi_callback(nmi_callback_t callback); -+void unset_nmi_ipi_callback(void); -+ - extern void setup_apic_nmi_watchdog(void *); - extern void stop_apic_nmi_watchdog(void *); - extern void disable_timer_nmi_watchdog(void); diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index 271de94..e255a04 100644 --- a/arch/x86/include/asm/pgalloc.h @@ -3017,10 +4819,10 @@ index c042729..6e7f232 100644 #endif rdtscll(ret); diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h -index 6fb3c20..e7a2442 100644 +index 6fb3c20..c870519 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h -@@ -342,10 +342,20 @@ +@@ -342,10 +342,22 @@ #define __NR_pwritev 334 #define __NR_rt_tgsigqueueinfo 335 #define __NR_perf_event_open 336 @@ -3034,6 +4836,8 @@ index 6fb3c20..e7a2442 100644 +#define __NR_setluid 511 +#define __NR_setublimit 512 +#define __NR_ubstat 513 ++#define __NR_lchmod 516 ++#define __NR_lutime 517 #ifdef __KERNEL__ @@ -3043,10 +4847,10 @@ index 6fb3c20..e7a2442 100644 #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h -index 8d3ad0a..dc19a9c 100644 +index 8d3ad0a..15bc00e 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h -@@ -661,6 +661,26 @@ __SYSCALL(__NR_pwritev, sys_pwritev) +@@ -661,6 +661,30 @@ __SYSCALL(__NR_pwritev, sys_pwritev) __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) #define __NR_perf_event_open 298 __SYSCALL(__NR_perf_event_open, sys_perf_event_open) @@ -3070,10 +4874,14 @@ index 8d3ad0a..dc19a9c 100644 +__SYSCALL(__NR_fairsched_mvpr, sys_fairsched_mvpr) +#define __NR_fairsched_rate 508 +__SYSCALL(__NR_fairsched_rate, sys_fairsched_rate) ++#define __NR_lchmod 509 ++__SYSCALL(__NR_lchmod, sys_lchmod) ++#define __NR_lutime 510 ++__SYSCALL(__NR_lutime, sys_lutime) #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR -@@ -685,6 +705,7 @@ __SYSCALL(__NR_perf_event_open, sys_perf_event_open) +@@ -685,6 +709,7 @@ __SYSCALL(__NR_perf_event_open, sys_perf_event_open) #define __ARCH_WANT_SYS_RT_SIGSUSPEND #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_COMPAT_SYS_TIME @@ -3094,32 +4902,10 @@ index 9064052..2cf267b 100644 * Given a pointer to the vDSO image, find the pointer to VDSO32_name * as that symbol is defined in the vDSO sources or linker script. diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c -index 7ff61d6..e5c7f78 100644 +index 7ff61d6..ee58297 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c -@@ -386,6 +386,21 @@ void touch_nmi_watchdog(void) - } - EXPORT_SYMBOL(touch_nmi_watchdog); - -+void smp_show_regs(struct pt_regs *regs, void *info) -+{ -+ static DEFINE_SPINLOCK(show_regs_lock); -+ -+ if (regs == NULL) -+ return; -+ -+ spin_lock(&show_regs_lock); -+ bust_spinlocks(1); -+ printk("----------- IPI show regs -----------"); -+ show_regs(regs); -+ bust_spinlocks(0); -+ spin_unlock(&show_regs_lock); -+} -+ - notrace __kprobes int - nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) - { -@@ -435,10 +450,10 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) +@@ -435,10 +435,10 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) if (!touched && __get_cpu_var(last_irq_sum) == sum) { /* * Ayiee, looks like this CPU is stuck ... @@ -3132,40 +4918,30 @@ index 7ff61d6..e5c7f78 100644 /* * die_nmi will return ONLY if NOTIFY_STOP happens.. */ +diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c +index bb62b3e..ce8a3f5 100644 +--- a/arch/x86/kernel/cpu/transmeta.c ++++ b/arch/x86/kernel/cpu/transmeta.c +@@ -1,6 +1,7 @@ + #include + #include + #include ++#include + #include + #include + #include "cpu.h" diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c -index 2d8a371..155d6c6 100644 +index 2d8a371..0d1ce00 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c -@@ -303,6 +303,21 @@ void die(const char *str, struct pt_regs *regs, long err) - oops_end(flags, regs, sig); - } - -+/* -+ * Voyager doesn't implement these -+ */ -+void __attribute__((weak)) smp_show_regs(struct pt_regs *regs, void *info) -+{ -+} -+ -+#ifdef CONFIG_SMP -+int __attribute__((weak)) -+smp_nmi_call_function(smp_nmi_function func, void *info, int wait) -+{ -+ return 0; -+} -+#endif -+ - void notrace __kprobes - die_nmi(char *str, struct pt_regs *regs, int do_panic) - { -@@ -319,6 +334,7 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic) - printk(KERN_EMERG "%s", str); +@@ -320,6 +320,7 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic) printk(" on CPU%d, ip %08lx, registers:\n", smp_processor_id(), regs->ip); -+ smp_nmi_call_function(smp_show_regs, NULL, 1); show_registers(regs); ++ nmi_show_regs(regs, 1); oops_end(flags, regs, 0); if (do_panic || panic_on_oops) + panic("Non maskable interrupt"); diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index f7dd2a7..24c02de 100644 --- a/arch/x86/kernel/dumpstack_32.c @@ -3506,107 +5282,21 @@ index 6a44a76..6ecea3a 100644 if (syscall_get_nr(current, regs) >= 0) { /* Restart the system call - no handlers present */ diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c -index ec1de97..b74f73d 100644 +index ec1de97..29df6fd 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c -@@ -22,6 +22,7 @@ - #include - #include - -+#include - #include - #include - #include -@@ -146,6 +147,89 @@ void native_send_call_func_ipi(const struct cpumask *mask) - free_cpumask_var(allbutself); +@@ -221,6 +221,11 @@ void smp_call_function_single_interrupt(struct pt_regs *regs) + irq_exit(); } -+static DEFINE_SPINLOCK(nmi_call_lock); -+static struct nmi_call_data_struct { -+ smp_nmi_function func; -+ void *info; -+ atomic_t started; -+ atomic_t finished; -+ cpumask_t cpus_called; -+ int wait; -+} *nmi_call_data; -+ -+static int smp_nmi_callback(struct pt_regs *regs, int cpu) ++void send_nmi_ipi_allbutself(void) +{ -+ smp_nmi_function func; -+ void *info; -+ int wait; -+ -+ func = nmi_call_data->func; -+ info = nmi_call_data->info; -+ wait = nmi_call_data->wait; -+ ack_APIC_irq(); -+ /* prevent from calling func() multiple times */ -+ if (cpu_test_and_set(cpu, nmi_call_data->cpus_called)) -+ return 0; -+ /* -+ * notify initiating CPU that I've grabbed the data and am -+ * about to execute the function -+ */ -+ mb(); -+ atomic_inc(&nmi_call_data->started); -+ /* at this point the nmi_call_data structure is out of scope */ -+ irq_enter(); -+ func(regs, info); -+ irq_exit(); -+ if (wait) -+ atomic_inc(&nmi_call_data->finished); -+ -+ return 1; ++ apic->send_IPI_allbutself(NMI_VECTOR); +} + -+/* -+ * This function tries to call func(regs, info) on each cpu. -+ * Func must be fast and non-blocking. -+ * May be called with disabled interrupts and from any context. -+ */ -+int smp_nmi_call_function(smp_nmi_function func, void *info, int wait) -+{ -+ struct nmi_call_data_struct data; -+ int cpus; -+ -+ cpus = num_online_cpus() - 1; -+ if (!cpus) -+ return 0; -+ -+ data.func = func; -+ data.info = info; -+ data.wait = wait; -+ atomic_set(&data.started, 0); -+ atomic_set(&data.finished, 0); -+ cpus_clear(data.cpus_called); -+ /* prevent this cpu from calling func if NMI happens */ -+ cpu_set(smp_processor_id(), data.cpus_called); -+ -+ if (!spin_trylock(&nmi_call_lock)) -+ return -1; -+ -+ nmi_call_data = &data; -+ set_nmi_ipi_callback(smp_nmi_callback); -+ mb(); -+ -+ /* Send a message to all other CPUs and wait for them to respond */ -+ apic->send_IPI_allbutself(APIC_DM_NMI); -+ while (atomic_read(&data.started) != cpus) -+ barrier(); -+ -+ unset_nmi_ipi_callback(); -+ if (wait) -+ while (atomic_read(&data.finished) != cpus) -+ barrier(); -+ spin_unlock(&nmi_call_lock); -+ -+ return 0; -+} -+ - /* - * this function calls the 'stop' function on all other CPUs in the system. - */ + struct smp_ops smp_ops = { + .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, + .smp_prepare_cpus = native_smp_prepare_cpus, diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 28e963d..54a0ecf 100644 --- a/arch/x86/kernel/smpboot.c @@ -3625,10 +5315,10 @@ index 28e963d..54a0ecf 100644 start_ip = setup_trampoline(); diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S -index 76d70a4..0defa11 100644 +index 76d70a4..477e261 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S -@@ -336,3 +336,22 @@ ENTRY(sys_call_table) +@@ -336,3 +336,24 @@ ENTRY(sys_call_table) .long sys_pwritev .long sys_rt_tgsigqueueinfo /* 335 */ .long sys_perf_event_open @@ -3650,52 +5340,23 @@ index 76d70a4..0defa11 100644 + .long sys_setublimit + .long sys_ubstat + .long sys_ni_syscall -+ .long sys_ni_syscall ++ .long sys_ni_syscall /* 515 */ ++ .long sys_lchmod ++ .long sys_lutime diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c -index 7e37dce..e1ceccb 100644 +index 7e37dce..d1fd061 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c -@@ -385,6 +385,13 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) - printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); - } - -+static int dummy_nmi_callback(struct pt_regs *regs, int cpu) -+{ -+ return 0; -+} -+ -+static nmi_callback_t nmi_ipi_callback = dummy_nmi_callback; -+ - static notrace __kprobes void default_do_nmi(struct pt_regs *regs) - { - unsigned char reason = 0; -@@ -439,12 +446,24 @@ do_nmi(struct pt_regs *regs, long error_code) - - inc_irq_stat(__nmi_count); - -- if (!ignore_nmis) -- default_do_nmi(regs); -+ if (!ignore_nmis) { -+ if (!nmi_ipi_callback(regs, smp_processor_id())) -+ default_do_nmi(regs); -+ } - - nmi_exit(); - } - -+void set_nmi_ipi_callback(nmi_callback_t callback) -+{ -+ nmi_ipi_callback = callback; -+} -+ -+void unset_nmi_ipi_callback(void) -+{ -+ nmi_ipi_callback = dummy_nmi_callback; -+} -+ - void stop_nmi(void) - { - acpi_nmi_disable(); +@@ -405,7 +405,8 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) + * Ok, so this is none of the documented NMI sources, + * so it must be the NMI watchdog. + */ +- if (nmi_watchdog_tick(regs, reason)) ++ if (nmi_watchdog_tick(regs, reason) + ++ do_nmi_show_regs(regs, cpu)) + return; + if (!do_nmi_callback(regs, cpu)) + unknown_nmi_error(reason, regs); diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index f379309..6c44e77 100644 --- a/arch/x86/kernel/tsc_sync.c @@ -4762,34 +6423,32 @@ index 62f282e..2dd6714 100644 #else diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c -index 44203ff..2f26e57 100644 +index 44203ff..4288c77 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c -@@ -37,6 +37,8 @@ +@@ -37,7 +37,10 @@ #include #include #include +#include +#include #include ++#include #include -@@ -250,8 +252,14 @@ static struct sysrq_key_op sysrq_showallcpus_op = { + #include +@@ -250,8 +253,8 @@ static struct sysrq_key_op sysrq_showallcpus_op = { static void sysrq_handle_showregs(int key, struct tty_struct *tty) { struct pt_regs *regs = get_irq_regs(); +- if (regs) +- show_regs(regs); + -+ bust_spinlocks(1); - if (regs) - show_regs(regs); -+ bust_spinlocks(0); -+#if defined(__i386__) || defined(__x86_64__) -+ smp_nmi_call_function(smp_show_regs, NULL, 1); -+#endif ++ nmi_show_regs(regs, 0); perf_event_print_debug(); } static struct sysrq_key_op sysrq_showregs_op = { -@@ -303,6 +311,7 @@ static struct sysrq_key_op sysrq_ftrace_dump_op = { +@@ -303,6 +306,7 @@ static struct sysrq_key_op sysrq_ftrace_dump_op = { static void sysrq_handle_showmem(int key, struct tty_struct *tty) { show_mem(); @@ -4797,7 +6456,7 @@ index 44203ff..2f26e57 100644 } static struct sysrq_key_op sysrq_showmem_op = { .handler = sysrq_handle_showmem, -@@ -318,7 +327,7 @@ static void send_sig_all(int sig) +@@ -318,7 +322,7 @@ static void send_sig_all(int sig) { struct task_struct *p; @@ -4806,7 +6465,7 @@ index 44203ff..2f26e57 100644 if (p->mm && !is_global_init(p)) /* Not swapper, init nor kernel thread */ force_sig(sig, p); -@@ -394,7 +403,267 @@ static struct sysrq_key_op sysrq_unrt_op = { +@@ -394,7 +398,267 @@ static struct sysrq_key_op sysrq_unrt_op = { /* Key Operations table and lock */ static DEFINE_SPINLOCK(sysrq_key_table_lock); @@ -5075,7 +6734,7 @@ index 44203ff..2f26e57 100644 &sysrq_loglevel_op, /* 0 */ &sysrq_loglevel_op, /* 1 */ &sysrq_loglevel_op, /* 2 */ -@@ -417,7 +686,11 @@ static struct sysrq_key_op *sysrq_key_table[36] = { +@@ -417,7 +681,11 @@ static struct sysrq_key_op *sysrq_key_table[36] = { &sysrq_term_op, /* e */ &sysrq_moom_op, /* f */ /* g: May be registered for the kernel debugger */ @@ -5087,7 +6746,7 @@ index 44203ff..2f26e57 100644 NULL, /* h - reserved for help */ &sysrq_kill_op, /* i */ #ifdef CONFIG_BLOCK -@@ -449,8 +722,11 @@ static struct sysrq_key_op *sysrq_key_table[36] = { +@@ -449,8 +717,11 @@ static struct sysrq_key_op *sysrq_key_table[36] = { /* y: May be registered on sparc64 for global register dump */ NULL, /* y */ &sysrq_ftrace_dump_op, /* z */ @@ -5099,7 +6758,7 @@ index 44203ff..2f26e57 100644 /* key2index calculation, -1 on invalid index */ static int sysrq_key_table_key2index(int key) { -@@ -460,6 +736,10 @@ static int sysrq_key_table_key2index(int key) +@@ -460,6 +731,10 @@ static int sysrq_key_table_key2index(int key) retval = key - '0'; else if ((key >= 'a') && (key <= 'z')) retval = key + 10 - 'a'; @@ -5110,7 +6769,7 @@ index 44203ff..2f26e57 100644 else retval = -1; return retval; -@@ -470,21 +750,21 @@ static int sysrq_key_table_key2index(int key) +@@ -470,21 +745,21 @@ static int sysrq_key_table_key2index(int key) */ struct sysrq_key_op *__sysrq_get_key_op(int key) { @@ -5139,7 +6798,7 @@ index 44203ff..2f26e57 100644 } /* -@@ -507,25 +787,25 @@ void __handle_sysrq(int key, struct tty_struct *tty, int check_mask) +@@ -507,25 +782,25 @@ void __handle_sysrq(int key, struct tty_struct *tty, int check_mask) */ orig_log_level = console_loglevel; console_loglevel = 7; @@ -5171,7 +6830,7 @@ index 44203ff..2f26e57 100644 if (sysrq_key_table[i]) { int j; -@@ -555,7 +835,7 @@ void handle_sysrq(int key, struct tty_struct *tty) +@@ -555,7 +830,7 @@ void handle_sysrq(int key, struct tty_struct *tty) EXPORT_SYMBOL(handle_sysrq); static int __sysrq_swap_key_ops(int key, struct sysrq_key_op *insert_op_p, @@ -5180,9 +6839,13 @@ index 44203ff..2f26e57 100644 { int retval; -@@ -592,11 +872,16 @@ static ssize_t write_sysrq_trigger(struct file *file, const char __user *buf, +@@ -591,12 +866,29 @@ EXPORT_SYMBOL(unregister_sysrq_key); + static ssize_t write_sysrq_trigger(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { ++ struct ve_struct *cur = get_exec_env(); ++ static int pnum = 10; ++ if (count) { - char c; + int i, cnt; @@ -5195,13 +6858,31 @@ index 44203ff..2f26e57 100644 - __handle_sysrq(c, NULL, 0); + + -+ for (i = 0; i < cnt && c[i] != '\n'; i++) ++ for (i = 0; i < cnt && c[i] != '\n'; i++) { ++ if (!ve_is_super(cur)) { ++ if (!pnum) ++ continue; ++ printk("SysRq: CT#%u sent '%c' magic key.\n", ++ cur->veid, c[i]); ++ pnum--; ++ continue; ++ } + __handle_sysrq(c[i], NULL, 0); ++ } } return count; } +@@ -607,7 +899,7 @@ static const struct file_operations proc_sysrq_trigger_operations = { + + static int __init sysrq_init(void) + { +- proc_create("sysrq-trigger", S_IWUSR, NULL, &proc_sysrq_trigger_operations); ++ proc_create("sysrq-trigger", S_IWUSR, &glob_proc_root, &proc_sysrq_trigger_operations); + return 0; + } + module_init(sysrq_init); diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c -index 05cab2c..f973a9f 100644 +index 53ffcfc..2571f59 100644 --- a/drivers/char/tty_io.c +++ b/drivers/char/tty_io.c @@ -96,6 +96,8 @@ @@ -5287,7 +6968,7 @@ index 05cab2c..f973a9f 100644 { struct tty_struct *tty; int retval; -@@ -1705,7 +1729,7 @@ void tty_release_dev(struct file *filp) +@@ -1707,7 +1731,7 @@ void tty_release_dev(struct file *filp) static int __tty_open(struct inode *inode, struct file *filp) { @@ -5296,7 +6977,7 @@ index 05cab2c..f973a9f 100644 int noctty, retval; struct tty_driver *driver; int index; -@@ -1729,6 +1753,7 @@ retry_open: +@@ -1731,6 +1755,7 @@ retry_open: } driver = tty_driver_kref_get(tty->driver); index = tty->index; @@ -5304,7 +6985,7 @@ index 05cab2c..f973a9f 100644 filp->f_flags |= O_NONBLOCK; /* Don't let /dev/tty block */ /* noctty = 1; */ /* FIXME: Should we take a driver reference ? */ -@@ -1738,6 +1763,12 @@ retry_open: +@@ -1740,6 +1765,12 @@ retry_open: #ifdef CONFIG_VT if (device == MKDEV(TTY_MAJOR, 0)) { extern struct tty_driver *console_driver; @@ -5317,7 +6998,7 @@ index 05cab2c..f973a9f 100644 driver = tty_driver_kref_get(console_driver); index = fg_console; noctty = 1; -@@ -1746,6 +1777,12 @@ retry_open: +@@ -1748,6 +1779,12 @@ retry_open: #endif if (device == MKDEV(TTYAUX_MAJOR, 1)) { struct tty_driver *console_driver = console_device(&index); @@ -5330,7 +7011,7 @@ index 05cab2c..f973a9f 100644 if (console_driver) { driver = tty_driver_kref_get(console_driver); if (driver) { -@@ -1780,7 +1817,7 @@ got_driver: +@@ -1782,7 +1819,7 @@ got_driver: if (retval) tty = ERR_PTR(retval); } else @@ -5339,7 +7020,7 @@ index 05cab2c..f973a9f 100644 mutex_unlock(&tty_mutex); tty_driver_kref_put(driver); -@@ -2076,6 +2113,8 @@ static int tioccons(struct file *file) +@@ -2078,6 +2115,8 @@ static int tioccons(struct file *file) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -5348,7 +7029,7 @@ index 05cab2c..f973a9f 100644 if (file->f_op->write == redirected_tty_write) { struct file *f; spin_lock(&redirect_lock); -@@ -2656,7 +2695,7 @@ void __do_SAK(struct tty_struct *tty) +@@ -2658,7 +2697,7 @@ void __do_SAK(struct tty_struct *tty) /* Now kill any processes that happen to have the * tty open. */ @@ -5357,7 +7038,7 @@ index 05cab2c..f973a9f 100644 if (p->signal->tty == tty) { printk(KERN_NOTICE "SAK: killed process %d" " (%s): task_session(p)==tty->session\n", -@@ -2688,7 +2727,7 @@ void __do_SAK(struct tty_struct *tty) +@@ -2690,7 +2729,7 @@ void __do_SAK(struct tty_struct *tty) spin_unlock(&p->files->file_lock); } task_unlock(p); @@ -5366,7 +7047,7 @@ index 05cab2c..f973a9f 100644 read_unlock(&tasklist_lock); #endif } -@@ -2755,6 +2794,7 @@ void initialize_tty_struct(struct tty_struct *tty, +@@ -2757,6 +2796,7 @@ void initialize_tty_struct(struct tty_struct *tty, tty->ops = driver->ops; tty->index = idx; tty_line_name(driver, idx, tty->name); @@ -5374,7 +7055,7 @@ index 05cab2c..f973a9f 100644 } /** -@@ -2847,6 +2887,7 @@ struct tty_driver *alloc_tty_driver(int lines) +@@ -2849,6 +2889,7 @@ struct tty_driver *alloc_tty_driver(int lines) driver->magic = TTY_DRIVER_MAGIC; driver->num = lines; /* later we'll move allocation of tables here */ @@ -5382,7 +7063,7 @@ index 05cab2c..f973a9f 100644 } return driver; } -@@ -2881,6 +2922,7 @@ static void destruct_tty_driver(struct kref *kref) +@@ -2883,6 +2924,7 @@ static void destruct_tty_driver(struct kref *kref) kfree(p); cdev_del(&driver->cdev); } @@ -5390,7 +7071,7 @@ index 05cab2c..f973a9f 100644 kfree(driver); } -@@ -2955,6 +2997,7 @@ int tty_register_driver(struct tty_driver *driver) +@@ -2957,6 +2999,7 @@ int tty_register_driver(struct tty_driver *driver) } mutex_lock(&tty_mutex); @@ -5398,7 +7079,7 @@ index 05cab2c..f973a9f 100644 list_add(&driver->tty_drivers, &tty_drivers); mutex_unlock(&tty_mutex); -@@ -3128,3 +3171,43 @@ static int __init tty_init(void) +@@ -3130,3 +3173,43 @@ static int __init tty_init(void) return 0; } module_init(tty_init); @@ -6228,10 +7909,10 @@ index 4fdfa2a..37d414d 100644 } diff --git a/drivers/net/venet_core.c b/drivers/net/venet_core.c new file mode 100644 -index 0000000..5aeb82b +index 0000000..317fbb0 --- /dev/null +++ b/drivers/net/venet_core.c -@@ -0,0 +1,775 @@ +@@ -0,0 +1,864 @@ +/* + * venet_core.c + * @@ -6321,6 +8002,86 @@ index 0000000..5aeb82b + return NULL; +} + ++struct ext_entry_struct *venet_ext_lookup(struct ve_struct *ve, ++ struct ve_addr_struct *addr) ++{ ++ struct ext_entry_struct *entry; ++ ++ if (ve->veip == NULL) ++ return NULL; ++ ++ list_for_each_entry (entry, &ve->veip->ext_lh, list) ++ if (memcmp(&entry->addr, addr, sizeof(*addr)) == 0) ++ return entry; ++ return NULL; ++} ++ ++int venet_ext_add(struct ve_struct *ve, struct ve_addr_struct *addr) ++{ ++ struct ext_entry_struct *entry, *found; ++ int err; ++ ++ if (ve->veip == NULL) ++ return -ENONET; ++ ++ entry = kzalloc(sizeof(struct ext_entry_struct), GFP_KERNEL); ++ if (entry == NULL) ++ return -ENOMEM; ++ ++ write_lock_irq(&veip_hash_lock); ++ err = -EADDRINUSE; ++ found = venet_ext_lookup(ve, addr); ++ if (found != NULL) ++ goto out_unlock; ++ ++ entry->addr = *addr; ++ list_add(&entry->list, &ve->veip->ext_lh); ++ err = 0; ++ entry = NULL; ++out_unlock: ++ write_unlock_irq(&veip_hash_lock); ++ if (entry != NULL) ++ kfree(entry); ++ return err; ++} ++ ++int venet_ext_del(struct ve_struct *ve, struct ve_addr_struct *addr) ++{ ++ struct ext_entry_struct *found; ++ int err; ++ ++ if (ve->veip == NULL) ++ return -ENONET; ++ ++ err = -EADDRNOTAVAIL; ++ write_lock_irq(&veip_hash_lock); ++ found = venet_ext_lookup(ve, addr); ++ if (found == NULL) ++ goto out; ++ ++ list_del(&found->list); ++ kfree(found); ++ err = 0; ++out: ++ write_unlock_irq(&veip_hash_lock); ++ return err; ++} ++ ++void venet_ext_clean(struct ve_struct *ve) ++{ ++ struct ext_entry_struct *entry, *tmp; ++ ++ if (ve->veip == NULL) ++ return; ++ ++ write_lock_irq(&veip_hash_lock); ++ list_for_each_entry_safe (entry, tmp, &ve->veip->ext_lh, list) { ++ list_del(&entry->list); ++ kfree(entry); ++ } ++ write_unlock_irq(&veip_hash_lock); ++} ++ +struct veip_struct *veip_find(envid_t veid) +{ + struct veip_struct *ptr; @@ -6348,6 +8109,7 @@ index 0000000..5aeb82b + INIT_LIST_HEAD(&ptr->ip_lh); + INIT_LIST_HEAD(&ptr->src_lh); + INIT_LIST_HEAD(&ptr->dst_lh); ++ INIT_LIST_HEAD(&ptr->ext_lh); + ptr->veid = veid; + list_add(&ptr->list, &veip_lh); + return ptr; @@ -6641,6 +8403,20 @@ index 0000000..5aeb82b + return venet_set_op(dev, data, ethtool_op_set_tx_csum); +} + ++static int ++venet_op_set_tso(struct net_device *dev, u32 data) ++{ ++ if (!ve_is_super(get_exec_env())) ++ return -EPERM; ++ ++ if (data) ++ common_features |= NETIF_F_TSO; ++ else ++ common_features &= ~NETIF_F_TSO; ++ ++ return venet_set_op(dev, data, ethtool_op_set_tso); ++} ++ +#define venet_op_set_rx_csum venet_op_set_tx_csum + +static struct ethtool_ops venet_ethtool_ops = { @@ -6651,6 +8427,7 @@ index 0000000..5aeb82b + .get_rx_csum = ethtool_op_get_tx_csum, + .set_rx_csum = venet_op_set_rx_csum, + .get_tso = ethtool_op_get_tso, ++ .set_tso = venet_op_set_tso, +}; + +static void venet_cpt(struct net_device *dev, @@ -6685,15 +8462,10 @@ index 0000000..5aeb82b +} + +#ifdef CONFIG_PROC_FS -+static int veinfo_seq_show(struct seq_file *m, void *v) ++static void veaddr_seq_print(struct seq_file *m, struct ve_struct *ve) +{ -+ struct ve_struct *ve; + struct ip_entry_struct *entry; + -+ ve = list_entry((struct list_head *)v, struct ve_struct, ve_list); -+ -+ seq_printf(m, "%10u %5u %5u", ve->veid, -+ ve->class_id, atomic_read(&ve->pcounter)); + read_lock(&veip_hash_lock); + if (ve->veip == NULL) + goto unlock; @@ -6711,29 +8483,8 @@ index 0000000..5aeb82b + } +unlock: + read_unlock(&veip_hash_lock); -+ seq_putc(m, '\n'); -+ return 0; +} + -+static struct seq_operations veinfo_seq_op = { -+ .start = ve_seq_start, -+ .next = ve_seq_next, -+ .stop = ve_seq_stop, -+ .show = veinfo_seq_show, -+}; -+ -+static int veinfo_open(struct inode *inode, struct file *file) -+{ -+ return seq_open(file, &veinfo_seq_op); -+} -+ -+static struct file_operations proc_veinfo_operations = { -+ .open = veinfo_open, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = seq_release, -+}; -+ +static void *veip_seq_start(struct seq_file *m, loff_t *pos) +{ + loff_t l; @@ -6804,7 +8555,7 @@ index 0000000..5aeb82b + struct ve_addr_struct addr; + + err = -EPERM; -+ if (!capable(CAP_SETVEID)) ++ if (!capable_setveid()) + goto out; + + err = sockaddr_to_veaddr(uaddr, addrlen, &addr); @@ -6829,6 +8580,28 @@ index 0000000..5aeb82b + case VE_IP_DEL: + err = veip_entry_del(veid, &addr); + break; ++ case VE_IP_EXT_ADD: ++ ve = get_ve_by_id(veid); ++ err = -ESRCH; ++ if (!ve) ++ goto out; ++ ++ down_read(&ve->op_sem); ++ err = venet_ext_add(ve, &addr); ++ up_read(&ve->op_sem); ++ put_ve(ve); ++ break; ++ case VE_IP_EXT_DEL: ++ ve = get_ve_by_id(veid); ++ err = -ESRCH; ++ if (!ve) ++ goto out; ++ ++ down_read(&ve->op_sem); ++ err = venet_ext_del(ve, &addr); ++ up_read(&ve->op_sem); ++ put_ve(ve); ++ break; + default: + err = -EINVAL; + } @@ -6940,6 +8713,7 @@ index 0000000..5aeb82b + struct net_device *dev; + + env = (struct ve_struct *)data; ++ venet_ext_clean(env); + veip_stop(env); + + dev = env->_venet_dev; @@ -6976,11 +8750,6 @@ index 0000000..5aeb82b + return err; + +#ifdef CONFIG_PROC_FS -+ de = proc_create("veinfo", S_IFREG | S_IRUSR, glob_proc_vz_dir, -+ &proc_veinfo_operations); -+ if (de == NULL) -+ printk(KERN_WARNING "venet: can't make veinfo proc entry\n"); -+ + de = proc_create("veip", S_IFREG | S_IRUSR, proc_vz_dir, + &proc_veip_operations); + if (de == NULL) @@ -6989,17 +8758,18 @@ index 0000000..5aeb82b + + ve_hook_register(VE_SS_CHAIN, &venet_ve_hook); + vzioctl_register(&venetcalls); ++ vzmon_register_veaddr_print_cb(veaddr_seq_print); + return 0; +} + +__exit void venet_exit(void) +{ ++ vzmon_unregister_veaddr_print_cb(veaddr_seq_print); + vzioctl_unregister(&venetcalls); + ve_hook_unregister(&venet_ve_hook); + +#ifdef CONFIG_PROC_FS + remove_proc_entry("veip", proc_vz_dir); -+ remove_proc_entry("veinfo", glob_proc_vz_dir); +#endif + venet_stop(get_ve0()); + veip_cleanup(); @@ -7021,10 +8791,10 @@ index 52af501..68b47b9 100644 diff --git a/drivers/net/vzethdev.c b/drivers/net/vzethdev.c new file mode 100644 -index 0000000..e073e3e +index 0000000..ed8ed97 --- /dev/null +++ b/drivers/net/vzethdev.c -@@ -0,0 +1,741 @@ +@@ -0,0 +1,749 @@ +/* + * veth.c + * @@ -7278,6 +9048,7 @@ index 0000000..e073e3e + stats->tx_bytes += dev_stats->tx_bytes; + stats->rx_packets += dev_stats->rx_packets; + stats->tx_packets += dev_stats->tx_packets; ++ stats->tx_dropped += dev_stats->tx_dropped; + } + + return stats; @@ -7418,6 +9189,12 @@ index 0000000..e073e3e + return veth_set_op(dev, data, ethtool_op_set_tx_csum); +} + ++static int ++veth_op_set_tso(struct net_device *dev, u32 data) ++{ ++ return veth_set_op(dev, data, ethtool_op_set_tso); ++} ++ +#define veth_op_set_rx_csum veth_op_set_tx_csum + +static struct ethtool_ops veth_ethtool_ops = { @@ -7428,6 +9205,7 @@ index 0000000..e073e3e + .get_rx_csum = ethtool_op_get_tx_csum, + .set_rx_csum = veth_op_set_rx_csum, + .get_tso = ethtool_op_get_tso, ++ .set_tso = veth_op_set_tso, +}; + +static void veth_cpt(struct net_device *dev, @@ -7799,16 +9577,15 @@ index 47291bc..142a991 100644 starget->id = id; starget->channel = channel; diff --git a/fs/Kconfig b/fs/Kconfig -index 64d44ef..998c68e 100644 +index 64d44ef..f48e240 100644 --- a/fs/Kconfig +++ b/fs/Kconfig -@@ -63,6 +63,15 @@ source "fs/autofs/Kconfig" +@@ -63,6 +63,14 @@ source "fs/autofs/Kconfig" source "fs/autofs4/Kconfig" source "fs/fuse/Kconfig" +config SIM_FS + tristate "VPS filesystem" -+ depends on VZ_QUOTA + default m + help + This file system is a part of Virtuozzo. It intoduces a fake @@ -8202,11 +9979,257 @@ index 1ed37ba..e8ef26b 100644 if (retval < 0) { send_sig(SIGKILL, current, 0); goto out; +diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c +index c4e8353..8180165 100644 +--- a/fs/binfmt_misc.c ++++ b/fs/binfmt_misc.c +@@ -28,6 +28,7 @@ + #include + #include + #include ++#include + + #include + +@@ -35,8 +36,15 @@ enum { + VERBOSE_STATUS = 1 /* make it zero to save 400 bytes kernel memory */ + }; + ++#ifdef CONFIG_VE ++#define bm_entries(ve) ((ve)->bm_entries) ++#define bm_enabled(ve) ((ve)->bm_enabled) ++#else + static LIST_HEAD(entries); + static int enabled = 1; ++#define bm_entries(ve) (entries) ++#define bm_enabled(ve) (enabled) ++#endif + + enum {Enabled, Magic}; + #define MISC_FMT_PRESERVE_ARGV0 (1<<31) +@@ -56,21 +64,30 @@ typedef struct { + } Node; + + static DEFINE_RWLOCK(entries_lock); ++#ifdef CONFIG_VE ++#define bm_fs_type(ve) (*(ve)->bm_fs_type) ++#define bm_mnt(ve) ((ve)->bm_mnt) ++#define bm_entry_count(ve) ((ve)->bm_entry_count) ++#else + static struct file_system_type bm_fs_type; + static struct vfsmount *bm_mnt; + static int entry_count; ++#define bm_fs_type(ve) (bm_fs_type) ++#define bm_mnt(ve) (bm_mnt) ++#define bm_entry_count(ve) (bm_entry_count) ++#endif + + /* + * Check if we support the binfmt + * if we do, return the node, else NULL + * locking is done in load_misc_binary + */ +-static Node *check_file(struct linux_binprm *bprm) ++static Node *check_file(struct ve_struct *ve, struct linux_binprm *bprm) + { + char *p = strrchr(bprm->interp, '.'); + struct list_head *l; + +- list_for_each(l, &entries) { ++ list_for_each(l, &bm_entries(ve)) { + Node *e = list_entry(l, Node, list); + char *s; + int j; +@@ -111,9 +128,10 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs) + char *iname_addr = iname; + int retval; + int fd_binary = -1; ++ struct ve_struct *ve = get_exec_env(); + + retval = -ENOEXEC; +- if (!enabled) ++ if (!bm_enabled(ve)) + goto _ret; + + retval = -ENOEXEC; +@@ -122,7 +140,7 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs) + + /* to keep locking time low, we copy the interpreter string */ + read_lock(&entries_lock); +- fmt = check_file(bprm); ++ fmt = check_file(ve, bprm); + if (fmt) + strlcpy(iname, fmt->interpreter, BINPRM_BUF_SIZE); + read_unlock(&entries_lock); +@@ -507,7 +525,7 @@ static void bm_clear_inode(struct inode *inode) + kfree(inode->i_private); + } + +-static void kill_node(Node *e) ++static void kill_node(struct ve_struct *ve, Node *e) + { + struct dentry *dentry; + +@@ -523,7 +541,7 @@ static void kill_node(Node *e) + dentry->d_inode->i_nlink--; + d_drop(dentry); + dput(dentry); +- simple_release_fs(&bm_mnt, &entry_count); ++ simple_release_fs(&bm_mnt(ve), &bm_entry_count(ve)); + } + } + +@@ -562,7 +580,7 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer, + case 3: root = dget(file->f_path.mnt->mnt_sb->s_root); + mutex_lock(&root->d_inode->i_mutex); + +- kill_node(e); ++ kill_node(get_exec_env(), e); + + mutex_unlock(&root->d_inode->i_mutex); + dput(root); +@@ -587,6 +605,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, + struct dentry *root, *dentry; + struct super_block *sb = file->f_path.mnt->mnt_sb; + int err = 0; ++ struct ve_struct *ve = get_exec_env(); + + e = create_entry(buffer, count); + +@@ -610,7 +629,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, + if (!inode) + goto out2; + +- err = simple_pin_fs(&bm_fs_type, &bm_mnt, &entry_count); ++ err = simple_pin_fs(&bm_fs_type(ve), &bm_mnt(ve), &bm_entry_count(ve)); + if (err) { + iput(inode); + inode = NULL; +@@ -623,7 +642,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, + + d_instantiate(dentry, inode); + write_lock(&entries_lock); +- list_add(&e->list, &entries); ++ list_add(&e->list, &bm_entries(ve)); + write_unlock(&entries_lock); + + err = 0; +@@ -649,26 +668,31 @@ static const struct file_operations bm_register_operations = { + static ssize_t + bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) + { +- char *s = enabled ? "enabled\n" : "disabled\n"; ++ struct ve_struct *ve = get_exec_env(); ++ char *s = bm_enabled(ve) ? "enabled\n" : "disabled\n"; + + return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s)); + } + ++static void dm_genocide(struct ve_struct *ve) ++{ ++ while (!list_empty(&bm_entries(ve))) ++ kill_node(ve, list_entry(bm_entries(ve).next, Node, list)); ++} ++ + static ssize_t bm_status_write(struct file * file, const char __user * buffer, + size_t count, loff_t *ppos) + { ++ struct ve_struct *ve = get_exec_env(); + int res = parse_command(buffer, count); + struct dentry *root; + + switch (res) { +- case 1: enabled = 0; break; +- case 2: enabled = 1; break; ++ case 1: bm_enabled(ve) = 0; break; ++ case 2: bm_enabled(ve) = 1; break; + case 3: root = dget(file->f_path.mnt->mnt_sb->s_root); + mutex_lock(&root->d_inode->i_mutex); +- +- while (!list_empty(&entries)) +- kill_node(list_entry(entries.next, Node, list)); +- ++ dm_genocide(ve); + mutex_unlock(&root->d_inode->i_mutex); + dput(root); + default: return res; +@@ -719,6 +743,53 @@ static struct file_system_type bm_fs_type = { + .kill_sb = kill_litter_super, + }; + ++#ifdef CONFIG_VE ++static void __ve_binfmt_init(struct ve_struct *ve, struct file_system_type *fs) ++{ ++ ve->bm_fs_type = fs; ++ INIT_LIST_HEAD(&ve->bm_entries); ++ ve->bm_enabled = 1; ++ ve->bm_mnt = NULL; ++ ve->bm_entry_count = 0; ++} ++ ++static int ve_binfmt_init(void *x) ++{ ++ struct ve_struct *ve = x; ++ struct file_system_type *fs_type; ++ int err; ++ ++ err = register_ve_fs_type(ve, &bm_fs_type, &fs_type, NULL); ++ if (err == 0) ++ __ve_binfmt_init(ve, fs_type); ++ ++ return err; ++} ++ ++static void ve_binfmt_fini(void *x) ++{ ++ struct ve_struct *ve = x; ++ ++ /* ++ * no locks since exec_ve is dead and noone will ++ * mess with bm_xxx fields any longer ++ */ ++ if (!ve->bm_fs_type) ++ return; ++ dm_genocide(ve); ++ unregister_ve_fs_type(ve->bm_fs_type, NULL); ++ kfree(ve->bm_fs_type); ++ ve->bm_fs_type = NULL; ++} ++ ++static struct ve_hook ve_binfmt_hook = { ++ .init = ve_binfmt_init, ++ .fini = ve_binfmt_fini, ++ .priority = HOOK_PRIO_FS, ++ .owner = THIS_MODULE, ++}; ++#endif ++ + static int __init init_misc_binfmt(void) + { + int err = register_filesystem(&bm_fs_type); +@@ -727,11 +798,17 @@ static int __init init_misc_binfmt(void) + if (err) + unregister_filesystem(&bm_fs_type); + } ++ ++ if (!err) { ++ __ve_binfmt_init(get_ve0(), &bm_fs_type); ++ ve_hook_register(VE_SS_CHAIN, &ve_binfmt_hook); ++ } + return err; + } + + static void __exit exit_misc_binfmt(void) + { ++ ve_hook_unregister(&ve_binfmt_hook); + unregister_binfmt(&misc_format); + unregister_filesystem(&bm_fs_type); + } diff --git a/fs/block_dev.c b/fs/block_dev.c -index 34e2d20..b170595 100644 +index 9b9e3dc..fe0cca1 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c -@@ -1601,7 +1601,7 @@ int __invalidate_device(struct block_device *bdev) +@@ -1602,7 +1602,7 @@ int __invalidate_device(struct block_device *bdev) * hold). */ shrink_dcache_sb(sb); @@ -8240,7 +10263,7 @@ index 6fa5302..34c1563 100644 } diff --git a/fs/compat.c b/fs/compat.c -index 6c19040..204915d 100644 +index 6c19040..5141257 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -26,6 +26,7 @@ @@ -8270,7 +10293,29 @@ index 6c19040..204915d 100644 /* * Not all architectures have sys_utime, so implement this in terms * of sys_utimes. -@@ -269,6 +282,8 @@ asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_sta +@@ -91,6 +104,21 @@ asmlinkage long compat_sys_utime(char __user *filename, struct compat_utimbuf __ + return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0); + } + ++asmlinkage long compat_sys_lutime(char __user * filename, ++ struct compat_utimbuf __user *t) ++{ ++ struct timespec tv[2]; ++ ++ if (t) { ++ if (get_user(tv[0].tv_sec, &t->actime) || ++ get_user(tv[1].tv_sec, &t->modtime)) ++ return -EFAULT; ++ tv[0].tv_nsec = 0; ++ tv[1].tv_nsec = 0; ++ } ++ return do_utimes(AT_FDCWD, filename, t ? tv : NULL, AT_SYMLINK_NOFOLLOW); ++} ++ + asmlinkage long compat_sys_utimensat(unsigned int dfd, char __user *filename, struct compat_timespec __user *t, int flags) + { + struct timespec tv[2]; +@@ -269,6 +297,8 @@ asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_sta struct kstatfs tmp; error = vfs_statfs(path.dentry, &tmp); if (!error) @@ -8279,7 +10324,7 @@ index 6c19040..204915d 100644 error = put_compat_statfs(buf, &tmp); path_put(&path); } -@@ -287,6 +302,8 @@ asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user +@@ -287,6 +317,8 @@ asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user goto out; error = vfs_statfs(file->f_path.dentry, &tmp); if (!error) @@ -8288,7 +10333,7 @@ index 6c19040..204915d 100644 error = put_compat_statfs(buf, &tmp); fput(file); out: -@@ -337,6 +354,8 @@ asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t s +@@ -337,6 +369,8 @@ asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t s struct kstatfs tmp; error = vfs_statfs(path.dentry, &tmp); if (!error) @@ -8297,7 +10342,7 @@ index 6c19040..204915d 100644 error = put_compat_statfs64(buf, &tmp); path_put(&path); } -@@ -358,6 +377,8 @@ asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct c +@@ -358,6 +392,8 @@ asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct c goto out; error = vfs_statfs(file->f_path.dentry, &tmp); if (!error) @@ -8306,7 +10351,7 @@ index 6c19040..204915d 100644 error = put_compat_statfs64(buf, &tmp); fput(file); out: -@@ -1469,6 +1490,10 @@ int compat_do_execve(char * filename, +@@ -1469,6 +1505,10 @@ int compat_do_execve(char * filename, bool clear_in_exec; int retval; @@ -8331,7 +10376,7 @@ index d84e705..960f82f 100644 current->comm, current->pid, (int)fd, (unsigned int)cmd, buf, diff --git a/fs/dcache.c b/fs/dcache.c -index a100fa3..48c4d04 100644 +index a100fa3..7fce87d 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -26,6 +26,7 @@ @@ -8375,16 +10420,7 @@ index a100fa3..48c4d04 100644 /*drops the locks, at that point nobody can reach this dentry */ dentry_iput(dentry); if (IS_ROOT(dentry)) -@@ -214,21 +223,31 @@ static struct dentry *d_kill(struct dentry *dentry) - - void dput(struct dentry *dentry) - { -+ struct user_beancounter *ub; -+ unsigned long d_ubsize; -+ - if (!dentry) - return; - +@@ -220,15 +229,22 @@ void dput(struct dentry *dentry) repeat: if (atomic_read(&dentry->d_count) == 1) might_sleep(); @@ -8414,7 +10450,7 @@ index a100fa3..48c4d04 100644 /* * AV: ->d_delete() is _NOT_ allowed to block now. -@@ -244,8 +263,12 @@ repeat: +@@ -244,8 +260,12 @@ repeat: dentry->d_flags |= DCACHE_REFERENCED; dentry_lru_add(dentry); } @@ -8427,20 +10463,23 @@ index a100fa3..48c4d04 100644 return; unhash_it: -@@ -253,9 +276,18 @@ unhash_it: +@@ -253,9 +273,21 @@ unhash_it: kill_it: /* if dentry was on the d_lru list delete it from there */ dentry_lru_del(dentry); + -+ ub = dentry->dentry_bc.d_ub; -+ d_ubsize = dentry->dentry_bc.d_ubsize; ++ if (unlikely(ub_dentry_on)) { ++ struct user_beancounter *ub; ++ ++ ub = dentry->dentry_bc.d_ub; ++ BUG_ON(!ub_dput_testzero(dentry)); ++ uncharge_dcache(ub, dentry->dentry_bc.d_ubsize); ++ put_beancounter(ub); ++ } ++ dentry = d_kill(dentry); - if (dentry) + preempt_disable(); -+ if (unlikely(ub_dentry_on)) { -+ uncharge_dcache(ub, d_ubsize); -+ put_beancounter(ub); -+ } + if (dentry) goto repeat; + preempt_enable(); @@ -9482,10 +11521,10 @@ index aad6400..7be0b93 100644 /* diff --git a/fs/ext3/super.c b/fs/ext3/super.c -index 427496c..a7a6210 100644 +index ca3068f..0c4978f 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c -@@ -2988,7 +2988,7 @@ static struct file_system_type ext3_fs_type = { +@@ -2986,7 +2986,7 @@ static struct file_system_type ext3_fs_type = { .name = "ext3", .get_sb = ext3_get_sb, .kill_sb = kill_block_super, @@ -9494,6 +11533,27 @@ index 427496c..a7a6210 100644 }; static int __init init_ext3_fs(void) +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index 16efcee..3833fe9 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -5770,9 +5770,14 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) + int ret = -EINVAL; + void *fsdata; + struct file *file = vma->vm_file; +- struct inode *inode = file->f_path.dentry->d_inode; +- struct address_space *mapping = inode->i_mapping; ++ struct inode *inode; ++ struct address_space *mapping; ++ ++ if (file->f_op->get_host) ++ file = file->f_op->get_host(file); + ++ inode = file->f_path.dentry->d_inode; ++ mapping = inode->i_mapping; + /* + * Get i_alloc_sem to stop truncates messing with the inode. We cannot + * get i_mutex because we are already holding mmap_sem. diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index b63d193..0ae6e52 100644 --- a/fs/ext4/ioctl.c @@ -10135,7 +12195,7 @@ index 1a822ce..00dbf5f 100644 fuse_sysfs_cleanup(); fuse_fs_cleanup(); diff --git a/fs/inode.c b/fs/inode.c -index 4d8e3be..5460538 100644 +index 4d8e3be..ab63b5f 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -8,10 +8,13 @@ @@ -10364,8 +12424,17 @@ index 4d8e3be..5460538 100644 } } #endif +@@ -1258,7 +1339,7 @@ int generic_detach_inode(struct inode *inode) + if (!(inode->i_state & (I_DIRTY|I_SYNC))) + list_move(&inode->i_list, &inode_unused); + inodes_stat.nr_unused++; +- if (sb->s_flags & MS_ACTIVE) { ++ if (sb->s_flags & MS_ACTIVE && !(inode->i_flags & S_NOUNUSE)) { + spin_unlock(&inode_lock); + return 0; + } diff --git a/fs/ioprio.c b/fs/ioprio.c -index c7c0b28..c14af3f 100644 +index c7c0b28..2a7e8ae 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -26,6 +26,7 @@ @@ -10389,7 +12458,7 @@ index c7c0b28..c14af3f 100644 switch (class) { case IOPRIO_CLASS_RT: -@@ -137,17 +141,23 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) +@@ -137,17 +141,25 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) if (!user) break; @@ -10407,15 +12476,17 @@ index c7c0b28..c14af3f 100644 free_uid(user); break; + case IOPRIO_WHO_UBC: -+ if (class != IOPRIO_CLASS_BE) -+ return -ERANGE; ++ if (class != IOPRIO_CLASS_BE) { ++ ret = -ERANGE; ++ break; ++ } + + ret = 0; /* bc_set_ioprio(who, data); */ + break; default: ret = -EINVAL; } -@@ -192,9 +202,9 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) +@@ -192,9 +204,9 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) { struct task_struct *g, *p; struct user_struct *user; @@ -10426,7 +12497,7 @@ index c7c0b28..c14af3f 100644 read_lock(&tasklist_lock); switch (which) { -@@ -230,7 +240,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) +@@ -230,7 +242,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) if (!user) break; @@ -10435,7 +12506,7 @@ index c7c0b28..c14af3f 100644 if (__task_cred(p)->uid != user->uid) continue; tmpio = get_task_ioprio(p); -@@ -240,7 +250,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) +@@ -240,7 +252,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) ret = tmpio; else ret = ioprio_best(ret, tmpio); @@ -11147,7 +13218,7 @@ index b0afbd4..84f4037 100644 return 0; diff --git a/fs/namespace.c b/fs/namespace.c -index bdc3cb4..2536eff 100644 +index bdc3cb4..d811360 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -29,6 +29,7 @@ @@ -11182,7 +13253,24 @@ index bdc3cb4..2536eff 100644 atomic_set(&mnt->mnt_count, 1); INIT_LIST_HEAD(&mnt->mnt_hash); INIT_LIST_HEAD(&mnt->mnt_child); -@@ -629,6 +633,7 @@ repeat: +@@ -517,7 +521,7 @@ static void commit_tree(struct vfsmount *mnt) + touch_mnt_namespace(n); + } + +-static struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root) ++struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root) + { + struct list_head *next = p->mnt_mounts.next; + if (next == &p->mnt_mounts) { +@@ -532,6 +536,7 @@ static struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root) + } + return list_entry(next, struct vfsmount, mnt_child); + } ++EXPORT_SYMBOL(next_mnt); + + static struct vfsmount *skip_mnt_tree(struct vfsmount *p) + { +@@ -629,6 +634,7 @@ repeat: spin_unlock(&vfsmount_lock); acct_auto_close_mnt(mnt); security_sb_umount_close(mnt); @@ -11190,7 +13278,7 @@ index bdc3cb4..2536eff 100644 goto repeat; } } -@@ -789,15 +794,48 @@ static void show_type(struct seq_file *m, struct super_block *sb) +@@ -789,15 +795,50 @@ static void show_type(struct seq_file *m, struct super_block *sb) } } @@ -11223,9 +13311,10 @@ index bdc3cb4..2536eff 100644 - int err = 0; + int err; struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; -+ char *path_buf, *path; - +- - mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); ++ char *path_buf, *path; ++ + err = prepare_mnt_root_mangle(&mnt_path, &path_buf, &path); + if (err < 0) + return (err == -EACCES ? 0 : err); @@ -11233,8 +13322,10 @@ index bdc3cb4..2536eff 100644 + if (ve_is_super(get_exec_env()) || + !(mnt->mnt_sb->s_type->fs_flags & FS_MANGLE_PROC)) + mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); -+ else ++ else { ++ seq_puts(m, "/dev/"); + mangle(m, mnt->mnt_sb->s_type->name); ++ } seq_putc(m, ' '); - seq_path(m, &mnt_path, " \t\n\\"); + mangle(m, path); @@ -11242,7 +13333,7 @@ index bdc3cb4..2536eff 100644 seq_putc(m, ' '); show_type(m, mnt->mnt_sb); seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw"); -@@ -884,18 +922,27 @@ static int show_vfsstat(struct seq_file *m, void *v) +@@ -884,18 +925,27 @@ static int show_vfsstat(struct seq_file *m, void *v) { struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; @@ -11273,7 +13364,7 @@ index bdc3cb4..2536eff 100644 seq_putc(m, ' '); /* file system type */ -@@ -1107,6 +1154,34 @@ static int do_umount(struct vfsmount *mnt, int flags) +@@ -1107,6 +1157,36 @@ static int do_umount(struct vfsmount *mnt, int flags) return retval; } @@ -11296,8 +13387,10 @@ index bdc3cb4..2536eff 100644 + } + + while (!list_empty(&kill)) { ++ LIST_HEAD(kill2); + mnt = list_entry(kill.next, struct vfsmount, mnt_list); -+ umount_tree(mnt, 1, &umount_list); ++ umount_tree(mnt, 1, &kill2); ++ list_splice(&kill2, &umount_list); + } + spin_unlock(&vfsmount_lock); + up_write(&namespace_sem); @@ -11308,7 +13401,7 @@ index bdc3cb4..2536eff 100644 /* * Now umount can handle mount points as well as block devices. * This is important for filesystems which use unnamed block devices. -@@ -1130,7 +1205,7 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags) +@@ -1130,7 +1210,7 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags) goto dput_and_out; retval = -EPERM; @@ -11317,7 +13410,7 @@ index bdc3cb4..2536eff 100644 goto dput_and_out; retval = do_umount(path.mnt, flags); -@@ -1156,7 +1231,7 @@ SYSCALL_DEFINE1(oldumount, char __user *, name) +@@ -1156,7 +1236,7 @@ SYSCALL_DEFINE1(oldumount, char __user *, name) static int mount_is_safe(struct path *path) { @@ -11326,7 +13419,7 @@ index bdc3cb4..2536eff 100644 return 0; return -EPERM; #ifdef notyet -@@ -1425,6 +1500,8 @@ static int do_change_type(struct path *path, int flag) +@@ -1425,6 +1505,8 @@ static int do_change_type(struct path *path, int flag) if (path->dentry != path->mnt->mnt_root) return -EINVAL; @@ -11335,7 +13428,7 @@ index bdc3cb4..2536eff 100644 down_write(&namespace_sem); if (type == MS_SHARED) { -@@ -1447,7 +1524,7 @@ static int do_change_type(struct path *path, int flag) +@@ -1447,7 +1529,7 @@ static int do_change_type(struct path *path, int flag) * do loopback mount. */ static int do_loopback(struct path *path, char *old_name, @@ -11344,7 +13437,7 @@ index bdc3cb4..2536eff 100644 { struct path old_path; struct vfsmount *mnt = NULL; -@@ -1477,6 +1554,7 @@ static int do_loopback(struct path *path, char *old_name, +@@ -1477,6 +1559,7 @@ static int do_loopback(struct path *path, char *old_name, if (!mnt) goto out; @@ -11352,7 +13445,7 @@ index bdc3cb4..2536eff 100644 err = graft_tree(mnt, path); if (err) { LIST_HEAD(umount_list); -@@ -1520,7 +1598,7 @@ static int do_remount(struct path *path, int flags, int mnt_flags, +@@ -1520,7 +1603,7 @@ static int do_remount(struct path *path, int flags, int mnt_flags, int err; struct super_block *sb = path->mnt->mnt_sb; @@ -11361,7 +13454,7 @@ index bdc3cb4..2536eff 100644 return -EPERM; if (!check_mnt(path->mnt)) -@@ -1529,6 +1607,9 @@ static int do_remount(struct path *path, int flags, int mnt_flags, +@@ -1529,6 +1612,9 @@ static int do_remount(struct path *path, int flags, int mnt_flags, if (path->dentry != path->mnt->mnt_root) return -EINVAL; @@ -11371,7 +13464,7 @@ index bdc3cb4..2536eff 100644 down_write(&sb->s_umount); if (flags & MS_BIND) err = change_mount_flags(path->mnt, flags); -@@ -1562,7 +1643,7 @@ static int do_move_mount(struct path *path, char *old_name) +@@ -1562,7 +1648,7 @@ static int do_move_mount(struct path *path, char *old_name) struct path old_path, parent_path; struct vfsmount *p; int err = 0; @@ -11380,7 +13473,7 @@ index bdc3cb4..2536eff 100644 return -EPERM; if (!old_name || !*old_name) return -EINVAL; -@@ -1570,6 +1651,10 @@ static int do_move_mount(struct path *path, char *old_name) +@@ -1570,6 +1656,10 @@ static int do_move_mount(struct path *path, char *old_name) if (err) return err; @@ -11391,7 +13484,7 @@ index bdc3cb4..2536eff 100644 down_write(&namespace_sem); while (d_mountpoint(path->dentry) && follow_down(path)) -@@ -1627,6 +1712,7 @@ out: +@@ -1627,6 +1717,7 @@ out: up_write(&namespace_sem); if (!err) path_put(&parent_path); @@ -11399,7 +13492,7 @@ index bdc3cb4..2536eff 100644 path_put(&old_path); return err; } -@@ -1644,7 +1730,7 @@ static int do_new_mount(struct path *path, char *type, int flags, +@@ -1644,7 +1735,7 @@ static int do_new_mount(struct path *path, char *type, int flags, return -EINVAL; /* we need capabilities... */ @@ -11408,7 +13501,7 @@ index bdc3cb4..2536eff 100644 return -EPERM; lock_kernel(); -@@ -1685,6 +1771,11 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path, +@@ -1685,6 +1776,11 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path, goto unlock; newmnt->mnt_flags = mnt_flags; @@ -11420,7 +13513,7 @@ index bdc3cb4..2536eff 100644 if ((err = graft_tree(newmnt, path))) goto unlock; -@@ -1959,7 +2050,7 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, +@@ -1959,7 +2055,7 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, data_page); else if (flags & MS_BIND) @@ -11429,7 +13522,7 @@ index bdc3cb4..2536eff 100644 else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) retval = do_change_type(&path, flags); else if (flags & MS_MOVE) -@@ -2122,6 +2213,7 @@ out_dir: +@@ -2122,6 +2218,7 @@ out_dir: out_type: return ret; } @@ -11437,7 +13530,7 @@ index bdc3cb4..2536eff 100644 /* * pivot_root Semantics: -@@ -2281,7 +2373,7 @@ void __init mnt_init(void) +@@ -2281,7 +2378,7 @@ void __init mnt_init(void) init_rwsem(&namespace_sem); mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount), @@ -11447,7 +13540,7 @@ index bdc3cb4..2536eff 100644 mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC); diff --git a/fs/nfs/client.c b/fs/nfs/client.c -index 99ea196..986fe94 100644 +index 69d6a46..b9a8f89 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -125,6 +125,7 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_ @@ -11514,7 +13607,7 @@ index 99ea196..986fe94 100644 if (clp->rpc_ops != data->rpc_ops) continue; diff --git a/fs/nfs/super.c b/fs/nfs/super.c -index 4bf23f6..79e65e4 100644 +index 4bf23f6..253438f 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -53,6 +53,9 @@ @@ -11619,7 +13712,19 @@ index 4bf23f6..79e65e4 100644 #ifdef CONFIG_NFS_V4 unregister_filesystem(&nfs4_fs_type); #endif -@@ -2079,6 +2135,10 @@ static int nfs_compare_super(struct super_block *sb, void *data) +@@ -1794,6 +1850,11 @@ static int nfs_validate_mount_data(void *options, + goto out_v3_not_compiled; + #endif /* !CONFIG_NFS_V3 */ + ++ if (!(args->flags & NFS_MOUNT_VER3)) { ++ printk("NFSv2 is broken and not supported\n"); ++ return -EPROTONOSUPPORT; ++ } ++ + return 0; + + out_no_data: +@@ -2079,6 +2140,10 @@ static int nfs_compare_super(struct super_block *sb, void *data) struct nfs_server *server = sb_mntdata->server, *old = NFS_SB(sb); int mntflags = sb_mntdata->mntflags; @@ -11630,7 +13735,7 @@ index 4bf23f6..79e65e4 100644 if (!nfs_compare_super_address(old, server)) return 0; /* Note: NFS_MOUNT_UNSHARED == NFS4_MOUNT_UNSHARED */ -@@ -2107,6 +2167,11 @@ static int nfs_get_sb(struct file_system_type *fs_type, +@@ -2107,6 +2172,11 @@ static int nfs_get_sb(struct file_system_type *fs_type, .mntflags = flags, }; int error = -ENOMEM; @@ -11642,7 +13747,7 @@ index 4bf23f6..79e65e4 100644 data = nfs_alloc_parsed_mount_data(3); mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); -@@ -2237,6 +2302,11 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags, +@@ -2237,6 +2307,11 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags, .mntflags = flags, }; int error; @@ -11968,7 +14073,7 @@ index ca44337..745983d 100644 goto path_put_and_out; diff --git a/fs/open.c b/fs/open.c -index 4f01e06..23011b6 100644 +index 4f01e06..77f73fc 100644 --- a/fs/open.c +++ b/fs/open.c @@ -25,6 +25,7 @@ @@ -12070,7 +14175,51 @@ index 4f01e06..23011b6 100644 if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) error = -EFAULT; fput(file); -@@ -707,6 +731,7 @@ out_release: +@@ -630,14 +654,20 @@ out: + return err; + } + +-SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode) ++static int do_fchmodat(int dfd, const char __user *filename, mode_t mode, int flag) + { + struct path path; + struct inode *inode; + int error; + struct iattr newattrs; ++ int follow; + +- error = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path); ++ error = -EINVAL; ++ if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0) ++ goto out; ++ ++ follow = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; ++ error = user_path_at(dfd, filename, follow, &path); + if (error) + goto out; + inode = path.dentry->d_inode; +@@ -659,9 +689,19 @@ out: + return error; + } + ++SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode) ++{ ++ return do_fchmodat(dfd, filename, mode, 0); ++} ++ + SYSCALL_DEFINE2(chmod, const char __user *, filename, mode_t, mode) + { +- return sys_fchmodat(AT_FDCWD, filename, mode); ++ return do_fchmodat(AT_FDCWD, filename, mode, 0); ++} ++ ++SYSCALL_DEFINE2(lchmod, const char __user *, filename, mode_t, mode) ++{ ++ return do_fchmodat(AT_FDCWD, filename, mode, AT_SYMLINK_NOFOLLOW); + } + + static int chown_common(struct dentry * dentry, uid_t user, gid_t group) +@@ -707,6 +747,7 @@ out_release: out: return error; } @@ -12078,7 +14227,7 @@ index 4f01e06..23011b6 100644 SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user, gid_t, group, int, flag) -@@ -948,6 +973,7 @@ struct file *nameidata_to_filp(struct nameidata *nd, int flags) +@@ -948,6 +989,7 @@ struct file *nameidata_to_filp(struct nameidata *nd, int flags) return filp; } @@ -12086,7 +14235,7 @@ index 4f01e06..23011b6 100644 /* * dentry_open() will have done dput(dentry) and mntput(mnt) if it returns an * error. -@@ -972,6 +998,9 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags, +@@ -972,6 +1014,9 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags, return ERR_PTR(-EINVAL); } @@ -12096,7 +14245,7 @@ index 4f01e06..23011b6 100644 error = -ENFILE; f = get_empty_filp(); if (f == NULL) { -@@ -1062,6 +1091,7 @@ SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode) +@@ -1062,6 +1107,7 @@ SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode) asmlinkage_protect(3, ret, filename, flags, mode); return ret; } @@ -12405,10 +14554,18 @@ index 822c2d5..d29461e 100644 mmput(mm); return 0; diff --git a/fs/proc/base.c b/fs/proc/base.c -index 6d71c67..de26c5c 100644 +index 13b0378..eb8a70f 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c -@@ -156,10 +156,14 @@ static int get_fs_path(struct task_struct *task, struct path *path, bool root) +@@ -49,6 +49,7 @@ + + #include + ++#include + #include + #include + #include +@@ -156,10 +157,14 @@ static int get_fs_path(struct task_struct *task, struct path *path, bool root) fs = task->fs; if (fs) { read_lock(&fs->lock); @@ -12426,7 +14583,7 @@ index 6d71c67..de26c5c 100644 } task_unlock(task); return result; -@@ -549,17 +553,31 @@ static int proc_pid_syscall(struct task_struct *task, char *buffer) +@@ -550,17 +555,31 @@ static int proc_pid_syscall(struct task_struct *task, char *buffer) static int proc_fd_access_allowed(struct inode *inode) { struct task_struct *task; @@ -12461,7 +14618,7 @@ index 6d71c67..de26c5c 100644 } static int proc_setattr(struct dentry *dentry, struct iattr *attr) -@@ -1038,6 +1056,8 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, +@@ -1039,6 +1058,8 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) && oom_adjust != OOM_DISABLE) return -EINVAL; @@ -12470,7 +14627,7 @@ index 6d71c67..de26c5c 100644 task = get_proc_task(file->f_path.dentry->d_inode); if (!task) -@@ -1294,6 +1314,7 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) +@@ -1295,6 +1316,7 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) mm->exe_file = new_exe_file; mm->num_exe_file_vmas = 0; } @@ -12478,7 +14635,7 @@ index 6d71c67..de26c5c 100644 struct file *get_mm_exe_file(struct mm_struct *mm) { -@@ -1332,10 +1353,15 @@ static int proc_exe_link(struct inode *inode, struct path *exe_path) +@@ -1333,10 +1355,15 @@ static int proc_exe_link(struct inode *inode, struct path *exe_path) exe_file = get_mm_exe_file(mm); mmput(mm); if (exe_file) { @@ -12497,7 +14654,7 @@ index 6d71c67..de26c5c 100644 } else return -ENOENT; } -@@ -1343,13 +1369,14 @@ static int proc_exe_link(struct inode *inode, struct path *exe_path) +@@ -1344,13 +1371,14 @@ static int proc_exe_link(struct inode *inode, struct path *exe_path) static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; @@ -12514,7 +14671,7 @@ index 6d71c67..de26c5c 100644 goto out; error = PROC_I(inode)->op.proc_get_link(inode, &nd->path); -@@ -1384,12 +1411,13 @@ static int do_proc_readlink(struct path *path, char __user *buffer, int buflen) +@@ -1385,12 +1413,13 @@ static int do_proc_readlink(struct path *path, char __user *buffer, int buflen) static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen) { @@ -12530,7 +14687,7 @@ index 6d71c67..de26c5c 100644 goto out; error = PROC_I(inode)->op.proc_get_link(inode, &path); -@@ -1640,6 +1668,7 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info) +@@ -1641,6 +1670,7 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info) struct files_struct *files = NULL; struct file *file; int fd = proc_fd(inode); @@ -12538,7 +14695,7 @@ index 6d71c67..de26c5c 100644 if (task) { files = get_files_struct(task); -@@ -1652,7 +1681,8 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info) +@@ -1653,7 +1683,8 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info) */ spin_lock(&files->file_lock); file = fcheck_files(files, fd); @@ -12548,7 +14705,7 @@ index 6d71c67..de26c5c 100644 if (path) { *path = file->f_path; path_get(&file->f_path); -@@ -1670,7 +1700,7 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info) +@@ -1671,7 +1702,7 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info) spin_unlock(&files->file_lock); put_files_struct(files); } @@ -12557,7 +14714,7 @@ index 6d71c67..de26c5c 100644 } static int proc_fd_link(struct inode *inode, struct path *path) -@@ -2457,7 +2487,7 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole) +@@ -2458,7 +2489,7 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole) struct task_struct *t = task; task_io_accounting_add(&acct, &task->signal->ioac); @@ -12566,6 +14723,42 @@ index 6d71c67..de26c5c 100644 task_io_accounting_add(&acct, &t->ioac); unlock_task_sighand(task, &flags); +@@ -3161,3 +3192,35 @@ static const struct file_operations proc_task_operations = { + .read = generic_read_dir, + .readdir = proc_task_readdir, + }; ++ ++/* Check whether dentry belongs to a task that already died */ ++int proc_dentry_of_dead_task(struct dentry *dentry) ++{ ++ if (dentry->d_inode->i_fop == &dummy_proc_pid_file_operations) ++ return 1; ++ ++ return (dentry->d_op == &pid_dentry_operations && ++ proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first == NULL); ++} ++EXPORT_SYMBOL(proc_dentry_of_dead_task); ++ ++/* Place it here to avoid use vzrst module count */ ++static ssize_t dummy_proc_pid_read(struct file * file, char __user * buf, ++ size_t count, loff_t *ppos) ++{ ++ return -ESRCH; ++} ++ ++static ssize_t dummy_proc_pid_write(struct file * file, const char * buf, ++ size_t count, loff_t *ppos) ++{ ++ return -ESRCH; ++} ++ ++struct file_operations dummy_proc_pid_file_operations = { ++ .read = dummy_proc_pid_read, ++ .write = dummy_proc_pid_write, ++}; ++ ++EXPORT_SYMBOL(dummy_proc_pid_file_operations); ++ diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c index 82676e3..2ad657d 100644 --- a/fs/proc/cmdline.c @@ -12606,8 +14799,39 @@ index 5a1e539..f7d84b5 100644 return 0; } module_init(proc_cpuinfo_init); +diff --git a/fs/proc/devices.c b/fs/proc/devices.c +index 59ee7da..d485f24 100644 +--- a/fs/proc/devices.c ++++ b/fs/proc/devices.c +@@ -2,6 +2,7 @@ + #include + #include + #include ++#include + + static int devinfo_show(struct seq_file *f, void *v) + { +@@ -25,6 +26,9 @@ static int devinfo_show(struct seq_file *f, void *v) + + static void *devinfo_start(struct seq_file *f, loff_t *pos) + { ++ if (!ve_is_super(get_exec_env())) ++ return NULL; ++ + if (*pos < (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE)) + return pos; + return NULL; +@@ -64,7 +68,7 @@ static const struct file_operations proc_devinfo_operations = { + + static int __init proc_devices_init(void) + { +- proc_create("devices", 0, NULL, &proc_devinfo_operations); ++ proc_create("devices", 0, &glob_proc_root, &proc_devinfo_operations); + return 0; + } + module_init(proc_devices_init); diff --git a/fs/proc/generic.c b/fs/proc/generic.c -index fa678ab..56f268b 100644 +index fa678ab..a66517d 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -255,6 +255,10 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) @@ -12637,7 +14861,33 @@ index fa678ab..56f268b 100644 out: return error; } -@@ -411,28 +418,60 @@ static const struct dentry_operations proc_dentry_operations = +@@ -274,11 +281,22 @@ static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) + { + struct inode *inode = dentry->d_inode; +- struct proc_dir_entry *de = PROC_I(inode)->pde; +- if (de && de->nlink) +- inode->i_nlink = de->nlink; ++ struct proc_dir_entry *de = PDE(inode); ++ struct proc_dir_entry *lde = LPDE(inode); + + generic_fillattr(inode, stat); ++ ++ if (de && de->nlink) ++ stat->nlink = de->nlink; ++ /* if dentry is found in both trees and it is a directory ++ * then inode's nlink count must be altered, because local ++ * and global subtrees may differ. ++ * on the other hand, they may intersect, so actual nlink ++ * value is difficult to calculate - upper estimate is used ++ * instead of it. ++ */ ++ if (lde && lde != de && lde->nlink > 1) ++ stat->nlink += lde->nlink - 2; + return 0; + } + +@@ -411,28 +429,60 @@ static const struct dentry_operations proc_dentry_operations = .d_delete = proc_delete_dentry, }; @@ -12705,7 +14955,7 @@ index fa678ab..56f268b 100644 goto out_unlock; } } -@@ -446,13 +485,15 @@ out_unlock: +@@ -446,13 +496,15 @@ out_unlock: } if (de) de_put(de); @@ -12722,7 +14972,7 @@ index fa678ab..56f268b 100644 } /* -@@ -464,13 +505,14 @@ struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry, +@@ -464,13 +516,14 @@ struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry, * value of the readdir() call, as long as it's non-negative * for success.. */ @@ -12739,7 +14989,7 @@ index fa678ab..56f268b 100644 ino = inode->i_ino; i = filp->f_pos; -@@ -491,25 +533,19 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, +@@ -491,25 +544,19 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, /* fall through */ default: spin_lock(&proc_subdir_lock); @@ -12774,7 +15024,7 @@ index fa678ab..56f268b 100644 spin_unlock(&proc_subdir_lock); if (filldir(dirent, de->name, de->namelen, filp->f_pos, de->low_ino, de->mode >> 12) < 0) { -@@ -518,10 +554,17 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, +@@ -518,10 +565,17 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, } spin_lock(&proc_subdir_lock); filp->f_pos++; @@ -12793,7 +15043,7 @@ index fa678ab..56f268b 100644 spin_unlock(&proc_subdir_lock); } ret = 1; -@@ -533,7 +576,7 @@ int proc_readdir(struct file *filp, void *dirent, filldir_t filldir) +@@ -533,7 +587,7 @@ int proc_readdir(struct file *filp, void *dirent, filldir_t filldir) { struct inode *inode = filp->f_path.dentry->d_inode; @@ -13111,7 +15361,7 @@ index 83adcc8..e9dce9e 100644 } diff --git a/fs/proc/root.c b/fs/proc/root.c -index b080b79..39e1923 100644 +index b080b79..36f59af 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -42,6 +42,9 @@ static int proc_get_sb(struct file_system_type *fs_type, @@ -13176,7 +15426,28 @@ index b080b79..39e1923 100644 #if defined(CONFIG_SUN_OPENPROMFS) || defined(CONFIG_SUN_OPENPROMFS_MODULE) /* just give it a mountpoint */ proc_mkdir("openprom", NULL); -@@ -205,6 +219,22 @@ struct proc_dir_entry proc_root = { +@@ -141,8 +155,19 @@ void __init proc_root_init(void) + static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat + ) + { ++ struct ve_struct *ve = get_exec_env(); ++ + generic_fillattr(dentry->d_inode, stat); +- stat->nlink = proc_root.nlink + nr_processes(); ++ stat->nlink = glob_proc_root.nlink; ++ if (ve_is_super(ve)) ++ stat->nlink += nr_processes(); ++#ifdef CONFIG_VE ++ else ++ /* thread count. not really processes count */ ++ stat->nlink += atomic_read(&ve->pcounter); ++ /* the same logic as in the proc_getattr */ ++ stat->nlink += ve->proc_root->nlink - 2; ++#endif + return 0; + } + +@@ -205,6 +230,22 @@ struct proc_dir_entry proc_root = { .parent = &proc_root, }; @@ -13407,7 +15678,7 @@ index 68d4f6d..4c2159c 100644 + +obj-y += vzdquota/ diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c -index 2ed79a9..acfde60 100644 +index 4fdb0eb..e7aff07 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -170,8 +170,9 @@ static struct quota_format_type *find_quota_format(int id) @@ -13423,7 +15694,7 @@ index 2ed79a9..acfde60 100644 if (!actqf || !try_module_get(actqf->qf_owner)) { int qm; diff --git a/fs/quota/quota.c b/fs/quota/quota.c -index 95c5b42..41a6f18 100644 +index 95c5b42..7d9d4b4 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -18,6 +18,7 @@ @@ -13501,7 +15772,7 @@ index 95c5b42..41a6f18 100644 sb = get_super(bdev); bdput(bdev); if (!sb) -@@ -379,6 +390,215 @@ static struct super_block *quotactl_block(const char __user *special) +@@ -379,6 +390,231 @@ static struct super_block *quotactl_block(const char __user *special) #endif } @@ -13534,6 +15805,21 @@ index 95c5b42..41a6f18 100644 + __kernel_time_t dqb_itime; +}; + ++#ifdef CONFIG_COMPAT ++ ++struct compat_compat_dqblk { ++ compat_uint_t dqb_ihardlimit; ++ compat_uint_t dqb_isoftlimit; ++ compat_uint_t dqb_curinodes; ++ compat_uint_t dqb_bhardlimit; ++ compat_uint_t dqb_bsoftlimit; ++ compat_u64 dqb_curspace; ++ compat_time_t dqb_btime; ++ compat_time_t dqb_itime; ++}; ++ ++#endif ++ +struct compat_dqinfo { + unsigned int dqi_bgrace; + unsigned int dqi_igrace; @@ -13556,6 +15842,7 @@ index 95c5b42..41a6f18 100644 +}; + +asmlinkage long sys_quotactl(unsigned int cmd, const char __user *special, qid_t id, void __user *addr); ++ +static long compat_quotactl(unsigned int cmds, unsigned int type, + const char __user *special, qid_t id, + void __user *addr) @@ -13717,7 +16004,7 @@ index 95c5b42..41a6f18 100644 /* * This is the system call interface. This communicates with * the user-level programs. Currently this only supports diskquota -@@ -395,6 +615,11 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special, +@@ -395,6 +631,11 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special, cmds = cmd >> SUBCMDSHIFT; type = cmd & SUBCMDMASK; @@ -13729,6 +16016,62 @@ index 95c5b42..41a6f18 100644 if (cmds != Q_SYNC || special) { sb = quotactl_block(special); if (IS_ERR(sb)) +@@ -459,6 +700,11 @@ asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special, + compat_uint_t data; + u16 xdata; + long ret; ++#ifdef CONFIG_QUOTA_COMPAT ++ struct compat_dqblk __user *cdq; ++ struct compat_compat_dqblk __user *compat_cdq; ++ compat_time_t time; ++#endif + + cmds = cmd >> SUBCMDSHIFT; + +@@ -519,6 +765,43 @@ asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special, + break; + ret = 0; + break; ++#ifdef CONFIG_QUOTA_COMPAT ++ case QC_GETQUOTA: ++ cdq = compat_alloc_user_space(sizeof(struct compat_dqblk)); ++ compat_cdq = addr; ++ ret = sys_quotactl(cmd, special, id, cdq); ++ if (ret) ++ break; ++ ret = -EFAULT; ++ if (copy_in_user(compat_cdq, cdq, sizeof(struct compat_compat_dqblk) - ++ offsetof(struct compat_compat_dqblk, dqb_curspace)) || ++ copy_in_user(&compat_cdq->dqb_curspace, &cdq->dqb_curspace, ++ sizeof(cdq->dqb_curspace)) || ++ get_user(time, &cdq->dqb_btime) || ++ put_user(time, &compat_cdq->dqb_btime) || ++ get_user(time, &cdq->dqb_itime) || ++ put_user(time, &compat_cdq->dqb_itime)) ++ break; ++ ret = 0; ++ break; ++ case QC_SETQUOTA: ++ case QC_SETUSE: ++ case QC_SETQLIM: ++ cdq = compat_alloc_user_space(sizeof(struct compat_dqblk)); ++ compat_cdq = addr; ++ ret = -EFAULT; ++ if (copy_in_user(cdq, compat_cdq, sizeof(struct compat_compat_dqblk) - ++ offsetof(struct compat_compat_dqblk, dqb_curspace)) || ++ copy_in_user(&cdq->dqb_curspace, &compat_cdq->dqb_curspace, ++ sizeof(cdq->dqb_curspace)) || ++ get_user(time, &compat_cdq->dqb_btime) || ++ put_user(time, &cdq->dqb_btime) || ++ get_user(time, &compat_cdq->dqb_itime) || ++ put_user(time, &cdq->dqb_itime)) ++ break; ++ ret = sys_quotactl(cmd, special, id, cdq); ++ break; ++#endif + default: + ret = sys_quotactl(cmd, special, id, addr); + } diff --git a/fs/quota/vzdquota/Makefile b/fs/quota/vzdquota/Makefile new file mode 100644 index 0000000..03fdee3 @@ -13741,10 +16084,10 @@ index 0000000..03fdee3 +vzdquota-$(CONFIG_VZ_QUOTA_UGID) += vzdq_file.o diff --git a/fs/quota/vzdquota/vzdq_file.c b/fs/quota/vzdquota/vzdq_file.c new file mode 100644 -index 0000000..0355917 +index 0000000..3ac9f05 --- /dev/null +++ b/fs/quota/vzdquota/vzdq_file.c -@@ -0,0 +1,928 @@ +@@ -0,0 +1,956 @@ +/* + * + * Copyright (C) 2005 SWsoft @@ -13783,12 +16126,12 @@ index 0000000..0355917 + * File read operation + * + * FIXME: functions in this section (as well as many functions in vzdq_ugid.c, -+ * perhaps) abuse vz_quota_sem. -+ * Taking a global semaphore for lengthy and user-controlled operations inside ++ * perhaps) abuse vz_quota_mutex. ++ * Taking a global mutex for lengthy and user-controlled operations inside + * VPSs is not a good idea in general. -+ * In this case, the reasons for taking this semaphore are completely unclear, ++ * In this case, the reasons for taking this mutex are completely unclear, + * especially taking into account that the only function that has comments -+ * about the necessity to be called under this semaphore ++ * about the necessity to be called under this mutex + * (create_proc_quotafile) is actually called OUTSIDE it. + * + * --------------------------------------------------------------------- */ @@ -13817,7 +16160,7 @@ index 0000000..0355917 + int type; /* type of the tree */ +}; + -+/* serialized by vz_quota_sem */ ++/* serialized by vz_quota_mutex */ +static LIST_HEAD(qf_data_head); + +static const u_int32_t vzquota_magics[] = V2_INITQMAGICS; @@ -14054,8 +16397,8 @@ index 0000000..0355917 + return -ENOMEM; + + qtd = data; -+ down(&vz_quota_sem); -+ down(&qtd->qmblk->dq_sem); ++ mutex_lock(&vz_quota_mutex); ++ mutex_lock(&qtd->qmblk->dq_mutex); + + res = 0; + tree = QUGID_TREE(qtd->qmblk, qtd->type); @@ -14094,8 +16437,8 @@ index 0000000..0355917 +out_err: + *start += count; +out_dq: -+ up(&qtd->qmblk->dq_sem); -+ up(&vz_quota_sem); ++ mutex_unlock(&qtd->qmblk->dq_mutex); ++ mutex_unlock(&vz_quota_mutex); + kfree(tmp); + + return res; @@ -14617,6 +16960,33 @@ index 0000000..0355917 + return ERR_PTR(-ENOENT); +} + ++static int vzdq_aquotd_getattr(struct vfsmount *mnt, struct dentry *dentry, ++ struct kstat *stat) ++{ ++ struct ve_struct *ve, *old_ve; ++ struct list_head mntlist, *pos; ++ ++ generic_fillattr(dentry->d_inode, stat); ++ ve = dentry->d_sb->s_type->owner_env; ++#ifdef CONFIG_VE ++ /* ++ * The only reason of disabling getattr for the host system is that ++ * this getattr can be slow and CPU consuming with large number of VPSs ++ * (or just mount points). ++ */ ++ if (ve_is_super(ve)) ++ return 0; ++#endif ++ INIT_LIST_HEAD(&mntlist); ++ old_ve = set_exec_env(ve); ++ if (!vzdq_aquot_buildmntlist(ve, &mntlist)) ++ list_for_each(pos, &mntlist) ++ stat->nlink++; ++ vzdq_aquot_releasemntlist(ve, &mntlist); ++ (void)set_exec_env(old_ve); ++ return 0; ++} ++ +static struct file_operations vzdq_aquotd_file_operations = { + .read = &generic_read_dir, + .readdir = &vzdq_aquotd_readdir, @@ -14624,6 +16994,7 @@ index 0000000..0355917 + +static struct inode_operations vzdq_aquotd_inode_operations = { + .lookup = &vzdq_aquotd_lookup, ++ .getattr = &vzdq_aquotd_getattr, +}; + + @@ -14675,7 +17046,7 @@ index 0000000..0355917 +} diff --git a/fs/quota/vzdquota/vzdq_mgmt.c b/fs/quota/vzdquota/vzdq_mgmt.c new file mode 100644 -index 0000000..5e078ed +index 0000000..bd066de --- /dev/null +++ b/fs/quota/vzdquota/vzdq_mgmt.c @@ -0,0 +1,754 @@ @@ -14764,7 +17135,7 @@ index 0000000..5e078ed + struct vz_quota_stat qstat; + struct vz_quota_master *qmblk; + -+ down(&vz_quota_sem); ++ mutex_lock(&vz_quota_mutex); + + err = -EFAULT; + if (!compat) { @@ -14792,7 +17163,7 @@ index 0000000..5e078ed + if (IS_ERR(qmblk)) /* ENOMEM or EEXIST */ + err = PTR_ERR(qmblk); +out: -+ up(&vz_quota_sem); ++ mutex_unlock(&vz_quota_mutex); + + return err; +} @@ -14816,7 +17187,7 @@ index 0000000..5e078ed + struct super_block *dqsb; + + dqsb = NULL; -+ down(&vz_quota_sem); ++ mutex_lock(&vz_quota_mutex); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); @@ -14856,7 +17227,7 @@ index 0000000..5e078ed + goto out_init; + qmblk->dq_state = VZDQ_WORKING; + -+ up(&vz_quota_sem); ++ mutex_unlock(&vz_quota_mutex); + return 0; + +out_init: @@ -14871,7 +17242,7 @@ index 0000000..5e078ed +out: + if (dqsb) + vzquota_put_super(dqsb); -+ up(&vz_quota_sem); ++ mutex_unlock(&vz_quota_mutex); + return err; +} + @@ -14889,7 +17260,7 @@ index 0000000..5e078ed + struct vz_quota_master *qmblk; + struct path root; + -+ down(&vz_quota_sem); ++ mutex_lock(&vz_quota_mutex); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); @@ -14907,14 +17278,14 @@ index 0000000..5e078ed + + if (qmblk->dq_sb) + vzquota_put_super(qmblk->dq_sb); -+ up(&vz_quota_sem); ++ mutex_unlock(&vz_quota_mutex); + + qmblk_put(qmblk); + path_put(&root); + return 0; + +out: -+ up(&vz_quota_sem); ++ mutex_unlock(&vz_quota_mutex); + return err; +} + @@ -14992,7 +17363,7 @@ index 0000000..5e078ed + int err, ret; + struct vz_quota_master *qmblk; + -+ down(&vz_quota_sem); ++ mutex_lock(&vz_quota_mutex); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); @@ -15015,7 +17386,7 @@ index 0000000..5e078ed + /* vzquota_destroy will free resources */ + qmblk->dq_state = VZDQ_STOPING; +out: -+ up(&vz_quota_sem); ++ mutex_unlock(&vz_quota_mutex); + + return err; +} @@ -15090,7 +17461,7 @@ index 0000000..5e078ed + struct vz_quota_stat qstat; + struct vz_quota_master *qmblk; + -+ down(&vz_quota_sem); /* for hash list protection */ ++ mutex_lock(&vz_quota_mutex); /* for hash list protection */ + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); @@ -15118,7 +17489,7 @@ index 0000000..5e078ed + qmblk_data_write_unlock(qmblk); + +out: -+ up(&vz_quota_sem); ++ mutex_unlock(&vz_quota_mutex); + return err; +} + @@ -15133,7 +17504,7 @@ index 0000000..5e078ed + struct vz_quota_stat qstat; + struct vz_quota_master *qmblk; + -+ down(&vz_quota_sem); ++ mutex_lock(&vz_quota_mutex); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); @@ -15160,7 +17531,7 @@ index 0000000..5e078ed + err = -EFAULT; + +out: -+ up(&vz_quota_sem); ++ mutex_unlock(&vz_quota_mutex); + return err; +} + @@ -15354,7 +17725,7 @@ index 0000000..5e078ed + p += len; + } + -+ down(&vz_quota_sem); ++ mutex_lock(&vz_quota_mutex); + + /* traverse master hash table for all records */ + for (i = 0; i < vzquota_hash_size; i++) { @@ -15395,7 +17766,7 @@ index 0000000..5e078ed + + *eof = 1; /* checked all hash */ +out: -+ up(&vz_quota_sem); ++ mutex_unlock(&vz_quota_mutex); + + len = 0; + if (*start != NULL) { @@ -15435,10 +17806,10 @@ index 0000000..5e078ed +#endif diff --git a/fs/quota/vzdquota/vzdq_ops.c b/fs/quota/vzdquota/vzdq_ops.c new file mode 100644 -index 0000000..e22d573 +index 0000000..904ff5e --- /dev/null +++ b/fs/quota/vzdquota/vzdq_ops.c -@@ -0,0 +1,632 @@ +@@ -0,0 +1,644 @@ +/* + * Copyright (C) 2001, 2002, 2004, 2005 SWsoft + * All rights reserved. @@ -15942,11 +18313,13 @@ index 0000000..e22d573 + * of vzquota. + * + * To be safe, we reacquire vzquota lock. ++ * The assumption is that it would not hurt to call ++ * vzquota_inode_drop() more than once, but it must ++ * be called at least once after S_NOQUOTA is set. + */ + inode_qmblk_lock(inode->i_sb); + inode->i_flags |= S_NOQUOTA; + inode_qmblk_unlock(inode->i_sb); -+ return; + } else { + loff_t bytes = inode_get_bytes(inode); +#ifdef CONFIG_VZ_QUOTA_UGID @@ -15969,9 +18342,8 @@ index 0000000..e22d573 +#endif + + vzquota_data_unlock(inode, &data); -+ -+ vzquota_inode_drop_call(inode); + } ++ vzquota_inode_drop_call(inode); +} + + @@ -16035,6 +18407,12 @@ index 0000000..e22d573 + NO_QUOTA : QUOTA_OK; +} + ++static void vzquota_swap_inode(struct inode *inode, struct inode *tmpl) ++{ ++ vzquota_inode_swap_call(inode, tmpl); ++} ++ ++ +#else /* CONFIG_VZ_QUOTA_UGID */ + +static int vzquota_transfer(struct inode *inode, struct iattr *iattr) @@ -16058,6 +18436,8 @@ index 0000000..e22d573 + NO_QUOTA : QUOTA_OK; +} + ++extern void vzquota_shutdown_super(struct super_block *sb); ++ +/* + * Structure of superblock diskquota operations. + */ @@ -16070,6 +18450,9 @@ index 0000000..e22d573 + .free_inode = vzquota_free_inode, + .transfer = vzquota_transfer, + .rename = vzquota_rename, ++ ++ .swap_inode = vzquota_swap_inode, ++ .shutdown = vzquota_shutdown_super, +}; diff --git a/fs/quota/vzdquota/vzdq_tree.c b/fs/quota/vzdquota/vzdq_tree.c new file mode 100644 @@ -16365,10 +18748,10 @@ index 0000000..f4f2152 +} diff --git a/fs/quota/vzdquota/vzdq_ugid.c b/fs/quota/vzdquota/vzdq_ugid.c new file mode 100644 -index 0000000..60e0981 +index 0000000..a3e9e8c --- /dev/null +++ b/fs/quota/vzdquota/vzdq_ugid.c -@@ -0,0 +1,1220 @@ +@@ -0,0 +1,1216 @@ +/* + * Copyright (C) 2002 SWsoft + * All rights reserved. @@ -16410,10 +18793,6 @@ index 0000000..60e0981 + +static struct kmem_cache *vz_quota_ugid_cachep; + -+/* guard to protect vz_quota_master from destroy in quota_on/off. Also protects -+ * list on the hash table */ -+extern struct semaphore vz_quota_sem; -+ +inline struct vz_quota_ugid *vzquota_get_ugid(struct vz_quota_ugid *qugid) +{ + if (qugid != VZ_QUOTA_UGBAD) @@ -16436,7 +18815,7 @@ index 0000000..60e0981 + +/* + * destroy ugid, if it have zero refcount, limits and usage -+ * must be called under qmblk->dq_sem ++ * must be called under qmblk->dq_mutex + */ +void vzquota_put_ugid(struct vz_quota_master *qmblk, + struct vz_quota_ugid *qugid) @@ -16483,7 +18862,7 @@ index 0000000..60e0981 +} + +/* -+ * requires dq_sem ++ * requires dq_mutex + */ +struct vz_quota_ugid *__vzquota_find_ugid(struct vz_quota_master *qmblk, + unsigned int quota_id, int type, int flags) @@ -16541,16 +18920,16 @@ index 0000000..60e0981 +} + +/* -+ * takes dq_sem, may schedule ++ * takes dq_mutex, may schedule + */ +struct vz_quota_ugid *vzquota_find_ugid(struct vz_quota_master *qmblk, + unsigned int quota_id, int type, int flags) +{ + struct vz_quota_ugid *qugid; + -+ down(&qmblk->dq_sem); ++ mutex_lock(&qmblk->dq_mutex); + qugid = __vzquota_find_ugid(qmblk, quota_id, type, flags); -+ up(&qmblk->dq_sem); ++ mutex_unlock(&qmblk->dq_mutex); + + return qugid; +} @@ -16705,7 +19084,7 @@ index 0000000..60e0981 + if (err < 0) + goto out_put; + -+ down(&vz_quota_sem); ++ mutex_lock(&vz_quota_mutex); + mask2 = 0; + sb->dq_op = &vz_quota_operations2; + sb->s_qcop = &vz_quotactl_operations; @@ -16724,7 +19103,7 @@ index 0000000..60e0981 + DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED, type); + +out_sem: -+ up(&vz_quota_sem); ++ mutex_unlock(&vz_quota_mutex); +out_put: + qmblk_put(qmblk); +out: @@ -16738,7 +19117,7 @@ index 0000000..60e0981 + int err; + + qmblk = vzquota_find_qmblk(sb); -+ down(&vz_quota_sem); ++ mutex_lock(&vz_quota_mutex); + err = -ESRCH; + if (qmblk == NULL) + goto out; @@ -16759,7 +19138,7 @@ index 0000000..60e0981 + err = 0; + +out: -+ up(&vz_quota_sem); ++ mutex_unlock(&vz_quota_mutex); + if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) + qmblk_put(qmblk); + return err; @@ -16778,7 +19157,7 @@ index 0000000..60e0981 + int err; + + qmblk = vzquota_find_qmblk(sb); -+ down(&vz_quota_sem); ++ mutex_lock(&vz_quota_mutex); + err = -ESRCH; + if (qmblk == NULL) + goto out; @@ -16807,13 +19186,13 @@ index 0000000..60e0981 + } + +out: -+ up(&vz_quota_sem); ++ mutex_unlock(&vz_quota_mutex); + if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) + qmblk_put(qmblk); + return err; +} + -+/* must be called under vz_quota_sem */ ++/* must be called under vz_quota_mutex */ +static int __vz_set_dqblk(struct vz_quota_master *qmblk, + int type, qid_t id, struct if_dqblk *di) +{ @@ -16882,7 +19261,7 @@ index 0000000..60e0981 + int err; + + qmblk = vzquota_find_qmblk(sb); -+ down(&vz_quota_sem); ++ mutex_lock(&vz_quota_mutex); + err = -ESRCH; + if (qmblk == NULL) + goto out; @@ -16891,7 +19270,7 @@ index 0000000..60e0981 + goto out; + err = __vz_set_dqblk(qmblk, type, id, di); +out: -+ up(&vz_quota_sem); ++ mutex_unlock(&vz_quota_mutex); + if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) + qmblk_put(qmblk); + return err; @@ -16904,7 +19283,7 @@ index 0000000..60e0981 + int err; + + qmblk = vzquota_find_qmblk(sb); -+ down(&vz_quota_sem); ++ mutex_lock(&vz_quota_mutex); + err = -ESRCH; + if (qmblk == NULL) + goto out; @@ -16919,13 +19298,13 @@ index 0000000..60e0981 + ii->dqi_valid = IIF_ALL; + +out: -+ up(&vz_quota_sem); ++ mutex_unlock(&vz_quota_mutex); + if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) + qmblk_put(qmblk); + return err; +} + -+/* must be called under vz_quota_sem */ ++/* must be called under vz_quota_mutex */ +static int __vz_set_dqinfo(struct vz_quota_master *qmblk, + int type, struct if_dqinfo *ii) +{ @@ -16947,7 +19326,7 @@ index 0000000..60e0981 + int err; + + qmblk = vzquota_find_qmblk(sb); -+ down(&vz_quota_sem); ++ mutex_lock(&vz_quota_mutex); + err = -ESRCH; + if (qmblk == NULL) + goto out; @@ -16956,7 +19335,7 @@ index 0000000..60e0981 + goto out; + err = __vz_set_dqinfo(qmblk, type, ii); +out: -+ up(&vz_quota_sem); ++ mutex_unlock(&vz_quota_mutex); + if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) + qmblk_put(qmblk); + return err; @@ -17003,8 +19382,8 @@ index 0000000..60e0981 + if (!kbuf) + goto out; + -+ down(&vz_quota_sem); -+ down(&qmblk->dq_sem); ++ mutex_lock(&vz_quota_mutex); ++ mutex_lock(&qmblk->dq_mutex); + for (ugid = vzquota_get_byindex(qmblk, idx, type), count = 0; + ugid != NULL && count < Q_GETQUOTI_SIZE; + count++) @@ -17019,8 +19398,8 @@ index 0000000..60e0981 + ugid = vzquota_get_next(qmblk, ugid); + BUG_ON(ugid != NULL && ugid->qugid_type != type); + } -+ up(&qmblk->dq_sem); -+ up(&vz_quota_sem); ++ mutex_unlock(&qmblk->dq_mutex); ++ mutex_unlock(&vz_quota_mutex); + + err = count; + if (copy_to_user(dqblk, kbuf, count * sizeof(*kbuf))) @@ -17060,7 +19439,7 @@ index 0000000..60e0981 + struct vz_quota_master *qmblk; + int ret; + -+ down(&vz_quota_sem); ++ mutex_lock(&vz_quota_mutex); + + ret = -ENOENT; + qmblk = vzquota_find_master(quota_id); @@ -17114,7 +19493,7 @@ index 0000000..60e0981 + vzquota_put_ugid(qmblk, ugid); + } +out: -+ up(&vz_quota_sem); ++ mutex_unlock(&vz_quota_mutex); + + return ret; +} @@ -17127,7 +19506,7 @@ index 0000000..60e0981 + struct dq_info *target; + int err, type; + -+ down(&vz_quota_sem); ++ mutex_lock(&vz_quota_mutex); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); @@ -17161,7 +19540,7 @@ index 0000000..60e0981 + target->iexpire = dq_info[type].iexpire; + } +out: -+ up(&vz_quota_sem); ++ mutex_unlock(&vz_quota_mutex); + + return err; +} @@ -17230,16 +19609,16 @@ index 0000000..60e0981 + if (k_ugid_buf == NULL) + return -ENOMEM; + -+ down(&vz_quota_sem); ++ mutex_lock(&vz_quota_mutex); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + -+ down(&qmblk->dq_sem); ++ mutex_lock(&qmblk->dq_mutex); + err = do_quota_ugid_getstat(qmblk, index, size, k_ugid_buf); -+ up(&qmblk->dq_sem); ++ mutex_unlock(&qmblk->dq_mutex); + if (err < 0) + goto out; + @@ -17265,7 +19644,7 @@ index 0000000..60e0981 + } + +out: -+ up(&vz_quota_sem); ++ mutex_unlock(&vz_quota_mutex); + vfree(k_ugid_buf); + return err; +} @@ -17278,7 +19657,7 @@ index 0000000..60e0981 + struct dq_info *target; + int err, type; + -+ down(&vz_quota_sem); ++ mutex_lock(&vz_quota_mutex); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); @@ -17307,7 +19686,7 @@ index 0000000..60e0981 +#endif + } +out: -+ up(&vz_quota_sem); ++ mutex_unlock(&vz_quota_mutex); + + return err; +} @@ -17319,7 +19698,7 @@ index 0000000..60e0981 + struct vz_quota_ugid_stat kinfo; + int err; + -+ down(&vz_quota_sem); ++ mutex_lock(&vz_quota_mutex); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); @@ -17334,7 +19713,7 @@ index 0000000..60e0981 + if (copy_to_user(info, &kinfo, sizeof(kinfo))) + err = -EFAULT; +out: -+ up(&vz_quota_sem); ++ mutex_unlock(&vz_quota_mutex); + + return err; +} @@ -17346,7 +19725,7 @@ index 0000000..60e0981 + struct vz_quota_ugid_stat kinfo; + int err; + -+ down(&vz_quota_sem); ++ mutex_lock(&vz_quota_mutex); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); @@ -17366,7 +19745,7 @@ index 0000000..60e0981 + } + +out: -+ up(&vz_quota_sem); ++ mutex_unlock(&vz_quota_mutex); + + return err; +} @@ -17378,7 +19757,7 @@ index 0000000..60e0981 + struct vz_quota_ugid_setlimit lim; + int err; + -+ down(&vz_quota_sem); ++ mutex_lock(&vz_quota_mutex); + + err = -ESRCH; + qmblk = vzquota_find_master(quota_id); @@ -17392,7 +19771,7 @@ index 0000000..60e0981 + err = __vz_set_dqblk(qmblk, lim.type, lim.id, &lim.dqb); + +out: -+ up(&vz_quota_sem); ++ mutex_unlock(&vz_quota_mutex); + + return err; +} @@ -17404,7 +19783,7 @@ index 0000000..60e0981 + struct vz_quota_ugid_setinfo info; + int err; + -+ down(&vz_quota_sem); ++ mutex_lock(&vz_quota_mutex); + + err = -ESRCH; + qmblk = vzquota_find_master(quota_id); @@ -17418,7 +19797,7 @@ index 0000000..60e0981 + err = __vz_set_dqinfo(qmblk, info.type, &info.dqi); + +out: -+ up(&vz_quota_sem); ++ mutex_unlock(&vz_quota_mutex); + + return err; +} @@ -17509,14 +19888,14 @@ index 0000000..60e0981 + qmblk = vzquota_find_qmblk(sb); + if ((qmblk == NULL) || (qmblk == VZ_QUOTA_BAD)) + return; -+ down(&vz_quota_sem); ++ mutex_lock(&vz_quota_mutex); + if (qmblk->dq_flags & VZDQ_USRQUOTA) + sb->s_dquot.flags |= dquot_state_flag(DQUOT_USAGE_ENABLED | + DQUOT_LIMITS_ENABLED, USRQUOTA); + if (qmblk->dq_flags & VZDQ_GRPQUOTA) + sb->s_dquot.flags |= dquot_state_flag(DQUOT_USAGE_ENABLED | + DQUOT_LIMITS_ENABLED, GRPQUOTA); -+ up(&vz_quota_sem); ++ mutex_unlock(&vz_quota_mutex); + qmblk_put(qmblk); +} + @@ -17591,10 +19970,10 @@ index 0000000..60e0981 +} diff --git a/fs/quota/vzdquota/vzdquot.c b/fs/quota/vzdquota/vzdquot.c new file mode 100644 -index 0000000..6f2f22a +index 0000000..f091943 --- /dev/null +++ b/fs/quota/vzdquota/vzdquot.c -@@ -0,0 +1,1961 @@ +@@ -0,0 +1,1994 @@ +/* + * Copyright (C) 2001, 2002, 2004, 2005 SWsoft + * All rights reserved. @@ -17639,7 +20018,7 @@ index 0000000..6f2f22a + * Serializes on/off and all other do_vzquotactl operations. + * Protects qmblk hash. + */ -+struct semaphore vz_quota_sem; ++struct mutex vz_quota_mutex; + +/* + * Data access locks @@ -17703,7 +20082,7 @@ index 0000000..6f2f22a + * + * Master hash table handling. + * -+ * SMP not safe, serialied by vz_quota_sem within quota syscalls ++ * SMP not safe, serialied by vz_quota_mutex within quota syscalls + * + * --------------------------------------------------------------------- */ + @@ -17757,7 +20136,7 @@ index 0000000..6f2f22a +#endif + + qmblk->dq_state = VZDQ_STARTING; -+ init_MUTEX(&qmblk->dq_sem); ++ mutex_init(&qmblk->dq_mutex); + spin_lock_init(&qmblk->dq_data_lock); + + qmblk->dq_id = quota_id; @@ -17811,7 +20190,7 @@ index 0000000..6f2f22a + * vzquota_find_master - find master record with given id + * + * Returns qmblk without touching its refcounter. -+ * Called under vz_quota_sem. ++ * Called under vz_quota_mutex. + */ +struct vz_quota_master *vzquota_find_master(unsigned int quota_id) +{ @@ -17830,7 +20209,7 @@ index 0000000..6f2f22a + * vzquota_free_master - release resources taken by qmblk, freeing memory + * + * qmblk is assumed to be already taken out from the hash. -+ * Should be called outside vz_quota_sem. ++ * Should be called outside vz_quota_mutex. + */ +void vzquota_free_master(struct vz_quota_master *qmblk) +{ @@ -17912,7 +20291,7 @@ index 0000000..6f2f22a + * quotas. We keep a counter of such subtrees and set VZ quota operations or + * reset the default ones. + * -+ * Called under vz_quota_sem (from quota_on). ++ * Called under vz_quota_mutex (from quota_on). + */ +int vzquota_get_super(struct super_block *sb) +{ @@ -17954,7 +20333,7 @@ index 0000000..6f2f22a + __module_get(THIS_MODULE); + up(&sb->s_dquot.dqonoff_sem); + } -+ /* protected by vz_quota_sem */ ++ /* protected by vz_quota_mutex */ + __VZ_QUOTA_SBREF(sb)++; + return 0; +} @@ -17962,7 +20341,7 @@ index 0000000..6f2f22a +/** + * quota_put_super - release superblock when one quota tree goes away + * -+ * Called under vz_quota_sem. ++ * Called under vz_quota_mutex. + */ +void vzquota_put_super(struct super_block *sb) +{ @@ -18004,28 +20383,17 @@ index 0000000..6f2f22a + +#else + -+struct vzquota_new_sop { -+ struct super_operations new_op; -+ const struct super_operations *old_op; -+}; -+ +/** + * vzquota_shutdown_super - callback on umount + */ +void vzquota_shutdown_super(struct super_block *sb) +{ + struct vz_quota_master *qmblk; -+ struct vzquota_new_sop *sop; + + qmblk = __VZ_QUOTA_NOQUOTA(sb); + __VZ_QUOTA_NOQUOTA(sb) = NULL; + if (qmblk != NULL) + qmblk_put(qmblk); -+ sop = container_of(sb->s_op, struct vzquota_new_sop, new_op); -+ sb->s_op = sop->old_op; -+ kfree(sop); -+ if (sb->s_op->put_super != NULL) -+ (*sb->s_op->put_super)(sb); +} + +/** @@ -18034,12 +20402,11 @@ index 0000000..6f2f22a + * One superblock can have multiple directory subtrees with different VZ + * quotas. + * -+ * Called under vz_quota_sem (from vzquota_on). ++ * Called under vz_quota_mutex (from vzquota_on). + */ +int vzquota_get_super(struct super_block *sb) +{ + struct vz_quota_master *qnew; -+ struct vzquota_new_sop *sop; + int err; + + mutex_lock(&sb->s_dquot.dqonoff_mutex); @@ -18059,17 +20426,6 @@ index 0000000..6f2f22a + } + + if (sb->dq_op != &vz_quota_operations) { -+ sop = kmalloc(sizeof(*sop), GFP_KERNEL); -+ if (sop == NULL) { -+ vzquota_free_master(__VZ_QUOTA_NOQUOTA(sb)); -+ __VZ_QUOTA_NOQUOTA(sb) = NULL; -+ goto out_up; -+ } -+ memcpy(&sop->new_op, sb->s_op, sizeof(sop->new_op)); -+ sop->new_op.put_super = &vzquota_shutdown_super; -+ sop->old_op = sb->s_op; -+ sb->s_op = &sop->new_op; -+ + sb->dq_op = &vz_quota_operations; +#ifdef CONFIG_VZ_QUOTA_UGID + sb->s_qcop = &vz_quotactl_operations; @@ -18115,7 +20471,7 @@ index 0000000..6f2f22a +/** + * vzquota_put_super - one quota tree less on this superblock + * -+ * Called under vz_quota_sem. ++ * Called under vz_quota_mutex. + */ +void vzquota_put_super(struct super_block *sb) +{ @@ -18194,12 +20550,12 @@ index 0000000..6f2f22a + quid = qlnk->qugid[USRQUOTA]; + qgid = qlnk->qugid[GRPQUOTA]; + if (quid != NULL || qgid != NULL) { -+ down(&qmblk->dq_sem); ++ mutex_lock(&qmblk->dq_mutex); + if (qgid != NULL) + vzquota_put_ugid(qmblk, qgid); + if (quid != NULL) + vzquota_put_ugid(qmblk, quid); -+ up(&qmblk->dq_sem); ++ mutex_unlock(&qmblk->dq_mutex); + } + } +#endif @@ -18315,10 +20671,10 @@ index 0000000..6f2f22a + spin_unlock(&dcache_lock); + inode_qmblk_unlock(inode->i_sb); + -+ down(&qmblk->dq_sem); ++ mutex_lock(&qmblk->dq_mutex); + quid = __vzquota_find_ugid(qmblk, inode->i_uid, USRQUOTA, 0); + qgid = __vzquota_find_ugid(qmblk, inode->i_gid, GRPQUOTA, 0); -+ up(&qmblk->dq_sem); ++ mutex_unlock(&qmblk->dq_mutex); + + inode_qmblk_lock(inode->i_sb); + spin_lock(&dcache_lock); @@ -18361,14 +20717,14 @@ index 0000000..6f2f22a + qmblk_data_write_unlock(qmblk); + inode_qmblk_unlock(inode->i_sb); + -+ down(&qmblk->dq_sem); ++ mutex_lock(&qmblk->dq_mutex); + if (mask & (1 << USRQUOTA)) + quid = __vzquota_find_ugid(qmblk, iattr->ia_uid, + USRQUOTA, 0); + if (mask & (1 << GRPQUOTA)) + qgid = __vzquota_find_ugid(qmblk, iattr->ia_gid, + GRPQUOTA, 0); -+ up(&qmblk->dq_sem); ++ mutex_unlock(&qmblk->dq_mutex); + + inode_qmblk_lock(inode->i_sb); + qmblk_data_write_lock(qmblk); @@ -18529,6 +20885,29 @@ index 0000000..6f2f22a + return qmblk; +} + ++/* NFS root is disconnected dentry. */ ++ ++static int is_nfs_root(struct inode * inode) ++{ ++ struct dentry *de; ++ ++ if (inode->i_sb->s_magic != 0x6969) ++ return 0; ++ ++ if (list_empty(&inode->i_dentry)) ++ return 0; ++ ++ list_for_each_entry(de, &inode->i_dentry, d_alias) { ++ if (de->d_parent != de) ++ return 0; ++ if (d_unhashed(de)) ++ return 0; ++ if (!(de->d_flags & DCACHE_DISCONNECTED)) ++ return 0; ++ } ++ return 1; ++} ++ +static void vzquota_dbranch_actualize(struct inode *inode, + struct inode *refinode) +{ @@ -18539,7 +20918,7 @@ index 0000000..6f2f22a + vzquota_qlnk_init(&qlnk); + +start: -+ if (inode == inode->i_sb->s_root->d_inode) { ++ if (inode == inode->i_sb->s_root->d_inode || is_nfs_root(inode)) { + /* filesystem root */ + atomic_inc(&inode->i_count); + do { @@ -18594,7 +20973,7 @@ index 0000000..6f2f22a + struct inode *pinode; + struct vz_quota_master *qmblk; + -+ if (inode == inode->i_sb->s_root->d_inode) { ++ if (inode == inode->i_sb->s_root->d_inode || is_nfs_root(inode)) { + /* filesystem root */ + do { + qmblk = __VZ_QUOTA_NOQUOTA(inode->i_sb); @@ -18859,6 +21238,39 @@ index 0000000..6f2f22a + spin_unlock(&dcache_lock); +} + ++void vzquota_inode_swap_call(struct inode *inode, struct inode *tmpl) ++{ ++ struct vz_quota_master *qmblk; ++ ++ __vzquota_inode_init(inode, VZ_QUOTAO_INIT); ++ ++ might_sleep(); ++ ++ inode_qmblk_lock(tmpl->i_sb); ++ if (unlikely(tmpl->i_flags & S_NOQUOTA)) { ++ inode_qmblk_unlock(tmpl->i_sb); ++ return; ++ } ++ __vzquota_inode_init(tmpl, VZ_QUOTAO_INICAL); ++ ++ qmblk = INODE_QLNK(tmpl)->qmblk; ++ if (qmblk != VZ_QUOTA_BAD) { ++ void * uq; ++ list_del_init(&INODE_QLNK(tmpl)->list); ++ vzquota_qlnk_swap(INODE_QLNK(tmpl), INODE_QLNK(inode)); ++ uq = inode->i_dquot[USRQUOTA]; ++ inode->i_dquot[USRQUOTA] = tmpl->i_dquot[USRQUOTA]; ++ tmpl->i_dquot[USRQUOTA] = uq; ++ tmpl->i_flags |= S_NOQUOTA; ++ inode_qmblk_unlock(inode->i_sb); ++ ++ vzquota_inode_drop(tmpl); ++ } else { ++ inode_qmblk_unlock(tmpl->i_sb); ++ } ++} ++ ++ +/** + * vzquota_inode_drop_call - call from DQUOT_DROP + */ @@ -19513,7 +21925,7 @@ index 0000000..6f2f22a + goto out_ugid; +#endif + -+ init_MUTEX(&vz_quota_sem); ++ mutex_init(&vz_quota_mutex); + vzioctl_register(&vzdqcalls); + virtinfo_notifier_register(VITYPE_QUOTA, "a_notifier_block); +#if defined(CONFIG_VZ_QUOTA_UGID) && defined(CONFIG_PROC_FS) @@ -19893,10 +22305,10 @@ index b07565c..5b872c3 100644 size_t, sizemask) diff --git a/fs/simfs.c b/fs/simfs.c new file mode 100644 -index 0000000..2fccd6d +index 0000000..e21f911 --- /dev/null +++ b/fs/simfs.c -@@ -0,0 +1,335 @@ +@@ -0,0 +1,339 @@ +/* + * fs/simfs.c + * @@ -20032,7 +22444,7 @@ index 0000000..2fccd6d + + err = -ENOSYS; + if (lsb && lsb->s_op && lsb->s_op->statfs) -+ err = lsb->s_op->statfs(lsb->s_root, &statbuf); ++ err = lsb->s_op->statfs(sb->s_root, &statbuf); + if (err) + return err; + @@ -20074,10 +22486,12 @@ index 0000000..2fccd6d + return (err ? NOTIFY_BAD : NOTIFY_OK); +} + ++#ifdef CONFIG_QUOTA +static struct inode *sim_quota_root(struct super_block *sb) +{ + return sb->s_root->d_inode; +} ++#endif + +/* + * NOTE: We need to setup s_bdev field on super block, since sys_quotactl() @@ -20124,7 +22538,9 @@ index 0000000..2fccd6d +} + +static struct super_operations sim_super_ops = { ++#ifdef CONFIG_QUOTA + .get_quota_root = sim_quota_root, ++#endif +}; + +static int sim_fill_super(struct super_block *s, void *data) @@ -20277,7 +22693,7 @@ index c4ecd52..37e6cd9 100644 return inode->i_op->getattr(mnt, dentry, stat); diff --git a/fs/super.c b/fs/super.c -index aff046b..a2e26f4 100644 +index aff046b..cce99ab 100644 --- a/fs/super.c +++ b/fs/super.c @@ -37,12 +37,15 @@ @@ -20314,7 +22730,13 @@ index aff046b..a2e26f4 100644 /* * sget() can have s_umount recursion. * -@@ -311,7 +316,7 @@ void generic_shutdown_super(struct super_block *sb) +@@ -307,11 +312,13 @@ void generic_shutdown_super(struct super_block *sb) + /* bad name - it should be evict_inodes() */ + invalidate_inodes(sb); + ++ if (sb->dq_op && sb->dq_op->shutdown) ++ sb->dq_op->shutdown(sb); + if (sop->put_super) sop->put_super(sb); /* Forget any remaining inodes */ @@ -20323,7 +22745,7 @@ index aff046b..a2e26f4 100644 printk("VFS: Busy inodes after unmount of %s. " "Self-destruct in 5 seconds. Have a nice day...\n", sb->s_id); -@@ -531,17 +536,26 @@ rescan: +@@ -531,17 +538,26 @@ rescan: spin_unlock(&sb_lock); return NULL; } @@ -20354,7 +22776,7 @@ index aff046b..a2e26f4 100644 err = vfs_statfs(s->s_root, &sbuf); drop_super(s); if (err) -@@ -653,6 +667,13 @@ static DEFINE_IDA(unnamed_dev_ida); +@@ -653,6 +669,13 @@ static DEFINE_IDA(unnamed_dev_ida); static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */ static int unnamed_dev_start = 0; /* don't bother trying below it */ @@ -20368,7 +22790,7 @@ index aff046b..a2e26f4 100644 int set_anon_super(struct super_block *s, void *data) { int dev; -@@ -672,7 +693,7 @@ int set_anon_super(struct super_block *s, void *data) +@@ -672,7 +695,7 @@ int set_anon_super(struct super_block *s, void *data) else if (error) return -EAGAIN; @@ -20377,7 +22799,7 @@ index aff046b..a2e26f4 100644 spin_lock(&unnamed_dev_lock); ida_remove(&unnamed_dev_ida, dev); if (unnamed_dev_start > dev) -@@ -680,7 +701,7 @@ int set_anon_super(struct super_block *s, void *data) +@@ -680,7 +703,7 @@ int set_anon_super(struct super_block *s, void *data) spin_unlock(&unnamed_dev_lock); return -EMFILE; } @@ -20386,7 +22808,7 @@ index aff046b..a2e26f4 100644 return 0; } -@@ -688,8 +709,9 @@ EXPORT_SYMBOL(set_anon_super); +@@ -688,8 +711,9 @@ EXPORT_SYMBOL(set_anon_super); void kill_anon_super(struct super_block *sb) { @@ -20856,6 +23278,51 @@ index af4c4e7..561271d 100644 extern struct kmem_cache *sysfs_dir_cachep; /* +diff --git a/fs/utimes.c b/fs/utimes.c +index e4c75db..86a62a1 100644 +--- a/fs/utimes.c ++++ b/fs/utimes.c +@@ -40,6 +40,20 @@ SYSCALL_DEFINE2(utime, char __user *, filename, struct utimbuf __user *, times) + + #endif + ++SYSCALL_DEFINE2(lutime, char __user *, filename, struct utimbuf __user *, times) ++{ ++ struct timespec tv[2]; ++ ++ if (times) { ++ if (get_user(tv[0].tv_sec, ×->actime) || ++ get_user(tv[1].tv_sec, ×->modtime)) ++ return -EFAULT; ++ tv[0].tv_nsec = 0; ++ tv[1].tv_nsec = 0; ++ } ++ return do_utimes(AT_FDCWD, filename, times ? tv : NULL, AT_SYMLINK_NOFOLLOW); ++} ++ + static bool nsec_valid(long nsec) + { + if (nsec == UTIME_OMIT || nsec == UTIME_NOW) +diff --git a/fs/xattr.c b/fs/xattr.c +index 6d4f6d3..3243bd7 100644 +--- a/fs/xattr.c ++++ b/fs/xattr.c +@@ -115,6 +115,15 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value, + struct inode *inode = dentry->d_inode; + int error; + ++#if defined(CONFIG_VE) && defined(CONFIG_SYSCTL) ++ if (!ve_is_super(get_exec_env())) { ++ if (ve_xattr_policy == VE_XATTR_POLICY_IGNORE) ++ return 0; ++ else if (ve_xattr_policy == VE_XATTR_POLICY_REJECT) ++ return -EPERM; ++ } ++#endif ++ + error = xattr_permission(inode, name, MAY_WRITE); + if (error) + return error; diff --git a/include/asm-generic/mman.h b/include/asm-generic/mman.h index 32c8bd6..cb151a4 100644 --- a/include/asm-generic/mman.h @@ -20870,10 +23337,10 @@ index 32c8bd6..cb151a4 100644 #define MCL_FUTURE 2 /* lock all future mappings */ diff --git a/include/bc/beancounter.h b/include/bc/beancounter.h new file mode 100644 -index 0000000..aabbc72 +index 0000000..7ba4c77 --- /dev/null +++ b/include/bc/beancounter.h -@@ -0,0 +1,454 @@ +@@ -0,0 +1,453 @@ +/* + * include/bc/beancounter.h + * @@ -20953,12 +23420,12 @@ index 0000000..aabbc72 +/* Add new resources here */ + +#define UB_NUMXTENT 23 -+#define UB_RESOURCES 24 ++#define UB_SWAPPAGES 24 ++#define UB_RESOURCES 25 + +#define UB_UNUSEDPRIVVM (UB_RESOURCES + 0) +#define UB_TMPFSPAGES (UB_RESOURCES + 1) -+#define UB_SWAPPAGES (UB_RESOURCES + 2) -+#define UB_HELDPAGES (UB_RESOURCES + 3) ++#define UB_HELDPAGES (UB_RESOURCES + 2) + +struct ubparm { + /* @@ -21017,7 +23484,7 @@ index 0000000..aabbc72 +struct page_private { + unsigned long ubp_unused_privvmpages; + unsigned long ubp_tmpfs_respages; -+ unsigned long ubp_swap_pages; ++ unsigned long ubp_pbcs; + unsigned long long ubp_held_pages; +}; + @@ -21046,7 +23513,6 @@ index 0000000..aabbc72 +#ifdef CONFIG_BC_DEBUG_KMEM + long pages_charged; + long vmalloc_charged; -+ long pbcs; +#endif + unsigned long sync; + unsigned long sync_done; @@ -21080,6 +23546,7 @@ index 0000000..aabbc72 + + spinlock_t ub_lock; + uid_t ub_uid; ++ unsigned int ub_cookie; + + struct ub_rate_info ub_limit_rl; + int ub_oom_noproc; @@ -21087,8 +23554,8 @@ index 0000000..aabbc72 + struct page_private ppriv; +#define ub_unused_privvmpages ppriv.ubp_unused_privvmpages +#define ub_tmpfs_respages ppriv.ubp_tmpfs_respages -+#define ub_swap_pages ppriv.ubp_swap_pages +#define ub_held_pages ppriv.ubp_held_pages ++#define ub_pbcs ppriv.ubp_pbcs + struct sock_private spriv; +#define ub_rmem_thres spriv.ubp_rmem_thres +#define ub_maxadvmss spriv.ubp_maxadvmss @@ -21100,6 +23567,7 @@ index 0000000..aabbc72 +#define ub_tw_count spriv.ubp_tw_count + + struct user_beancounter *parent; ++ int ub_childs; + void *private_data; + unsigned long ub_aflags; + @@ -21125,6 +23593,8 @@ index 0000000..aabbc72 +#endif +}; + ++extern int ub_count; ++ +enum ub_severity { UB_HARD, UB_SOFT, UB_FORCE }; + +#define UB_AFLAG_NOTIF_PAGEIN 0 @@ -21189,16 +23659,12 @@ index 0000000..aabbc72 +#else /* CONFIG_BEANCOUNTERS */ + +#define ub_percpu_add(ub, field, v) do { \ -+ if (ub->ub_percpu == NULL) \ -+ break; \ + per_cpu_ptr(ub->ub_percpu, get_cpu())->field += (v); \ + put_cpu(); \ + } while (0) +#define ub_percpu_inc(ub, field) ub_percpu_add(ub, field, 1) + +#define ub_percpu_sub(ub, field, v) do { \ -+ if (ub->ub_percpu == NULL) \ -+ break; \ + per_cpu_ptr(ub->ub_percpu, get_cpu())->field -= (v); \ + put_cpu(); \ + } while (0) @@ -21491,10 +23957,10 @@ index 0000000..23306e9 +#endif /* __dcache_op.h_ */ diff --git a/include/bc/debug.h b/include/bc/debug.h new file mode 100644 -index 0000000..7b1feb6 +index 0000000..58c64f3 --- /dev/null +++ b/include/bc/debug.h -@@ -0,0 +1,109 @@ +@@ -0,0 +1,103 @@ +/* + * include/bc/debug.h + * @@ -21588,17 +24054,11 @@ index 0000000..7b1feb6 + ub_percpu_sub(ub, vmalloc_charged, \ + vm->nr_pages); \ + } while (0) -+ -+#define inc_pbc_count(ub) ub_percpu_inc(ub, pbcs) -+#define dec_pbc_count(ub) ub_percpu_dec(ub, pbcs) +#else +#define init_cache_counters() do { } while (0) +#define inc_vmalloc_charged(vm, f) do { } while (0) +#define dec_vmalloc_charged(vm) do { } while (0) + -+#define inc_pbc_count(ub) do { } while (0) -+#define dec_pbc_count(ub) do { } while (0) -+ +#define ub_free_counters(ub) do { } while (0) +#define ub_kmemcache_free(cachep) do { } while (0) +#endif @@ -21695,7 +24155,7 @@ index 0000000..b2afb69 +#endif /* _LINUX_UBHASH_H */ diff --git a/include/bc/io_acct.h b/include/bc/io_acct.h new file mode 100644 -index 0000000..d84bf5a +index 0000000..361b26c --- /dev/null +++ b/include/bc/io_acct.h @@ -0,0 +1,113 @@ @@ -21714,6 +24174,8 @@ index 0000000..d84bf5a +#ifndef __UB_IO_ACCT_H_ +#define __UB_IO_ACCT_H_ + ++#define PAGE_IO_MARK (0x1UL) ++ +#ifdef CONFIG_BC_IO_ACCOUNTING +#include +#include @@ -21748,8 +24210,6 @@ index 0000000..d84bf5a +extern void ub_io_save_context(struct page *, size_t); +extern void ub_io_release_context(struct page *pg, size_t size); + -+#define PAGE_IO_MARK (0x1UL) -+ +static inline struct page_beancounter *iopb_to_pb(struct page_beancounter *pb) +{ + if (!((unsigned long)pb & PAGE_IO_MARK)) @@ -22876,7 +25336,7 @@ index aea219d..89cab9b 100644 + #endif /* __LINUX__AIO_H */ diff --git a/include/linux/capability.h b/include/linux/capability.h -index c8f2a5f..3f85123 100644 +index c8f2a5f..301d709 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -197,12 +197,9 @@ struct cpu_vfs_cap_data { @@ -22926,7 +25386,7 @@ index c8f2a5f..3f85123 100644 /* Allow setting readahead and flushing buffers on block devices */ /* Allow setting geometry in floppy driver */ /* Allow turning DMA on/off in xd driver */ -@@ -340,6 +333,50 @@ struct cpu_vfs_cap_data { +@@ -340,6 +333,61 @@ struct cpu_vfs_cap_data { #define CAP_SETFCAP 31 @@ -22938,10 +25398,21 @@ index c8f2a5f..3f85123 100644 + */ + +/* Allow access to all information. In the other case some structures will be -+ hiding to ensure different Virtual Environment non-interaction on the same -+ node */ ++ * hiding to ensure different Virtual Environment non-interaction on the same ++ * node (NOW OBSOLETED) ++ */ +#define CAP_SETVEID 29 + ++#define capable_setveid() ({ \ ++ ve_is_super(get_exec_env()) && \ ++ (capable(CAP_SYS_ADMIN) || \ ++ capable(CAP_VE_ADMIN)); \ ++ }) ++ ++/* ++ * coinsides with CAP_AUDIT_CONTROL but we don't care, since ++ * audit is disabled in Virtuozzo ++ */ +#define CAP_VE_ADMIN 30 + +#ifdef CONFIG_VE @@ -22977,7 +25448,7 @@ index c8f2a5f..3f85123 100644 /* Override MAC access. The base kernel enforces no MAC policy. An LSM may enforce a MAC policy, and if it does and it chooses -@@ -418,7 +455,16 @@ struct cpu_vfs_cap_data { +@@ -418,7 +466,16 @@ struct cpu_vfs_cap_data { #define CAP_INIT_INH_SET CAP_EMPTY_SET # define cap_clear(c) do { (c) = __cap_empty_set; } while (0) @@ -22994,7 +25465,7 @@ index c8f2a5f..3f85123 100644 # define cap_set_init_eff(c) do { (c) = __cap_init_eff_set; } while (0) #define cap_raise(c, flag) ((c).cap[CAP_TO_INDEX(flag)] |= CAP_TO_MASK(flag)) -@@ -536,6 +582,10 @@ extern const kernel_cap_t __cap_empty_set; +@@ -536,6 +593,10 @@ extern const kernel_cap_t __cap_empty_set; extern const kernel_cap_t __cap_full_set; extern const kernel_cap_t __cap_init_eff_set; @@ -23061,10 +25532,10 @@ index 0000000..be88d2d + diff --git a/include/linux/cpt_image.h b/include/linux/cpt_image.h new file mode 100644 -index 0000000..6ab78b7 +index 0000000..8185d4e --- /dev/null +++ b/include/linux/cpt_image.h -@@ -0,0 +1,1799 @@ +@@ -0,0 +1,1842 @@ +/* + * + * include/linux/cpt_image.h @@ -23187,11 +25658,15 @@ index 0000000..6ab78b7 +#define CPT_VERSION_16 0x200 +#define CPT_VERSION_18 0x300 +#define CPT_VERSION_18_1 0x301 ++#define CPT_VERSION_18_2 0x302 ++#define CPT_VERSION_18_3 0x303 +#define CPT_VERSION_20 0x400 +#define CPT_VERSION_24 0x500 +#define CPT_VERSION_26 0x600 +#define CPT_VERSION_27 0x700 ++#define CPT_VERSION_27_3 0x703 +#define CPT_VERSION_32 0x800 ++#define CPT_CURRENT_VERSION CPT_VERSION_32 + __u16 cpt_os_arch; /* Architecture */ +#define CPT_OS_ARCH_I386 0 +#define CPT_OS_ARCH_EMT64 1 @@ -23238,6 +25713,7 @@ index 0000000..6ab78b7 +#define CPT_BIND_MOUNT 21 +#define CPT_UNSUPPORTED_NETDEV 22 +#define CPT_UNSUPPORTED_MISC 23 ++#define CPT_SLM_DMPRST 24 + +/* This mask is used to determine whether VE + has some unsupported features or not */ @@ -23291,6 +25767,7 @@ index 0000000..6ab78b7 + CPT_SECT_VSYSCALL, + CPT_SECT_INOTIFY, + CPT_SECT_SYSV_MSG, ++ CPT_SECT_SNMP_STATS, + CPT_SECT_MAX +}; + @@ -23380,7 +25857,7 @@ index 0000000..6ab78b7 + + /* later extension */ + __u32 last_pid; -+ __u32 pad1; ++ __u32 rnd_va_space; + __u64 reserved[8]; +} __attribute__ ((aligned (8))); + @@ -23409,6 +25886,8 @@ index 0000000..6ab78b7 +#define CPT_DENTRY_INOTIFY 0x40 +#define CPT_DENTRY_FUTEX 0x80 +#define CPT_DENTRY_TUNTAP 0x100 ++#define CPT_DENTRY_PROCPID_DEAD 0x200 ++#define CPT_DENTRY_HARDLINKED 0x400 +#define CPT_DENTRY_SIGNALFD 0x800 + __u64 cpt_inode; + __u64 cpt_priv; @@ -24373,6 +26852,8 @@ index 0000000..6ab78b7 + + __u64 cpt_state; + __u64 cpt_flags; ++#define CPT_TASK_FLAGS_MASK (PF_EXITING | PF_FORKNOEXEC | \ ++ PF_SUPERPRIV | PF_DUMPCORE | PF_SIGNALED) + __u64 cpt_ptrace; + __u32 cpt_prio; + __u32 cpt_static_prio; @@ -24771,6 +27252,39 @@ index 0000000..6ab78b7 + __u32 cpt_mark; +} __attribute__ ((aligned (8))); + ++/* cpt_ip_conntrack_image struct from 2.6.9 kernel */ ++struct cpt_ip_conntrack_image_compat ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ struct cpt_ipct_tuple cpt_tuple[2]; ++ __u64 cpt_status; ++ __u64 cpt_timeout; ++ __u32 cpt_index; ++ __u8 cpt_ct_helper; ++ __u8 cpt_nat_helper; ++ __u16 __cpt_pad1; ++ ++ /* union ip_conntrack_proto. Used by tcp and icmp. */ ++ __u32 cpt_proto_data[12]; ++ ++ /* union ip_conntrack_help. Used only by ftp helper. */ ++ __u32 cpt_help_data[4]; ++ ++ /* nat info */ ++ __u32 cpt_initialized; ++ __u32 cpt_num_manips; ++ struct cpt_nat_manip cpt_nat_manips[6]; ++ ++ struct cpt_nat_seq cpt_nat_seq[2]; ++ ++ __u32 cpt_masq_index; ++ __u32 __cpt_pad2; ++} __attribute__ ((aligned (8))); ++ +struct cpt_ubparm +{ + __u64 barrier; @@ -24789,7 +27303,7 @@ index 0000000..6ab78b7 + + __u64 cpt_parent; + __u32 cpt_id; -+ __u32 __cpt_pad; ++ __u32 cpt_ub_resources; + struct cpt_ubparm cpt_parms[32 * 2]; +} __attribute__ ((aligned (8))); + @@ -24866,10 +27380,10 @@ index 0000000..6ab78b7 +#endif /* __CPT_IMAGE_H_ */ diff --git a/include/linux/cpt_ioctl.h b/include/linux/cpt_ioctl.h new file mode 100644 -index 0000000..b8e83cc +index 0000000..f31b66c --- /dev/null +++ b/include/linux/cpt_ioctl.h -@@ -0,0 +1,43 @@ +@@ -0,0 +1,45 @@ +/* + * + * include/linux/cpt_ioctl.h @@ -24911,6 +27425,8 @@ index 0000000..b8e83cc +#define CPT_SET_ERRORFD _IOW(CPTCTLTYPE, 21, int) + +#define CPT_ITER _IOW(CPTCTLTYPE, 23, int) ++#define CPT_LINKDIR_ADD _IOW(CPTCTLTYPE, 24, int) ++#define CPT_HARDLNK_ON _IOW(CPTCTLTYPE, 25, int) + +#endif diff --git a/include/linux/dcache.h b/include/linux/dcache.h @@ -25153,10 +27669,10 @@ index f6856a5..a7f552f 100644 static inline void eventpoll_init_file(struct file *file) {} diff --git a/include/linux/fairsched.h b/include/linux/fairsched.h new file mode 100644 -index 0000000..e08c84d +index 0000000..521455c --- /dev/null +++ b/include/linux/fairsched.h -@@ -0,0 +1,86 @@ +@@ -0,0 +1,92 @@ +/* + * Fair Scheduler + * @@ -25229,6 +27745,9 @@ index 0000000..e08c84d +asmlinkage int sys_fairsched_chwt(unsigned int id, unsigned int weight); +asmlinkage int sys_fairsched_rate(unsigned int id, int op, unsigned rate); + ++int fairsched_new_node(int id, unsigned int vcpus); ++void fairsched_drop_node(int id); ++ +#else /* CONFIG_VZ_FAIRSCHED */ + +static inline void fairsched_init_early(void) { } @@ -25237,6 +27756,9 @@ index 0000000..e08c84d +static inline void get_task_fairsched_node(struct task_struct *p) { } +static inline void put_task_fairsched_node(struct task_struct *p) { } + ++static inline int fairsched_new_node(int id, unsigned int vcpus) { return 0; } ++static inline void fairsched_drop_node(int id) { } ++ +#define INIT_VZ_FAIRSCHED + +#endif /* CONFIG_VZ_FAIRSCHED */ @@ -25306,10 +27828,10 @@ index 335a0a5..8e31c51 100644 + #endif /* __LINUX_FILE_H */ diff --git a/include/linux/freezer.h b/include/linux/freezer.h -index 5a361f8..9426083 100644 +index da7e52b..099191c 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h -@@ -160,6 +160,8 @@ static inline void set_freezable_with_signal(void) +@@ -163,6 +163,8 @@ static inline void set_freezable_with_signal(void) } while (try_to_freeze()); \ __retval; \ }) @@ -25319,7 +27841,7 @@ index 5a361f8..9426083 100644 static inline int frozen(struct task_struct *p) { return 0; } static inline int freezing(struct task_struct *p) { return 0; } diff --git a/include/linux/fs.h b/include/linux/fs.h -index 692a3ee..53547b0 100644 +index 9b67805..3fef9ef 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -53,6 +53,7 @@ struct inodes_stat_t { @@ -25348,7 +27870,17 @@ index 692a3ee..53547b0 100644 #define FS_REVAL_DOT 16384 /* Check the paths ".", ".." for staleness */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() * during rename() internally. -@@ -370,7 +375,6 @@ struct inodes_stat_t { +@@ -235,6 +240,9 @@ struct inodes_stat_t { + #define S_SWAPFILE 256 /* Do not truncate: swapon got its bmaps */ + #define S_PRIVATE 512 /* Inode is fs-internal */ + ++/* VZ flags -- These are not upstream! */ ++#define S_NOUNUSE (1 << 17) /* just destroy inode in cleanup */ ++ + /* + * Note that nosuid etc flags are inode-specific: setting some file-system + * flags just means all the inodes inherit those flags by default. It might be +@@ -370,7 +378,6 @@ struct inodes_stat_t { #include #include #include @@ -25356,7 +27888,7 @@ index 692a3ee..53547b0 100644 #include #include #include -@@ -405,6 +409,7 @@ extern int get_max_files(void); +@@ -405,6 +412,7 @@ extern int get_max_files(void); extern int sysctl_nr_open; extern struct inodes_stat_t inodes_stat; extern int leases_enable, lease_break_time; @@ -25364,7 +27896,7 @@ index 692a3ee..53547b0 100644 #ifdef CONFIG_DNOTIFY extern int dir_notify_enable; #endif -@@ -464,10 +469,15 @@ struct iattr { +@@ -464,10 +472,15 @@ struct iattr { struct file *ia_file; }; @@ -25380,7 +27912,7 @@ index 692a3ee..53547b0 100644 /** * enum positive_aop_returns - aop return codes with specific semantics -@@ -754,6 +764,9 @@ struct inode { +@@ -754,6 +767,9 @@ struct inode { #ifdef CONFIG_QUOTA struct dquot *i_dquot[MAXQUOTAS]; #endif @@ -25390,7 +27922,7 @@ index 692a3ee..53547b0 100644 struct list_head i_devices; union { struct pipe_inode_info *i_pipe; -@@ -809,6 +822,8 @@ enum inode_i_mutex_lock_class +@@ -809,6 +825,8 @@ enum inode_i_mutex_lock_class I_MUTEX_QUOTA }; @@ -25399,7 +27931,7 @@ index 692a3ee..53547b0 100644 /* * NOTE: in a 32bit arch with a preemptable kernel and * an UP compile the i_size_read/write must be atomic -@@ -929,6 +944,7 @@ struct file { +@@ -929,6 +947,7 @@ struct file { struct fown_struct f_owner; const struct cred *f_cred; struct file_ra_state f_ra; @@ -25407,7 +27939,7 @@ index 692a3ee..53547b0 100644 u64 f_version; #ifdef CONFIG_SECURITY -@@ -945,6 +961,7 @@ struct file { +@@ -945,6 +964,7 @@ struct file { #ifdef CONFIG_DEBUG_WRITECOUNT unsigned long f_mnt_write_state; #endif @@ -25415,7 +27947,7 @@ index 692a3ee..53547b0 100644 }; extern spinlock_t files_lock; #define file_list_lock() spin_lock(&files_lock); -@@ -1063,6 +1080,9 @@ struct file_lock { +@@ -1063,6 +1083,9 @@ struct file_lock { fl_owner_t fl_owner; unsigned char fl_flags; unsigned char fl_type; @@ -25425,7 +27957,7 @@ index 692a3ee..53547b0 100644 unsigned int fl_pid; struct pid *fl_nspid; wait_queue_head_t fl_wait; -@@ -1509,6 +1529,7 @@ struct file_operations { +@@ -1509,6 +1532,7 @@ struct file_operations { ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); int (*setlease)(struct file *, long, struct file_lock **); @@ -25433,7 +27965,7 @@ index 692a3ee..53547b0 100644 }; struct inode_operations { -@@ -1578,6 +1599,7 @@ struct super_operations { +@@ -1578,6 +1602,7 @@ struct super_operations { #ifdef CONFIG_QUOTA ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); @@ -25441,7 +27973,7 @@ index 692a3ee..53547b0 100644 #endif int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t); }; -@@ -1755,8 +1777,14 @@ struct file_system_type { +@@ -1755,8 +1780,14 @@ struct file_system_type { struct lock_class_key i_mutex_key; struct lock_class_key i_mutex_dir_key; struct lock_class_key i_alloc_sem_key; @@ -25456,7 +27988,7 @@ index 692a3ee..53547b0 100644 extern int get_sb_ns(struct file_system_type *fs_type, int flags, void *data, int (*fill_super)(struct super_block *, void *, int), struct vfsmount *mnt); -@@ -1800,6 +1828,11 @@ extern int register_filesystem(struct file_system_type *); +@@ -1800,13 +1831,20 @@ extern int register_filesystem(struct file_system_type *); extern int unregister_filesystem(struct file_system_type *); extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data); #define kern_mount(type) kern_mount_data(type, NULL) @@ -25466,9 +27998,10 @@ index 692a3ee..53547b0 100644 +extern void umount_ve_fs_type(struct file_system_type *local_fs_type); +#define kern_umount mntput extern int may_umount_tree(struct vfsmount *); ++extern struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root); extern int may_umount(struct vfsmount *); extern long do_mount(char *, char *, char *, unsigned long, void *); -@@ -1807,6 +1840,7 @@ extern struct vfsmount *collect_mounts(struct path *); + extern struct vfsmount *collect_mounts(struct path *); extern void drop_collected_mounts(struct vfsmount *); extern int vfs_statfs(struct dentry *, struct kstatfs *); @@ -25476,7 +28009,7 @@ index 692a3ee..53547b0 100644 extern int current_umask(void); -@@ -2065,7 +2099,8 @@ extern int check_disk_change(struct block_device *); +@@ -2065,7 +2103,8 @@ extern int check_disk_change(struct block_device *); extern int __invalidate_device(struct block_device *); extern int invalidate_partition(struct gendisk *, int); #endif @@ -25486,7 +28019,7 @@ index 692a3ee..53547b0 100644 unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end); -@@ -2477,6 +2512,17 @@ ssize_t simple_attr_read(struct file *file, char __user *buf, +@@ -2478,6 +2517,17 @@ ssize_t simple_attr_read(struct file *file, char __user *buf, ssize_t simple_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos); @@ -26009,7 +28542,7 @@ index 6a8ca98..ee562c3 100644 + #endif diff --git a/include/linux/mm.h b/include/linux/mm.h -index 24c3956..d38e63e 100644 +index 24c3956..7bb1cf3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -712,6 +712,7 @@ extern void pagefault_out_of_memory(void); @@ -26040,6 +28573,20 @@ index 24c3956..d38e63e 100644 int set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); int clear_page_dirty_for_io(struct page *page); +@@ -1294,7 +1297,12 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, + #ifndef CONFIG_MMU + #define randomize_va_space 0 + #else +-extern int randomize_va_space; ++extern int _randomize_va_space; ++#ifndef CONFIG_VE ++#define randomize_va_space _randomize_va_space ++#else ++#define randomize_va_space (get_exec_env()->_randomize_va_space) ++#endif + #endif + + const char * arch_vma_name(struct vm_area_struct *vma); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 84a524a..8ecf0ec 100644 --- a/include/linux/mm_types.h @@ -26607,8 +29154,21 @@ index 0000000..e9f10ba +}; + +#endif /*_IPT_OWNER_H*/ +diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h +index d09db1b..5b36364 100644 +--- a/include/linux/nfs_fs.h ++++ b/include/linux/nfs_fs.h +@@ -374,7 +374,7 @@ extern const struct address_space_operations nfs_file_aops; + + static inline struct nfs_open_context *nfs_file_open_context(struct file *filp) + { +- return filp->private_data; ++ return file_private(filp); + } + + static inline struct rpc_cred *nfs_file_cred(struct file *file) diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h -index 320569e..8e0d228 100644 +index b26dc51..643e380 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -91,6 +91,7 @@ struct nfs_client { @@ -26619,6 +29179,17 @@ index 320569e..8e0d228 100644 }; /* +diff --git a/include/linux/nmi.h b/include/linux/nmi.h +index b752e80..ed9d975 100644 +--- a/include/linux/nmi.h ++++ b/include/linux/nmi.h +@@ -47,4 +47,6 @@ static inline bool trigger_all_cpu_backtrace(void) + } + #endif + ++extern void nmi_show_regs(struct pt_regs *regs, int in_nmi); ++extern int do_nmi_show_regs(struct pt_regs *regs, int cpu); + #endif diff --git a/include/linux/notifier.h b/include/linux/notifier.h index 44428d2..a3a0a02 100644 --- a/include/linux/notifier.h @@ -26763,10 +29334,10 @@ index 6673743..977e52b 100644 #endif /* KERNEL */ diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h -index 379eaed..52c8b17 100644 +index 379eaed..80bd26a 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h -@@ -103,6 +103,8 @@ struct vmcore { +@@ -103,9 +103,14 @@ struct vmcore { #ifdef CONFIG_PROC_FS extern void proc_root_init(void); @@ -26775,7 +29346,13 @@ index 379eaed..52c8b17 100644 void proc_flush_task(struct task_struct *task); -@@ -149,6 +151,8 @@ extern struct proc_dir_entry *proc_mkdir(const char *,struct proc_dir_entry *); ++extern int proc_dentry_of_dead_task(struct dentry *dentry); ++extern struct file_operations dummy_proc_pid_file_operations; ++ + extern struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, + struct proc_dir_entry *parent); + struct proc_dir_entry *proc_create_data(const char *name, mode_t mode, +@@ -149,6 +154,8 @@ extern struct proc_dir_entry *proc_mkdir(const char *,struct proc_dir_entry *); extern struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode, struct proc_dir_entry *parent); @@ -26784,7 +29361,16 @@ index 379eaed..52c8b17 100644 static inline struct proc_dir_entry *proc_create(const char *name, mode_t mode, struct proc_dir_entry *parent, const struct file_operations *proc_fops) { -@@ -268,6 +272,9 @@ struct proc_inode { +@@ -184,6 +191,8 @@ extern void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm); + #define proc_net_fops_create(net, name, mode, fops) ({ (void)(mode), NULL; }) + static inline void proc_net_remove(struct net *net, const char *name) {} + ++static inline int proc_dentry_of_dead_task(struct dentry *dentry) { return 0; } ++ + static inline void proc_flush_task(struct task_struct *task) + { + } +@@ -268,6 +277,9 @@ struct proc_inode { struct proc_dir_entry *pde; struct ctl_table_header *sysctl; struct ctl_table *sysctl_entry; @@ -26794,7 +29380,7 @@ index 379eaed..52c8b17 100644 struct inode vfs_inode; }; -@@ -281,6 +288,15 @@ static inline struct proc_dir_entry *PDE(const struct inode *inode) +@@ -281,6 +293,15 @@ static inline struct proc_dir_entry *PDE(const struct inode *inode) return PROC_I(inode)->pde; } @@ -26811,7 +29397,7 @@ index 379eaed..52c8b17 100644 { return pde->parent->data; diff --git a/include/linux/quota.h b/include/linux/quota.h -index 8fd8efc..8cd6b71 100644 +index 8fd8efc..5fa291e 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -173,6 +173,10 @@ enum { @@ -26834,11 +29420,14 @@ index 8fd8efc..8cd6b71 100644 /* Operations working with dquots */ struct dquot_operations { int (*initialize) (struct inode *, int); -@@ -316,9 +322,11 @@ struct dquot_operations { +@@ -316,9 +322,14 @@ struct dquot_operations { /* get reserved quota for delayed alloc, value returned is managed by * quota code only */ qsize_t *(*get_reserved_space) (struct inode *); + int (*rename) (struct inode *, struct inode *, struct inode *); ++ ++ void (*swap_inode) (struct inode *, struct inode *); ++ void (*shutdown) (struct super_block *); }; /* Operations handling requests from userspace */ @@ -26846,7 +29435,7 @@ index 8fd8efc..8cd6b71 100644 struct quotactl_ops { int (*quota_on)(struct super_block *, int, int, char *, int); int (*quota_off)(struct super_block *, int, int); -@@ -331,6 +339,10 @@ struct quotactl_ops { +@@ -331,6 +342,10 @@ struct quotactl_ops { int (*set_xstate)(struct super_block *, unsigned int, int); int (*get_xquota)(struct super_block *, int, qid_t, struct fs_disk_quota *); int (*set_xquota)(struct super_block *, int, qid_t, struct fs_disk_quota *); @@ -26857,7 +29446,7 @@ index 8fd8efc..8cd6b71 100644 }; struct quota_format_type { -@@ -385,6 +397,10 @@ struct quota_info { +@@ -385,6 +400,10 @@ struct quota_info { struct inode *files[MAXQUOTAS]; /* inodes of quotafiles */ struct mem_dqinfo info[MAXQUOTAS]; /* Information for each quota type */ struct quota_format_ops *ops[MAXQUOTAS]; /* Operations for each type */ @@ -26869,7 +29458,7 @@ index 8fd8efc..8cd6b71 100644 int register_quota_format(struct quota_format_type *fmt); diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h -index a529d86..579a15c 100644 +index a529d86..bdbe1f7 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -264,6 +264,19 @@ static inline void vfs_dq_free_inode(struct inode *inode) @@ -26892,7 +29481,43 @@ index a529d86..579a15c 100644 /* Cannot be called inside a transaction */ static inline int vfs_dq_off(struct super_block *sb, int remount) { -@@ -363,6 +376,12 @@ static inline int vfs_dq_transfer(struct inode *inode, struct iattr *iattr) +@@ -274,6 +287,35 @@ static inline int vfs_dq_off(struct super_block *sb, int remount) + return ret; + } + ++static __inline__ void DQUOT_SWAP(struct inode *inode, struct inode *tmpl) ++{ ++ if (sb_any_quota_active(tmpl->i_sb) && ++ tmpl->i_sb->dq_op->swap_inode) ++ tmpl->i_sb->dq_op->swap_inode(inode, tmpl); ++} ++ ++static __inline__ int DQUOT_CHECK_SPACE(struct inode *inode) ++{ ++ if (vfs_dq_alloc_space_nodirty(inode, 512)) ++ return -EDQUOT; ++ vfs_dq_free_space_nodirty(inode, 512); ++ return 0; ++} ++ ++static __inline__ void DQUOT_SYNC_BLOCKS(struct inode *inode, blkcnt_t blocks) ++{ ++ if (sb_any_quota_active(inode->i_sb)) { ++ if (blocks > inode->i_blocks) ++ inode->i_sb->dq_op->alloc_space(inode, ++ (qsize_t)(blocks-inode->i_blocks)*512, ++ 13 /*DQUOT_CMD_FORCE*/); ++ else if (blocks < inode->i_blocks) ++ inode->i_sb->dq_op->free_space(inode, (qsize_t)(inode->i_blocks-blocks)*512); ++ } else ++ inode->i_blocks = blocks; ++} ++ ++ + #else + + static inline int sb_has_quota_usage_enabled(struct super_block *sb, int type) +@@ -363,6 +405,12 @@ static inline int vfs_dq_transfer(struct inode *inode, struct iattr *iattr) return 0; } @@ -26905,6 +29530,22 @@ index a529d86..579a15c 100644 static inline int vfs_dq_prealloc_space_nodirty(struct inode *inode, qsize_t nr) { inode_add_bytes(inode, nr); +@@ -416,6 +464,15 @@ static inline void vfs_dq_free_space(struct inode *inode, qsize_t nr) + mark_inode_dirty(inode); + } + ++static inline void DQUOT_SWAP(struct inode *inode, struct inode *tmpl) ++{ ++} ++ ++static inline void DQUOT_SYNC_BLOCKS(struct inode *inode, blkcnt_t blocks) ++{ ++ inode->i_blocks = blocks; ++} ++ + #endif /* CONFIG_QUOTA */ + + static inline int vfs_dq_prealloc_block_nodirty(struct inode *inode, qsize_t nr) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index cb0ba70..b14f124 100644 --- a/include/linux/rmap.h @@ -26919,7 +29560,7 @@ index cb0ba70..b14f124 100644 static inline void page_dup_rmap(struct page *page) { diff --git a/include/linux/sched.h b/include/linux/sched.h -index 70abfd3..fa44cc6 100644 +index 70abfd3..d6155c1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -94,6 +94,8 @@ struct sched_param { @@ -26979,15 +29620,7 @@ index 70abfd3..fa44cc6 100644 extern void calc_global_load(void); -@@ -286,6 +313,7 @@ static inline void show_state(void) - } - - extern void show_regs(struct pt_regs *); -+extern void smp_show_regs(struct pt_regs *, void *); - - /* - * TASK is a pointer to the task whose backtrace we want to see (or NULL for current -@@ -553,6 +581,9 @@ struct thread_group_cputimer { +@@ -553,6 +580,9 @@ struct thread_group_cputimer { spinlock_t lock; }; @@ -26997,7 +29630,7 @@ index 70abfd3..fa44cc6 100644 /* * NOTE! "signal_struct" does not have it's own * locking, because a shared signal_struct always -@@ -1283,6 +1314,7 @@ struct task_struct { +@@ -1283,6 +1313,7 @@ struct task_struct { unsigned in_execve:1; /* Tell the LSMs that the process is doing an * execve */ unsigned in_iowait:1; @@ -27005,7 +29638,7 @@ index 70abfd3..fa44cc6 100644 /* Revert to default priority/policy when forking */ -@@ -1498,6 +1530,14 @@ struct task_struct { +@@ -1498,6 +1529,14 @@ struct task_struct { struct rcu_head rcu; /* @@ -27020,7 +29653,7 @@ index 70abfd3..fa44cc6 100644 * cache last used pipe for splice */ struct pipe_inode_info *splice_pipe; -@@ -1542,6 +1582,19 @@ struct task_struct { +@@ -1542,6 +1581,19 @@ struct task_struct { unsigned long trace_recursion; #endif /* CONFIG_TRACING */ unsigned long stack_start; @@ -27040,7 +29673,7 @@ index 70abfd3..fa44cc6 100644 }; /* Future-safe accessor for struct task_struct's cpus_allowed. */ -@@ -1727,6 +1780,43 @@ extern cputime_t task_utime(struct task_struct *p); +@@ -1727,6 +1779,43 @@ extern cputime_t task_utime(struct task_struct *p); extern cputime_t task_stime(struct task_struct *p); extern cputime_t task_gtime(struct task_struct *p); @@ -27084,7 +29717,7 @@ index 70abfd3..fa44cc6 100644 /* * Per process flags */ -@@ -1736,6 +1826,7 @@ extern cputime_t task_gtime(struct task_struct *p); +@@ -1736,6 +1825,7 @@ extern cputime_t task_gtime(struct task_struct *p); #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ @@ -27092,7 +29725,7 @@ index 70abfd3..fa44cc6 100644 #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ #define PF_MCE_PROCESS 0x00000080 /* process policy on mce errors */ #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ -@@ -1872,6 +1963,21 @@ extern unsigned long long +@@ -1872,6 +1962,21 @@ extern unsigned long long task_sched_runtime(struct task_struct *task); extern unsigned long long thread_group_sched_runtime(struct task_struct *task); @@ -27114,7 +29747,7 @@ index 70abfd3..fa44cc6 100644 /* sched_exec is called by processes performing an exec */ #ifdef CONFIG_SMP extern void sched_exec(void); -@@ -2151,6 +2257,13 @@ extern int disallow_signal(int); +@@ -2151,6 +2256,13 @@ extern int disallow_signal(int); extern int do_execve(char *, char __user * __user *, char __user * __user *, struct pt_regs *); extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *); @@ -27128,7 +29761,7 @@ index 70abfd3..fa44cc6 100644 struct task_struct *fork_idle(int); extern void set_task_comm(struct task_struct *tsk, char *from); -@@ -2168,11 +2281,11 @@ static inline unsigned long wait_task_inactive(struct task_struct *p, +@@ -2168,11 +2280,11 @@ static inline unsigned long wait_task_inactive(struct task_struct *p, } #endif @@ -27143,7 +29776,7 @@ index 70abfd3..fa44cc6 100644 extern bool current_is_single_threaded(void); -@@ -2180,10 +2293,10 @@ extern bool current_is_single_threaded(void); +@@ -2180,10 +2292,10 @@ extern bool current_is_single_threaded(void); * Careful: do_each_thread/while_each_thread is a double loop so * 'break' will not work as expected - use goto instead. */ @@ -27157,7 +29790,7 @@ index 70abfd3..fa44cc6 100644 while ((t = next_thread(t)) != g) /* de_thread depends on thread_group_leader not being a pid based check */ -@@ -2208,8 +2321,14 @@ int same_thread_group(struct task_struct *p1, struct task_struct *p2) +@@ -2208,8 +2320,14 @@ int same_thread_group(struct task_struct *p1, struct task_struct *p2) static inline struct task_struct *next_thread(const struct task_struct *p) { @@ -27173,7 +29806,7 @@ index 70abfd3..fa44cc6 100644 } static inline int thread_group_empty(struct task_struct *p) -@@ -2254,6 +2373,98 @@ static inline void unlock_task_sighand(struct task_struct *tsk, +@@ -2254,6 +2372,98 @@ static inline void unlock_task_sighand(struct task_struct *tsk, spin_unlock_irqrestore(&tsk->sighand->siglock, *flags); } @@ -27690,42 +30323,6 @@ index 5ad70a6..8f3d203 100644 if (!s) return ZERO_SIZE_PTR; -diff --git a/include/linux/smp.h b/include/linux/smp.h -index 39c64ba..7b81017 100644 ---- a/include/linux/smp.h -+++ b/include/linux/smp.h -@@ -13,6 +13,9 @@ - - extern void cpu_idle(void); - -+struct pt_regs; -+typedef void (*smp_nmi_function)(struct pt_regs *regs, void *info); -+ - struct call_single_data { - struct list_head list; - void (*func) (void *info); -@@ -66,6 +69,8 @@ extern int __cpu_up(unsigned int cpunum); - */ - extern void smp_cpus_done(unsigned int max_cpus); - -+extern int smp_nmi_call_function(smp_nmi_function func, void *info, int wait); -+ - /* - * Call a function on all other processors - */ -@@ -140,6 +145,12 @@ static inline void smp_send_reschedule(int cpu) { } - static inline void init_call_single_data(void) - { - } -+static inline int smp_nmi_call_function(smp_nmi_function func, -+ void *info, int wait) -+{ -+ return 0; -+} -+ - #endif /* !SMP */ - - /* diff --git a/include/linux/socket.h b/include/linux/socket.h index 3273a0c..87cf3d1 100644 --- a/include/linux/socket.h @@ -28205,10 +30802,10 @@ index 69f3997..6c74733 100644 static inline void get_uts_ns(struct uts_namespace *ns) diff --git a/include/linux/ve.h b/include/linux/ve.h new file mode 100644 -index 0000000..8f8d083 +index 0000000..e0e045a --- /dev/null +++ b/include/linux/ve.h -@@ -0,0 +1,361 @@ +@@ -0,0 +1,367 @@ +/* + * include/linux/ve.h + * @@ -28340,9 +30937,9 @@ index 0000000..8f8d083 + cycles_t strt_idle_time; + cycles_t used_time; + seqcount_t stat_lock; -+ int nr_running; -+ int nr_unint; -+ int nr_iowait; ++ unsigned long nr_running; ++ unsigned long nr_unint; ++ unsigned long nr_iowait; + cputime64_t user; + cputime64_t nice; + cputime64_t system; @@ -28481,6 +31078,7 @@ index 0000000..8f8d083 + struct ve_monitor *monitor; + struct proc_dir_entry *monitor_proc; + unsigned long meminfo_val; ++ int _randomize_va_space; + +#if defined(CONFIG_NFS_FS) || defined(CONFIG_NFS_FS_MODULE) \ + || defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE) @@ -28491,6 +31089,14 @@ index 0000000..8f8d083 + struct svc_rqst* _nlmsvc_rqst; +#endif + ++#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) ++ struct file_system_type *bm_fs_type; ++ struct vfsmount *bm_mnt; ++ int bm_enabled; ++ int bm_entry_count; ++ struct list_head bm_entries; ++#endif ++ + struct nsproxy *ve_ns; + struct user_namespace *user_ns; + struct net *ve_netns; @@ -28507,10 +31113,7 @@ index 0000000..8f8d083 +extern struct ve_cpu_stats static_ve_cpu_stats; +static inline struct ve_cpu_stats *VE_CPU_STATS(struct ve_struct *ve, int cpu) +{ -+ if (ve->cpu_stats == NULL) -+ return &static_ve_cpu_stats; -+ else -+ return per_cpu_ptr(ve->cpu_stats, cpu); ++ return per_cpu_ptr(ve->cpu_stats, cpu); +} + +extern int nr_ve; @@ -28608,10 +31211,10 @@ index 0000000..8f2e8f8 +#endif diff --git a/include/linux/ve_proto.h b/include/linux/ve_proto.h new file mode 100644 -index 0000000..3364e33 +index 0000000..8bc4e01 --- /dev/null +++ b/include/linux/ve_proto.h -@@ -0,0 +1,89 @@ +@@ -0,0 +1,96 @@ +/* + * include/linux/ve_proto.h + * @@ -28629,6 +31232,13 @@ index 0000000..3364e33 + +struct ve_struct; + ++struct seq_file; ++ ++typedef void (*ve_seq_print_t)(struct seq_file *, struct ve_struct *); ++ ++void vzmon_register_veaddr_print_cb(ve_seq_print_t); ++void vzmon_unregister_veaddr_print_cb(ve_seq_print_t); ++ +#ifdef CONFIG_INET +void tcp_v4_kill_ve_sockets(struct ve_struct *envid); +#ifdef CONFIG_VE_NETDEV @@ -28803,10 +31413,10 @@ index 0000000..745f1ec +#endif diff --git a/include/linux/venet.h b/include/linux/venet.h new file mode 100644 -index 0000000..1554037 +index 0000000..dd26f11 --- /dev/null +++ b/include/linux/venet.h -@@ -0,0 +1,86 @@ +@@ -0,0 +1,95 @@ +/* + * include/linux/venet.h + * @@ -28845,12 +31455,19 @@ index 0000000..1554037 + struct list_head ve_list; +}; + ++struct ext_entry_struct ++{ ++ struct list_head list; ++ struct ve_addr_struct addr; ++}; ++ +struct veip_struct +{ + struct list_head src_lh; + struct list_head dst_lh; + struct list_head ip_lh; + struct list_head list; ++ struct list_head ext_lh; + envid_t veid; +}; + @@ -28884,6 +31501,8 @@ index 0000000..1554037 +int veip_entry_add(struct ve_struct *ve, struct ve_addr_struct *addr); +int veip_entry_del(envid_t veid, struct ve_addr_struct *addr); +int venet_change_skb_owner(struct sk_buff *skb); ++struct ext_entry_struct *venet_ext_lookup(struct ve_struct *ve, ++ struct ve_addr_struct *addr); + +extern struct list_head ip_entry_hash_table[]; +extern rwlock_t veip_hash_lock; @@ -29092,10 +31711,10 @@ index 0000000..b0dad07 +#endif /* __LINUX_VIRTINFO_H */ diff --git a/include/linux/virtinfoscp.h b/include/linux/virtinfoscp.h new file mode 100644 -index 0000000..9e7584f +index 0000000..5661c0d --- /dev/null +++ b/include/linux/virtinfoscp.h -@@ -0,0 +1,21 @@ +@@ -0,0 +1,23 @@ +#ifndef __VIRTINFO_SCP_H__ +#define __VIRTINFO_SCP_H__ + @@ -29114,6 +31733,8 @@ index 0000000..9e7584f +#define VIRTINFO_SCP_RSTTSK 0x20 +#define VIRTINFO_SCP_RSTMM 0x21 + ++#define VIRTINFO_SCP_TEST 0x30 ++ +#define VIRTNOTIFY_CHANGE 0x100 + +#endif /* __VIRTINFO_SCP_H__ */ @@ -29502,10 +32123,10 @@ index 0000000..6d36cdd +#endif /* __LINUX_VZCTL_QUOTA_H__ */ diff --git a/include/linux/vzctl_venet.h b/include/linux/vzctl_venet.h new file mode 100644 -index 0000000..4797a50 +index 0000000..8c02cd4 --- /dev/null +++ b/include/linux/vzctl_venet.h -@@ -0,0 +1,51 @@ +@@ -0,0 +1,53 @@ +/* + * include/linux/vzctl_venet.h + * @@ -29533,6 +32154,8 @@ index 0000000..4797a50 + int op; +#define VE_IP_ADD 1 +#define VE_IP_DEL 2 ++#define VE_IP_EXT_ADD 3 ++#define VE_IP_EXT_DEL 4 + struct sockaddr *addr; + int addrlen; +}; @@ -29818,10 +32441,10 @@ index 0000000..204e9d8 +#endif /* _LINUX_VZIPTABLE_DEFS_H */ diff --git a/include/linux/vzquota.h b/include/linux/vzquota.h new file mode 100644 -index 0000000..e16605e +index 0000000..1dba5fa --- /dev/null +++ b/include/linux/vzquota.h -@@ -0,0 +1,379 @@ +@@ -0,0 +1,380 @@ +/* + * + * Copyright (C) 2001-2005 SWsoft @@ -30031,7 +32654,7 @@ index 0000000..e16605e + struct dq_info dq_info; /* grace times and flags */ + spinlock_t dq_data_lock; /* for dq_stat */ + -+ struct semaphore dq_sem; /* semaphore to protect ++ struct mutex dq_mutex; /* mutex to protect + ugid tree */ + + struct list_head dq_ilink_list; /* list of vz_quota_ilink */ @@ -30096,7 +32719,8 @@ index 0000000..e16605e +#define DQUOT_CMD_CHECK 12 +#define DQUOT_CMD_FORCE 13 + -+extern struct semaphore vz_quota_sem; ++extern struct mutex vz_quota_mutex; ++ +void inode_qmblk_lock(struct super_block *sb); +void inode_qmblk_unlock(struct super_block *sb); +void qmblk_data_read_lock(struct vz_quota_master *qmblk); @@ -30106,6 +32730,7 @@ index 0000000..e16605e + +/* for quota operations */ +void vzquota_inode_init_call(struct inode *inode); ++void vzquota_inode_swap_call(struct inode *, struct inode *); +void vzquota_inode_drop_call(struct inode *inode); +int vzquota_inode_transfer_call(struct inode *, struct iattr *); +struct vz_quota_master *vzquota_inode_data(struct inode *inode, @@ -30193,7 +32818,6 @@ index 0000000..e16605e +int vzquota_proc_init(void); +void vzquota_proc_release(void); +struct vz_quota_master *vzquota_find_qmblk(struct super_block *); -+extern struct semaphore vz_quota_sem; + +void vzaquota_init(void); +void vzaquota_fini(void); @@ -30268,7 +32892,7 @@ index 0000000..f26baad +#endif /* __VZ_RATELIMIT_H__ */ diff --git a/include/linux/vzstat.h b/include/linux/vzstat.h new file mode 100644 -index 0000000..5c23ea4 +index 0000000..c7dfd1f --- /dev/null +++ b/include/linux/vzstat.h @@ -0,0 +1,182 @@ @@ -30310,7 +32934,7 @@ index 0000000..5c23ea4 + cycles_t avg[3]; +}; +struct kstat_lat_pcpu_struct { -+ struct kstat_lat_pcpu_snap_struct cur[NR_CPUS]; ++ struct kstat_lat_pcpu_snap_struct *cur; + cycles_t max_snap; + struct kstat_lat_snap_struct last; + cycles_t avg[3]; @@ -30395,7 +33019,7 @@ index 0000000..5c23ea4 +{ + struct kstat_lat_pcpu_snap_struct *cur; + -+ cur = &p->cur[cpu]; ++ cur = per_cpu_ptr(p->cur, cpu); + write_seqcount_begin(&cur->lock); + cur->count++; + if (cur->maxlat < dur) @@ -30426,8 +33050,8 @@ index 0000000..5c23ea4 + cycles_t m; + + memset(&p->last, 0, sizeof(p->last)); -+ for (cpu = 0; cpu < NR_CPUS; cpu++) { -+ cur = &p->cur[cpu]; ++ for_each_online_cpu(cpu) { ++ cur = per_cpu_ptr(p->cur, cpu); + do { + i = read_seqcount_begin(&cur->lock); + memcpy(&snap, cur, sizeof(snap)); @@ -30454,6 +33078,24 @@ index 0000000..5c23ea4 +} + +#endif /* __VZSTAT_H__ */ +diff --git a/include/linux/xattr.h b/include/linux/xattr.h +index 5c84af8..12bd3c3 100644 +--- a/include/linux/xattr.h ++++ b/include/linux/xattr.h +@@ -10,6 +10,13 @@ + #ifndef _LINUX_XATTR_H + #define _LINUX_XATTR_H + ++#ifdef CONFIG_VE ++extern int ve_xattr_policy; ++#define VE_XATTR_POLICY_ACCEPT 0 ++#define VE_XATTR_POLICY_IGNORE 1 ++#define VE_XATTR_POLICY_REJECT 2 ++#endif ++ + #define XATTR_CREATE 0x1 /* set value, fail if attr already exists */ + #define XATTR_REPLACE 0x2 /* set value, fail if attr does not exist */ + diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 0f7c378..e2a9043 100644 --- a/include/net/addrconf.h @@ -31026,7 +33668,7 @@ index 6eb48e5..b07e8d6 100644 printed = true; } diff --git a/init/main.c b/init/main.c -index bc109c7..d7f4866 100644 +index bc109c7..d06cdc8 100644 --- a/init/main.c +++ b/init/main.c @@ -70,6 +70,9 @@ @@ -31056,20 +33698,20 @@ index bc109c7..d7f4866 100644 /* * Boot command-line arguments */ -@@ -516,6 +529,9 @@ asmlinkage void __init start_kernel(void) +@@ -516,6 +529,8 @@ asmlinkage void __init start_kernel(void) smp_setup_processor_id(); + prepare_ve0_process(&init_task); -+ init_ve0(); + /* * Need to run as early as possible, to initialize the * lockdep hash: -@@ -548,6 +564,7 @@ asmlinkage void __init start_kernel(void) +@@ -548,6 +563,8 @@ asmlinkage void __init start_kernel(void) setup_command_line(command_line); setup_nr_cpu_ids(); setup_per_cpu_areas(); ++ init_ve0(); + ub_init_early(); smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ @@ -32131,10 +34773,10 @@ index 0000000..95ee497 +obj-$(CONFIG_BC_IO_ACCOUNTING) += io_acct.o diff --git a/kernel/bc/beancounter.c b/kernel/bc/beancounter.c new file mode 100644 -index 0000000..6513257 +index 0000000..fdf3bb8 --- /dev/null +++ b/kernel/bc/beancounter.c -@@ -0,0 +1,688 @@ +@@ -0,0 +1,715 @@ +/* + * linux/kernel/bc/beancounter.c + * @@ -32168,6 +34810,7 @@ index 0000000..6513257 +#include +#include +#include ++#include + +#include +#include @@ -32204,9 +34847,9 @@ index 0000000..6513257 + "dummy", + "dummy", + "numiptent", ++ "swappages", + "unused_privvmpages", /* UB_RESOURCES */ + "tmpfs_respages", -+ "swap_pages", + "held_pages", +}; + @@ -32309,6 +34952,25 @@ index 0000000..6513257 + return NULL; +} + ++int ub_count; ++ ++/* next two must be called under ub_hash_lock */ ++static inline void ub_count_inc(struct user_beancounter *ub) ++{ ++ if (ub->parent) ++ ub->parent->ub_childs++; ++ else ++ ub_count++; ++} ++ ++static inline void ub_count_dec(struct user_beancounter *ub) ++{ ++ if (ub->parent) ++ ub->parent->ub_childs--; ++ else ++ ub_count--; ++} ++ +struct user_beancounter *get_beancounter_byuid(uid_t uid, int create) +{ + struct user_beancounter *new_ub, *ub; @@ -32337,6 +34999,7 @@ index 0000000..6513257 + if (new_ub != NULL) { + list_add_rcu(&new_ub->ub_list, &ub_list_head); + hlist_add_head(&new_ub->ub_hash, hash); ++ ub_count_inc(new_ub); + spin_unlock_irqrestore(&ub_hash_lock, flags); + return new_ub; + } @@ -32382,6 +35045,7 @@ index 0000000..6513257 + if (new_ub != NULL) { + list_add_rcu(&new_ub->ub_list, &ub_list_head); + hlist_add_head(&new_ub->ub_hash, hash); ++ ub_count_inc(new_ub); + spin_unlock_irqrestore(&ub_hash_lock, flags); + return new_ub; + } @@ -32431,7 +35095,6 @@ index 0000000..6513257 + + clean &= verify_res(ub, UB_UNUSEDPRIVVM, ub->ub_unused_privvmpages); + clean &= verify_res(ub, UB_TMPFSPAGES, ub->ub_tmpfs_respages); -+ clean &= verify_res(ub, UB_SWAPPAGES, ub->ub_swap_pages); + clean &= verify_res(ub, UB_HELDPAGES, (unsigned long)ub->ub_held_pages); + + ub_debug_trace(!clean, 5, 60*HZ); @@ -32460,6 +35123,7 @@ index 0000000..6513257 + } + + hlist_del(&ub->ub_hash); ++ ub_count_dec(ub); + list_del_rcu(&ub->ub_list); + spin_unlock_irqrestore(&ub_hash_lock, flags); + @@ -32716,6 +35380,7 @@ index 0000000..6513257 +static void init_beancounter_struct(struct user_beancounter *ub) +{ + ub->ub_magic = UB_MAGIC; ++ ub->ub_cookie = get_random_int(); + atomic_set(&ub->ub_refcount, 1); + spin_lock_init(&ub->ub_lock); + INIT_LIST_HEAD(&ub->ub_tcp_sk_list); @@ -32775,6 +35440,7 @@ index 0000000..6513257 + ub->ub_parms[UB_NUMSIGINFO].limit = 1024; + ub->ub_parms[UB_DCACHESIZE].limit = 1024*1024; + ub->ub_parms[UB_NUMFILE].limit = 1024; ++ ub->ub_parms[UB_SWAPPAGES].limit = UB_MAXVALUE; + + for (k = 0; k < UB_RESOURCES; k++) + ub->ub_parms[k].barrier = ub->ub_parms[k].limit; @@ -32783,6 +35449,8 @@ index 0000000..6513257 + ub->ub_limit_rl.interval = 300*HZ; +} + ++static DEFINE_PER_CPU(struct ub_percpu_struct, ub0_percpu); ++ +void __init ub_init_early(void) +{ + struct user_beancounter *ub; @@ -32794,7 +35462,7 @@ index 0000000..6513257 + init_beancounter_nolimits(ub); + init_beancounter_store(ub); + init_beancounter_struct(ub); -+ ub->ub_percpu = NULL; ++ ub->ub_percpu = &per_cpu__ub0_percpu; + + memset(¤t->task_bc, 0, sizeof(struct task_beancounter)); + (void)set_exec_ub(ub); @@ -32806,6 +35474,7 @@ index 0000000..6513257 + + hlist_add_head(&ub->ub_hash, &ub_hash[ub->ub_uid]); + list_add(&ub->ub_list, &ub_list_head); ++ ub_count_inc(ub); +} + +void __init ub_init_late(void) @@ -33737,10 +36406,10 @@ index 0000000..428220f +#endif diff --git a/kernel/bc/kmem.c b/kernel/bc/kmem.c new file mode 100644 -index 0000000..74c4179 +index 0000000..7068e57 --- /dev/null +++ b/kernel/bc/kmem.c -@@ -0,0 +1,406 @@ +@@ -0,0 +1,405 @@ +/* + * kernel/bc/kmem.c + * @@ -33911,16 +36580,15 @@ index 0000000..74c4179 +{ + struct user_beancounter *ub; + struct ub_cache_counter *cc; -+ long pages, vmpages, pbc; ++ long pages, vmpages; + int i; + + ub = seq_beancounter(f); + -+ pages = vmpages = pbc = 0; ++ pages = vmpages = 0; + for_each_online_cpu(i) { + pages += per_cpu_ptr(ub->ub_percpu, i)->pages_charged; + vmpages += per_cpu_ptr(ub->ub_percpu, i)->vmalloc_charged; -+ pbc += per_cpu_ptr(ub->ub_percpu, i)->pbcs; + } + if (pages < 0) + pages = 0; @@ -33929,7 +36597,7 @@ index 0000000..74c4179 + + seq_printf(f, bc_proc_lu_lu_fmt, "pages", pages, PAGE_SIZE); + seq_printf(f, bc_proc_lu_lu_fmt, "vmalloced", vmpages, PAGE_SIZE); -+ seq_printf(f, bc_proc_lu_lu_fmt, "pbcs", pbc, ++ seq_printf(f, bc_proc_lu_lu_fmt, "pbcs", ub->ub_pbcs, + sizeof(struct page_beancounter)); + + spin_lock_irq(&cc_lock); @@ -34149,10 +36817,10 @@ index 0000000..74c4179 +EXPORT_SYMBOL(mem_ub); diff --git a/kernel/bc/misc.c b/kernel/bc/misc.c new file mode 100644 -index 0000000..a47b355 +index 0000000..15e7aa4 --- /dev/null +++ b/kernel/bc/misc.c -@@ -0,0 +1,454 @@ +@@ -0,0 +1,460 @@ +/* + * kernel/bc/misc.c + * @@ -34447,28 +37115,34 @@ index 0000000..a47b355 + return err; +} + ++static inline int task_precharge_farnr(struct task_beancounter *task_bc) ++{ ++ return (task_bc->file_precharged < (1UL << task_bc->file_quant)); ++} ++ +void ub_file_uncharge(struct file *f) +{ + struct user_beancounter *ub, *pub; + struct task_beancounter *task_bc; -+ unsigned long nr; ++ int nr; + + ub = f->f_ub; + task_bc = ¤t->task_bc; + if (likely(ub == task_bc->task_ub)) { + task_bc->file_precharged++; + pub = top_beancounter(ub); -+ if (ub_barrier_farnr(pub, UB_NUMFILE) && ++ if (task_precharge_farnr(task_bc) && + ub_barrier_farsz(pub, UB_KMEMSIZE)) + return; -+ if (task_bc->file_precharged < (1UL << task_bc->file_quant)) -+ return; + nr = task_bc->file_precharged + - (1UL << (task_bc->file_quant - 1)); -+ task_bc->file_precharged -= nr; -+ __put_beancounter_batch(ub, nr); -+ uncharge_beancounter(ub, UB_NUMFILE, nr); -+ uncharge_beancounter(ub, UB_KMEMSIZE, ub_file_kmemsize(nr)); ++ if (nr > 0) { ++ task_bc->file_precharged -= nr; ++ __put_beancounter_batch(ub, nr); ++ uncharge_beancounter(ub, UB_NUMFILE, nr); ++ uncharge_beancounter(ub, UB_KMEMSIZE, ++ ub_file_kmemsize(nr)); ++ } + } else { + uncharge_beancounter(ub, UB_NUMFILE, 1); + uncharge_beancounter(ub, UB_KMEMSIZE, ub_file_kmemsize(1)); @@ -35974,10 +38648,10 @@ index 0000000..c79e826 +EXPORT_SYMBOL(ub_out_of_memory); diff --git a/kernel/bc/proc.c b/kernel/bc/proc.c new file mode 100644 -index 0000000..4bfc03c +index 0000000..dd96e38 --- /dev/null +++ b/kernel/bc/proc.c -@@ -0,0 +1,682 @@ +@@ -0,0 +1,703 @@ +/* + * kernel/bc/proc.c + * @@ -36265,7 +38939,7 @@ index 0000000..4bfc03c + + ret = 0xbc000000; + if (ub->parent) -+ ret |= ((ub->parent->ub_uid) << 4); ++ ret |= ((ub->parent->ub_uid + 1) << 4); + ret |= (ub->ub_uid + 1); + return ret; +} @@ -36583,6 +39257,17 @@ index 0000000..4bfc03c + return bc_lookup(ub, dir, dentry); +} + ++static int bc_entry_getattr(struct vfsmount *mnt, struct dentry *dentry, ++ struct kstat *stat) ++{ ++ struct user_beancounter *ub; ++ ++ generic_fillattr(dentry->d_inode, stat); ++ ub = (struct user_beancounter *)dentry->d_fsdata; ++ stat->nlink = ub->ub_childs + 2; ++ return 0; ++} ++ +static struct file_operations bc_entry_fops = { + .read = generic_read_dir, + .readdir = bc_entry_readdir, @@ -36590,6 +39275,7 @@ index 0000000..4bfc03c + +static struct inode_operations bc_entry_iops = { + .lookup = bc_entry_lookup, ++ .getattr = bc_entry_getattr, +}; + +/* @@ -36627,6 +39313,14 @@ index 0000000..4bfc03c + return bc_lookup(ub, dir, dentry); +} + ++static int bc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, ++ struct kstat *stat) ++{ ++ generic_fillattr(dentry->d_inode, stat); ++ stat->nlink = ub_count + 2; ++ return 0; ++} ++ +static struct file_operations bc_root_fops = { + .read = generic_read_dir, + .readdir = bc_root_readdir, @@ -36634,6 +39328,7 @@ index 0000000..4bfc03c + +static struct inode_operations bc_root_iops = { + .lookup = bc_root_lookup, ++ .getattr = bc_root_getattr, +}; + +static int __init ub_init_proc(void) @@ -36662,10 +39357,10 @@ index 0000000..4bfc03c +core_initcall(ub_init_proc); diff --git a/kernel/bc/rss_pages.c b/kernel/bc/rss_pages.c new file mode 100644 -index 0000000..7b3d872 +index 0000000..2f64be5 --- /dev/null +++ b/kernel/bc/rss_pages.c -@@ -0,0 +1,438 @@ +@@ -0,0 +1,454 @@ +/* + * kernel/bc/rss_pages.c + * @@ -36754,6 +39449,22 @@ index 0000000..7b3d872 +} + +/* ++ * ++ and -- beyond are protected with pb_lock ++ */ ++ ++static inline void inc_pbc_count(struct user_beancounter *ub) ++{ ++ for (; ub != NULL; ub = ub->parent) ++ ub->ub_pbcs++; ++} ++ ++static inline void dec_pbc_count(struct user_beancounter *ub) ++{ ++ for (; ub != NULL; ub = ub->parent) ++ ub->ub_pbcs--; ++} ++ ++/* + * Alloc - free + */ + @@ -36865,7 +39576,7 @@ index 0000000..7b3d872 + +static inline int pb_hash(struct user_beancounter *ub, struct page *page) +{ -+ return (page_to_pfn(page) + (ub->ub_uid << 10)) & pb_hash_mask; ++ return (page_to_pfn(page) ^ ub->ub_cookie) & pb_hash_mask; +} + +/* pb_lock should be held */ @@ -37565,10 +40276,10 @@ index 0000000..bf6354b +module_init(ubstatd_init); diff --git a/kernel/bc/sys.c b/kernel/bc/sys.c new file mode 100644 -index 0000000..a997944 +index 0000000..8fb942e --- /dev/null +++ b/kernel/bc/sys.c -@@ -0,0 +1,176 @@ +@@ -0,0 +1,184 @@ +/* + * kernel/bc/sys.c + * @@ -37724,18 +40435,26 @@ index 0000000..a997944 +} + +#ifdef CONFIG_COMPAT -+asmlinkage long compat_sys_setublimit(uid_t uid, int resource, -+ unsigned int __user *limits) ++#define UB_MAXVALUE_COMPAT ((1UL << (sizeof(compat_long_t) * 8 - 1)) - 1) ++ ++asmlinkage long compat_sys_setublimit(uid_t uid, ++ compat_long_t resource, ++ compat_long_t __user *limits) +{ -+ unsigned int u_new_limits[2]; ++ compat_long_t u_new_limits[2]; + unsigned long new_limits[2]; + -+ if (copy_from_user(&u_new_limits, limits, sizeof(u_new_limits))) -+ return -EFAULT; ++ if (copy_from_user(&u_new_limits, limits, sizeof(u_new_limits))) ++ return -EFAULT; + + new_limits[0] = u_new_limits[0]; + new_limits[1] = u_new_limits[1]; + ++ if (u_new_limits[0] == UB_MAXVALUE_COMPAT) ++ new_limits[0] = UB_MAXVALUE; ++ if (u_new_limits[1] == UB_MAXVALUE_COMPAT) ++ new_limits[1] = UB_MAXVALUE; ++ + return do_setublimit(uid, resource, new_limits); +} + @@ -37747,10 +40466,10 @@ index 0000000..a997944 +#endif diff --git a/kernel/bc/vm_pages.c b/kernel/bc/vm_pages.c new file mode 100644 -index 0000000..e98134b +index 0000000..9b4ef0e --- /dev/null +++ b/kernel/bc/vm_pages.c -@@ -0,0 +1,549 @@ +@@ -0,0 +1,546 @@ +/* + * kernel/bc/vm_pages.c + * @@ -37858,7 +40577,8 @@ index 0000000..e98134b +void __ub_update_oomguarpages(struct user_beancounter *ub) +{ + ub->ub_parms[UB_OOMGUARPAGES].held = -+ ub->ub_parms[UB_PHYSPAGES].held + ub->ub_swap_pages; ++ ub->ub_parms[UB_PHYSPAGES].held + ++ ub->ub_parms[UB_SWAPPAGES].held; + ub_adjust_maxheld(ub, UB_OOMGUARPAGES); +} + @@ -38160,7 +40880,7 @@ index 0000000..e98134b + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); -+ ub->ub_swap_pages++; ++ __charge_beancounter_locked(ub, UB_SWAPPAGES, 1, UB_FORCE); + __ub_update_oomguarpages(ub); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} @@ -38179,10 +40899,7 @@ index 0000000..e98134b + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); -+ if (ub->ub_swap_pages <= 0) -+ uncharge_warn(ub, UB_SWAPPAGES, 1, ub->ub_swap_pages); -+ else -+ ub->ub_swap_pages--; ++ __uncharge_beancounter_locked(ub, UB_SWAPPAGES, 1); + __ub_update_oomguarpages(ub); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} @@ -38280,8 +40997,7 @@ index 0000000..e98134b + ub->ub_unused_privvmpages); + seq_printf(f, bc_proc_lu_fmt, ub_rnames[UB_TMPFSPAGES], + ub->ub_tmpfs_respages); -+ seq_printf(f, bc_proc_lu_fmt, ub_rnames[UB_SWAPPAGES], -+ ub->ub_swap_pages); ++ seq_printf(f, bc_proc_lu_fmt, "rss", ub->ub_pbcs); + + seq_printf(f, bc_proc_lu_fmt, "swapin", swap); + seq_printf(f, bc_proc_lu_fmt, "unmap", unmap); @@ -39015,7 +41731,7 @@ index 0000000..19dcf32 +#endif diff --git a/kernel/cpt/cpt_context.c b/kernel/cpt/cpt_context.c new file mode 100644 -index 0000000..bfba186 +index 0000000..f095a73 --- /dev/null +++ b/kernel/cpt/cpt_context.c @@ -0,0 +1,285 @@ @@ -39171,7 +41887,7 @@ index 0000000..bfba186 + hdr.cpt_signature[2] = CPT_SIGNATURE2; + hdr.cpt_signature[3] = CPT_SIGNATURE3; + hdr.cpt_hdrlen = sizeof(hdr); -+ hdr.cpt_image_version = CPT_VERSION_32; ++ hdr.cpt_image_version = CPT_CURRENT_VERSION; +#ifdef CONFIG_X86_64 + hdr.cpt_os_arch = CPT_OS_ARCH_EMT64; +#elif defined(CONFIG_X86_32) @@ -39306,10 +42022,10 @@ index 0000000..bfba186 +} diff --git a/kernel/cpt/cpt_context.h b/kernel/cpt/cpt_context.h new file mode 100644 -index 0000000..e4f82f9 +index 0000000..9eb851a --- /dev/null +++ b/kernel/cpt/cpt_context.h -@@ -0,0 +1,215 @@ +@@ -0,0 +1,225 @@ +#include +#include +#include @@ -39415,6 +42131,16 @@ index 0000000..e4f82f9 + and restore them before resuming */ + struct ubparm saved_ubc[UB_RESOURCES]; +#endif ++ ++ int tcp_cb_convert; ++#define CPT_TCP_CB_CONV 1 ++#define CPT_TCP_CB_NOT_CONV 2 ++ ++#define CPT_MAX_LINKDIRS 1 ++ struct file *linkdirs[CPT_MAX_LINKDIRS]; ++ int linkdirs_num; ++ unsigned int linkcnt; /* for create hardlinked files */ ++ int hardlinked_on; +} cpt_context_t; + +typedef struct { @@ -39527,10 +42253,10 @@ index 0000000..e4f82f9 +} diff --git a/kernel/cpt/cpt_dump.c b/kernel/cpt/cpt_dump.c new file mode 100644 -index 0000000..7a36b4e +index 0000000..08ae5e6 --- /dev/null +++ b/kernel/cpt/cpt_dump.c -@@ -0,0 +1,1248 @@ +@@ -0,0 +1,1271 @@ +/* + * + * kernel/cpt/cpt_dump.c @@ -40327,6 +43053,7 @@ index 0000000..7a36b4e + i->start_jiffies_delta = get_jiffies_64() - ve->start_jiffies; + + i->last_pid = ve->ve_ns->pid_ns->last_pid; ++ i->rnd_va_space = ve->_randomize_va_space + 1; + + ctx->write(i, sizeof(*i), ctx); + cpt_release_buf(ctx); @@ -40692,8 +43419,10 @@ index 0000000..7a36b4e + + p.dentry = mnt->mnt_root; + p.mnt = mnt; ++ spin_lock(&dcache_lock); + path = __d_path(&p, &env->root_path, + path_buf, PAGE_SIZE); ++ spin_unlock(&dcache_lock); + if (IS_ERR(path)) + continue; + @@ -40714,7 +43443,7 @@ index 0000000..7a36b4e + struct nsproxy *old_ns; + struct mnt_namespace *n; + int err; -+ unsigned int flags = test_cpu_caps(); ++ unsigned int flags = test_cpu_caps_and_features(); + + if (!ctx->ve_id) + return -EINVAL; @@ -40723,8 +43452,26 @@ index 0000000..7a36b4e + if (env == NULL) + return -ESRCH; + ++ down_read(&env->op_sem); ++ err = -ESRCH; ++ if (!env->is_running) { ++ eprintk_ctx("CT is not running\n"); ++ goto out_noenv; ++ } ++ ++ err = -EBUSY; ++ if (env->is_locked) { ++ eprintk_ctx("CT is locked\n"); ++ goto out_noenv; ++ } ++ + *caps = flags & (1<nsproxy; + current->nsproxy = env->ve_ns; @@ -40775,6 +43522,8 @@ index 0000000..7a36b4e +out: + current->nsproxy = old_ns; + set_exec_env(old_env); ++out_noenv: ++ up_read(&env->op_sem); + put_ve(env); + + return err; @@ -40941,10 +43690,10 @@ index 0000000..f492331 +EXPORT_SYMBOL(lookup_cpt_obj_bypos); diff --git a/kernel/cpt/cpt_files.c b/kernel/cpt/cpt_files.c new file mode 100644 -index 0000000..f013331 +index 0000000..3ada205 --- /dev/null +++ b/kernel/cpt/cpt_files.c -@@ -0,0 +1,1648 @@ +@@ -0,0 +1,1783 @@ +/* + * + * kernel/cpt/cpt_files.c @@ -40973,6 +43722,7 @@ index 0000000..f013331 +#include +#include +#include ++#include +#include +#include +#include @@ -41020,15 +43770,29 @@ index 0000000..f013331 +} + +int cpt_verify_overmount(char *path, struct dentry *d, struct vfsmount *mnt, -+ cpt_context_t *ctx) ++ int verify, cpt_context_t *ctx) +{ ++ if (d->d_inode->i_sb->s_magic == FSMAGIC_PROC && ++ proc_dentry_of_dead_task(d)) ++ return 0; ++ + if (path[0] == '/' && !(!IS_ROOT(d) && d_unhashed(d))) { + struct nameidata nd; + if (path_lookup(path, 0, &nd)) { + eprintk_ctx("d_path cannot be looked up %s\n", path); + return -EINVAL; + } -+ if (nd.path.dentry != d || nd.path.mnt != mnt) { ++ if (nd.path.dentry != d || (verify && nd.path.mnt != mnt)) { ++ if (!strcmp(path, "/dev/null")) { ++ /* ++ * epic kludge to workaround the case, when the ++ * init opens a /dev/null and then udevd ++ * overmounts the /dev with tmpfs ++ */ ++ path_put(&nd.path); ++ return 0; ++ } ++ + eprintk_ctx("d_path is invisible %s\n", path); + path_put(&nd.path); + return -EINVAL; @@ -41090,7 +43854,7 @@ index 0000000..f013331 +} + +static int cpt_dump_dentry(struct dentry *d, struct vfsmount *mnt, -+ int replaced, cpt_context_t *ctx) ++ int replaced, int verify, cpt_context_t *ctx) +{ + int len; + char *path; @@ -41155,7 +43919,7 @@ index 0000000..f013331 + o.cpt_content = CPT_CONTENT_NAME; + path[len] = 0; + -+ if (cpt_verify_overmount(path, d, mnt, ctx)) { ++ if (cpt_verify_overmount(path, d, mnt, verify, ctx)) { + __cpt_release_buf(ctx); + return -EINVAL; + } @@ -41194,7 +43958,7 @@ index 0000000..f013331 +static int +cpt_dump_filename(struct file *file, int replaced, cpt_context_t *ctx) +{ -+ return cpt_dump_dentry(file->f_dentry, file->f_vfsmnt, replaced, ctx); ++ return cpt_dump_dentry(file->f_dentry, file->f_vfsmnt, replaced, 1, ctx); +} + +int cpt_dump_inode(struct dentry *d, struct vfsmount *mnt, struct cpt_context *ctx) @@ -41435,25 +44199,33 @@ index 0000000..f013331 + + v->cpt_i_mode = sbuf.mode; + v->cpt_lflags = 0; ++ ++ if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_PROC) { ++ v->cpt_lflags |= CPT_DENTRY_PROC; ++ if (proc_dentry_of_dead_task(file->f_dentry)) ++ v->cpt_lflags |= CPT_DENTRY_PROCPID_DEAD; ++ } ++ + if (IS_ROOT(file->f_dentry)) + v->cpt_lflags |= CPT_DENTRY_ROOT; + else if (d_unhashed(file->f_dentry)) { + if (cpt_replaced(file->f_dentry, file->f_vfsmnt, ctx)) { + v->cpt_lflags |= CPT_DENTRY_REPLACED; + replaced = 1; -+ } else { ++ } else if (!(v->cpt_lflags & CPT_DENTRY_PROCPID_DEAD)) + v->cpt_lflags |= CPT_DENTRY_DELETED; -+ } + } + if (is_cloning_inode(file->f_dentry->d_inode)) + v->cpt_lflags |= CPT_DENTRY_CLONING; -+ if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_PROC) -+ v->cpt_lflags |= CPT_DENTRY_PROC; ++ + v->cpt_inode = CPT_NULL; + if (!(v->cpt_lflags & CPT_DENTRY_REPLACED)) { + iobj = lookup_cpt_object(CPT_OBJ_INODE, file->f_dentry->d_inode, ctx); -+ if (iobj) ++ if (iobj) { + v->cpt_inode = iobj->o_pos; ++ if (iobj->o_flags & CPT_INODE_HARDLINKED) ++ v->cpt_lflags |= CPT_DENTRY_HARDLINKED; ++ } + } + v->cpt_priv = CPT_NULL; + v->cpt_fown_fd = -1; @@ -41604,14 +44376,17 @@ index 0000000..f013331 + + if (!(file->f_mode & FMODE_READ) || + (file->f_flags & O_DIRECT)) { -+ file = dentry_open(dget(file->f_dentry), -+ mntget(file->f_vfsmnt), O_RDONLY, ++ struct file *filp; ++ filp = dentry_open(dget(file->f_dentry), ++ mntget(file->f_vfsmnt), ++ O_RDONLY | O_LARGEFILE, + NULL /* not checked */); -+ if (IS_ERR(file)) { ++ if (IS_ERR(filp)) { + cpt_printk_dentry(file->f_dentry, file->f_vfsmnt); -+ eprintk_ctx("cannot reopen file for read %ld\n", PTR_ERR(file)); -+ return PTR_ERR(file); ++ eprintk_ctx("cannot reopen file for read %ld\n", PTR_ERR(filp)); ++ return PTR_ERR(filp); + } ++ file = filp; + } else { + atomic_long_inc(&file->f_count); + } @@ -41858,7 +44633,7 @@ index 0000000..f013331 + } + spin_unlock(&dcache_lock); + if (found) { -+ err = cpt_dump_dentry(found, mnt, 0, ctx); ++ err = cpt_dump_dentry(found, mnt, 0, 1, ctx); + dput(found); + if (!err) { + dprintk_ctx("dentry found in aliases\n"); @@ -41872,7 +44647,7 @@ index 0000000..f013331 + return -EINVAL; + + mntget(mnt); -+ f = dentry_open(de, mnt, O_RDONLY, NULL); ++ f = dentry_open(de, mnt, O_RDONLY | O_LARGEFILE, NULL); + if (IS_ERR(f)) + return PTR_ERR(f); + @@ -41897,7 +44672,7 @@ index 0000000..f013331 + + dprintk_ctx("dentry found in dir\n"); + __cpt_release_buf(ctx); -+ err = cpt_dump_dentry(found, mnt, 0, ctx); ++ err = cpt_dump_dentry(found, mnt, 0, 1, ctx); + +err_lookup: + dput(found); @@ -41907,6 +44682,86 @@ index 0000000..f013331 + return err; +} + ++static struct dentry *find_linkdir(struct vfsmount *mnt, struct cpt_context *ctx) ++{ ++ int i; ++ ++ for (i = 0; i < ctx->linkdirs_num; i++) ++ if (ctx->linkdirs[i]->f_vfsmnt == mnt) ++ return ctx->linkdirs[i]->f_dentry; ++ return NULL; ++} ++ ++struct dentry *cpt_fake_link(struct dentry *d, struct vfsmount *mnt, ++ struct inode *ino, struct cpt_context *ctx) ++{ ++ int err; ++ int order = 8; ++ const char *prefix = ".cpt_hardlink."; ++ int preflen = strlen(prefix) + order; ++ char name[preflen + 1]; ++ struct dentry *dirde, *hardde; ++ ++ dirde = find_linkdir(mnt, ctx); ++ if (!dirde) { ++ err = -ENOENT; ++ goto out; ++ } ++ ++ ctx->linkcnt++; ++ snprintf(name, sizeof(name), "%s%0*u", prefix, order, ctx->linkcnt); ++ ++ mutex_lock(&dirde->d_inode->i_mutex); ++ hardde = lookup_one_len(name, dirde, strlen(name)); ++ if (IS_ERR(hardde)) { ++ err = PTR_ERR(hardde); ++ goto out_unlock; ++ } ++ ++ if (hardde->d_inode) { ++ /* Userspace should clean hardlinked files from previous ++ * dump/undump ++ */ ++ eprintk_ctx("Hardlinked file already exists: %s\n", name); ++ err = -EEXIST; ++ goto out_put; ++ } ++ ++ if (d == NULL) ++ err = vfs_create(dirde->d_inode, hardde, 0600, NULL); ++ else ++ err = vfs_link(d, dirde->d_inode, hardde); ++ if (err) { ++ eprintk_ctx("error hardlink %s, %d\n", name, err); ++ goto out_put; ++ } ++ ++out_unlock: ++ mutex_unlock(&dirde->d_inode->i_mutex); ++out: ++ return err ? ERR_PTR(err) : hardde; ++ ++out_put: ++ dput(hardde); ++ goto out_unlock; ++} ++ ++static int create_dump_hardlink(struct dentry *d, struct vfsmount *mnt, ++ struct inode *ino, struct cpt_context *ctx) ++{ ++ int err; ++ struct dentry *hardde; ++ ++ hardde = cpt_fake_link(d, mnt, ino, ctx); ++ if (IS_ERR(hardde)) ++ return PTR_ERR(hardde); ++ ++ err = cpt_dump_dentry(hardde, mnt, 0, 1, ctx); ++ dput(hardde); ++ ++ return err; ++} ++ +static int dump_one_inode(struct file *file, struct dentry *d, + struct vfsmount *mnt, struct cpt_context *ctx) +{ @@ -41922,6 +44777,10 @@ index 0000000..f013331 + if (iobj->o_pos >= 0) + return 0; + ++ if (ino->i_sb->s_magic == FSMAGIC_PROC && ++ proc_dentry_of_dead_task(d)) ++ return 0; ++ + if ((!IS_ROOT(d) && d_unhashed(d)) && + !cpt_replaced(d, mnt, ctx)) + dump_it = 1; @@ -41948,6 +44807,14 @@ index 0000000..f013331 + * process group. */ + if (ino->i_nlink != 0) { + err = find_linked_dentry(d, mnt, ino, ctx); ++ if (err && S_ISREG(ino->i_mode)) { ++ err = create_dump_hardlink(d, mnt, ino, ctx); ++ iobj->o_flags |= CPT_INODE_HARDLINKED; ++ } else if (S_ISCHR(ino->i_mode) || ++ S_ISBLK(ino->i_mode) || ++ S_ISFIFO(ino->i_mode)) ++ err = 0; ++ + if (err) { + eprintk_ctx("deleted reference to existing inode, checkpointing is impossible: %d\n", err); + return -EBUSY; @@ -42305,6 +45172,7 @@ index 0000000..f013331 +{ + int* pfd; + char* path; ++ envid_t veid; +}; + +static int dumptmpfs(void *arg) @@ -42316,7 +45184,7 @@ index 0000000..f013331 + char *path = args->path; + char *argv[] = { "tar", "-c", "-S", "--numeric-owner", path, NULL }; + -+ i = real_env_create(VEID(get_exec_env()), VE_ENTER|VE_SKIPLOCK, 2, NULL, 0); ++ i = real_env_create(args->veid, VE_ENTER|VE_SKIPLOCK, 2, NULL, 0); + if (i < 0) { + eprintk("cannot enter ve to dump tmpfs\n"); + module_put(THIS_MODULE); @@ -42363,16 +45231,20 @@ index 0000000..f013331 + int status; + mm_segment_t oldfs; + sigset_t ignore, blocked; ++ struct ve_struct *oldenv; + + err = sc_pipe(pfd); + if (err < 0) + return err; + args.pfd = pfd; + args.path = path; ++ args.veid = VEID(get_exec_env()); + ignore.sig[0] = CPT_SIG_IGNORE_MASK; + sigprocmask(SIG_BLOCK, &ignore, &blocked); ++ oldenv = set_exec_env(get_ve0()); + err = pid = local_kernel_thread(dumptmpfs, (void*)&args, + SIGCHLD | CLONE_VFORK, 0); ++ set_exec_env(oldenv); + if (err < 0) { + eprintk_ctx("tmpfs local_kernel_thread: %d\n", err); + goto out; @@ -42454,7 +45326,7 @@ index 0000000..f013331 + + /* One special case: mount --bind /a /a */ + if (mnt->mnt_root == mnt->mnt_mountpoint) -+ return cpt_dump_dentry(mnt->mnt_root, mnt, 0, ctx); ++ return cpt_dump_dentry(mnt->mnt_root, mnt, 0, 0, ctx); + + list_for_each_prev(p, &mnt->mnt_list) { + struct vfsmount * m; @@ -42467,7 +45339,7 @@ index 0000000..f013331 + if (m->mnt_sb != mnt->mnt_sb) + continue; + -+ err = cpt_dump_dentry(mnt->mnt_root, m, 0, ctx); ++ err = cpt_dump_dentry(mnt->mnt_root, m, 0, 1, ctx); + if (err == 0) + break; + } @@ -42517,19 +45389,30 @@ index 0000000..f013331 + cpt_dump_string(path, ctx); + cpt_dump_string(mnt->mnt_sb->s_type->name, ctx); + -+ if (v.cpt_mntflags & CPT_MNT_BIND) ++ if (v.cpt_mntflags & CPT_MNT_BIND) { + err = cpt_dump_bind_mnt(mnt, ctx); -+ else if (!(v.cpt_mntflags & CPT_MNT_EXT) && -+ strcmp(mnt->mnt_sb->s_type->name, "tmpfs") == 0) { -+ mntget(mnt); -+ up_read(&namespace_sem); -+ err = cpt_dump_tmpfs(path, ctx); -+ down_read(&namespace_sem); -+ if (!err) { -+ if (list_empty(&mnt->mnt_list)) -+ err = -EBUSY; ++ ++ /* Temporary solution for Ubuntu 8.04 */ ++ if (err == -EINVAL && !strcmp(path, "/dev/.static/dev")) { ++ cpt_dump_string("/dev", ctx); ++ err = 0; ++ } ++ } ++ else if (!(v.cpt_mntflags & CPT_MNT_EXT)) { ++ ++ if (mnt->mnt_sb->s_type->fs_flags & FS_REQUIRES_DEV) { ++ eprintk_ctx("Checkpoint supports only nodev fs: %s\n", ++ mnt->mnt_sb->s_type->name); ++ err = -EXDEV; ++ } else if (!strcmp(mnt->mnt_sb->s_type->name, "tmpfs")) { ++ mntget(mnt); ++ up_read(&namespace_sem); ++ err = cpt_dump_tmpfs(path, ctx); ++ down_read(&namespace_sem); ++ if (!err && list_empty(&mnt->mnt_list)) ++ err = -EBUSY; ++ mntput(mnt); + } -+ mntput(mnt); + } + + cpt_pop_object(&saved_obj, ctx); @@ -42547,7 +45430,7 @@ index 0000000..f013331 +{ + struct mnt_namespace *n = obj->o_obj; + struct cpt_object_hdr v; -+ struct list_head *p; ++ struct vfsmount *rootmnt, *p; + loff_t saved_obj; + int err = 0; + @@ -42563,8 +45446,9 @@ index 0000000..f013331 + cpt_push_object(&saved_obj, ctx); + + down_read(&namespace_sem); -+ list_for_each(p, &n->list) { -+ err = dump_vfsmount(list_entry(p, struct vfsmount, mnt_list), ctx); ++ rootmnt = n->root; ++ for (p = rootmnt; p; p = next_mnt(p, rootmnt)) { ++ err = dump_vfsmount(p, ctx); + if (err) + break; + } @@ -42595,10 +45479,10 @@ index 0000000..f013331 +} diff --git a/kernel/cpt/cpt_files.h b/kernel/cpt/cpt_files.h new file mode 100644 -index 0000000..e0ebd97 +index 0000000..bc66731 --- /dev/null +++ b/kernel/cpt/cpt_files.h -@@ -0,0 +1,73 @@ +@@ -0,0 +1,77 @@ +int cpt_collect_files(cpt_context_t *); +int cpt_collect_fs(cpt_context_t *); +int cpt_collect_namespace(cpt_context_t *); @@ -42619,6 +45503,7 @@ index 0000000..e0ebd97 + +struct file *rst_file(loff_t pos, int fd, struct cpt_context *ctx); +int rst_files_complete(struct cpt_task_image *ti, struct cpt_context *ctx); ++int rst_files_std(struct cpt_task_image *ti, struct cpt_context *ctx); +__u32 rst_files_flag(struct cpt_task_image *ti, struct cpt_context *ctx); +int rst_fs_complete(struct cpt_task_image *ti, struct cpt_context *ctx); +int rst_restore_fs(struct cpt_context *ctx); @@ -42658,9 +45543,11 @@ index 0000000..e0ebd97 + unsigned flags, + struct cpt_context *ctx); + ++struct dentry *cpt_fake_link(struct dentry *d, struct vfsmount *mnt, ++ struct inode *ino, struct cpt_context *ctx); + +int cpt_verify_overmount(char *path, struct dentry *d, struct vfsmount *mnt, -+ cpt_context_t *ctx); ++ int verify, cpt_context_t *ctx); + +#define check_one_vfsmount(mnt) \ + (strcmp(mnt->mnt_sb->s_type->name, "rootfs") != 0 && \ @@ -42671,7 +45558,8 @@ index 0000000..e0ebd97 + strcmp(mnt->mnt_sb->s_type->name, "tmpfs") != 0 && \ + strcmp(mnt->mnt_sb->s_type->name, "devpts") != 0 && \ + strcmp(mnt->mnt_sb->s_type->name, "proc") != 0 && \ -+ strcmp(mnt->mnt_sb->s_type->name, "sysfs") != 0) ++ strcmp(mnt->mnt_sb->s_type->name, "sysfs") != 0 && \ ++ strcmp(mnt->mnt_sb->s_type->name, "binfmt_misc") != 0) diff --git a/kernel/cpt/cpt_fsmagic.h b/kernel/cpt/cpt_fsmagic.h new file mode 100644 index 0000000..7e79789 @@ -42697,10 +45585,10 @@ index 0000000..7e79789 +#define FSMAGIC_ANON 0x09041934 diff --git a/kernel/cpt/cpt_inotify.c b/kernel/cpt/cpt_inotify.c new file mode 100644 -index 0000000..87f6bfd +index 0000000..4f2abb0 --- /dev/null +++ b/kernel/cpt/cpt_inotify.c -@@ -0,0 +1,151 @@ +@@ -0,0 +1,174 @@ +/* + * + * kernel/cpt/cpt_inotify.c @@ -42744,6 +45632,29 @@ index 0000000..87f6bfd +#include "cpt_fsmagic.h" +#include "cpt_syscalls.h" + ++static int dump_watch_inode(struct path *path, cpt_context_t *ctx) ++{ ++ int err; ++ struct dentry *d; ++ ++ d = path->dentry; ++ if (IS_ROOT(d) || !d_unhashed(d)) ++ goto dump_dir; ++ ++ d = cpt_fake_link(d->d_inode->i_nlink ? d : NULL, ++ path->mnt, d->d_inode, ctx); ++ ++ if (IS_ERR(d)) ++ return PTR_ERR(d); ++ ++dump_dir: ++ err = cpt_dump_dir(d, path->mnt, ctx); ++ if (d != path->dentry) ++ dput(d); ++ ++ return err; ++} ++ +static int cpt_dump_watches(struct fsnotify_group *g, struct cpt_context *ctx) +{ + int err = 0; @@ -42783,7 +45694,7 @@ index 0000000..87f6bfd + path_get(&path); + spin_unlock(&fse->lock); + -+ err = cpt_dump_dir(path.dentry, path.mnt, ctx); ++ err = dump_watch_inode(&path, ctx); + cpt_pop_object(&saved_obj, ctx); + path_put(&path); + @@ -42854,10 +45765,10 @@ index 0000000..87f6bfd +} diff --git a/kernel/cpt/cpt_kernel.c b/kernel/cpt/cpt_kernel.c new file mode 100644 -index 0000000..3272d81 +index 0000000..10fa5d6 --- /dev/null +++ b/kernel/cpt/cpt_kernel.c -@@ -0,0 +1,178 @@ +@@ -0,0 +1,185 @@ +/* + * + * kernel/cpt/cpt_kernel.c @@ -42880,6 +45791,8 @@ index 0000000..3272d81 +#include +#endif +#include ++#include ++#include + +#include "cpt_kernel.h" +#include "cpt_syscalls.h" @@ -42952,7 +45865,9 @@ index 0000000..3272d81 + } + if (!try_module_get(THIS_MODULE)) + return -EBUSY; -+ ret = asm_kernel_thread(fn, arg, flags, pid); ++ while ((ret = asm_kernel_thread(fn, arg, flags, pid)) == ++ -ERESTARTNOINTR) ++ cond_resched(); + if (ret < 0) + module_put(THIS_MODULE); + return ret; @@ -42981,7 +45896,7 @@ index 0000000..3272d81 + return ret; +} + -+unsigned int test_cpu_caps(void) ++unsigned int test_cpu_caps_and_features(void) +{ + unsigned int flags = 0; + @@ -43023,6 +45938,9 @@ index 0000000..3272d81 + flags |= 1 << CPT_CPU_X86_IA64; + flags |= 1 << CPT_CPU_X86_FXSR; +#endif ++ if (virtinfo_notifier_call(VITYPE_SCP, ++ VIRTINFO_SCP_TEST, NULL) & NOTIFY_FAIL) ++ flags |= 1 << CPT_SLM_DMPRST; + return flags; +} + @@ -43038,7 +45956,7 @@ index 0000000..3272d81 +} diff --git a/kernel/cpt/cpt_kernel.h b/kernel/cpt/cpt_kernel.h new file mode 100644 -index 0000000..9254778 +index 0000000..8bbd402 --- /dev/null +++ b/kernel/cpt/cpt_kernel.h @@ -0,0 +1,99 @@ @@ -43092,7 +46010,7 @@ index 0000000..9254778 +static inline void vefs_track_notify(struct dentry *vdentry, int track_cow) { }; +#endif + -+unsigned int test_cpu_caps(void); ++unsigned int test_cpu_caps_and_features(void); +unsigned int test_kernel_config(void); + +#define test_one_flag_old(src, dst, flag, message, ret) \ @@ -44113,10 +47031,10 @@ index 0000000..dc2c483 +extern struct vm_operations_struct special_mapping_vmops; diff --git a/kernel/cpt/cpt_net.c b/kernel/cpt/cpt_net.c new file mode 100644 -index 0000000..9e09675 +index 0000000..4e183ba --- /dev/null +++ b/kernel/cpt/cpt_net.c -@@ -0,0 +1,544 @@ +@@ -0,0 +1,652 @@ +/* + * + * kernel/cpt/cpt_net.c @@ -44514,13 +47432,20 @@ index 0000000..9e09675 + return err; +} + ++struct args_t ++{ ++ int* pfd; ++ envid_t veid; ++}; ++ +static int dumpfn(void *arg) +{ + int i; -+ int *pfd = arg; ++ struct args_t *args = arg; ++ int *pfd = args->pfd; + char *argv[] = { "iptables-save", "-c", NULL }; + -+ i = real_env_create(VEID(get_exec_env()), VE_ENTER|VE_SKIPLOCK, 2, NULL, 0); ++ i = real_env_create(args->veid, VE_ENTER|VE_SKIPLOCK, 2, NULL, 0); + if (i < 0) { + eprintk("cannot enter ve to dump iptables\n"); + module_put(THIS_MODULE); @@ -44560,6 +47485,8 @@ index 0000000..9e09675 + int status; + mm_segment_t oldfs; + sigset_t ignore, blocked; ++ struct args_t args; ++ struct ve_struct *oldenv; + + if (!(get_exec_env()->_iptables_modules & VE_IP_IPTABLES_MOD)) + return 0; @@ -44569,9 +47496,14 @@ index 0000000..9e09675 + eprintk_ctx("sc_pipe: %d\n", err); + return err; + } ++ args.pfd = pfd; ++ args.veid = VEID(get_exec_env()); + ignore.sig[0] = CPT_SIG_IGNORE_MASK; + sigprocmask(SIG_BLOCK, &ignore, &blocked); -+ err = pid = local_kernel_thread(dumpfn, (void*)pfd, SIGCHLD, 0); ++ oldenv = set_exec_env(get_ve0()); ++ err = pid = local_kernel_thread(dumpfn, (void*)&args, ++ SIGCHLD | CLONE_VFORK, 0); ++ set_exec_env(oldenv); + if (err < 0) { + eprintk_ctx("local_kernel_thread: %d\n", err); + goto out; @@ -44646,6 +47578,98 @@ index 0000000..9e09675 + return err; +} + ++static unsigned long fold_field(void *mib[], int offt) ++{ ++ unsigned long res = 0; ++ int i; ++ ++ for_each_possible_cpu(i) { ++ res += *(((unsigned long *) per_cpu_ptr(mib[0], i)) + offt); ++ res += *(((unsigned long *) per_cpu_ptr(mib[1], i)) + offt); ++ } ++ return res; ++} ++ ++static void cpt_dump_snmp_stat(struct cpt_context *ctx, void *mib[], int n) ++{ ++ int i; ++ struct cpt_object_hdr o; ++ __u32 *stats; ++ ++ stats = cpt_get_buf(ctx); ++ ++ cpt_open_object(NULL, ctx); ++ ++ for (i = 0; i < n; i++) ++ stats[i] = fold_field(mib, i); ++ ++ o.cpt_next = CPT_NULL; ++ o.cpt_object = CPT_OBJ_BITS; ++ o.cpt_hdrlen = sizeof(o); ++ o.cpt_content = CPT_CONTENT_DATA; ++ ++ ctx->write(&o, sizeof(o), ctx); ++ ctx->write(stats, n * sizeof(*stats), ctx); ++ ctx->align(ctx); ++ ++ cpt_close_object(ctx); ++ ++ cpt_release_buf(ctx); ++} ++ ++static void cpt_dump_snmp_stub(struct cpt_context *ctx) ++{ ++ struct cpt_object_hdr o; ++ ++ cpt_open_object(NULL, ctx); ++ o.cpt_next = CPT_NULL; ++ o.cpt_object = CPT_OBJ_BITS; ++ o.cpt_hdrlen = sizeof(o); ++ o.cpt_content = CPT_CONTENT_VOID; ++ ctx->write(&o, sizeof(o), ctx); ++ ctx->align(ctx); ++ cpt_close_object(ctx); ++} ++ ++static int cpt_dump_snmp(struct cpt_context *ctx) ++{ ++ struct ve_struct *ve; ++ struct net *net; ++ ++ ve = get_exec_env(); ++ net = ve->ve_netns; ++ ++ cpt_open_section(ctx, CPT_SECT_SNMP_STATS); ++ ++ cpt_dump_snmp_stat(ctx, (void **)&net->mib.net_statistics, ++ LINUX_MIB_MAX); ++ cpt_dump_snmp_stat(ctx, (void **)&net->mib.ip_statistics, ++ IPSTATS_MIB_MAX); ++ cpt_dump_snmp_stat(ctx, (void **)&net->mib.tcp_statistics, ++ TCP_MIB_MAX); ++ cpt_dump_snmp_stat(ctx, (void **)&net->mib.udp_statistics, ++ UDP_MIB_MAX); ++ cpt_dump_snmp_stat(ctx, (void **)&net->mib.icmp_statistics, ++ ICMP_MIB_MAX); ++ cpt_dump_snmp_stat(ctx, (void **)&net->mib.icmpmsg_statistics, ++ ICMPMSG_MIB_MAX); ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ cpt_dump_snmp_stat(ctx, (void **)&ve->_ipv6_statistics, ++ IPSTATS_MIB_MAX); ++ cpt_dump_snmp_stat(ctx, (void **)&ve->_udp_stats_in6, ++ UDP_MIB_MAX); ++ cpt_dump_snmp_stat(ctx, (void **)&ve->_icmpv6_statistics, ++ ICMP6_MIB_MAX); ++#else ++ cpt_dump_snmp_stub(ctx); ++ cpt_dump_snmp_stub(ctx); ++ cpt_dump_snmp_stub(ctx); ++#endif ++ cpt_close_section(ctx); ++ ++ return 0; ++} ++ +int cpt_dump_ifinfo(struct cpt_context * ctx) +{ + int err; @@ -44659,6 +47683,8 @@ index 0000000..9e09675 + err = cpt_dump_route(ctx); + if (!err) + err = cpt_dump_iptables(ctx); ++ if (!err) ++ err = cpt_dump_snmp(ctx); + return err; +} diff --git a/kernel/cpt/cpt_net.h b/kernel/cpt/cpt_net.h @@ -44676,10 +47702,10 @@ index 0000000..5d33877 +int rst_restore_ip_conntrack(struct cpt_context * ctx); diff --git a/kernel/cpt/cpt_obj.c b/kernel/cpt/cpt_obj.c new file mode 100644 -index 0000000..7ab23d7 +index 0000000..341d2ab --- /dev/null +++ b/kernel/cpt/cpt_obj.c -@@ -0,0 +1,162 @@ +@@ -0,0 +1,163 @@ +/* + * + * kernel/cpt/cpt_obj.c @@ -44720,6 +47746,7 @@ index 0000000..7ab23d7 + obj->o_index = CPT_NOINDEX; + obj->o_obj = NULL; + obj->o_image = NULL; ++ obj->o_flags = 0; + ctx->objcount++; + } + return obj; @@ -44844,10 +47871,10 @@ index 0000000..7ab23d7 +} diff --git a/kernel/cpt/cpt_obj.h b/kernel/cpt/cpt_obj.h new file mode 100644 -index 0000000..7762623 +index 0000000..2dca39b --- /dev/null +++ b/kernel/cpt/cpt_obj.h -@@ -0,0 +1,62 @@ +@@ -0,0 +1,64 @@ +#ifndef __CPT_OBJ_H_ +#define __CPT_OBJ_H_ 1 + @@ -44867,6 +47894,8 @@ index 0000000..7762623 + void *o_image; + void *o_parent; + struct list_head o_alist; ++ unsigned int o_flags; ++#define CPT_INODE_HARDLINKED 0x1 +} cpt_object_t; + +struct cpt_context; @@ -44912,10 +47941,10 @@ index 0000000..7762623 +#endif /* __CPT_OBJ_H_ */ diff --git a/kernel/cpt/cpt_proc.c b/kernel/cpt/cpt_proc.c new file mode 100644 -index 0000000..918fe2a +index 0000000..a7d2d82 --- /dev/null +++ b/kernel/cpt/cpt_proc.c -@@ -0,0 +1,594 @@ +@@ -0,0 +1,623 @@ +/* + * + * kernel/cpt/cpt_proc.c @@ -45001,6 +48030,8 @@ index 0000000..918fe2a + +void cpt_context_release(cpt_context_t *ctx) +{ ++ int i; ++ + list_del(&ctx->ctx_list); + spin_unlock(&cpt_context_lock); + @@ -45027,6 +48058,8 @@ index 0000000..918fe2a + fput(ctx->errorfile); + ctx->errorfile = NULL; + } ++ for (i = 0; i < ctx->linkdirs_num; i++) ++ fput(ctx->linkdirs[i]); + if (ctx->error_msg) { + free_page((unsigned long)ctx->error_msg); + ctx->error_msg = NULL; @@ -45122,7 +48155,7 @@ index 0000000..918fe2a + unsigned int src_flags, dst_flags = arg; + + err = 0; -+ src_flags = test_cpu_caps(); ++ src_flags = test_cpu_caps_and_features(); + test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_CMOV, "cmov", err); + test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_FXSR, "fxsr", err); + test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_SSE, "sse", err); @@ -45244,6 +48277,26 @@ index 0000000..918fe2a + fput(ctx->file); + ctx->file = dfile; + break; ++ case CPT_LINKDIR_ADD: ++ if (ctx->linkdirs_num >= CPT_MAX_LINKDIRS) { ++ err = -EMLINK; ++ break; ++ } ++ ++ dfile = fget(arg); ++ if (!dfile) { ++ err = -EBADFD; ++ break; ++ } ++ ++ if (!S_ISDIR(dfile->f_dentry->d_inode->i_mode)) { ++ err = -ENOTDIR; ++ fput(dfile); ++ break; ++ } ++ ++ ctx->linkdirs[ctx->linkdirs_num++] = dfile; ++ break; + case CPT_SET_ERRORFD: + if (arg >= 0) { + dfile = fget(arg); @@ -45304,7 +48357,7 @@ index 0000000..918fe2a + break; + } + ctx->dst_cpu_flags = arg; -+ ctx->src_cpu_flags = test_cpu_caps(); ++ ctx->src_cpu_flags = test_cpu_caps_and_features(); + break; + case CPT_SUSPEND: + if (cpt_context_lookup_veid(ctx->ve_id) || @@ -45378,6 +48431,11 @@ index 0000000..918fe2a + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_IA64, "ia64", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SYSCALL, "syscall", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SYSCALL32, "syscall32", err); ++ if (dst_flags & (1 << CPT_SLM_DMPRST)) { ++ eprintk_ctx("SLM is enabled on destination node, but slm_dmprst module is not loaded\n"); ++ err = 1; ++ } ++ + if (src_flags & CPT_UNSUPPORTED_MASK) + err = 2; + break; @@ -45512,10 +48570,10 @@ index 0000000..918fe2a +module_exit(exit_cpt); diff --git a/kernel/cpt/cpt_process.c b/kernel/cpt/cpt_process.c new file mode 100644 -index 0000000..2afc171 +index 0000000..6314bee --- /dev/null +++ b/kernel/cpt/cpt_process.c -@@ -0,0 +1,1383 @@ +@@ -0,0 +1,1379 @@ +/* + * + * kernel/cpt/cpt_process.c @@ -46241,10 +49299,6 @@ index 0000000..2afc171 + +int cpt_check_unsupported(struct task_struct *tsk, cpt_context_t *ctx) +{ -+ if (tsk->splice_pipe) { -+ eprintk_ctx("splice is used by " CPT_FID "\n", CPT_TID(tsk)); -+ return -EBUSY; -+ } +#ifdef CONFIG_KEYS + if (tsk->cred->request_key_auth || tsk->cred->thread_keyring) { + eprintk_ctx("keys are used by " CPT_FID "\n", CPT_TID(tsk)); @@ -46321,7 +49375,7 @@ index 0000000..2afc171 + return -EBUSY; + } + -+ v->cpt_flags = tsk->flags&~(PF_FROZEN|PF_EXIT_RESTART); ++ v->cpt_flags = tsk->flags & CPT_TASK_FLAGS_MASK; + v->cpt_ptrace = tsk->ptrace; + v->cpt_prio = tsk->prio; + v->cpt_exit_code = tsk->exit_code; @@ -46920,10 +49974,10 @@ index 0000000..b9f28af +struct pid *alloc_vpid_safe(pid_t vnr); diff --git a/kernel/cpt/cpt_socket.c b/kernel/cpt/cpt_socket.c new file mode 100644 -index 0000000..939fb30 +index 0000000..3943b60 --- /dev/null +++ b/kernel/cpt/cpt_socket.c -@@ -0,0 +1,790 @@ +@@ -0,0 +1,802 @@ +/* + * + * kernel/cpt/cpt_socket.c @@ -47105,7 +50159,7 @@ index 0000000..939fb30 +} + +int cpt_dump_skb(int type, int owner, struct sk_buff *skb, -+ struct cpt_context *ctx) ++ struct sock *sk, struct cpt_context *ctx) +{ + struct cpt_skb_image *v = cpt_get_buf(ctx); + loff_t saved_obj; @@ -47129,7 +50183,19 @@ index 0000000..939fb30 + v->cpt_nh = skb_network_header(skb) - skb->head; + v->cpt_mac = skb_mac_header(skb) - skb->head; + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(v->cpt_cb)); -+ memcpy(v->cpt_cb, skb->cb, sizeof(v->cpt_cb)); ++ memset(v->cpt_cb, 0, sizeof(v->cpt_cb)); ++#if !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) ++ if (sk->sk_protocol == IPPROTO_TCP) { ++ /* Save control block according to tcp_skb_cb with IPv6 */ ++ BUG_ON(sizeof(struct tcp_skb_cb) - sizeof(struct inet_skb_parm) > ++ sizeof(v->cpt_cb) - sizeof(struct inet6_skb_parm)); ++ memcpy(v->cpt_cb, skb->cb, sizeof(struct inet_skb_parm)); ++ memcpy((void *)v->cpt_cb + sizeof(struct inet6_skb_parm), ++ skb->cb + sizeof(struct inet_skb_parm), ++ sizeof(struct tcp_skb_cb) - sizeof(struct inet_skb_parm)); ++ } else ++#endif ++ memcpy(v->cpt_cb, skb->cb, sizeof(v->cpt_cb)); + if (sizeof(skb->cb) > sizeof(v->cpt_cb)) { + int i; + for (i=sizeof(v->cpt_cb); icb); i++) { @@ -47256,7 +50322,7 @@ index 0000000..939fb30 + } + } + -+ err = cpt_dump_skb(CPT_SKB_RQ, idx, skb, ctx); ++ err = cpt_dump_skb(CPT_SKB_RQ, idx, skb, sk, ctx); + if (err) + return err; + @@ -47274,7 +50340,7 @@ index 0000000..939fb30 + + skb = skb_peek(&sk->sk_write_queue); + while (skb && skb != (struct sk_buff*)&sk->sk_write_queue) { -+ int err = cpt_dump_skb(CPT_SKB_WQ, idx, skb, ctx); ++ int err = cpt_dump_skb(CPT_SKB_WQ, idx, skb, sk, ctx); + if (err) + return err; + @@ -47438,7 +50504,7 @@ index 0000000..939fb30 + } else { + wprintk_ctx("af_unix path is too long: %s (%s)\n", path, ((char*)v->cpt_laddr)+2); + } -+ err = cpt_verify_overmount(path, d, unix_sk(sk)->mnt, ctx); ++ err = cpt_verify_overmount(path, d, unix_sk(sk)->mnt, 1, ctx); + } else { + eprintk_ctx("cannot get path of an af_unix socket\n"); + err = PTR_ERR(path); @@ -47716,10 +50782,10 @@ index 0000000..939fb30 +} diff --git a/kernel/cpt/cpt_socket.h b/kernel/cpt/cpt_socket.h new file mode 100644 -index 0000000..6489184 +index 0000000..9c64399 --- /dev/null +++ b/kernel/cpt/cpt_socket.h -@@ -0,0 +1,33 @@ +@@ -0,0 +1,37 @@ +struct sock; + +int cpt_collect_passedfds(cpt_context_t *); @@ -47733,7 +50799,8 @@ index 0000000..6489184 +int cpt_dump_orphaned_sockets(struct cpt_context *ctx); + +int rst_sock_attr(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx); -+struct sk_buff * rst_skb(loff_t *pos_p, __u32 *owner, __u32 *queue, struct cpt_context *ctx); ++struct sk_buff * rst_skb(struct sock *sk, loff_t *pos_p, __u32 *owner, ++ __u32 *queue, struct cpt_context *ctx); + +void cpt_unlock_sockets(cpt_context_t *); +void cpt_kill_sockets(cpt_context_t *); @@ -47742,11 +50809,14 @@ index 0000000..6489184 +int cpt_kill_socket(struct sock *, cpt_context_t *); +int cpt_dump_socket_in(struct cpt_sock_image *, struct sock *, struct cpt_context*); +int rst_socket_in(struct cpt_sock_image *si, loff_t pos, struct sock *, struct cpt_context *ctx); ++int rst_listen_socket_in(struct sock *sk, struct cpt_sock_image *si, ++ loff_t pos, struct cpt_context *ctx); +__u32 cpt_socket_fasync(struct file *file, struct cpt_context *ctx); +int cpt_attach_accept(struct sock *lsk, struct sock *sk, cpt_context_t *); +int rst_restore_synwait_queue(struct sock *sk, struct cpt_sock_image *si, loff_t pos, struct cpt_context *ctx); +int cpt_dump_ofo_queue(int idx, struct sock *sk, struct cpt_context *ctx); -+int cpt_dump_skb(int type, int owner, struct sk_buff *skb, struct cpt_context *ctx); ++int cpt_dump_skb(int type, int owner, struct sk_buff *skb, struct sock *sk, ++ struct cpt_context *ctx); +int cpt_dump_mcfilter(struct sock *sk, struct cpt_context *ctx); + +int rst_sk_mcfilter_in(struct sock *sk, struct cpt_sockmc_image *v, @@ -47755,7 +50825,7 @@ index 0000000..6489184 + loff_t pos, cpt_context_t *ctx); diff --git a/kernel/cpt/cpt_socket_in.c b/kernel/cpt/cpt_socket_in.c new file mode 100644 -index 0000000..9c25d70 +index 0000000..d565745 --- /dev/null +++ b/kernel/cpt/cpt_socket_in.c @@ -0,0 +1,448 @@ @@ -47820,7 +50890,7 @@ index 0000000..9c25d70 + while (skb && skb != (struct sk_buff*)&tp->out_of_order_queue) { + int err; + -+ err = cpt_dump_skb(CPT_SKB_OFOQ, idx, skb, ctx); ++ err = cpt_dump_skb(CPT_SKB_OFOQ, idx, skb, sk, ctx); + if (err) + return err; + @@ -48073,7 +51143,7 @@ index 0000000..9c25d70 + v->cpt_snt_isn = tcp_rsk(req)->snt_isn; + v->cpt_rmt_port = inet_rsk(req)->rmt_port; + v->cpt_mss = req->mss; -+ // // v->cpt_family = (req->class == &or_ipv4 ? AF_INET : AF_INET6); ++ v->cpt_family = req->rsk_ops->family; + v->cpt_retrans = req->retrans; + v->cpt_snd_wscale = inet_rsk(req)->snd_wscale; + v->cpt_rcv_wscale = inet_rsk(req)->rcv_wscale; @@ -48946,10 +52016,10 @@ index 0000000..8ac9417 +} diff --git a/kernel/cpt/cpt_ubc.c b/kernel/cpt/cpt_ubc.c new file mode 100644 -index 0000000..5746184 +index 0000000..0fc4f5f --- /dev/null +++ b/kernel/cpt/cpt_ubc.c -@@ -0,0 +1,133 @@ +@@ -0,0 +1,135 @@ +/* + * + * kernel/cpt/cpt_ubc.c @@ -49020,13 +52090,15 @@ index 0000000..5746184 + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_UBC; + v->cpt_hdrlen = sizeof(*v); -+ v->cpt_content = CPT_CONTENT_VOID; ++ v->cpt_content = CPT_CONTENT_ARRAY; + + if (obj->o_parent != NULL) + v->cpt_parent = ((cpt_object_t *)obj->o_parent)->o_pos; + else + v->cpt_parent = CPT_NULL; + v->cpt_id = (obj->o_parent != NULL) ? bc->ub_uid : 0; ++ v->cpt_ub_resources = UB_RESOURCES; ++ BUILD_BUG_ON(ARRAY_SIZE(v->cpt_parms) < UB_RESOURCES * 2); + for (i = 0; i < UB_RESOURCES; i++) { + dump_one_bc_parm(v->cpt_parms + i * 2, bc->ub_parms + i, 0); + dump_one_bc_parm(v->cpt_parms + i * 2 + 1, bc->ub_store + i, 1); @@ -49187,10 +52259,10 @@ index 0000000..0d5e361 + diff --git a/kernel/cpt/rst_conntrack.c b/kernel/cpt/rst_conntrack.c new file mode 100644 -index 0000000..4c31f32 +index 0000000..b863ac4 --- /dev/null +++ b/kernel/cpt/rst_conntrack.c -@@ -0,0 +1,283 @@ +@@ -0,0 +1,328 @@ +/* + * + * kernel/cpt/rst_conntrack.c @@ -49249,17 +52321,33 @@ index 0000000..4c31f32 + int index; +}; + -+static void decode_tuple(struct cpt_ipct_tuple *v, struct ip_conntrack_tuple *tuple, int dir) ++static int decode_tuple(struct cpt_ipct_tuple *v, ++ struct ip_conntrack_tuple *tuple, int dir, ++ cpt_context_t *ctx) +{ + tuple->dst.ip = v->cpt_dst; + tuple->dst.u.all = v->cpt_dstport; -+ tuple->dst.protonum = v->cpt_protonum; -+ tuple->dst.dir = v->cpt_dir; -+ if (dir != tuple->dst.dir) -+ wprintk("dir != tuple->dst.dir\n"); ++ if (ctx->image_version < CPT_VERSION_16) { ++ /* In 2.6.9 kernel protonum has short type */ ++ __u16 protonum = *(__u16 *)&v->cpt_protonum; ++ if (protonum > 0xff && protonum < 0xffff) { ++ eprintk_ctx("tuple: protonum > 255: %u\n", protonum); ++ return -EINVAL; ++ } ++ tuple->dst.protonum = protonum; ++ tuple->dst.dir = dir; ++ } else { ++ tuple->dst.protonum = v->cpt_protonum; ++ tuple->dst.dir = v->cpt_dir; ++ if (dir != tuple->dst.dir) { ++ eprintk_ctx("dir != tuple->dst.dir\n"); ++ return -EINVAL; ++ } ++ } + + tuple->src.ip = v->cpt_src; + tuple->src.u.all = v->cpt_srcport; ++ return 0; +} + + @@ -49314,16 +52402,13 @@ index 0000000..4c31f32 + return -ENOMEM; + } + -+ if (ct->helper->timeout && !del_timer(&exp->timeout)) { -+ /* Dying already. We can do nothing. */ ++ if (decode_tuple(&v.cpt_tuple, &exp->tuple, 0, ctx) || ++ decode_tuple(&v.cpt_mask, &exp->mask, 0, ctx)) { ++ ip_conntrack_expect_put(exp); + write_unlock_bh(&ip_conntrack_lock); -+ dprintk_ctx("conntrack expectation is dying\n"); -+ continue; ++ return -EINVAL; + } + -+ decode_tuple(&v.cpt_tuple, &exp->tuple, 0); -+ decode_tuple(&v.cpt_mask, &exp->mask, 0); -+ + exp->master = ct; + nf_conntrack_get(&ct->ct_general); + ip_conntrack_expect_insert(exp); @@ -49337,11 +52422,12 @@ index 0000000..4c31f32 + } else +#endif + if (ct->helper->timeout) { -+ exp->timeout.expires = jiffies + v.cpt_timeout; -+ add_timer(&exp->timeout); ++ mod_timer(&exp->timeout, jiffies + v.cpt_timeout); + } + write_unlock_bh(&ip_conntrack_lock); + ++ ip_conntrack_expect_put(exp); ++ + pos += v.cpt_next; + } + return 0; @@ -49359,8 +52445,11 @@ index 0000000..4c31f32 + if (c == NULL) + return -ENOMEM; + -+ decode_tuple(&ci->cpt_tuple[0], &orig, 0); -+ decode_tuple(&ci->cpt_tuple[1], &repl, 1); ++ if (decode_tuple(&ci->cpt_tuple[0], &orig, 0, ctx) || ++ decode_tuple(&ci->cpt_tuple[1], &repl, 1, ctx)) { ++ kfree(c); ++ return -EINVAL; ++ } + + conntrack = ip_conntrack_alloc(&orig, &repl, get_exec_env()->_ip_conntrack->ub); + if (!conntrack || IS_ERR(conntrack)) { @@ -49373,14 +52462,15 @@ index 0000000..4c31f32 + *ct_list = c; + c->index = ci->cpt_index; + -+ decode_tuple(&ci->cpt_tuple[0], &conntrack->tuplehash[0].tuple, 0); -+ decode_tuple(&ci->cpt_tuple[1], &conntrack->tuplehash[1].tuple, 1); -+ + conntrack->status = ci->cpt_status; + + memcpy(&conntrack->proto, ci->cpt_proto_data, sizeof(conntrack->proto)); + memcpy(&conntrack->help, ci->cpt_help_data, sizeof(conntrack->help)); + ++#if defined(CONFIG_IP_NF_CONNTRACK_MARK) ++ conntrack->mark = ci->cpt_mark; ++#endif ++ +#ifdef CONFIG_IP_NF_NAT_NEEDED +#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \ + defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE) @@ -49412,9 +52502,34 @@ index 0000000..4c31f32 + if (err == 0 && ci->cpt_next > ci->cpt_hdrlen) + err = undump_expect_list(conntrack, ci, pos, *ct_list, ctx); + ++ if (conntrack->helper) ++ ip_conntrack_helper_put(conntrack->helper); ++ + return err; +} + ++static void convert_conntrack_image(struct cpt_ip_conntrack_image *ci) ++{ ++ struct cpt_ip_conntrack_image_compat img; ++ ++ memcpy(&img, ci, sizeof(struct cpt_ip_conntrack_image_compat)); ++ /* ++ * Size of cpt_help_data in 2.6.9 kernel is 16 bytes, ++ * in 2.6.18 cpt_help_data size is 24 bytes, so zero the rest 8 bytes ++ */ ++ memset(ci->cpt_help_data + 4, 0, 8); ++ ci->cpt_initialized = img.cpt_initialized; ++ ci->cpt_num_manips = img.cpt_num_manips; ++ memcpy(ci->cpt_nat_manips, img.cpt_nat_manips, sizeof(img.cpt_nat_manips)); ++ memcpy(ci->cpt_nat_seq, img.cpt_nat_seq, sizeof(img.cpt_nat_seq)); ++ ci->cpt_masq_index = img.cpt_masq_index; ++ /* Id will be assigned in ip_conntrack_hash_insert(), so make it 0 here */ ++ ci->cpt_id = 0; ++ /* mark was not supported in 2.6.9, so set it to default 0 value */ ++ ci->cpt_mark = 0; ++ ++} ++ +int rst_restore_ip_conntrack(struct cpt_context * ctx) +{ + int err = 0; @@ -49445,6 +52560,8 @@ index 0000000..4c31f32 + err = rst_get_object(CPT_OBJ_NET_CONNTRACK, sec, &ci, ctx); + if (err) + break; ++ if (ctx->image_version < CPT_VERSION_16) ++ convert_conntrack_image(&ci); + err = undump_one_ct(&ci, sec, &ct_list, ctx); + if (err) + break; @@ -49476,10 +52593,10 @@ index 0000000..4c31f32 +#endif diff --git a/kernel/cpt/rst_context.c b/kernel/cpt/rst_context.c new file mode 100644 -index 0000000..c68e807 +index 0000000..0007197 --- /dev/null +++ b/kernel/cpt/rst_context.c -@@ -0,0 +1,330 @@ +@@ -0,0 +1,331 @@ +/* + * + * kernel/cpt/rst_context.c @@ -49662,8 +52779,9 @@ index 0000000..c68e807 + ctx->start_time.tv_nsec = h.cpt_start_nsec; + ctx->kernel_config_flags = h.cpt_kernel_config[0]; + ctx->iptables_mask = h.cpt_iptables_mask; -+ if (h.cpt_image_version > CPT_VERSION_32 || -+ CPT_VERSION_MINOR(h.cpt_image_version) > 1) { ++ if (h.cpt_image_version > CPT_CURRENT_VERSION || ++ CPT_VERSION_MINOR(h.cpt_image_version) > ++ CPT_VERSION_MINOR(CPT_CURRENT_VERSION)) { + eprintk_ctx("Unknown image version: %x. Can't restore.\n", + h.cpt_image_version); + err = -EINVAL; @@ -49987,10 +53105,10 @@ index 0000000..0ac4cae +} diff --git a/kernel/cpt/rst_files.c b/kernel/cpt/rst_files.c new file mode 100644 -index 0000000..4b21b04 +index 0000000..a84e3d3 --- /dev/null +++ b/kernel/cpt/rst_files.c -@@ -0,0 +1,1698 @@ +@@ -0,0 +1,1779 @@ +/* + * + * kernel/cpt/rst_files.c @@ -50030,6 +53148,7 @@ index 0000000..4b21b04 +#include +#include +#include ++#include + +#include "cpt_obj.h" +#include "cpt_context.h" @@ -50523,7 +53642,7 @@ index 0000000..4b21b04 + fput(file); + file = dentry_open(dget(file->f_dentry), + mntget(file->f_vfsmnt), -+ O_WRONLY, NULL); ++ O_WRONLY | O_LARGEFILE, NULL); + if (IS_ERR(file)) { + __cpt_release_buf(ctx); + return PTR_ERR(file); @@ -50825,6 +53944,7 @@ index 0000000..4b21b04 + struct cpt_file_image fi; + __u8 *name = NULL; + struct file *file; ++ struct proc_dir_entry *proc_dead_file; + int flags; + + obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, pos, ctx); @@ -50896,6 +54016,12 @@ index 0000000..4b21b04 + err = -EINVAL; + goto err_out; + } ++ if ((fi.cpt_lflags & CPT_DENTRY_HARDLINKED) && ++ !ctx->hardlinked_on) { ++ eprintk_ctx("Open hardlinked is off\n"); ++ err = -EPERM; ++ goto err_out; ++ } + goto open_file; + } + } @@ -50963,8 +54089,32 @@ index 0000000..4b21b04 + goto map_file; + } + ++ /* This hook is needed to open file /proc// ++ * but there is no proccess with pid . ++ */ ++ proc_dead_file = NULL; ++ if (fi.cpt_lflags & CPT_DENTRY_PROCPID_DEAD) { ++ sprintf(name, "/proc/rst_dead_pid_file_%d", task_pid_vnr(current)); ++ ++ proc_dead_file = create_proc_entry(name + 6, S_IRUGO|S_IWUGO, ++ NULL); ++ if (!proc_dead_file) { ++ eprintk_ctx("can't create proc entry %s\n", name); ++ err = -ENOMEM; ++ goto err_out; ++ } ++#ifdef CONFIG_PROC_FS ++ proc_dead_file->proc_fops = &dummy_proc_pid_file_operations; ++#endif ++ } ++ + file = filp_open(name, flags, 0); + ++ if (proc_dead_file) { ++ remove_proc_entry(proc_dead_file->name, NULL); ++ if (!IS_ERR(file)) ++ d_drop(file->f_dentry); ++ } +map_file: + if (!IS_ERR(file)) { + fixup_file_flags(file, &fi, was_dentry_open, pos, ctx); @@ -51009,7 +54159,8 @@ index 0000000..4b21b04 + goto err_put; + } + } else { -+ if (fi.cpt_lflags & CPT_DENTRY_PROC) { ++ if ((fi.cpt_lflags & CPT_DENTRY_PROC) && ++ !(fi.cpt_lflags & CPT_DENTRY_PROCPID_DEAD)) { + dprintk_ctx("rst_file /proc delayed\n"); + file = NULL; + } else if (name) @@ -51073,7 +54224,8 @@ index 0000000..4b21b04 +extern int expand_fdtable(struct files_struct *files, int nr); + + -+int rst_files_complete(struct cpt_task_image *ti, struct cpt_context *ctx) ++static int rst_files(struct cpt_task_image *ti, struct cpt_context *ctx, ++ int from, int to) +{ + struct cpt_files_struct_image fi; + struct files_struct *f = current->files; @@ -51088,6 +54240,14 @@ index 0000000..4b21b04 + return 0; + } + ++ if (from == 3) { ++ err = rst_get_object(CPT_OBJ_FILES, ti->cpt_files, &fi, ctx); ++ if (err) ++ return err; ++ ++ goto just_do_it; ++ } ++ + obj = lookup_cpt_obj_bypos(CPT_OBJ_FILES, ti->cpt_files, ctx); + if (obj) { + if (obj->o_obj != f) { @@ -51113,6 +54273,7 @@ index 0000000..4b21b04 + return err; + } + ++just_do_it: + pos = ti->cpt_files + fi.cpt_hdrlen; + endpos = ti->cpt_files + fi.cpt_next; + while (pos < endpos) { @@ -51122,6 +54283,9 @@ index 0000000..4b21b04 + err = rst_get_object(CPT_OBJ_FILEDESC, pos, &fdi, ctx); + if (err) + return err; ++ if (fdi.cpt_fd < from || fdi.cpt_fd > to) ++ goto skip; ++ + filp = rst_file(fdi.cpt_file, fdi.cpt_fd, ctx); + if (IS_ERR(filp)) { + eprintk_ctx("rst_file: %ld %Lu\n", PTR_ERR(filp), @@ -51139,6 +54303,8 @@ index 0000000..4b21b04 + if (fdi.cpt_flags&CPT_FD_FLAG_CLOSEEXEC) + FD_SET(fdi.cpt_fd, f->fdt->close_on_exec); + } ++ ++skip: + pos += fdi.cpt_next; + } + f->next_fd = fi.cpt_next_fd; @@ -51151,6 +54317,16 @@ index 0000000..4b21b04 + return 0; +} + ++int rst_files_complete(struct cpt_task_image *ti, struct cpt_context *ctx) ++{ ++ return rst_files(ti, ctx, (ti->cpt_pid == 1) ? 3 : 0, INT_MAX); ++} ++ ++int rst_files_std(struct cpt_task_image *ti, struct cpt_context *ctx) ++{ ++ return rst_files(ti, ctx, 0, 2); ++} ++ +int rst_do_filejobs(cpt_context_t *ctx) +{ + struct filejob *j; @@ -51260,8 +54436,31 @@ index 0000000..4b21b04 + return err; + + file = rst_file(*pos, -2, ctx); -+ if (IS_ERR(file)) ++ if (IS_ERR(file)) { ++ if (PTR_ERR(file) == -EINVAL && S_ISLNK(fi.cpt_i_mode)) { ++ /* One special case: inotify on symlink */ ++ struct nameidata nd; ++ __u8 *name = NULL; ++ ++ if (fi.cpt_next > fi.cpt_hdrlen) ++ name = rst_get_name(*pos + sizeof(fi), ctx); ++ if (!name) { ++ eprintk_ctx("can't get name for file\n"); ++ return -EINVAL; ++ } ++ if ((err = path_lookup(name, 0, &nd)) != 0) { ++ eprintk_ctx("path_lookup %s: %d\n", name, err); ++ rst_put_name(name, ctx); ++ return -EINVAL; ++ } ++ *dp = nd.path.dentry; ++ *mp = nd.path.mnt; ++ *pos += fi.cpt_next; ++ rst_put_name(name, ctx); ++ return 0; ++ } + return PTR_ERR(file); ++ } + + *dp = dget(file->f_dentry); + *mp = mntget(file->f_vfsmnt); @@ -53041,10 +56240,10 @@ index 0000000..78627cc +} diff --git a/kernel/cpt/rst_net.c b/kernel/cpt/rst_net.c new file mode 100644 -index 0000000..dc5de80 +index 0000000..4c8d482 --- /dev/null +++ b/kernel/cpt/rst_net.c -@@ -0,0 +1,628 @@ +@@ -0,0 +1,745 @@ +/* + * + * kernel/cpt/rst_net.c @@ -53638,6 +56837,7 @@ index 0000000..dc5de80 + err = (status & 0xff00) >> 8; + if (err != 0) { + eprintk_ctx("iptables-restore exited with %d\n", err); ++ eprintk_ctx("Most probably some iptables modules are not loaded\n"); + err = -EINVAL; + } + } else { @@ -53658,6 +56858,120 @@ index 0000000..dc5de80 + return err; +} + ++static int rst_restore_snmp_stat(struct cpt_context *ctx, void *mib[], int n, ++ loff_t *ppos, loff_t endpos) ++{ ++ int err, in, i; ++ struct cpt_object_hdr o; ++ __u32 *stats; ++ ++ err = rst_get_object(CPT_OBJ_BITS, *ppos, &o, ctx); ++ if (err) ++ return err; ++ ++ in = o.cpt_next - o.cpt_hdrlen; ++ if (in >= PAGE_SIZE - 4) { ++ eprintk_ctx("Too long SNMP buf (%d)\n", in); ++ return -EINVAL; ++ } ++ ++ if (o.cpt_content != CPT_CONTENT_DATA) { ++ if (o.cpt_content == CPT_CONTENT_VOID) ++ return 1; ++ ++ eprintk_ctx("Corrupted SNMP stats\n"); ++ return -EINVAL; ++ } ++ ++ stats = cpt_get_buf(ctx); ++ err = ctx->pread(stats, in, ctx, (*ppos) + o.cpt_hdrlen); ++ if (err) ++ goto out; ++ ++ in /= sizeof(*stats); ++ if (in > n) ++ wprintk_ctx("SNMP stats trimmed\n"); ++ else ++ n = in; ++ ++ for (i = 0; i < n; i++) ++ *((unsigned long *)(per_cpu_ptr(mib[0], 0)) + i) = stats[i]; ++ ++ *ppos += o.cpt_next; ++ if (*ppos < endpos) ++ err = 1; /* go on restoring */ ++out: ++ cpt_release_buf(ctx); ++ return err; ++} ++ ++static int rst_restore_snmp(struct cpt_context *ctx) ++{ ++ int err; ++ loff_t sec = ctx->sections[CPT_SECT_SNMP_STATS]; ++ loff_t endsec; ++ struct cpt_section_hdr h; ++ struct ve_struct *ve; ++ struct net *net; ++ ++ if (sec == CPT_NULL) ++ return 0; ++ ++ err = ctx->pread(&h, sizeof(h), ctx, sec); ++ if (err) ++ return err; ++ if (h.cpt_section != CPT_SECT_SNMP_STATS || h.cpt_hdrlen < sizeof(h)) ++ return -EINVAL; ++ ++ ve = get_exec_env(); ++ net = ve->ve_netns; ++ endsec = sec + h.cpt_next; ++ sec += h.cpt_hdrlen; ++ if (sec >= endsec) ++ goto out; ++ ++ err = rst_restore_snmp_stat(ctx, (void **)&net->mib.net_statistics, ++ LINUX_MIB_MAX, &sec, endsec); ++ if (err <= 0) ++ goto out; ++ err = rst_restore_snmp_stat(ctx, (void **)&net->mib.ip_statistics, ++ IPSTATS_MIB_MAX, &sec, endsec); ++ if (err <= 0) ++ goto out; ++ err = rst_restore_snmp_stat(ctx, (void **)&net->mib.tcp_statistics, ++ TCP_MIB_MAX, &sec, endsec); ++ if (err <= 0) ++ goto out; ++ err = rst_restore_snmp_stat(ctx, (void **)&net->mib.udp_statistics, ++ UDP_MIB_MAX, &sec, endsec); ++ if (err <= 0) ++ goto out; ++ err = rst_restore_snmp_stat(ctx, (void **)&net->mib.icmp_statistics, ++ ICMP_MIB_MAX, &sec, endsec); ++ if (err <= 0) ++ goto out; ++ err = rst_restore_snmp_stat(ctx, (void **)&net->mib.icmpmsg_statistics, ++ ICMPMSG_MIB_MAX, &sec, endsec); ++ if (err <= 0) ++ goto out; ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ err = rst_restore_snmp_stat(ctx, (void **)&ve->_ipv6_statistics, ++ IPSTATS_MIB_MAX, &sec, endsec); ++ if (err <= 0) ++ goto out; ++ err = rst_restore_snmp_stat(ctx, (void **)&ve->_udp_stats_in6, ++ UDP_MIB_MAX, &sec, endsec); ++ if (err <= 0) ++ goto out; ++ err = rst_restore_snmp_stat(ctx, (void **)&ve->_icmpv6_statistics, ++ ICMP6_MIB_MAX, &sec, endsec); ++#endif ++ if (err == 1) ++ err = 0; ++out: ++ return err; ++} ++ +int rst_restore_net(struct cpt_context *ctx) +{ + int err; @@ -53671,14 +56985,16 @@ index 0000000..dc5de80 + err = rst_restore_iptables(ctx); + if (!err) + err = rst_restore_ip_conntrack(ctx); ++ if (!err) ++ err = rst_restore_snmp(ctx); + return err; +} diff --git a/kernel/cpt/rst_proc.c b/kernel/cpt/rst_proc.c new file mode 100644 -index 0000000..2b0b283 +index 0000000..beaaa3f --- /dev/null +++ b/kernel/cpt/rst_proc.c -@@ -0,0 +1,579 @@ +@@ -0,0 +1,582 @@ +/* + * + * kernel/cpt/rst_proc.c @@ -53887,7 +57203,7 @@ index 0000000..2b0b283 + unlock_kernel(); + + if (cmd == CPT_TEST_CAPS) { -+ err = test_cpu_caps(); ++ err = test_cpu_caps_and_features(); + goto out_lock; + } + @@ -54087,6 +57403,9 @@ index 0000000..2b0b283 + fput(ctx->errorfile); + ctx->errorfile = dfile; + break; ++ case CPT_HARDLNK_ON: ++ ctx->hardlinked_on = 1; ++ break; + case CPT_SET_VEID: + if (ctx->ctx_state > 0) { + err = -EBUSY; @@ -54260,10 +57579,10 @@ index 0000000..2b0b283 +module_exit(exit_rst); diff --git a/kernel/cpt/rst_process.c b/kernel/cpt/rst_process.c new file mode 100644 -index 0000000..19915b3 +index 0000000..000e0b9 --- /dev/null +++ b/kernel/cpt/rst_process.c -@@ -0,0 +1,1614 @@ +@@ -0,0 +1,1661 @@ +/* + * + * kernel/cpt/rst_process.c @@ -54687,8 +58006,13 @@ index 0000000..19915b3 + } + } + -+ if (si->cpt_curr_target) ++ if (si->cpt_curr_target) { + current->signal->curr_target = find_task_by_vpid(si->cpt_curr_target); ++ if (current->signal->curr_target == NULL) { ++ wprintk_ctx("oops, curr_target=NULL, pid=%u\n", si->cpt_curr_target); ++ current->signal->curr_target = current; ++ } ++ } + current->signal->flags = 0; + *exiting = si->cpt_group_exit; + current->signal->group_exit_code = si->cpt_group_exit_code; @@ -55449,7 +58773,7 @@ index 0000000..19915b3 +#ifdef CONFIG_X86_32 + unsigned int flags; + -+ flags = test_cpu_caps(); ++ flags = test_cpu_caps_and_features(); + + /* if cpu does not support sse2 mask 6 bit (DAZ flag) and 16-31 bits + in MXCSR to avoid general protection fault */ @@ -55462,6 +58786,32 @@ index 0000000..19915b3 +#include +#endif + ++#define RLIM_INFINITY32 0xffffffff ++#define RLIM_INFINITY64 (~0ULL) ++ ++#ifdef CONFIG_X86_64 ++#define rst_rlim_32_to_64(a, i, t, im) \ ++do { \ ++ if (im->cpt_rlim_##a[i] == RLIM_INFINITY32) \ ++ t->signal->rlim[i].rlim_##a = RLIM_INFINITY64; \ ++ else \ ++ t->signal->rlim[i].rlim_##a = im->cpt_rlim_##a[i]; \ ++} while (0) ++#elif defined(CONFIG_X86_32) ++#define rst_rlim_64_to_32(a, i, t, im) \ ++do { \ ++ if (im->cpt_rlim_##a[i] == RLIM_INFINITY64) \ ++ t->signal->rlim[i].rlim_##a = RLIM_INFINITY32; \ ++ else if (im->cpt_rlim_##a[i] > RLIM_INFINITY32) { \ ++ eprintk_ctx("rlimit %Lu is too high for 32-bit task, " \ ++ "dump file is corrupted\n", \ ++ im->cpt_rlim_##a[i]); \ ++ return -EINVAL; \ ++ } else \ ++ t->signal->rlim[i].rlim_##a = im->cpt_rlim_##a[i]; \ ++} while (0) ++#endif ++ +int rst_restore_process(struct cpt_context *ctx) +{ + cpt_object_t *obj; @@ -55574,8 +58924,23 @@ index 0000000..19915b3 + tsk->signal->cmaj_flt = ti->cpt_cmaj_flt; + + for (i=0; isignal->rlim[i].rlim_cur = ti->cpt_rlim_cur[i]; -+ tsk->signal->rlim[i].rlim_max = ti->cpt_rlim_max[i]; ++#ifdef CONFIG_X86_64 ++ if (ctx->image_arch == CPT_OS_ARCH_I386) { ++ rst_rlim_32_to_64(cur, i, tsk, ti); ++ rst_rlim_32_to_64(max, i, tsk, ti); ++ } else ++#elif defined(CONFIG_X86_32) ++ if (ctx->image_arch == CPT_OS_ARCH_EMT64) { ++ rst_rlim_64_to_32(cur, i, tsk, ti); ++ rst_rlim_64_to_32(max, i, tsk, ti); ++ } else ++#endif ++ { ++ tsk->signal->rlim[i].rlim_cur = ++ ti->cpt_rlim_cur[i]; ++ tsk->signal->rlim[i].rlim_max = ++ ti->cpt_rlim_max[i]; ++ } + } + } +#endif @@ -55809,7 +59174,8 @@ index 0000000..19915b3 + } + + tsk->ptrace = ti->cpt_ptrace; -+ tsk->flags = ti->cpt_flags & ~PF_FROZEN; ++ tsk->flags = (tsk->flags & PF_USED_MATH) | ++ (ti->cpt_flags & CPT_TASK_FLAGS_MASK); + clear_tsk_thread_flag(tsk, TIF_FREEZE); + tsk->exit_signal = ti->cpt_exit_signal; + @@ -55880,10 +59246,10 @@ index 0000000..19915b3 +} diff --git a/kernel/cpt/rst_socket.c b/kernel/cpt/rst_socket.c new file mode 100644 -index 0000000..22e1d1b +index 0000000..78cc4ff --- /dev/null +++ b/kernel/cpt/rst_socket.c -@@ -0,0 +1,918 @@ +@@ -0,0 +1,993 @@ +/* + * + * kernel/cpt/rst_socket.c @@ -56121,7 +59487,7 @@ index 0000000..22e1d1b + struct sk_buff *skb; + __u32 type; + -+ skb = rst_skb(&pos, NULL, &type, ctx); ++ skb = rst_skb(sk, &pos, NULL, &type, ctx); + if (IS_ERR(skb)) { + if (PTR_ERR(skb) == -EINVAL) { + int err; @@ -56374,8 +59740,10 @@ index 0000000..22e1d1b + + setup_sock_common(sock->sk, si, pos, ctx); + -+ if (si->cpt_family == AF_INET || si->cpt_family == AF_INET6) ++ if (si->cpt_family == AF_INET || si->cpt_family == AF_INET6) { ++ rst_listen_socket_in(sock->sk, si, pos, ctx); + rst_restore_synwait_queue(sock->sk, si, pos, ctx); ++ } + + return 0; + @@ -56456,7 +59824,53 @@ index 0000000..22e1d1b + return err; +} + -+struct sk_buff * rst_skb(loff_t *pos_p, __u32 *owner, __u32 *queue, struct cpt_context *ctx) ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++static void rst_tcp_cb_ipv4_to_ipv6(struct cpt_skb_image *v, struct sk_buff *skb) ++{ ++ BUG_ON(sizeof(skb->cb) - sizeof(struct inet6_skb_parm) < ++ sizeof(struct tcp_skb_cb) - sizeof(struct inet6_skb_parm)); ++ memcpy(skb->cb, v->cpt_cb, sizeof(struct inet_skb_parm)); ++ memcpy(skb->cb + sizeof(struct inet6_skb_parm), ++ (void *)v->cpt_cb + sizeof(struct inet_skb_parm), ++ sizeof(struct tcp_skb_cb) - sizeof(struct inet6_skb_parm)); ++} ++#else ++static void rst_tcp_cb_ipv6_to_ipv4(struct cpt_skb_image *v, struct sk_buff *skb) ++{ ++ BUG_ON(sizeof(v->cpt_cb) - sizeof(struct inet6_skb_parm) < ++ sizeof(struct tcp_skb_cb) - sizeof(struct inet_skb_parm)); ++ memcpy(skb->cb, v->cpt_cb, sizeof(struct inet_skb_parm)); ++ memcpy(skb->cb + sizeof(struct inet_skb_parm), ++ (void *)v->cpt_cb + sizeof(struct inet6_skb_parm), ++ sizeof(struct tcp_skb_cb) - sizeof(struct inet_skb_parm)); ++} ++#endif ++ ++struct tcp_skb_cb_ipv6 { ++ union { ++ struct inet_skb_parm h4; ++ struct inet6_skb_parm h6; ++ } header; ++ __u32 seq; ++ __u32 end_seq; ++ __u32 when; ++ __u8 flags; ++ __u8 sacked; ++ __u16 urg_ptr; ++ __u32 ack_seq; ++}; ++ ++#define check_tcp_cb_conv(op1, op2) do { \ ++ if (!ctx->tcp_cb_convert) \ ++ ctx->tcp_cb_convert = CPT_TCP_CB_##op1; \ ++ else if (ctx->tcp_cb_convert == CPT_TCP_CB_##op2) { \ ++ kfree_skb(skb); \ ++ return ERR_PTR(-EINVAL); \ ++ } \ ++} while (0) ++ ++struct sk_buff * rst_skb(struct sock *sk, loff_t *pos_p, __u32 *owner, ++ __u32 *queue, struct cpt_context *ctx) +{ + int err; + struct sk_buff *skb; @@ -56490,7 +59904,34 @@ index 0000000..22e1d1b + skb->mac_header = skb->head + v.cpt_mac; +#endif + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(v.cpt_cb)); -+ memcpy(skb->cb, v.cpt_cb, sizeof(v.cpt_cb)); ++ if (sk->sk_protocol == IPPROTO_TCP) { ++ /* ++ * According to Alexey all packets in queue have non-zero ++ * flags, as at least TCPCB_FLAG_ACK is set on them. ++ * Luckily for us, offset of field flags in tcp_skb_cb struct ++ * with IPv6 is higher then total size of tcp_skb_cb struct ++ * without IPv6. ++ */ ++ if (ctx->image_version >= CPT_VERSION_18_2 || ++ ((struct tcp_skb_cb_ipv6 *)&v.cpt_cb)->flags) { ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ check_tcp_cb_conv(NOT_CONV, CONV); ++ memcpy(skb->cb, v.cpt_cb, sizeof(v.cpt_cb)); ++#else ++ check_tcp_cb_conv(CONV, NOT_CONV); ++ rst_tcp_cb_ipv6_to_ipv4(&v, skb); ++#endif ++ } else { ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ check_tcp_cb_conv(CONV, NOT_CONV); ++ rst_tcp_cb_ipv4_to_ipv6(&v, skb); ++#else ++ check_tcp_cb_conv(NOT_CONV, CONV); ++ memcpy(skb->cb, v.cpt_cb, sizeof(v.cpt_cb)); ++#endif ++ } ++ } else ++ memcpy(skb->cb, v.cpt_cb, sizeof(v.cpt_cb)); + skb->mac_len = v.cpt_mac_len; + + skb->csum = v.cpt_csum; @@ -56568,7 +60009,7 @@ index 0000000..22e1d1b + struct sock *owner_sk; + __u32 owner; + -+ skb = rst_skb(&pos, &owner, NULL, ctx); ++ skb = rst_skb(sk, &pos, &owner, NULL, ctx); + if (IS_ERR(skb)) { + if (PTR_ERR(skb) == -EINVAL) { + int err; @@ -56804,10 +60245,10 @@ index 0000000..22e1d1b + diff --git a/kernel/cpt/rst_socket_in.c b/kernel/cpt/rst_socket_in.c new file mode 100644 -index 0000000..f63df90 +index 0000000..08bf907 --- /dev/null +++ b/kernel/cpt/rst_socket_in.c -@@ -0,0 +1,492 @@ +@@ -0,0 +1,578 @@ +/* + * + * kernel/cpt/rst_socket_in.c @@ -56869,7 +60310,7 @@ index 0000000..f63df90 + struct sk_buff *skb; + __u32 type; + -+ skb = rst_skb(&pos, NULL, &type, ctx); ++ skb = rst_skb(sk, &pos, NULL, &type, ctx); + if (IS_ERR(skb)) { + if (PTR_ERR(skb) == -EINVAL) { + int err; @@ -57104,6 +60545,62 @@ index 0000000..f63df90 + return 0; +} + ++static void rst_listen_socket_tcp(struct cpt_sock_image *si, struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ tp->rcv_tstamp = tcp_jiffies_import(si->cpt_rcv_tstamp); ++ tp->lsndtime = tcp_jiffies_import(si->cpt_lsndtime); ++ tp->tcp_header_len = si->cpt_tcp_header_len; ++ inet_csk(sk)->icsk_accept_queue.rskq_defer_accept = si->cpt_defer_accept; ++ ++ /* Next options are inherited by children */ ++ tp->mss_cache = si->cpt_mss_cache; ++ inet_csk(sk)->icsk_ext_hdr_len = si->cpt_ext_header_len; ++ tp->reordering = si->cpt_reordering; ++ tp->nonagle = si->cpt_nonagle; ++ tp->keepalive_probes = si->cpt_keepalive_probes; ++ tp->rx_opt.user_mss = si->cpt_user_mss; ++ inet_csk(sk)->icsk_syn_retries = si->cpt_syn_retries; ++ tp->keepalive_time = si->cpt_keepalive_time; ++ tp->keepalive_intvl = si->cpt_keepalive_intvl; ++ tp->linger2 = si->cpt_linger2; ++} ++ ++int rst_listen_socket_in( struct sock *sk, struct cpt_sock_image *si, ++ loff_t pos, struct cpt_context *ctx) ++{ ++ struct inet_sock *inet = inet_sk(sk); ++ ++ lock_sock(sk); ++ ++ inet->uc_ttl = si->cpt_uc_ttl; ++ inet->tos = si->cpt_tos; ++ inet->cmsg_flags = si->cpt_cmsg_flags; ++ inet->pmtudisc = si->cpt_pmtudisc; ++ inet->recverr = si->cpt_recverr; ++ inet->freebind = si->cpt_freebind; ++ inet->id = si->cpt_idcounter; ++ ++ if (sk->sk_family == AF_INET6) { ++ struct ipv6_pinfo *np = inet6_sk(sk); ++ ++ np->frag_size = si->cpt_frag_size6; ++ np->hop_limit = si->cpt_hop_limit6; ++ ++ np->rxopt.all = si->cpt_rxopt6; ++ np->mc_loop = si->cpt_mc_loop6; ++ np->recverr = si->cpt_recverr6; ++ np->pmtudisc = si->cpt_pmtudisc6; ++ np->ipv6only = si->cpt_ipv6only6; ++ } ++ ++ if (sk->sk_protocol == IPPROTO_TCP) ++ rst_listen_socket_tcp(si, sk); ++ ++ release_sock(sk); ++ return 0; ++} + +int rst_socket_in(struct cpt_sock_image *si, loff_t pos, struct sock *sk, + struct cpt_context *ctx) @@ -57215,26 +60712,49 @@ index 0000000..f63df90 + loff_t pos, struct cpt_context *ctx) +{ + int err; -+ loff_t end = si->cpt_next; ++ loff_t end = pos + si->cpt_next; + + pos += si->cpt_hdrlen; ++ ++ lock_sock(sk); + while (pos < end) { + struct cpt_openreq_image oi; + + err = rst_get_object(CPT_OBJ_OPENREQ, pos, &oi, ctx); + if (err) { + err = rst_sock_attr(&pos, sk, ctx); -+ if (err) ++ if (err) { ++ release_sock(sk); + return err; ++ } ++ + continue; + } + + if (oi.cpt_object == CPT_OBJ_OPENREQ) { -+ struct request_sock *req = reqsk_alloc(&tcp_request_sock_ops); -+ if (req == NULL) -+ return -ENOMEM; ++ struct request_sock *req; ++ ++ if (oi.cpt_family == AF_INET6 && ++ sk->sk_family != AF_INET6) ++ /* related to non initialized cpt_family bug */ ++ goto next; ++ ++ if (oi.cpt_family == AF_INET6) { ++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) ++ req = reqsk_alloc(&tcp6_request_sock_ops); ++#else ++ release_sock(sk); ++ return -EINVAL; ++#endif ++ } else { ++ req = reqsk_alloc(&tcp_request_sock_ops); ++ } ++ ++ if (req == NULL) { ++ release_sock(sk); ++ return -ENOMEM; ++ } + -+ memset(req, 0, sizeof(*req)); + tcp_rsk(req)->rcv_isn = oi.cpt_rcv_isn; + tcp_rsk(req)->snt_isn = oi.cpt_snt_isn; + inet_rsk(req)->rmt_port = oi.cpt_rmt_port; @@ -57247,26 +60767,33 @@ index 0000000..f63df90 + inet_rsk(req)->wscale_ok = oi.cpt_wscale_ok; + inet_rsk(req)->ecn_ok = oi.cpt_ecn_ok; + inet_rsk(req)->acked = oi.cpt_acked; ++ inet_rsk(req)->opt = NULL; + req->window_clamp = oi.cpt_window_clamp; + req->rcv_wnd = oi.cpt_rcv_wnd; + req->ts_recent = oi.cpt_ts_recent; + req->expires = jiffies_import(oi.cpt_expires); ++ req->sk = NULL; ++ req->secid = 0; ++ req->peer_secid = 0; + -+ if (oi.cpt_family == AF_INET) { -+ memcpy(&inet_rsk(req)->loc_addr, oi.cpt_loc_addr, 4); -+ memcpy(&inet_rsk(req)->rmt_addr, oi.cpt_rmt_addr, 4); -+ inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); -+ } else { ++ if (oi.cpt_family == AF_INET6) { +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) ++ inet6_rsk(req)->pktopts = NULL; + memcpy(&inet6_rsk(req)->loc_addr, oi.cpt_loc_addr, 16); + memcpy(&inet6_rsk(req)->rmt_addr, oi.cpt_rmt_addr, 16); + inet6_rsk(req)->iif = oi.cpt_iif; + inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); +#endif ++ } else { ++ memcpy(&inet_rsk(req)->loc_addr, oi.cpt_loc_addr, 4); ++ memcpy(&inet_rsk(req)->rmt_addr, oi.cpt_rmt_addr, 4); ++ inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + } + } ++next: + pos += oi.cpt_next; + } ++ release_sock(sk); + return 0; +} + @@ -57302,10 +60829,10 @@ index 0000000..f63df90 +#endif diff --git a/kernel/cpt/rst_sysvipc.c b/kernel/cpt/rst_sysvipc.c new file mode 100644 -index 0000000..0f21493 +index 0000000..b5e62a7 --- /dev/null +++ b/kernel/cpt/rst_sysvipc.c -@@ -0,0 +1,634 @@ +@@ -0,0 +1,639 @@ +/* + * + * kernel/cpt/rst_sysvipc.c @@ -57468,8 +60995,11 @@ index 0000000..0f21493 + u.shmi.cpt_segsz, u.shmi.cpt_mode); + if (!IS_ERR(file)) { + err = fixup_shm(file, &u.shmi); -+ if (err != -EEXIST && dpos < epos) ++ if (err != -EEXIST && dpos < epos) { + err = fixup_shm_data(file, dpos, epos, ctx); ++ if (err) ++ goto err_put; ++ } + } else if (IS_ERR(file) && PTR_ERR(file) == -EEXIST) { + struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns; + struct shmid_kernel *shp; @@ -57482,6 +61012,8 @@ index 0000000..0f21493 + } + return file; + ++err_put: ++ fput(file); +err_out: + return ERR_PTR(err); +} @@ -58332,10 +61864,10 @@ index 0000000..929ca26 +} diff --git a/kernel/cpt/rst_ubc.c b/kernel/cpt/rst_ubc.c new file mode 100644 -index 0000000..e7f717e +index 0000000..db1f982 --- /dev/null +++ b/kernel/cpt/rst_ubc.c -@@ -0,0 +1,133 @@ +@@ -0,0 +1,144 @@ +/* + * + * kernel/cpt/rst_ubc.c @@ -58396,7 +61928,7 @@ index 0000000..e7f717e +{ + struct user_beancounter *bc; + cpt_object_t *pobj; -+ int i; ++ int resources, i; + + if (v->cpt_parent != CPT_NULL) { + pobj = lookup_cpt_obj_bypos(CPT_OBJ_UBC, v->cpt_parent, ctx); @@ -58417,7 +61949,15 @@ index 0000000..e7f717e + CPT_VERSION_MINOR(ctx->image_version) < 1) + goto out; + -+ for (i = 0; i < UB_RESOURCES; i++) { ++ if (v->cpt_content == CPT_CONTENT_ARRAY) ++ resources = v->cpt_ub_resources; ++ else ++ resources = UB_RESOURCES_COMPAT; ++ ++ if (resources > UB_RESOURCES) ++ return -EINVAL; ++ ++ for (i = 0; i < resources; i++) { + restore_one_bc_parm(v->cpt_parms + i * 2, bc->ub_parms + i, 0); + restore_one_bc_parm(v->cpt_parms + i * 2 + 1, + bc->ub_store + i, 1); @@ -58454,9 +61994,12 @@ index 0000000..e7f717e + cpt_obj_setpos(obj, start, ctx); + intern_cpt_object(CPT_OBJ_UBC, obj, ctx); + -+ restore_one_bc(v, obj, ctx); ++ err = restore_one_bc(v, obj, ctx); + + cpt_release_buf(ctx); ++ if (err) ++ return err; ++ + start += v->cpt_next; + } + return 0; @@ -58471,10 +62014,10 @@ index 0000000..e7f717e +} diff --git a/kernel/cpt/rst_undump.c b/kernel/cpt/rst_undump.c new file mode 100644 -index 0000000..aadddcb +index 0000000..68cc6c2 --- /dev/null +++ b/kernel/cpt/rst_undump.c -@@ -0,0 +1,1069 @@ +@@ -0,0 +1,1077 @@ +/* + * + * kernel/cpt/rst_undump.c @@ -58589,6 +62132,8 @@ index 0000000..aadddcb + // // ve->start_cycles -= (s64)i->start_jiffies_delta * cycles_per_jiffy; + + ctx->last_vpid = i->last_pid; ++ if (i->rnd_va_space) ++ ve->_randomize_va_space = i->rnd_va_space - 1; + + err = 0; +out_rel: @@ -58626,7 +62171,7 @@ index 0000000..aadddcb + param.known_features = (ctx->image_version < CPT_VERSION_18) ? + VE_FEATURES_OLD : ~(__u64)0; + -+ err = real_env_create(ctx->ve_id, VE_CREATE|VE_LOCK, 2, ++ err = real_env_create(ctx->ve_id, VE_CREATE|VE_LOCK|VE_EXCLUSIVE, 2, + ¶m, sizeof(param)); + if (err < 0) + eprintk_ctx("real_env_create: %d\n", err); @@ -58769,6 +62314,12 @@ index 0000000..aadddcb + goto out; + } + ++ err = rst_files_std(ti, ctx); ++ if (err) { ++ eprintk_ctx("rst_root_stds: %d\n", err); ++ goto out; ++ } ++ + err = rst_root_namespace(ctx); + if (err) { + eprintk_ctx("rst_namespace: %d\n", err); @@ -59558,7 +63109,7 @@ index 291ac58..63381db 100644 (!cputime_eq(p->utime, cputime_zero) || !cputime_eq(p->stime, cputime_zero))) diff --git a/kernel/exit.c b/kernel/exit.c -index f7864ac..7773280 100644 +index f7864ac..38b3e22 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -22,6 +22,9 @@ @@ -59621,7 +63172,16 @@ index f7864ac..7773280 100644 call_rcu(&p->rcu, delayed_put_task_struct); p = leader; -@@ -526,6 +540,7 @@ void put_files_struct(struct files_struct *files) +@@ -422,6 +436,8 @@ void daemonize(const char *name, ...) + va_list args; + sigset_t blocked; + ++ (void)virtinfo_gencall(VIRTINFO_DOEXIT, NULL); ++ + va_start(args, name); + vsnprintf(current->comm, sizeof(current->comm), name, args); + va_end(args); +@@ -526,6 +542,7 @@ void put_files_struct(struct files_struct *files) free_fdtable(fdt); } } @@ -59629,7 +63189,7 @@ index f7864ac..7773280 100644 void reset_files_struct(struct files_struct *files) { -@@ -598,10 +613,10 @@ retry: +@@ -598,10 +615,10 @@ retry: * Search through everything else. We should not get * here often */ @@ -59642,7 +63202,7 @@ index f7864ac..7773280 100644 read_unlock(&tasklist_lock); /* -@@ -640,7 +655,7 @@ assign_new_owner: +@@ -640,7 +657,7 @@ assign_new_owner: * Turn us into a lazy TLB process if we * aren't already.. */ @@ -59651,7 +63211,7 @@ index f7864ac..7773280 100644 { struct mm_struct *mm = tsk->mm; struct core_state *core_state; -@@ -648,6 +663,10 @@ static void exit_mm(struct task_struct * tsk) +@@ -648,6 +665,10 @@ static void exit_mm(struct task_struct * tsk) mm_release(tsk, mm); if (!mm) return; @@ -59662,7 +63222,7 @@ index f7864ac..7773280 100644 /* * Serialize with any possible pending coredump. * We must hold mmap_sem around checking core_state -@@ -692,6 +711,7 @@ static void exit_mm(struct task_struct * tsk) +@@ -692,6 +713,7 @@ static void exit_mm(struct task_struct * tsk) mm_update_next_owner(mm); mmput(mm); } @@ -59670,7 +63230,7 @@ index f7864ac..7773280 100644 /* * When we die, we re-parent all our children. -@@ -706,7 +726,7 @@ static struct task_struct *find_new_reaper(struct task_struct *father) +@@ -706,7 +728,7 @@ static struct task_struct *find_new_reaper(struct task_struct *father) struct task_struct *thread; thread = father; @@ -59679,7 +63239,7 @@ index f7864ac..7773280 100644 if (thread->flags & PF_EXITING) continue; if (unlikely(pid_ns->child_reaper == father)) -@@ -839,11 +859,16 @@ static void exit_notify(struct task_struct *tsk, int group_dead) +@@ -839,11 +861,16 @@ static void exit_notify(struct task_struct *tsk, int group_dead) tsk->self_exec_id != tsk->parent_exec_id)) tsk->exit_signal = SIGCHLD; @@ -59696,7 +63256,7 @@ index f7864ac..7773280 100644 /* mt-exec, de_thread() is waiting for us */ if (thread_group_leader(tsk) && -@@ -900,6 +925,7 @@ NORET_TYPE void do_exit(long code) +@@ -900,6 +927,7 @@ NORET_TYPE void do_exit(long code) panic("Attempted to kill the idle task!"); tracehook_report_exit(&code); @@ -59704,7 +63264,7 @@ index f7864ac..7773280 100644 validate_creds_for_do_exit(tsk); -@@ -983,7 +1009,15 @@ NORET_TYPE void do_exit(long code) +@@ -983,7 +1011,15 @@ NORET_TYPE void do_exit(long code) */ perf_event_exit_task(tsk); @@ -59721,7 +63281,7 @@ index f7864ac..7773280 100644 #ifdef CONFIG_NUMA mpol_put(tsk->mempolicy); tsk->mempolicy = NULL; -@@ -1626,7 +1660,7 @@ repeat: +@@ -1626,7 +1662,7 @@ repeat: if (wo->wo_flags & __WNOTHREAD) break; @@ -59730,7 +63290,7 @@ index f7864ac..7773280 100644 read_unlock(&tasklist_lock); notask: -@@ -1753,6 +1787,7 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, +@@ -1753,6 +1789,7 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, asmlinkage_protect(4, ret, upid, stat_addr, options, ru); return ret; } @@ -59740,10 +63300,10 @@ index f7864ac..7773280 100644 diff --git a/kernel/fairsched.c b/kernel/fairsched.c new file mode 100644 -index 0000000..bfa5c33 +index 0000000..7cbd309 --- /dev/null +++ b/kernel/fairsched.c -@@ -0,0 +1,633 @@ +@@ -0,0 +1,683 @@ +/* + * Fair Scheduler + * @@ -59861,7 +63421,7 @@ index 0000000..bfa5c33 +{ + int retval; + -+ if (!capable(CAP_SETVEID)) ++ if (!capable_setveid()) + return -EPERM; + + mutex_lock(&fairsched_mutex); @@ -59902,7 +63462,7 @@ index 0000000..bfa5c33 +{ + int retval; + -+ if (!capable(CAP_SETVEID)) ++ if (!capable_setveid()) + return -EPERM; + + mutex_lock(&fairsched_mutex); @@ -59936,7 +63496,7 @@ index 0000000..bfa5c33 +{ + int retval; + -+ if (!capable(CAP_SETVEID)) ++ if (!capable_setveid()) + return -EPERM; + + mutex_lock(&fairsched_mutex); @@ -59964,7 +63524,7 @@ index 0000000..bfa5c33 +{ + int retval; + -+ if (!capable(CAP_SETVEID)) ++ if (!capable_setveid()) + return -EPERM; + + mutex_lock(&fairsched_mutex); @@ -60015,7 +63575,7 @@ index 0000000..bfa5c33 +{ + int retval; + -+ if (!capable(CAP_SETVEID)) ++ if (!capable_setveid()) + return -EPERM; + + mutex_lock(&fairsched_mutex); @@ -60063,7 +63623,7 @@ index 0000000..bfa5c33 +{ + int retval; + -+ if (!capable(CAP_SETVEID)) ++ if (!capable_setveid()) + return -EPERM; + + mutex_lock(&fairsched_mutex); @@ -60074,6 +63634,56 @@ index 0000000..bfa5c33 +} +EXPORT_SYMBOL(sys_fairsched_mvpr); + ++int fairsched_new_node(int id, unsigned int vcpus) ++{ ++ int err; ++ ++ mutex_lock(&fairsched_mutex); ++ /* ++ * We refuse to switch to an already existing node since nodes ++ * keep a pointer to their ve_struct... ++ */ ++ err = do_fairsched_mknod(0, 1, id); ++ if (err < 0) { ++ printk(KERN_WARNING "Can't create fairsched node %d\n", id); ++ goto out; ++ } ++#if 0 ++ err = do_fairsched_vcpus(id, vcpus); ++ if (err) { ++ printk(KERN_WARNING "Can't set sched vcpus on node %d\n", id); ++ goto cleanup; ++ } ++#endif ++ err = do_fairsched_mvpr(current->pid, id); ++ if (err) { ++ printk(KERN_WARNING "Can't switch to fairsched node %d\n", id); ++ goto cleanup; ++ } ++ mutex_unlock(&fairsched_mutex); ++ return 0; ++ ++cleanup: ++ if (do_fairsched_rmnod(id)) ++ printk(KERN_ERR "Can't clean fairsched node %d\n", id); ++out: ++ mutex_unlock(&fairsched_mutex); ++ return err; ++} ++EXPORT_SYMBOL(fairsched_new_node); ++ ++void fairsched_drop_node(int id) ++{ ++ mutex_lock(&fairsched_mutex); ++ if (task_fairsched_node_id(current) == id) ++ if (do_fairsched_mvpr(current->pid, FAIRSCHED_INIT_NODE_ID)) ++ printk(KERN_WARNING "Can't leave sched node %d\n", id); ++ if (do_fairsched_rmnod(id)) ++ printk(KERN_ERR "Can't remove fairsched node %d\n", id); ++ mutex_unlock(&fairsched_mutex); ++} ++EXPORT_SYMBOL(fairsched_drop_node); ++ +#ifdef CONFIG_PROC_FS + +/*********************************************************************/ @@ -61016,7 +64626,7 @@ index 84027cf..d3151a1 100644 /** * kthread_stop - stop a thread created by kthread_create(). diff --git a/kernel/lockdep.c b/kernel/lockdep.c -index 9af5672..99c3c9b 100644 +index f672d51..bc200db 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3742,7 +3742,7 @@ retry: @@ -61038,7 +64648,7 @@ index 9af5672..99c3c9b 100644 printk("\n"); printk("=============================================\n\n"); diff --git a/kernel/module.c b/kernel/module.c -index dfa33e8..48a2edc 100644 +index a4aae35..6d7a625 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2915,6 +2915,8 @@ static char *module_flags(struct module *mod, char *buf) @@ -61656,7 +65266,7 @@ index 4954407..da76c51 100644 rcu_read_unlock(); /* If we failed to send the signal the timer stops. */ diff --git a/kernel/power/process.c b/kernel/power/process.c -index cc2e553..3122fcb 100644 +index e7cd671..732f532 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -15,6 +15,8 @@ @@ -61739,7 +65349,7 @@ index cc2e553..3122fcb 100644 continue; @@ -142,8 +148,10 @@ static void thaw_tasks(bool nosig_only) - if (cgroup_frozen(p)) + if (cgroup_freezing_or_frozen(p)) continue; - thaw_process(p); @@ -61752,7 +65362,7 @@ index cc2e553..3122fcb 100644 } diff --git a/kernel/printk.c b/kernel/printk.c -index f38b07f..517bd6a 100644 +index f38b07f..1041e53 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -31,7 +31,9 @@ @@ -61973,20 +65583,21 @@ index f38b07f..517bd6a 100644 boot_delay_msec(); printk_delay(); -@@ -705,6 +754,12 @@ asmlinkage int vprintk(const char *fmt, va_list args) +@@ -705,6 +754,13 @@ asmlinkage int vprintk(const char *fmt, va_list args) spin_lock(&logbuf_lock); printk_cpu = this_cpu; + err = ve_log_init(); + if (err) { -+ spin_unlock_irqrestore(&logbuf_lock, flags); -+ return err; ++ spin_unlock(&logbuf_lock); ++ printed_len = err; ++ goto out_lockdep; + } + if (recursion_bug) { recursion_bug = 0; strcpy(printk_buf, recursion_bug_msg); -@@ -788,7 +843,13 @@ asmlinkage int vprintk(const char *fmt, va_list args) +@@ -788,19 +844,67 @@ asmlinkage int vprintk(const char *fmt, va_list args) * will release 'logbuf_lock' regardless of whether it * actually gets the semaphore or not. */ @@ -61994,14 +65605,24 @@ index f38b07f..517bd6a 100644 + if (!ve_is_super(get_exec_env())) { + need_wake = (ve_log_start != ve_log_end); + printk_cpu = UINT_MAX; -+ spin_unlock_irqrestore(&logbuf_lock, flags); ++ spin_unlock(&logbuf_lock); ++ lockdep_on(); ++ raw_local_irq_restore(flags); + if (!oops_in_progress && need_wake) + wake_up_interruptible(&ve_log_wait); ++ goto out_preempt; + } else if (acquire_console_semaphore_for_printk(this_cpu)) release_console_sem(); ++out_lockdep: lockdep_on(); -@@ -801,6 +862,41 @@ out_restore_irqs: + out_restore_irqs: + raw_local_irq_restore(flags); + ++out_preempt: + preempt_enable(); + return printed_len; + } EXPORT_SYMBOL(printk); EXPORT_SYMBOL(vprintk); @@ -62019,12 +65640,14 @@ index f38b07f..517bd6a 100644 +asmlinkage int ve_vprintk(int dst, const char *fmt, va_list args) +{ + int printed_len; ++ va_list args2; + + printed_len = 0; ++ va_copy(args2, args); + if (ve_is_super(get_exec_env()) || (dst & VE0_LOG)) + printed_len = vprintk(fmt, args); + if (!ve_is_super(get_exec_env()) && (dst & VE_LOG)) -+ printed_len = __vprintk(fmt, args); ++ printed_len = __vprintk(fmt, args2); + return printed_len; +} + @@ -62043,7 +65666,7 @@ index f38b07f..517bd6a 100644 #else static void call_console_drivers(unsigned start, unsigned end) -@@ -1058,6 +1154,7 @@ void release_console_sem(void) +@@ -1058,6 +1162,7 @@ void release_console_sem(void) _con_start = con_start; _log_end = log_end; con_start = log_end; /* Flush */ @@ -62051,7 +65674,7 @@ index f38b07f..517bd6a 100644 spin_unlock(&logbuf_lock); stop_critical_timings(); /* don't trace print latency */ call_console_drivers(_con_start, _log_end); -@@ -1066,6 +1163,7 @@ void release_console_sem(void) +@@ -1066,6 +1171,7 @@ void release_console_sem(void) } console_locked = 0; up(&console_sem); @@ -62059,7 +65682,7 @@ index f38b07f..517bd6a 100644 spin_unlock_irqrestore(&logbuf_lock, flags); if (wake_klogd) wake_up_klogd(); -@@ -1382,6 +1480,36 @@ int printk_ratelimit(void) +@@ -1382,6 +1488,36 @@ int printk_ratelimit(void) } EXPORT_SYMBOL(printk_ratelimit); @@ -62096,6 +65719,72 @@ index f38b07f..517bd6a 100644 /** * printk_timed_ratelimit - caller-controlled printk ratelimiting * @caller_jiffies: pointer to caller's state +@@ -1405,3 +1541,65 @@ bool printk_timed_ratelimit(unsigned long *caller_jiffies, + } + EXPORT_SYMBOL(printk_timed_ratelimit); + #endif ++ ++static cpumask_t nmi_show_regs_cpus = CPU_MASK_NONE; ++static unsigned long nmi_show_regs_timeout; ++ ++void __attribute__((weak)) send_nmi_ipi_allbutself(void) ++{ ++ cpus_clear(nmi_show_regs_cpus); ++} ++ ++static void busted_show_regs(struct pt_regs *regs, int in_nmi) ++{ ++ if (!regs || (in_nmi && spin_is_locked(&logbuf_lock))) ++ return; ++ ++ bust_spinlocks(1); ++ printk("----------- IPI show regs -----------\n"); ++ show_regs(regs); ++ bust_spinlocks(0); ++} ++ ++void nmi_show_regs(struct pt_regs *regs, int in_nmi) ++{ ++ if (cpus_empty(nmi_show_regs_cpus)) ++ goto doit; ++ ++ /* Previous request still in progress */ ++ if (time_before(jiffies, nmi_show_regs_timeout)) ++ return; ++ ++ if (!in_nmi || !spin_is_locked(&logbuf_lock)) { ++ int cpu; ++ ++ bust_spinlocks(1); ++ printk("previous show regs lost IPI to: "); ++ for_each_cpu_mask(cpu, nmi_show_regs_cpus) ++ printk("%d ", cpu); ++ printk("\n"); ++ bust_spinlocks(0); ++ } ++ ++doit: ++ nmi_show_regs_timeout = jiffies + HZ/10; ++ nmi_show_regs_cpus = cpu_online_map; ++ cpu_clear(raw_smp_processor_id(), nmi_show_regs_cpus); ++ busted_show_regs(regs, in_nmi); ++ send_nmi_ipi_allbutself(); ++} ++ ++/* call only from nmi handler */ ++int do_nmi_show_regs(struct pt_regs *regs, int cpu) ++{ ++ static DEFINE_SPINLOCK(nmi_show_regs_lock); ++ ++ if (!cpu_isset(cpu, nmi_show_regs_cpus)) ++ return 0; ++ ++ spin_lock(&nmi_show_regs_lock); ++ busted_show_regs(regs, 1); ++ cpu_clear(cpu, nmi_show_regs_cpus); ++ spin_unlock(&nmi_show_regs_lock); ++ return 1; ++} diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 23bd09c..8967db7 100644 --- a/kernel/ptrace.c @@ -62159,7 +65848,7 @@ index 23bd09c..8967db7 100644 child = find_task_by_vpid(pid); if (child) diff --git a/kernel/sched.c b/kernel/sched.c -index ed61192..e66f256 100644 +index 34d924e..bf1165c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -71,6 +71,8 @@ @@ -62190,7 +65879,7 @@ index ed61192..e66f256 100644 struct task_struct *curr, *idle; unsigned long next_balance; struct mm_struct *prev_mm; -@@ -647,6 +654,11 @@ static inline int cpu_of(struct rq *rq) +@@ -647,6 +654,12 @@ static inline int cpu_of(struct rq *rq) #endif } @@ -62198,11 +65887,12 @@ index ed61192..e66f256 100644 +DEFINE_SPINLOCK(kstat_glb_lock); +EXPORT_SYMBOL(kstat_glob); +EXPORT_SYMBOL(kstat_glb_lock); ++static DEFINE_PER_CPU(struct kstat_lat_pcpu_snap_struct, glob_kstat_lat); + /* * The domain tree (rq->sd) is protected by RCU's quiescent state transition. * See detach_destroy_domains: synchronize_sched for details. -@@ -998,6 +1010,220 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) +@@ -998,6 +1011,220 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) spin_unlock_irqrestore(&rq->lock, *flags); } @@ -62423,7 +66113,7 @@ index ed61192..e66f256 100644 /* * this_rq_lock - lock this runqueue and disable interrupts. */ -@@ -1943,11 +2169,21 @@ static int effective_prio(struct task_struct *p) +@@ -1943,11 +2170,21 @@ static int effective_prio(struct task_struct *p) */ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) { @@ -62446,7 +66136,7 @@ index ed61192..e66f256 100644 } /* -@@ -1955,11 +2191,31 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) +@@ -1955,11 +2192,31 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) */ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) { @@ -62479,7 +66169,7 @@ index ed61192..e66f256 100644 } /** -@@ -2278,6 +2534,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) +@@ -2276,6 +2533,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) return ncsw; } @@ -62487,7 +66177,7 @@ index ed61192..e66f256 100644 /*** * kick_process - kick a running thread to enter/exit the kernel -@@ -2374,8 +2631,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, +@@ -2372,8 +2630,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, * * First fix up the nr_uninterruptible count: */ @@ -62500,7 +66190,7 @@ index ed61192..e66f256 100644 p->state = TASK_WAKING; task_rq_unlock(rq, &flags); -@@ -2609,6 +2869,10 @@ void sched_fork(struct task_struct *p, int clone_flags) +@@ -2607,6 +2868,10 @@ void sched_fork(struct task_struct *p, int clone_flags) /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; #endif @@ -62511,7 +66201,7 @@ index ed61192..e66f256 100644 plist_node_init(&p->pushable_tasks, MAX_PRIO); put_cpu(); -@@ -2639,6 +2903,8 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) +@@ -2637,6 +2902,8 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) */ p->sched_class->task_new(rq, p); inc_nr_running(rq); @@ -62520,7 +66210,7 @@ index ed61192..e66f256 100644 } trace_sched_wakeup_new(rq, p, 1); check_preempt_curr(rq, p, WF_FORK); -@@ -2841,6 +3107,7 @@ asmlinkage void schedule_tail(struct task_struct *prev) +@@ -2839,6 +3106,7 @@ asmlinkage void schedule_tail(struct task_struct *prev) if (current->set_child_tid) put_user(task_pid_vnr(current), current->set_child_tid); } @@ -62528,7 +66218,7 @@ index ed61192..e66f256 100644 /* * context_switch - switch to the new MM and the new -@@ -2912,6 +3179,7 @@ unsigned long nr_running(void) +@@ -2910,6 +3178,7 @@ unsigned long nr_running(void) return sum; } @@ -62536,7 +66226,7 @@ index ed61192..e66f256 100644 unsigned long nr_uninterruptible(void) { -@@ -2929,6 +3197,7 @@ unsigned long nr_uninterruptible(void) +@@ -2927,6 +3196,7 @@ unsigned long nr_uninterruptible(void) return sum; } @@ -62544,7 +66234,7 @@ index ed61192..e66f256 100644 unsigned long long nr_context_switches(void) { -@@ -2964,6 +3233,72 @@ unsigned long this_cpu_load(void) +@@ -2962,6 +3232,72 @@ unsigned long this_cpu_load(void) } @@ -62617,7 +66307,7 @@ index ed61192..e66f256 100644 /* Variables and functions for calc_load */ static atomic_long_t calc_load_tasks; static unsigned long calc_load_update; -@@ -2985,6 +3320,16 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) +@@ -2983,6 +3319,16 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) loads[2] = (avenrun[2] + offset) << shift; } @@ -62634,7 +66324,7 @@ index ed61192..e66f256 100644 static unsigned long calc_load(unsigned long load, unsigned long exp, unsigned long active) { -@@ -2993,6 +3338,35 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) +@@ -2991,6 +3337,35 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) return load >> FSHIFT; } @@ -62670,7 +66360,7 @@ index ed61192..e66f256 100644 /* * calc_load - update the avenrun load estimates 10 ticks after the * CPUs have updated calc_load_tasks. -@@ -3012,6 +3386,8 @@ void calc_global_load(void) +@@ -3010,6 +3385,8 @@ void calc_global_load(void) avenrun[1] = calc_load(avenrun[1], EXP_5, active); avenrun[2] = calc_load(avenrun[2], EXP_15, active); @@ -62679,7 +66369,7 @@ index ed61192..e66f256 100644 calc_load_update += LOAD_FREQ; } -@@ -3076,6 +3452,16 @@ static void update_cpu_load(struct rq *this_rq) +@@ -3074,6 +3451,16 @@ static void update_cpu_load(struct rq *this_rq) } } @@ -62696,7 +66386,7 @@ index ed61192..e66f256 100644 #ifdef CONFIG_SMP /* -@@ -3176,8 +3562,15 @@ void sched_exec(void) +@@ -3174,8 +3561,15 @@ void sched_exec(void) static void pull_task(struct rq *src_rq, struct task_struct *p, struct rq *this_rq, int this_cpu) { @@ -62712,7 +66402,7 @@ index ed61192..e66f256 100644 activate_task(this_rq, p, 0); check_preempt_curr(this_rq, p, 0); } -@@ -5054,10 +5447,13 @@ void account_user_time(struct task_struct *p, cputime_t cputime, +@@ -5052,10 +5446,13 @@ void account_user_time(struct task_struct *p, cputime_t cputime, /* Add user time to cpustat. */ tmp = cputime_to_cputime64(cputime); @@ -62728,7 +66418,7 @@ index ed61192..e66f256 100644 cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime); /* Account for user time used */ -@@ -5114,6 +5510,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, +@@ -5112,6 +5509,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, /* Add system time to cpustat. */ tmp = cputime_to_cputime64(cputime); @@ -62736,7 +66426,7 @@ index ed61192..e66f256 100644 if (hardirq_count() - hardirq_offset) cpustat->irq = cputime64_add(cpustat->irq, tmp); else if (softirq_count()) -@@ -5492,6 +5889,8 @@ need_resched_nonpreemptible: +@@ -5490,6 +5888,8 @@ need_resched_nonpreemptible: next = pick_next_task(rq); if (likely(prev != next)) { @@ -62745,7 +66435,7 @@ index ed61192..e66f256 100644 sched_info_switch(prev, next); perf_event_task_sched_out(prev, next, cpu); -@@ -5499,6 +5898,22 @@ need_resched_nonpreemptible: +@@ -5497,6 +5897,22 @@ need_resched_nonpreemptible: rq->curr = next; ++*switch_count; @@ -62768,7 +66458,7 @@ index ed61192..e66f256 100644 context_switch(rq, prev, next); /* unlocks the rq */ /* * the context switch might have flipped the stack from under -@@ -5506,8 +5921,10 @@ need_resched_nonpreemptible: +@@ -5504,8 +5920,10 @@ need_resched_nonpreemptible: */ cpu = smp_processor_id(); rq = cpu_rq(cpu); @@ -62780,7 +66470,7 @@ index ed61192..e66f256 100644 post_schedule(rq); -@@ -6291,7 +6708,7 @@ recheck: +@@ -6289,7 +6707,7 @@ recheck: /* * Allow unprivileged RT tasks to decrease priority: */ @@ -62789,7 +66479,7 @@ index ed61192..e66f256 100644 if (rt_policy(policy)) { unsigned long rlim_rtprio; -@@ -6798,11 +7215,16 @@ EXPORT_SYMBOL(yield); +@@ -6800,11 +7218,16 @@ EXPORT_SYMBOL(yield); void __sched io_schedule(void) { struct rq *rq = raw_rq(); @@ -62806,7 +66496,7 @@ index ed61192..e66f256 100644 current->in_iowait = 0; atomic_dec(&rq->nr_iowait); delayacct_blkio_end(); -@@ -6813,11 +7235,16 @@ long __sched io_schedule_timeout(long timeout) +@@ -6815,11 +7238,16 @@ long __sched io_schedule_timeout(long timeout) { struct rq *rq = raw_rq(); long ret; @@ -62823,7 +66513,7 @@ index ed61192..e66f256 100644 current->in_iowait = 0; atomic_dec(&rq->nr_iowait); delayacct_blkio_end(); -@@ -6924,17 +7351,7 @@ void sched_show_task(struct task_struct *p) +@@ -6926,17 +7354,7 @@ void sched_show_task(struct task_struct *p) state = p->state ? __ffs(p->state) + 1 : 0; printk(KERN_INFO "%-13.13s %c", p->comm, state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); @@ -62842,7 +66532,7 @@ index ed61192..e66f256 100644 #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); #endif -@@ -6951,13 +7368,13 @@ void show_state_filter(unsigned long state_filter) +@@ -6953,13 +7371,13 @@ void show_state_filter(unsigned long state_filter) #if BITS_PER_LONG == 32 printk(KERN_INFO @@ -62859,7 +66549,7 @@ index ed61192..e66f256 100644 /* * reset the NMI-timeout, listing all files on a slow * console might take alot of time: -@@ -6965,7 +7382,7 @@ void show_state_filter(unsigned long state_filter) +@@ -6967,7 +7385,7 @@ void show_state_filter(unsigned long state_filter) touch_nmi_watchdog(); if (!state_filter || (p->state & state_filter)) sched_show_task(p); @@ -62868,7 +66558,7 @@ index ed61192..e66f256 100644 touch_all_softlockup_watchdogs(); -@@ -7331,13 +7748,13 @@ static void migrate_live_tasks(int src_cpu) +@@ -7336,13 +7754,13 @@ static void migrate_live_tasks(int src_cpu) read_lock(&tasklist_lock); @@ -62884,7 +66574,15 @@ index ed61192..e66f256 100644 read_unlock(&tasklist_lock); } -@@ -9498,7 +9915,7 @@ void __init sched_init(void) +@@ -9490,6 +9908,7 @@ void __init sched_init(void) + update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), + __alignof__(unsigned long)); + #endif ++ kstat_glob.sched_lat.cur = &per_cpu__glob_kstat_lat; + for_each_possible_cpu(i) { + struct rq *rq; + +@@ -9503,7 +9922,7 @@ void __init sched_init(void) #ifdef CONFIG_FAIR_GROUP_SCHED init_task_group.shares = init_task_group_load; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); @@ -62893,7 +66591,7 @@ index ed61192..e66f256 100644 /* * How much cpu bandwidth does init_task_group get? * -@@ -9544,7 +9961,7 @@ void __init sched_init(void) +@@ -9549,7 +9968,7 @@ void __init sched_init(void) rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; #ifdef CONFIG_RT_GROUP_SCHED INIT_LIST_HEAD(&rq->leaf_rt_rq_list); @@ -62902,7 +66600,7 @@ index ed61192..e66f256 100644 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); #elif defined CONFIG_USER_SCHED init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL); -@@ -9610,6 +10027,7 @@ void __init sched_init(void) +@@ -9615,6 +10034,7 @@ void __init sched_init(void) * During early bootup we pretend to be a normal task: */ current->sched_class = &fair_sched_class; @@ -62910,7 +66608,7 @@ index ed61192..e66f256 100644 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); -@@ -9688,7 +10106,7 @@ void normalize_rt_tasks(void) +@@ -9693,7 +10113,7 @@ void normalize_rt_tasks(void) struct rq *rq; read_lock_irqsave(&tasklist_lock, flags); @@ -62919,7 +66617,7 @@ index ed61192..e66f256 100644 /* * Only normalize user tasks: */ -@@ -9719,7 +10137,7 @@ void normalize_rt_tasks(void) +@@ -9724,7 +10144,7 @@ void normalize_rt_tasks(void) __task_rq_unlock(rq); spin_unlock(&p->pi_lock); @@ -62928,7 +66626,7 @@ index ed61192..e66f256 100644 read_unlock_irqrestore(&tasklist_lock, flags); } -@@ -10165,10 +10583,10 @@ static inline int tg_has_rt_tasks(struct task_group *tg) +@@ -10170,10 +10590,10 @@ static inline int tg_has_rt_tasks(struct task_group *tg) { struct task_struct *g, *p; @@ -63457,17 +67155,19 @@ index ce17760..3073c3e 100644 if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c -index e06d0b8..da15284 100644 +index e06d0b8..7216e06 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c -@@ -179,3 +179,15 @@ cond_syscall(sys_eventfd2); +@@ -179,3 +179,17 @@ cond_syscall(sys_eventfd2); /* performance counters: */ cond_syscall(sys_perf_event_open); +cond_syscall(sys_getluid); +cond_syscall(sys_setluid); +cond_syscall(sys_setublimit); ++cond_syscall(compat_sys_setublimit); +cond_syscall(sys_ubstat); ++cond_syscall(compat_sys_lutime); + +/* fairsched compat */ +cond_syscall(sys_fairsched_mknod); @@ -63477,10 +67177,18 @@ index e06d0b8..da15284 100644 +cond_syscall(sys_fairsched_chwt); +cond_syscall(sys_fairsched_rate); diff --git a/kernel/sysctl.c b/kernel/sysctl.c -index b8bd058..d2d9eec 100644 +index b8bd058..5ef2188 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c -@@ -83,6 +83,21 @@ extern int pid_max_min, pid_max_max; +@@ -50,6 +50,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -83,6 +84,21 @@ extern int pid_max_min, pid_max_max; extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; extern int compat_log; @@ -63502,7 +67210,17 @@ index b8bd058..d2d9eec 100644 extern int latencytop_enabled; extern int sysctl_nr_open_min, sysctl_nr_open_max; #ifndef CONFIG_MMU -@@ -178,9 +193,31 @@ static struct ctl_table_header root_table_header = { +@@ -169,6 +185,9 @@ static int proc_taint(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); + #endif + ++static int proc_dointvec_ve(struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos); ++ + static struct ctl_table root_table[]; + static struct ctl_table_root sysctl_table_root; + static struct ctl_table_header root_table_header = { +@@ -178,9 +197,31 @@ static struct ctl_table_header root_table_header = { .root = &sysctl_table_root, .set = &sysctl_table_root.default_set, }; @@ -63535,7 +67253,7 @@ index b8bd058..d2d9eec 100644 }; static struct ctl_table kern_table[]; -@@ -504,6 +541,20 @@ static struct ctl_table kern_table[] = { +@@ -504,6 +545,20 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif @@ -63556,7 +67274,7 @@ index b8bd058..d2d9eec 100644 #ifdef __hppa__ { .ctl_name = KERN_HPPA_PWRSW, -@@ -699,6 +750,24 @@ static struct ctl_table kern_table[] = { +@@ -699,6 +754,24 @@ static struct ctl_table kern_table[] = { .extra1 = &pid_max_min, .extra2 = &pid_max_max, }, @@ -63581,7 +67299,22 @@ index b8bd058..d2d9eec 100644 { .ctl_name = KERN_PANIC_ON_OOPS, .procname = "panic_on_oops", -@@ -1424,6 +1493,21 @@ static struct ctl_table vm_table[] = { +@@ -824,10 +897,12 @@ static struct ctl_table kern_table[] = { + { + .ctl_name = KERN_RANDOMIZE, + .procname = "randomize_va_space", +- .data = &randomize_va_space, ++ .data = &_randomize_va_space, ++ .extra1 = (void *)offsetof(struct ve_struct, ++ _randomize_va_space), + .maxlen = sizeof(int), + .mode = 0644, +- .proc_handler = &proc_dointvec, ++ .proc_handler = &proc_dointvec_ve, + }, + #endif + #if defined(CONFIG_S390) && defined(CONFIG_SMP) +@@ -1424,6 +1499,21 @@ static struct ctl_table vm_table[] = { .extra2 = &one, }, #endif @@ -63603,7 +67336,7 @@ index b8bd058..d2d9eec 100644 /* * NOTE: do not add new entries to this table unless you have read -@@ -1600,6 +1684,13 @@ static struct ctl_table fs_table[] = { +@@ -1600,6 +1690,13 @@ static struct ctl_table fs_table[] = { }; static struct ctl_table debug_table[] = { @@ -63617,7 +67350,7 @@ index b8bd058..d2d9eec 100644 #if defined(CONFIG_X86) || defined(CONFIG_PPC) { .ctl_name = CTL_UNNUMBERED, -@@ -2150,10 +2241,27 @@ struct ctl_table_header *__register_sysctl_paths( +@@ -2150,10 +2247,27 @@ struct ctl_table_header *__register_sysctl_paths( struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, struct ctl_table *table) { @@ -63645,7 +67378,7 @@ index b8bd058..d2d9eec 100644 /** * register_sysctl_table - register a sysctl table hierarchy * @table: the top-level table structure -@@ -2170,6 +2278,14 @@ struct ctl_table_header *register_sysctl_table(struct ctl_table *table) +@@ -2170,6 +2284,14 @@ struct ctl_table_header *register_sysctl_table(struct ctl_table *table) return register_sysctl_paths(null_path, table); } @@ -63660,7 +67393,7 @@ index b8bd058..d2d9eec 100644 /** * unregister_sysctl_table - unregister a sysctl table hierarchy * @header: the header returned from register_sysctl_table -@@ -2231,6 +2347,18 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, +@@ -2231,6 +2353,18 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, return NULL; } @@ -63679,7 +67412,33 @@ index b8bd058..d2d9eec 100644 void unregister_sysctl_table(struct ctl_table_header * table) { } -@@ -3236,6 +3364,56 @@ static int deprecated_sysctl_warning(struct __sysctl_args *args) +@@ -2902,6 +3036,25 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, + return 0; + } + ++#ifdef CONFIG_VE ++static int proc_dointvec_ve(struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos) ++{ ++ struct ctl_table tmp_table; ++ ++ tmp_table = *table; ++ tmp_table.data = (char *)get_exec_env() + (unsigned long)table->extra1; ++ ++ return proc_dointvec(&tmp_table, write, buffer, lenp, ppos); ++} ++#else ++static int proc_dointvec_ve(struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos) ++{ ++ return proc_dointvec(table, write, buffer, lenp, ppos); ++} ++#endif /* CONFIG_VE */ ++ + #else /* CONFIG_PROC_FS */ + + int proc_dostring(struct ctl_table *table, int write, +@@ -3236,6 +3389,56 @@ static int deprecated_sysctl_warning(struct __sysctl_args *args) return 0; } @@ -63736,7 +67495,7 @@ index b8bd058..d2d9eec 100644 /* * No sense putting this after each symbol definition, twice, * exception granted :-) -@@ -3249,7 +3427,9 @@ EXPORT_SYMBOL(proc_dostring); +@@ -3249,7 +3452,9 @@ EXPORT_SYMBOL(proc_dostring); EXPORT_SYMBOL(proc_doulongvec_minmax); EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); EXPORT_SYMBOL(register_sysctl_table); @@ -64165,10 +67924,10 @@ index 0000000..1b82c35 + diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c new file mode 100644 -index 0000000..8b59ff7 +index 0000000..85c42c3 --- /dev/null +++ b/kernel/ve/ve.c -@@ -0,0 +1,119 @@ +@@ -0,0 +1,129 @@ +/* + * linux/kernel/ve/ve.c + * @@ -64252,6 +68011,12 @@ index 0000000..8b59ff7 +#endif + .features = VE_FEATURE_SIT | VE_FEATURE_IPIP | + VE_FEATURE_PPP, ++ ._randomize_va_space = ++#ifdef CONFIG_COMPAT_BRK ++ 1, ++#else ++ 2, ++#endif +}; + +EXPORT_SYMBOL(ve0); @@ -64269,12 +68034,16 @@ index 0000000..8b59ff7 +EXPORT_SYMBOL(ve_cleanup_list); +EXPORT_SYMBOL(ve_cleanup_thread); + ++static DEFINE_PER_CPU(struct ve_cpu_stats, ve0_cpustats); ++static DEFINE_PER_CPU(struct kstat_lat_pcpu_snap_struct, ve0_lat_stats); ++ +void init_ve0(void) +{ + struct ve_struct *ve; + + ve = get_ve0(); -+ ve->cpu_stats = NULL; ++ ve->cpu_stats = &per_cpu__ve0_cpustats; ++ ve->sched_lat_ve.cur = &per_cpu__ve0_lat_stats; + list_add(&ve->ve_list, &ve_list_head); +} + @@ -64290,10 +68059,10 @@ index 0000000..8b59ff7 +} diff --git a/kernel/ve/vecalls.c b/kernel/ve/vecalls.c new file mode 100644 -index 0000000..29b455d +index 0000000..cc27878 --- /dev/null +++ b/kernel/ve/vecalls.c -@@ -0,0 +1,2264 @@ +@@ -0,0 +1,2335 @@ +/* + * linux/kernel/ve/vecalls.c + * @@ -64353,6 +68122,7 @@ index 0000000..29b455d +#include +#include +#include ++#include + +#include +#include @@ -64481,7 +68251,7 @@ index 0000000..29b455d + struct ve_struct *ve; + int err; + -+ if (!capable(CAP_SETVEID) || veid == 0) ++ if (!capable_setveid() || veid == 0) + return -EPERM; + + if ((ve = get_ve_by_id(veid)) == NULL) @@ -64863,44 +68633,18 @@ index 0000000..29b455d + +static int init_ve_sched(struct ve_struct *ve) +{ -+#ifdef CONFIG_VZ_FAIRSCHED + int err; + -+ /* -+ * We refuse to switch to an already existing node since nodes -+ * keep a pointer to their ve_struct... -+ */ -+ err = sys_fairsched_mknod(0, 1, ve->veid); -+ if (err < 0) { -+ printk(KERN_WARNING "Can't create fairsched node %d\n", -+ ve->veid); -+ return err; -+ } -+ err = sys_fairsched_mvpr(current->pid, ve->veid); -+ if (err) { -+ printk(KERN_WARNING "Can't switch to fairsched node %d\n", -+ ve->veid); -+ if (sys_fairsched_rmnod(ve->veid)) -+ printk(KERN_ERR "Can't clean fairsched node %d\n", -+ ve->veid); -+ return err; -+ } -+#endif -+ ve_sched_attach(ve); -+ return 0; ++ err = fairsched_new_node(ve->veid, 0); ++ if (err == 0) ++ ve_sched_attach(ve); ++ ++ return err; +} + +static void fini_ve_sched(struct ve_struct *ve) +{ -+#ifdef CONFIG_VZ_FAIRSCHED -+ if (task_fairsched_node_id(current) == ve->veid) -+ if (sys_fairsched_mvpr(current->pid, FAIRSCHED_INIT_NODE_ID)) -+ printk(KERN_WARNING "Can't leave fairsched node %d\n", -+ ve->veid); -+ if (sys_fairsched_rmnod(ve->veid)) -+ printk(KERN_ERR "Can't remove fairsched node %d\n", -+ ve->veid); -+#endif ++ fairsched_drop_node(ve->veid); +} + +/* @@ -65023,6 +68767,8 @@ index 0000000..29b455d + ve->start_jiffies = get_jiffies_64(); + ve->start_cycles = get_cycles(); + ++ ve->_randomize_va_space = ve0._randomize_va_space; ++ + return 0; +} + @@ -65077,7 +68823,6 @@ index 0000000..29b455d +{ + /* required for real_setdevperms from register_ve_ above */ + memcpy(&ve->ve_cap_bset, &tsk->cred->cap_effective, sizeof(kernel_cap_t)); -+ cap_lower(ve->ve_cap_bset, CAP_SETVEID); +} + +static int ve_list_add(struct ve_struct *ve) @@ -65135,6 +68880,10 @@ index 0000000..29b455d + /* setup capabilities before enter */ + set_task_ve_caps(new, new_creds); + ++ /* Drop OOM protection. */ ++ if (tsk->signal->oom_adj == OOM_DISABLE) ++ tsk->signal->oom_adj = 0; ++ + old = tsk->ve_task_info.owner_env; + tsk->ve_task_info.owner_env = new; + tsk->ve_task_info.exec_env = new; @@ -65193,13 +68942,24 @@ index 0000000..29b455d +static inline int init_ve_cpustats(struct ve_struct *ve) +{ + ve->cpu_stats = alloc_percpu(struct ve_cpu_stats); -+ return ve->cpu_stats == NULL ? -ENOMEM : 0; ++ if (ve->cpu_stats == NULL) ++ return -ENOMEM; ++ ve->sched_lat_ve.cur = alloc_percpu(struct kstat_lat_pcpu_snap_struct); ++ if (ve == NULL) ++ goto fail; ++ return 0; ++ ++fail: ++ free_percpu(ve->cpu_stats); ++ return -ENOMEM; +} + +static inline void free_ve_cpustats(struct ve_struct *ve) +{ + free_percpu(ve->cpu_stats); + ve->cpu_stats = NULL; ++ free_percpu(ve->sched_lat_ve.cur); ++ ve->sched_lat_ve.cur = NULL; +} + +static int alone_in_pgrp(struct task_struct *tsk) @@ -65469,7 +69229,7 @@ index 0000000..29b455d + } + + status = -EPERM; -+ if (!capable(CAP_SETVEID)) ++ if (!capable_setveid()) + goto out; + + status = -EINVAL; @@ -65814,6 +69574,8 @@ index 0000000..29b455d +#ifdef CONFIG_UNIX98_PTYS + free_ve_tty_driver(ve->ptm_driver); + free_ve_tty_driver(ve->pts_driver); ++ if (ve->allocated_ptys) ++ ida_destroy(ve->allocated_ptys); + kfree(ve->allocated_ptys); + ve->ptm_driver = ve->pts_driver = NULL; + ve->allocated_ptys = NULL; @@ -65998,7 +69760,7 @@ index 0000000..29b455d + +int real_ve_dev_map(envid_t veid, int op, char *dev_name) +{ -+ if (!capable(CAP_SETVEID)) ++ if (!capable_setveid()) + return -EPERM; + switch (op) { + case VE_NETDEV_ADD: @@ -66182,6 +69944,20 @@ index 0000000..29b455d + ub->ub_parms[UB_PRIVVMPAGES].held ; +} + ++static void ve_swapinfo(struct sysinfo *val, struct user_beancounter *ub) ++{ ++ unsigned long size, used; ++ ++ size = ub->ub_parms[UB_SWAPPAGES].limit; ++ used = ub->ub_parms[UB_SWAPPAGES].held; ++ ++ if (size == UB_MAXVALUE) ++ size = 0; ++ ++ val->totalswap = size; ++ val->freeswap = size > used ? size - used : 0; ++} ++ +static inline int ve_mi_replace(struct meminfo *mi, int old_ret) +{ +#ifdef CONFIG_BEANCOUNTERS @@ -66198,7 +69974,7 @@ index 0000000..29b455d + return NOTIFY_DONE | NOTIFY_STOP_MASK; /* No virtualization */ + + nodettram = mi->si.totalram; -+ ub = current->mm->mm_ub; ++ ub = top_beancounter(current->mm->mm_ub); + usedmem = ve_used_mem(ub); + + memset(mi, 0, sizeof(*mi)); @@ -66208,6 +69984,8 @@ index 0000000..29b455d + mi->si.freeram = (mi->si.totalram > usedmem) ? + (mi->si.totalram - usedmem) : 0; + ++ ve_swapinfo(&mi->si, ub); ++ + return NOTIFY_OK | NOTIFY_STOP_MASK; +#else + return NOTIFY_DONE; @@ -66228,6 +70006,62 @@ index 0000000..29b455d + .notifier_call = meminfo_call +}; + ++/* /proc/vz/veinfo */ ++ ++static ve_seq_print_t veaddr_seq_print_cb; ++ ++void vzmon_register_veaddr_print_cb(ve_seq_print_t cb) ++{ ++ rcu_assign_pointer(veaddr_seq_print_cb, cb); ++} ++EXPORT_SYMBOL(vzmon_register_veaddr_print_cb); ++ ++void vzmon_unregister_veaddr_print_cb(ve_seq_print_t cb) ++{ ++ rcu_assign_pointer(veaddr_seq_print_cb, NULL); ++ synchronize_rcu(); ++} ++EXPORT_SYMBOL(vzmon_unregister_veaddr_print_cb); ++ ++static int veinfo_seq_show(struct seq_file *m, void *v) ++{ ++ struct ve_struct *ve; ++ ve_seq_print_t veaddr_seq_print; ++ ++ ve = list_entry((struct list_head *)v, struct ve_struct, ve_list); ++ ++ seq_printf(m, "%10u %5u %5u", ve->veid, ++ ve->class_id, atomic_read(&ve->pcounter)); ++ ++ rcu_read_lock(); ++ veaddr_seq_print = rcu_dereference(veaddr_seq_print_cb); ++ if (veaddr_seq_print) ++ veaddr_seq_print(m, ve); ++ rcu_read_unlock(); ++ ++ seq_putc(m, '\n'); ++ return 0; ++} ++ ++static struct seq_operations veinfo_seq_op = { ++ .start = ve_seq_start, ++ .next = ve_seq_next, ++ .stop = ve_seq_stop, ++ .show = veinfo_seq_show, ++}; ++ ++static int veinfo_open(struct inode *inode, struct file *file) ++{ ++ return seq_open(file, &veinfo_seq_op); ++} ++ ++static struct file_operations proc_veinfo_operations = { ++ .open = veinfo_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ +static int __init init_vecalls_proc(void) +{ + struct proc_dir_entry *de; @@ -66247,6 +70081,11 @@ index 0000000..29b455d + if (!de) + printk(KERN_WARNING "VZMON: can't make version proc entry\n"); + ++ de = proc_create("veinfo", S_IFREG | S_IRUSR, proc_vz_dir, ++ &proc_veinfo_operations); ++ if (!de) ++ printk(KERN_WARNING "VZMON: can't make veinfo proc entry\n"); ++ + virtinfo_notifier_register(VITYPE_GENERAL, &meminfo_notifier_block); + return 0; +} @@ -66256,6 +70095,7 @@ index 0000000..29b455d + remove_proc_entry("version", proc_vz_dir); + remove_proc_entry("devperms", proc_vz_dir); + remove_proc_entry("vestat", proc_vz_dir); ++ remove_proc_entry("veinfo", proc_vz_dir); + virtinfo_notifier_unregister(VITYPE_GENERAL, &meminfo_notifier_block); +} +#else @@ -66560,10 +70400,10 @@ index 0000000..29b455d +module_exit(vecalls_exit) diff --git a/kernel/ve/veowner.c b/kernel/ve/veowner.c new file mode 100644 -index 0000000..3889411 +index 0000000..50f4d9a --- /dev/null +++ b/kernel/ve/veowner.c -@@ -0,0 +1,150 @@ +@@ -0,0 +1,160 @@ +/* + * kernel/ve/veowner.c + * @@ -66590,6 +70430,7 @@ index 0000000..3889411 +#include +#include +#include ++#include +#include +#include + @@ -66641,6 +70482,7 @@ index 0000000..3889411 + * OpenVZ sysctl + * ------------------------------------------------------------------------ + */ ++int ve_xattr_policy = VE_XATTR_POLICY_ACCEPT; +extern int ve_area_access_check; + +#ifdef CONFIG_INET @@ -66671,6 +70513,14 @@ index 0000000..3889411 + .mode = 0644, + .proc_handler = proc_dointvec, + }, ++ { ++ .ctl_name = 228, ++ .procname = "ve-xattr-policy", ++ .data = &ve_xattr_policy, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, + { 0 } +}; + @@ -67550,7 +71400,7 @@ index dacc641..9d28f5c 100644 if (!task_early_kill(tsk)) diff --git a/mm/memory.c b/mm/memory.c -index 4e59455..fcdb9fb 100644 +index 4e59455..220dc95 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -42,6 +42,9 @@ @@ -67575,6 +71425,15 @@ index 4e59455..fcdb9fb 100644 #include #include #include +@@ -94,7 +102,7 @@ EXPORT_SYMBOL(high_memory); + * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization, + * as ancient (libc5 based) binaries can segfault. ) + */ +-int randomize_va_space __read_mostly = ++int _randomize_va_space __read_mostly = + #ifdef CONFIG_COMPAT_BRK + 1; + #else @@ -132,18 +140,21 @@ void pgd_clear_bad(pgd_t *pgd) pgd_ERROR(*pgd); pgd_clear(pgd); @@ -68216,7 +72075,7 @@ index 2e05c97..1ebf6e1 100644 static int do_mlockall(int flags) { diff --git a/mm/mmap.c b/mm/mmap.c -index ae19746..a5dd0bf 100644 +index ae19746..991a1ac 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -29,6 +29,7 @@ @@ -68279,6 +72138,15 @@ index ae19746..a5dd0bf 100644 goto out; set_brk: mm->brk = brk; +@@ -927,7 +946,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, + prot |= PROT_EXEC; + + if (!len) +- return -EINVAL; ++ return strncmp(current->comm, "rpm", 3) ? -EINVAL : addr; + + if (!(flags & MAP_FIXED)) + addr = round_hint_to_min(addr); @@ -1106,6 +1125,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, struct rb_node **rb_link, *rb_parent; unsigned long charged = 0; @@ -69179,7 +73047,7 @@ index dd43373..c9752f0 100644 dec_mm_counter(mm, file_rss); (*mapcount)--; diff --git a/mm/shmem.c b/mm/shmem.c -index 356dd99..141b181 100644 +index 356dd99..bc74e50 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -31,7 +31,11 @@ @@ -69203,7 +73071,41 @@ index 356dd99..141b181 100644 #include #include #include -@@ -214,7 +220,7 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages) +@@ -107,14 +113,31 @@ enum sgp_type { + }; + + #ifdef CONFIG_TMPFS ++ ++#include ++ ++static unsigned long tmpfs_ram_pages(void) ++{ ++ struct meminfo mi; ++ ++ if (ve_is_super(get_exec_env())) ++ return totalram_pages; ++ ++ memset(&mi, 0, sizeof(mi)); ++ si_meminfo(&mi.si); ++ if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_MEMINFO, &mi) & NOTIFY_FAIL) ++ return 0; ++ return mi.si.totalram; ++} ++ + static unsigned long shmem_default_max_blocks(void) + { +- return totalram_pages / 2; ++ return tmpfs_ram_pages() / 2; + } + + static unsigned long shmem_default_max_inodes(void) + { +- return min(totalram_pages - totalhigh_pages, totalram_pages / 2); ++ return min(totalram_pages - totalhigh_pages, tmpfs_ram_pages() / 2); + } + #endif + +@@ -214,7 +237,7 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages) static const struct super_operations shmem_ops; static const struct address_space_operations shmem_aops; @@ -69212,7 +73114,7 @@ index 356dd99..141b181 100644 static const struct inode_operations shmem_inode_operations; static const struct inode_operations shmem_dir_inode_operations; static const struct inode_operations shmem_special_inode_operations; -@@ -277,7 +283,7 @@ static void shmem_free_inode(struct super_block *sb) +@@ -277,7 +300,7 @@ static void shmem_free_inode(struct super_block *sb) * * It has to be called with the spinlock held. */ @@ -69221,7 +73123,7 @@ index 356dd99..141b181 100644 { struct shmem_inode_info *info = SHMEM_I(inode); long freed; -@@ -287,6 +293,8 @@ static void shmem_recalc_inode(struct inode *inode) +@@ -287,6 +310,8 @@ static void shmem_recalc_inode(struct inode *inode) info->alloced -= freed; shmem_unacct_blocks(info->flags, freed); shmem_free_blocks(inode, freed); @@ -69230,7 +73132,7 @@ index 356dd99..141b181 100644 } } -@@ -391,6 +399,11 @@ static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, uns +@@ -391,6 +416,11 @@ static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, uns struct page *page = kmap_atomic_to_page(entry); set_page_private(page, page_private(page) + incdec); } @@ -69242,7 +73144,7 @@ index 356dd99..141b181 100644 } /** -@@ -407,14 +420,24 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long +@@ -407,14 +437,24 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct page *page = NULL; swp_entry_t *entry; @@ -69269,7 +73171,7 @@ index 356dd99..141b181 100644 /* * Test free_blocks against 1 not 0, since we have 1 data * page (and perhaps indirect index pages) yet to allocate: -@@ -424,7 +447,8 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long +@@ -424,7 +464,8 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long spin_lock(&sbinfo->stat_lock); if (sbinfo->free_blocks <= 1) { spin_unlock(&sbinfo->stat_lock); @@ -69279,7 +73181,7 @@ index 356dd99..141b181 100644 } sbinfo->free_blocks--; inode->i_blocks += BLOCKS_PER_PAGE; -@@ -432,31 +456,43 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long +@@ -432,31 +473,43 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long } spin_unlock(&info->lock); @@ -69329,7 +73231,7 @@ index 356dd99..141b181 100644 } /** -@@ -564,6 +600,7 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) +@@ -564,6 +617,7 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) return; spin_lock(&info->lock); @@ -69337,7 +73239,7 @@ index 356dd99..141b181 100644 info->flags |= SHMEM_TRUNCATE; if (likely(end == (loff_t) -1)) { limit = info->next_index; -@@ -750,7 +787,7 @@ done2: +@@ -750,7 +804,7 @@ done2: info->swapped -= nr_swaps_freed; if (nr_pages_to_free) shmem_free_blocks(inode, nr_pages_to_free); @@ -69346,7 +73248,7 @@ index 356dd99..141b181 100644 spin_unlock(&info->lock); /* -@@ -833,6 +870,7 @@ static void shmem_delete_inode(struct inode *inode) +@@ -833,6 +887,7 @@ static void shmem_delete_inode(struct inode *inode) } } BUG_ON(inode->i_blocks); @@ -69354,7 +73256,7 @@ index 356dd99..141b181 100644 shmem_free_inode(inode->i_sb); clear_inode(inode); } -@@ -1020,6 +1058,12 @@ int shmem_unuse(swp_entry_t entry, struct page *page) +@@ -1020,6 +1075,12 @@ int shmem_unuse(swp_entry_t entry, struct page *page) out: return found; /* 0 or 1 or -ENOMEM */ } @@ -69367,7 +73269,7 @@ index 356dd99..141b181 100644 /* * Move the page from the page cache to the swap cache. */ -@@ -1051,7 +1095,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) +@@ -1051,7 +1112,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) * discarded. */ if (wbc->for_reclaim) @@ -69376,7 +73278,7 @@ index 356dd99..141b181 100644 else swap.val = 0; -@@ -1069,7 +1113,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) +@@ -1069,7 +1130,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) free_swap_and_cache(*entry); shmem_swp_set(info, entry, 0); } @@ -69385,7 +73287,7 @@ index 356dd99..141b181 100644 if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { remove_from_page_cache(page); -@@ -1252,7 +1296,7 @@ repeat: +@@ -1252,7 +1313,7 @@ repeat: } spin_lock(&info->lock); @@ -69394,7 +73296,7 @@ index 356dd99..141b181 100644 entry = shmem_swp_alloc(info, idx, sgp); if (IS_ERR(entry)) { spin_unlock(&info->lock); -@@ -1455,6 +1499,7 @@ repeat: +@@ -1455,6 +1516,7 @@ repeat: clear_highpage(filepage); flush_dcache_page(filepage); SetPageUptodate(filepage); @@ -69402,7 +73304,7 @@ index 356dd99..141b181 100644 if (sgp == SGP_DIRTY) set_page_dirty(filepage); } -@@ -1512,20 +1557,27 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) +@@ -1512,20 +1574,27 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) spin_lock(&info->lock); if (lock && !(info->flags & VM_LOCKED)) { @@ -69431,7 +73333,7 @@ index 356dd99..141b181 100644 spin_unlock(&info->lock); return retval; } -@@ -1559,6 +1611,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, int mode, +@@ -1559,6 +1628,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, int mode, inode->i_generation = get_seconds(); info = SHMEM_I(inode); memset(info, 0, (char *)inode - (char *)info); @@ -69439,7 +73341,16 @@ index 356dd99..141b181 100644 spin_lock_init(&info->lock); info->flags = flags & VM_NORESERVE; INIT_LIST_HEAD(&info->swaplist); -@@ -2424,7 +2477,7 @@ static const struct address_space_operations shmem_aops = { +@@ -2182,7 +2252,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, + size = memparse(value,&rest); + if (*rest == '%') { + size <<= PAGE_SHIFT; +- size *= totalram_pages; ++ size *= tmpfs_ram_pages(); + do_div(size, 100); + rest++; + } +@@ -2424,7 +2494,7 @@ static const struct address_space_operations shmem_aops = { .error_remove_page = generic_error_remove_page, }; @@ -69448,7 +73359,7 @@ index 356dd99..141b181 100644 .mmap = shmem_mmap, #ifdef CONFIG_TMPFS .llseek = generic_file_llseek, -@@ -2437,6 +2490,7 @@ static const struct file_operations shmem_file_operations = { +@@ -2437,6 +2507,7 @@ static const struct file_operations shmem_file_operations = { .splice_write = generic_file_splice_write, #endif }; @@ -69456,7 +73367,7 @@ index 356dd99..141b181 100644 static const struct inode_operations shmem_inode_operations = { .truncate = shmem_truncate, -@@ -2506,6 +2560,10 @@ static const struct vm_operations_struct shmem_vm_ops = { +@@ -2506,6 +2577,10 @@ static const struct vm_operations_struct shmem_vm_ops = { #endif }; @@ -69467,7 +73378,7 @@ index 356dd99..141b181 100644 static int shmem_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, struct vfsmount *mnt) -@@ -2513,12 +2571,13 @@ static int shmem_get_sb(struct file_system_type *fs_type, +@@ -2513,12 +2588,13 @@ static int shmem_get_sb(struct file_system_type *fs_type, return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt); } @@ -69482,7 +73393,7 @@ index 356dd99..141b181 100644 int __init init_tmpfs(void) { -@@ -2608,6 +2667,36 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) +@@ -2608,6 +2684,36 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) /* common code */ @@ -69519,7 +73430,7 @@ index 356dd99..141b181 100644 /** * shmem_file_setup - get an unlinked file living in tmpfs * @name: name for dentry (to be seen in /proc//maps -@@ -2653,6 +2742,9 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags +@@ -2653,6 +2759,9 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags d_instantiate(dentry, inode); inode->i_size = size; inode->i_nlink = 0; /* It is unlinked */ @@ -69529,7 +73440,7 @@ index 356dd99..141b181 100644 init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, &shmem_file_operations); -@@ -2689,6 +2781,8 @@ int shmem_zero_setup(struct vm_area_struct *vma) +@@ -2689,6 +2798,8 @@ int shmem_zero_setup(struct vm_area_struct *vma) if (vma->vm_file) fput(vma->vm_file); @@ -70738,7 +74649,7 @@ index 6d1daeb..8e4805b 100644 * swapin_readahead - swap in pages in hope we need them soon * @entry: swap entry of this memory diff --git a/mm/swapfile.c b/mm/swapfile.c -index 9c590ee..f5bc813 100644 +index 9c590ee..9ce0143 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -35,6 +35,8 @@ @@ -71084,7 +74995,56 @@ index 9c590ee..f5bc813 100644 #ifdef CONFIG_PROC_FS /* iterator */ static void *swap_start(struct seq_file *swap, loff_t *pos) -@@ -1743,7 +1817,7 @@ static const struct file_operations proc_swaps_operations = { +@@ -1729,21 +1803,55 @@ static const struct seq_operations swaps_op = { + .show = swap_show + }; + ++#include ++ ++static int swap_show_ve(struct seq_file *swap, void *v) ++{ ++ struct meminfo mi; ++ ++ memset(&mi, 0, sizeof(mi)); ++ si_swapinfo(&mi.si); ++ if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_MEMINFO, &mi) ++ & NOTIFY_FAIL) ++ goto out; ++ ++ seq_printf(swap, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); ++ if (!mi.si.totalswap) ++ goto out; ++ seq_printf(swap, "%-40s%s\t%lu\t%lu\t%d\n", ++ "/dev/null", ++ "partition", ++ mi.si.totalswap << (PAGE_SHIFT - 10), ++ (mi.si.totalswap - mi.si.freeswap) << (PAGE_SHIFT - 10), ++ -1); ++out: ++ return 0; ++} ++ + static int swaps_open(struct inode *inode, struct file *file) + { ++ if (!ve_is_super(get_exec_env())) ++ return single_open(file, &swap_show_ve, NULL); + return seq_open(file, &swaps_op); + } + ++static int swaps_release(struct inode *inode, struct file *file) ++{ ++ if (!ve_is_super(file->owner_env)) ++ return single_release(inode, file); ++ return seq_release(inode, file); ++} ++ + static const struct file_operations proc_swaps_operations = { + .open = swaps_open, + .read = seq_read, + .llseek = seq_lseek, +- .release = seq_release, ++ .release = swaps_release, + }; static int __init procswaps_init(void) { @@ -71093,7 +75053,7 @@ index 9c590ee..f5bc813 100644 return 0; } __initcall(procswaps_init); -@@ -1973,6 +2047,11 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) +@@ -1973,6 +2081,11 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap; } @@ -71105,7 +75065,7 @@ index 9c590ee..f5bc813 100644 if (p->bdev) { if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { p->flags |= SWP_SOLIDSTATE; -@@ -1991,6 +2070,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) +@@ -1991,6 +2104,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) p->prio = --least_priority; p->swap_map = swap_map; p->flags |= SWP_WRITEOK; @@ -71114,7 +75074,7 @@ index 9c590ee..f5bc813 100644 nr_swap_pages += nr_good_pages; total_swap_pages += nr_good_pages; -@@ -2049,6 +2130,8 @@ out: +@@ -2049,6 +2164,8 @@ out: return error; } @@ -71123,7 +75083,7 @@ index 9c590ee..f5bc813 100644 void si_swapinfo(struct sysinfo *val) { unsigned int i; -@@ -2146,6 +2229,8 @@ void swap_duplicate(swp_entry_t entry) +@@ -2146,6 +2263,8 @@ void swap_duplicate(swp_entry_t entry) __swap_duplicate(entry, SWAP_MAP); } @@ -72129,7 +76089,7 @@ index 4ade301..9732b07 100644 else set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); diff --git a/net/core/dev.c b/net/core/dev.c -index 74d0cce..48199c3 100644 +index 74d0cce..ee00d53 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -130,6 +130,9 @@ @@ -72256,13 +76216,14 @@ index 74d0cce..48199c3 100644 dev->name, (dev->flags & IFF_PROMISC) ? "entered" : "left"); if (audit_enabled) { -@@ -4547,11 +4576,20 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) +@@ -4547,16 +4576,25 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) * - require strict serialization. * - do not return a value */ + case SIOCSIFMTU: + case SIOCSIFHWADDR: case SIOCSIFFLAGS: ++ case SIOCSIFTXQLEN: + if (!capable(CAP_NET_ADMIN) && + !capable(CAP_VE_NET_ADMIN)) + return -EPERM; @@ -72279,6 +76240,11 @@ index 74d0cce..48199c3 100644 case SIOCSIFSLAVE: case SIOCADDMULTI: case SIOCDELMULTI: + case SIOCSIFHWBROADCAST: +- case SIOCSIFTXQLEN: + case SIOCSMIIREG: + case SIOCBONDENSLAVE: + case SIOCBONDRELEASE: @@ -4619,12 +4657,11 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) */ static int dev_new_index(struct net *net) @@ -75614,7 +79580,7 @@ index f1813bc..f2d3769 100644 limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7); max_share = min(4UL*1024*1024, limit); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c -index d86784b..46b61f5 100644 +index 2433bcd..0eb9c17 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -72,6 +72,8 @@ @@ -75751,7 +79717,7 @@ index d86784b..46b61f5 100644 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c -index 7cda24b..b0f93fd 100644 +index 7cda24b..e141833 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -72,6 +72,8 @@ @@ -75847,7 +79813,7 @@ index 7cda24b..b0f93fd 100644 __free_page(sk->sk_sndmsg_page); sk->sk_sndmsg_page = NULL; } -@@ -2478,6 +2493,87 @@ void __init tcp_v4_init(void) +@@ -2478,6 +2493,93 @@ void __init tcp_v4_init(void) panic("Failed to create the TCP control socket.\n"); } @@ -75858,16 +79824,13 @@ index 7cda24b..b0f93fd 100644 + + /* Check the assumed state of the socket. */ + if (!sock_flag(sk, SOCK_DEAD)) { -+ static int printed; -+invalid: -+ if (!printed) -+ printk(KERN_DEBUG "Killing sk: dead %d, state %d, " -+ "wrseq %u unseq %u, wrqu %d.\n", -+ sock_flag(sk, SOCK_DEAD), sk->sk_state, -+ tp->write_seq, tp->snd_una, -+ !skb_queue_empty(&sk->sk_write_queue)); -+ printed = 1; -+ return; ++ printk(KERN_WARNING "Killing sk: dead %d, state %d, " ++ "wrseq %u unseq %u, wrqu %d.\n", ++ sock_flag(sk, SOCK_DEAD), sk->sk_state, ++ tp->write_seq, tp->snd_una, ++ !skb_queue_empty(&sk->sk_write_queue)); ++ sk->sk_err = ECONNRESET; ++ sk->sk_error_report(sk); + } + + tcp_send_active_reset(sk, GFP_ATOMIC); @@ -75886,22 +79849,21 @@ index 7cda24b..b0f93fd 100644 + */ + tcp_time_wait(sk, TCP_FIN_WAIT2, 0); + break; -+ case TCP_LAST_ACK: ++ default: + /* Just jump into CLOSED state. */ + tcp_done(sk); + break; -+ default: -+ /* The socket must be already close()d. */ -+ goto invalid; + } +} + +void tcp_v4_kill_ve_sockets(struct ve_struct *envid) +{ + struct inet_ehash_bucket *head; -+ int i; ++ int i, retry; + + /* alive */ ++again: ++ retry = 0; + local_bh_disable(); + head = tcp_hashinfo.ehash; + for (i = 0; i < tcp_hashinfo.ehash_size; i++) { @@ -75916,6 +79878,12 @@ index 7cda24b..b0f93fd 100644 + spin_unlock(lock); + + bh_lock_sock(sk); ++ if (sock_owned_by_user(sk)) { ++ retry = 1; ++ bh_unlock_sock(sk); ++ sock_put(sk); ++ break; ++ } + /* sk might have disappeared from the hash before + * we got the lock */ + if (sk->sk_state != TCP_CLOSE) @@ -75928,6 +79896,10 @@ index 7cda24b..b0f93fd 100644 + spin_unlock(lock); + } + local_bh_enable(); ++ if (retry) { ++ schedule_timeout_interruptible(HZ); ++ goto again; ++ } +} +EXPORT_SYMBOL(tcp_v4_kill_ve_sockets); +#endif @@ -76355,8 +80327,20 @@ index cdb2ca7..78846e4 100644 sock_put(sk); + (void)set_exec_env(ve); } +diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c +index 0fa9f70..ca1c6bf 100644 +--- a/net/ipv4/udp.c ++++ b/net/ipv4/udp.c +@@ -138,6 +138,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, + sk2 != sk && + (bitmap || sk2->sk_hash == num) && + (!sk2->sk_reuse || !sk->sk_reuse) && ++ sk->sk_reuse != 2 && + (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if + || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && + (*saddr_comp)(sk, sk2)) { diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c -index d1f77cc..d62bbca 100644 +index d1f77cc..7fc4efd 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -407,9 +407,8 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev) @@ -76397,7 +80381,17 @@ index d1f77cc..d62bbca 100644 static int inet6_addr_del(struct net *net, int ifindex, struct in6_addr *pfx, unsigned int plen) -@@ -2202,7 +2202,7 @@ int addrconf_add_ifaddr(struct net *net, void __user *arg) +@@ -2188,7 +2188,8 @@ static int inet6_addr_del(struct net *net, int ifindex, struct in6_addr *pfx, + disable IPv6 on this interface. + */ + if (idev->addr_list == NULL) +- addrconf_ifdown(idev->dev, 1); ++ addrconf_ifdown(idev->dev, ++ !(idev->dev->flags & IFF_LOOPBACK)); + return 0; + } + } +@@ -2202,7 +2203,7 @@ int addrconf_add_ifaddr(struct net *net, void __user *arg) struct in6_ifreq ireq; int err; @@ -76406,7 +80400,7 @@ index d1f77cc..d62bbca 100644 return -EPERM; if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) -@@ -2221,7 +2221,7 @@ int addrconf_del_ifaddr(struct net *net, void __user *arg) +@@ -2221,7 +2222,7 @@ int addrconf_del_ifaddr(struct net *net, void __user *arg) struct in6_ifreq ireq; int err; @@ -76415,7 +80409,7 @@ index d1f77cc..d62bbca 100644 return -EPERM; if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) -@@ -2731,6 +2731,9 @@ static int addrconf_ifdown(struct net_device *dev, int how) +@@ -2731,6 +2732,9 @@ static int addrconf_ifdown(struct net_device *dev, int how) static void addrconf_rs_timer(unsigned long data) { struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data; @@ -76425,7 +80419,7 @@ index d1f77cc..d62bbca 100644 if (ifp->idev->cnf.forwarding) goto out; -@@ -2765,6 +2768,7 @@ static void addrconf_rs_timer(unsigned long data) +@@ -2765,6 +2769,7 @@ static void addrconf_rs_timer(unsigned long data) out: in6_ifa_put(ifp); @@ -76433,7 +80427,15 @@ index d1f77cc..d62bbca 100644 } /* -@@ -2841,7 +2845,9 @@ static void addrconf_dad_timer(unsigned long data) +@@ -2801,6 +2806,7 @@ static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags) + if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) || + idev->cnf.accept_dad < 1 || + !(ifp->flags&IFA_F_TENTATIVE) || ++ dev->owner_env->disable_net || + ifp->flags & IFA_F_NODAD) { + ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED); + spin_unlock_bh(&ifp->lock); +@@ -2841,7 +2847,9 @@ static void addrconf_dad_timer(unsigned long data) struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data; struct inet6_dev *idev = ifp->idev; struct in6_addr mcaddr; @@ -76443,7 +80445,7 @@ index d1f77cc..d62bbca 100644 read_lock_bh(&idev->lock); if (idev->dead) { read_unlock_bh(&idev->lock); -@@ -2872,6 +2878,7 @@ static void addrconf_dad_timer(unsigned long data) +@@ -2872,6 +2880,7 @@ static void addrconf_dad_timer(unsigned long data) ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &in6addr_any); out: in6_ifa_put(ifp); @@ -76451,7 +80453,7 @@ index d1f77cc..d62bbca 100644 } static void addrconf_dad_completed(struct inet6_ifaddr *ifp) -@@ -3093,6 +3100,7 @@ static void addrconf_verify(unsigned long foo) +@@ -3093,6 +3102,7 @@ static void addrconf_verify(unsigned long foo) struct inet6_ifaddr *ifp; unsigned long now, next; int i; @@ -76459,7 +80461,7 @@ index d1f77cc..d62bbca 100644 spin_lock_bh(&addrconf_verify_lock); now = jiffies; -@@ -3113,6 +3121,8 @@ restart: +@@ -3113,6 +3123,8 @@ restart: if (ifp->flags & IFA_F_PERMANENT) continue; @@ -76468,7 +80470,7 @@ index d1f77cc..d62bbca 100644 spin_lock(&ifp->lock); age = (now - ifp->tstamp) / HZ; -@@ -3128,9 +3138,11 @@ restart: +@@ -3128,9 +3140,11 @@ restart: in6_ifa_hold(ifp); read_unlock(&addrconf_hash_lock); ipv6_del_addr(ifp); @@ -76480,7 +80482,7 @@ index d1f77cc..d62bbca 100644 continue; } else if (age >= ifp->prefered_lft) { /* jiffies - ifp->tstamp > age >= ifp->prefered_lft */ -@@ -3152,6 +3164,7 @@ restart: +@@ -3152,6 +3166,7 @@ restart: ipv6_ifa_notify(0, ifp); in6_ifa_put(ifp); @@ -76488,7 +80490,7 @@ index d1f77cc..d62bbca 100644 goto restart; } #ifdef CONFIG_IPV6_PRIVACY -@@ -3173,6 +3186,7 @@ restart: +@@ -3173,6 +3188,7 @@ restart: ipv6_create_tempaddr(ifpub, ifp); in6_ifa_put(ifpub); in6_ifa_put(ifp); @@ -76496,7 +80498,7 @@ index d1f77cc..d62bbca 100644 goto restart; } } else if (time_before(ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ, next)) -@@ -3185,6 +3199,7 @@ restart: +@@ -3185,6 +3201,7 @@ restart: next = ifp->tstamp + ifp->prefered_lft * HZ; spin_unlock(&ifp->lock); } @@ -77335,7 +81337,7 @@ index dbd19a7..9fb663a 100644 return err; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c -index 21d100b..0ecd5b4 100644 +index 21d100b..1c534b7 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -61,6 +61,8 @@ @@ -77356,7 +81358,15 @@ index 21d100b..0ecd5b4 100644 static const struct inet_connection_sock_af_ops ipv6_specific; #ifdef CONFIG_TCP_MD5SIG static const struct tcp_sock_af_ops tcp_sock_ipv6_specific; -@@ -1496,6 +1498,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) +@@ -892,6 +894,7 @@ struct request_sock_ops tcp6_request_sock_ops __read_mostly = { + .destructor = tcp_v6_reqsk_destructor, + .send_reset = tcp_v6_send_reset + }; ++EXPORT_SYMBOL(tcp6_request_sock_ops); + + #ifdef CONFIG_TCP_MD5SIG + static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { +@@ -1496,6 +1499,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) struct ipv6_pinfo *np = inet6_sk(sk); struct tcp_sock *tp; struct sk_buff *opt_skb = NULL; @@ -77364,7 +81374,7 @@ index 21d100b..0ecd5b4 100644 /* Imagine: socket is IPv6. IPv4 packet arrives, goes to IPv4 receive handler and backlogged. -@@ -1508,6 +1511,8 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) +@@ -1508,6 +1512,8 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_do_rcv(sk, skb); @@ -77373,7 +81383,7 @@ index 21d100b..0ecd5b4 100644 #ifdef CONFIG_TCP_MD5SIG if (tcp_v6_inbound_md5_hash (sk, skb)) goto discard; -@@ -1544,7 +1549,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) +@@ -1544,7 +1550,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) TCP_CHECK_TIMER(sk); if (opt_skb) goto ipv6_pktoptions; @@ -77382,7 +81392,7 @@ index 21d100b..0ecd5b4 100644 } if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb)) -@@ -1565,7 +1570,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) +@@ -1565,7 +1571,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) goto reset; if (opt_skb) __kfree_skb(opt_skb); @@ -77391,7 +81401,7 @@ index 21d100b..0ecd5b4 100644 } } -@@ -1575,6 +1580,9 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) +@@ -1575,6 +1581,9 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) TCP_CHECK_TIMER(sk); if (opt_skb) goto ipv6_pktoptions; @@ -77401,7 +81411,7 @@ index 21d100b..0ecd5b4 100644 return 0; reset: -@@ -1583,7 +1591,7 @@ discard: +@@ -1583,7 +1592,7 @@ discard: if (opt_skb) __kfree_skb(opt_skb); kfree_skb(skb); @@ -77410,7 +81420,7 @@ index 21d100b..0ecd5b4 100644 csum_err: TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); goto discard; -@@ -1614,7 +1622,7 @@ ipv6_pktoptions: +@@ -1614,7 +1623,7 @@ ipv6_pktoptions: } kfree_skb(opt_skb); @@ -77419,7 +81429,7 @@ index 21d100b..0ecd5b4 100644 } static int tcp_v6_rcv(struct sk_buff *skb) -@@ -1793,7 +1801,7 @@ static const struct tcp_sock_af_ops tcp_sock_ipv6_specific = { +@@ -1793,7 +1802,7 @@ static const struct tcp_sock_af_ops tcp_sock_ipv6_specific = { * TCP over IPv4 via INET6 API */ @@ -77428,7 +81438,7 @@ index 21d100b..0ecd5b4 100644 .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, -@@ -1812,6 +1820,8 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = { +@@ -1812,6 +1821,8 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = { #endif }; @@ -80219,7 +84229,7 @@ index 8cce921..9685220 100644 cleanup_socket_xprt(); svc_cleanup_xprt_sock(); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c -index 1c246a4..f969dee 100644 +index 70b0a22..f66b225 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -229,6 +229,9 @@ static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) @@ -80241,7 +84251,7 @@ index 1c246a4..f969dee 100644 return len; } -@@ -1436,8 +1441,9 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, +@@ -1437,8 +1442,9 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, error = sock_create_kern(family, type, protocol, &sock); if (error < 0) @@ -80252,7 +84262,7 @@ index 1c246a4..f969dee 100644 svc_reclassify_socket(sock); /* -@@ -1488,6 +1494,8 @@ static void svc_sock_detach(struct svc_xprt *xprt) +@@ -1489,6 +1495,8 @@ static void svc_sock_detach(struct svc_xprt *xprt) dprintk("svc: svc_sock_detach(%p)\n", svsk); @@ -80668,7 +84678,7 @@ index fb363cd..dbfa601 100644 This allows you to choose different security modules to be configured into your kernel. diff --git a/security/commoncap.c b/security/commoncap.c -index fe30751..6110691 100644 +index fe30751..3579774 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -58,6 +58,10 @@ int cap_netlink_send(struct sock *sk, struct sk_buff *skb) @@ -80700,15 +84710,18 @@ index fe30751..6110691 100644 return -EPERM; return 0; } -@@ -962,7 +966,7 @@ error: +@@ -962,8 +966,9 @@ error: */ int cap_syslog(int type) { - if ((type != 3 && type != 10) && !capable(CAP_SYS_ADMIN)) -+ if ((type != 3 && type != 10) && !capable(CAP_VE_SYS_ADMIN)) - return -EPERM; +- return -EPERM; ++ if ((type != 3 && type != 10) && ++ !capable(CAP_VE_SYS_ADMIN) && !capable(CAP_SYS_ADMIN)) ++ return -EPERM; return 0; } + diff --git a/security/device_cgroup.c b/security/device_cgroup.c index 6cf8fd2..02aeae6 100644 --- a/security/device_cgroup.c